In [1]:
%%time
#import data
from scipy.io import arff
import pandas as pd

#import
data = arff.loadarff('../dataSets/currentData/FinalDataFull.csv.arff')
df = pd.DataFrame(data[0])

#correct for usertype boolean type
df['UserType'] = df['UserType'].astype(int)

#dataset overview
print(df.head(5))
# print(df.columns)
print("Data Import complete")

     UserID  UserType  NumberOfTweets  numOfFollowers  numOfFollowings  \
0    6301.0         0           861.0          3071.0           3269.0   
1   10836.0         0           226.0           793.0           1949.0   
2   10997.0         0         38674.0          9644.0           1119.0   
3  633293.0         0         12718.0          6029.0           2174.0   
4  717883.0         0           873.0          7029.0           7731.0   

   lengthAboutMe  lengthUsername  NumOfAnnotation  NumOfHttp  \
0          132.0             8.0         0.078978   0.163763   
1          134.0             9.0         0.261062   0.575221   
2          158.0            12.0         0.002870   0.001370   
3          121.0            11.0         0.007155   0.005032   
4           70.0             6.0         0.024055   0.158076   

   avgLengthOfTweets  totalNumOfUniqWords  
0           2.904762               1229.0  
1           7.955752                947.0  
2           0.083131               137

In [2]:
%%time
#decision tree
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

#partition
y = df['UserType']
X = df.drop(['UserType'], axis=1)
X_mat = X.as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X_mat, y, test_size=0.4, random_state=42, stratify=y)

#fit model via gridsearchCV
params = {'criterion': ['gini', 'entropy'],
          'splitter': ['best','random'],
          'max_depth': list(range(3, 10)),
          'min_samples_leaf': list(range(20, 60, 10))}

model = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=params, cv=10, n_jobs=-1)
model.fit(X_train, y_train)

#retrieve train/test accuracy
print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

#classification report and confusion matrix
y_pred = model.predict(X_test)
print("Classification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

print("Decision Tree Complete")



Train accuracy: 0.946865335957
Test accuracy: 0.939518072289
Classification report:
              precision    recall  f1-score   support

          0       0.94      0.94      0.94      8889
          1       0.93      0.94      0.93      7711

avg / total       0.94      0.94      0.94     16600

Confusion matrix:
 [[8375  514]
 [ 490 7221]]
Decision Tree Complete
CPU times: user 2.15 s, sys: 528 ms, total: 2.68 s
Wall time: 28 s


In [3]:
%%time
#Random Forest ensemble
from sklearn.ensemble import RandomForestClassifier

#fit model via gridsearchCV
params = {'n_estimators': list(range(1,10)),
          'criterion': ['gini', 'entropy'],
          'max_depth': list(range(3, 10)),
          'min_samples_leaf': list(range(20, 60, 10))}

model = GridSearchCV(estimator=RandomForestClassifier(), param_grid=params, cv=10, n_jobs=-1)
model.fit(X_train, y_train)

#retrieve train/test accuracy
print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

#classification report and confusion matrix
y_pred = model.predict(X_test)
print("Classification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

print("Random Forest Complete")

Train accuracy: 0.946785011446
Test accuracy: 0.943795180723
Classification report:
              precision    recall  f1-score   support

          0       0.95      0.95      0.95      8889
          1       0.94      0.94      0.94      7711

avg / total       0.94      0.94      0.94     16600

Confusion matrix:
 [[8424  465]
 [ 468 7243]]
Random Forest Complete
CPU times: user 20.8 s, sys: 1.21 s, total: 22 s
Wall time: 5min 7s


In [4]:
%%time
#Adaboost ensemble
from sklearn.ensemble import AdaBoostClassifier

#fit model via gridsearchCV, default classifier is decision tree
params = {'n_estimators': list(range(1,10)),
          'learning_rate': [pow(10, x) for x in range(-6, 3)],
          'random_state': [42]}

model = GridSearchCV(estimator=AdaBoostClassifier(), param_grid=params, cv=10, n_jobs=-1)
model.fit(X_train, y_train)

#retrieve train/test accuracy
print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

#classification report and confusion matrix
y_pred = model.predict(X_test)
print("Classification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

print("Adaboost Decision Tree Complete")

  (estimator_weight < 0)))
  sample_weight /= sample_weight_sum
  (estimator_weight < 0)))
  sample_weight /= sample_weight_sum
  (estimator_weight < 0)))
  sample_weight /= sample_weight_sum
  proba[proba < np.finfo(proba.dtype).eps] = np.finfo(proba.dtype).eps
  proba[proba < np.finfo(proba.dtype).eps] = np.finfo(proba.dtype).eps
  return self.classes_.take(pred > 0, axis=0)
  proba[proba < np.finfo(proba.dtype).eps] = np.finfo(proba.dtype).eps
  proba[proba < np.finfo(proba.dtype).eps] = np.finfo(proba.dtype).eps
  proba[proba < np.finfo(proba.dtype).eps] = np.finfo(proba.dtype).eps
  proba[proba < np.finfo(proba.dtype).eps] = np.finfo(proba.dtype).eps
  return self.classes_.take(pred > 0, axis=0)
  (estimator_weight < 0)))
  return self.classes_.take(pred > 0, axis=0)
  sample_weight /= sample_weight_sum
  (estimator_weight < 0)))
  (estimator_weight < 0)))
  sample_weight /= sample_weight_sum
  proba[proba < np.finfo(proba.dtype).eps] = np.finfo(proba.dtype).eps
  sample_weight /=

Train accuracy: 0.931402867585
Test accuracy: 0.931204819277
Classification report:
              precision    recall  f1-score   support

          0       0.93      0.94      0.94      8889
          1       0.93      0.92      0.93      7711

avg / total       0.93      0.93      0.93     16600

Confusion matrix:
 [[8350  539]
 [ 603 7108]]
Adaboost Decision Tree Complete
CPU times: user 3.29 s, sys: 292 ms, total: 3.58 s
Wall time: 36.2 s


In [5]:
%%time
#SVC - utlizing libsvm for training efficiency
from sklearn.svm import LinearSVC

#fit model via gridsearchCV
params = {'C': [pow(10, x) for x in range(-4, 2)],
          'random_state': [42],
          'max_iter': [x for x in range(300, 1000, 100)]}

model = GridSearchCV(estimator=LinearSVC(), param_grid=params, cv=10, n_jobs=-1)
model.fit(X_train, y_train)

#retrieve train/test accuracy
print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

#classification report and confusion matrix
y_pred = model.predict(X_test)
print("Classification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

print("Linear SVC Complete")

Train accuracy: 0.781316518736
Test accuracy: 0.78734939759
Classification report:
              precision    recall  f1-score   support

          0       0.78      0.84      0.81      8889
          1       0.79      0.73      0.76      7711

avg / total       0.79      0.79      0.79     16600

Confusion matrix:
 [[7430 1459]
 [2071 5640]]
Linear SVC Complete
CPU times: user 6.3 s, sys: 268 ms, total: 6.57 s
Wall time: 9min 2s
