In [25]:
#utility functions
from scipy.io import arff
import pandas as pd
from sklearn.model_selection import cross_val_score, ShuffleSplit, train_test_split
from sklearn.metrics import classification_report, confusion_matrix

def import_data():
    data = arff.loadarff('../dataSets/currentData/FinalDataFull.csv.arff')
    df = pd.DataFrame(data[0])
    
    #correct for usertype boolean type
    df['UserType'] = df['UserType'].astype(int)
    
    #correct floating points to ints
    
    return df

#entire x/y partition for cross fold validation
def course_split(df):
    return df.drop(['UserType'], axis=1), df['UserType']

#x/y splits further partitioned into train/test
def fine_split(df):
    #partition data
    y = df['UserType']
    X = df.drop(['UserType'], axis=1)
    X_mat = X.as_matrix()
    return train_test_split(X_mat, y, test_size=0.4, random_state=42, stratify=y)

#evaluate models performance using kfold cross validation
def kfold(model, model_name, X, y):
    cv = ShuffleSplit(n_splits=10, test_size=0.4, random_state=42) #define cv iterator parameters (stratify?)
    scores = cross_val_score(model, X, y, cv=cv)
    
    #retrieve trained model accuracy using cross fold validation score - using all data 
    print("{0} Accuracy: {1:.2f} (+/- {2:.2f})".format(model_name, scores.mean(), scores.std() * 2))
    
#evaluate models performance using classification report and confusion matrix
def metrics(model, X_train, X_test, y_train, y_test):
    #classification report and confusion matrix - using train/test partitions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Classification report:\n", classification_report(y_test, y_pred))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

In [26]:
%%time
#pseudo main
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC

models = [DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(), LinearSVC()]
model_names = ["Decision Tree", "Random Forest", "Adaboost", "LinearSVC"]

#import, partion data
df = import_data()
X,y = course_split(df)
X_train,X_test,y_train,y_test = fine_split(df)

#evaluate all models
for model, model_name in zip(models, model_names):
    kfold(model, model_name, X, y)
    metrics(model,X_train,X_test,y_train,y_test)

Decision Tree Accuracy: 0.92 (+/- 0.00)
Classification report:
              precision    recall  f1-score   support

          0       0.92      0.92      0.92      8889
          1       0.91      0.90      0.91      7711

avg / total       0.91      0.91      0.91     16600

Confusion matrix:
 [[8202  687]
 [ 735 6976]]
Random Forest Accuracy: 0.94 (+/- 0.00)
Classification report:
              precision    recall  f1-score   support

          0       0.94      0.96      0.95      8889
          1       0.95      0.93      0.94      7711

avg / total       0.94      0.94      0.94     16600

Confusion matrix:
 [[8507  382]
 [ 556 7155]]
Adaboost Accuracy: 0.94 (+/- 0.00)
Classification report:
              precision    recall  f1-score   support

          0       0.95      0.95      0.95      8889
          1       0.94      0.94      0.94      7711

avg / total       0.94      0.94      0.94     16600

Confusion matrix:
 [[8424  465]
 [ 474 7237]]
LinearSVC Accuracy: 0.72 (+/- 