In [1]:
import sys
sys.path.append("functions/")

In [2]:
from datastore import DataStore
from searchgrid import SearchGrid
from crossvalidate import CrossValidate
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import balanced_accuracy_score
from sampleddatastore import SampledDataStore as sds
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import balanced_accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [5]:
#Load object for CrossValidation
crossvalidate = CrossValidate()

#Load object for GridSearchCV
GridSpace = SearchGrid()

Let's start with a baseline model.

In [6]:
classifier = DecisionTreeClassifier(random_state=42)
crossvalidate.setClassifier(classifier)
crossvalidate.run()
f1, roc = crossvalidate.getMetrics().getScores()
print(f"F1 score is: {f1}")
print(f"ROC-AUC is: {roc}")

F1 score is: 0.20713950211366838
ROC-AUC is: 0.6098990846479329


Not bad, as with previous classifiers, we want to try adjusting the weights. Let's try balanced before moving onto to GridSearch.

In [8]:
classifier = DecisionTreeClassifier(random_state=42, class_weight='balanced')
crossvalidate.setClassifier(classifier)
crossvalidate.run()
f1, roc = crossvalidate.getMetrics().getScores()
print(f"F1 score is: {f1}")
print(f"ROC-AUC is: {roc}")

F1 score is: 0.20095948827292112
ROC-AUC is: 0.5940319965611066


Let's use GridSearch to find the best weights.

In [9]:
classifier = DecisionTreeClassifier(random_state=42)
parameters = {'class_weight':[{0:1,1:1}, {0:1,1:10}, {0:1,1:100}, {0:10,1:1}]}
GridSpace.setGridParameters(parameters)
GridSpace.setClassifier(classifier)
GridSpace.run()
parameters, scores = GridSpace.getMetrics().getBestResults()
f1 = scores[0]
roc = scores[1]
print(f"F1 score is: {f1}")
print(f"ROC-AUC is: {roc}")
print(f"Best Parameters: {parameters}")

F1 score is: 0.23215108245048366
ROC-AUC is: 0.6257549721670259
Best Parameters: {'class_weight': {0: 10, 1: 1}}


In [10]:
params, scores = GridSpace.getMetrics().getAllresults()
for i, j in zip(params, scores):
    print(i, "F1: ", round(j[0],3), "ROC_AUC: ", round(j[1],3))

{'class_weight': {0: 1, 1: 1}} F1:  0.207 ROC_AUC:  0.61
{'class_weight': {0: 1, 1: 10}} F1:  0.211 ROC_AUC:  0.605
{'class_weight': {0: 1, 1: 100}} F1:  0.194 ROC_AUC:  0.595
{'class_weight': {0: 10, 1: 1}} F1:  0.232 ROC_AUC:  0.626


The results are somewhat suprising, compared to our previous best weights (1:10 for nongilded to gilded). Let's try experimenting with the features next; we will keep both weights for comparison.

We will also restrict the max_depth (to avoid overfitting and to limit the resources needed for training).

In [7]:
classifier = DecisionTreeClassifier(random_state=42)
parameters = {'max_depth' : [2, 4],
    'class_weight':[{0:1,1:10}, {0:10,1:1}], 'max_features' : [2, 4, 6, 8]}
GridSpace.setGridParameters(parameters)
GridSpace.setClassifier(classifier)
GridSpace.run()
GridSpace.save("DTGrid01") #Saving the grid as well, we can reuse it later
parameters, scores = GridSpace.getMetrics().getBestResults()
f1 = scores[0]
roc = scores[1]
print(f"F1 score is: {f1}")
print(f"ROC-AUC is: {roc}")
print(f"Best Parameters: {parameters}")

F1 score is: 0.2825450623940841
ROC-AUC is: 0.7286708002744751
Best Parameters: {'class_weight': {0: 1, 1: 10}, 'max_depth': 4, 'max_features': 4}


In [8]:
params, scores = GridSpace.getAllresults()
print("\nAll Scores:")
for i, j in zip(params, scores):
    print(i, "F1: ", round(j[0],3), "ROC_AUC: ", round(j[1],3))


All Scores:
{'class_weight': {0: 1, 1: 10}, 'max_depth': 2, 'max_features': 2} F1:  0.243 ROC_AUC:  0.757
{'class_weight': {0: 1, 1: 10}, 'max_depth': 2, 'max_features': 4} F1:  0.243 ROC_AUC:  0.757
{'class_weight': {0: 1, 1: 10}, 'max_depth': 2, 'max_features': 6} F1:  0.253 ROC_AUC:  0.724
{'class_weight': {0: 1, 1: 10}, 'max_depth': 2, 'max_features': 8} F1:  0.253 ROC_AUC:  0.724
{'class_weight': {0: 1, 1: 10}, 'max_depth': 4, 'max_features': 2} F1:  0.278 ROC_AUC:  0.72
{'class_weight': {0: 1, 1: 10}, 'max_depth': 4, 'max_features': 4} F1:  0.283 ROC_AUC:  0.729
{'class_weight': {0: 1, 1: 10}, 'max_depth': 4, 'max_features': 6} F1:  0.278 ROC_AUC:  0.735
{'class_weight': {0: 1, 1: 10}, 'max_depth': 4, 'max_features': 8} F1:  0.281 ROC_AUC:  0.737
{'class_weight': {0: 10, 1: 1}, 'max_depth': 2, 'max_features': 2} F1:  0.0 ROC_AUC:  0.5
{'class_weight': {0: 10, 1: 1}, 'max_depth': 2, 'max_features': 4} F1:  0.0 ROC_AUC:  0.5
{'class_weight': {0: 10, 1: 1}, 'max_depth': 2, 'max_fea

Looks like limiting the tree depth has produced results similar to our previous models. We can use graphviz to visualize the results (saved as tree.png)

In [10]:
from sklearn import tree
import graphviz
clf = tree.DecisionTreeClassifier(max_depth=4, class_weight={0:1, 1:10}, max_features=4)
clf = clf.fit(Data.getxTrain(), Data.getyTrain())

features_list=["ups", "comment_karma", "link_karma", "is_premium", "comment_age_days", "acc_age_days", "comment_length", 
                 "account_activity"]
tree.export_graphviz(clf, out_file='tree.dot', 
                     feature_names=features_list,  
                      class_names=['Non gilded', 'gilded'],  
                      filled=True, rounded=True,  
                      special_characters=True)
!dot -Tpng tree.dot -o tree.png #Convert dot file into png

Now that we have our best parameters, we can evaluate the results on our test set.

In [17]:
GridSpace = GridSpace.loadGrid("models/DTGrid01.pkl")
y_preds = GridSpace.grid.predict(Data.getxTest())
print(f"F1 score: {f1_score(Data.getyTest(), y_preds)}")
print(f"ROC_AUC score: {roc_auc_score(Data.getyTest(), y_preds)}")
print(f"Balanced accuracy score: {balanced_accuracy_score(Data.getyTest(), y_preds)}")
print("\n")

F1 score: 0.288021534320323
ROC_AUC score: 0.7133447487906208
Balanced accuracy score: 0.7133447487906208




Comparable results to both SVM (0.29) and Logistic Regression (0.297), before sampling. Let's apply resampling techniques, train the model and then evaluate them on our test set.

In [3]:
SampledDataStore = sds()
SampledDataStore.initializeSamplers()

Loading Sampling Data


In [7]:
random = SampledDataStore.getRandomSampled
smote = SampledDataStore.getSMOTESampled
ada = SampledDataStore.getADASYNSampled
smote_tomek = SampledDataStore.getSMOTETOMEKSampled
smote_enn = SampledDataStore.getSMOTEENNSampled
samplers = [random, smote, ada, smote_tomek, smote_enn]
sampler_names = ["Random OverSampler", "SMOTE", "ADASYN", "SMOTE TOMEK", "SMOTE ENN"]

classifier = DecisionTreeClassifier(max_depth=4, max_features=4)

for i in range(len(samplers)):
    parameters = {'class_weight':[{0:1,1:10}]}
    X_resampled, y_resampled = samplers[i]()
    GridSpace.getDataStore().setxTrain(X_resampled)
    GridSpace.getDataStore().setyTrain(y_resampled) 
    GridSpace.setGridParameters(parameters)
    GridSpace.setClassifier(classifier)
    grid = GridSpace.run()
    GridSpace.save(f"DT_{sampler_names[i]}")
    y_preds = grid.predict(GridSpace.getDataStore().getxTest())
    print(f"{sampler_names[i]} on test set:")
    print(f"F1 score: {f1_score(GridSpace.getDataStore().getyTest(), y_preds)}")
    print(f"ROC_AUC score: {roc_auc_score(GridSpace.getDataStore().getyTest(), y_preds)}")
    print(f"Balanced accuracy score: {balanced_accuracy_score(GridSpace.getDataStore().getyTest(), y_preds)}")
    print("\n")

GridSpace.getDataStore().revertToOriginal()

Random OverSampler on test set:
F1 score: 0.007978623311128674
ROC_AUC score: 0.7195133303097084
Balanced accuracy score: 0.7195133303097084


SMOTE on test set:
F1 score: 0.030641466208476516
ROC_AUC score: 0.8768613188870834
Balanced accuracy score: 0.8768613188870835


ADASYN on test set:
F1 score: 0.03624519792884583
ROC_AUC score: 0.8907015234222002
Balanced accuracy score: 0.8907015234222002


SMOTE TOMEK on test set:
F1 score: 0.043701799485861184
ROC_AUC score: 0.9060292080658542
Balanced accuracy score: 0.9060292080658543


SMOTE ENN on test set:
F1 score: 0.030482870245481523
ROC_AUC score: 0.8976386960612805
Balanced accuracy score: 0.8976386960612805




Overall, results are worse when compared to both Logistic Regression and SVM.