In [1]:
import sys
sys.path.append("functions/")

In [2]:
from datastore import DataStore
from searchgrid import SearchGrid
from crossvalidate import CrossValidate
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import balanced_accuracy_score
from sampleddatastore import SampledDataStore as sds
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import balanced_accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
#Load object for CrossValidation
crossvalidate = CrossValidate()

#Load object for GridSearchCV
GridSpace = SearchGrid()

Let's start with a baseline model.

In [4]:
classifier = DecisionTreeClassifier(random_state=42)
crossvalidate.setClassifier(classifier)
crossvalidate.run()
f1, roc = crossvalidate.getMetrics().getScores()
print(f"F1 score is: {f1}")
print(f"ROC-AUC is: {roc}")

F1 score is: 0.17511289513296538
ROC-AUC is: 0.5899148573319357


Not bad, as with previous classifiers, we want to try adjusting the weights. Let's try balanced before moving onto to GridSearch.

In [5]:
classifier = DecisionTreeClassifier(random_state=42, class_weight='balanced')
crossvalidate.setClassifier(classifier)
crossvalidate.run()
f1, roc = crossvalidate.getMetrics().getScores()
print(f"F1 score is: {f1}")
print(f"ROC-AUC is: {roc}")

F1 score is: 0.1615720524017467
ROC-AUC is: 0.5762615873157263


Let's use GridSearch to find the best weights.

In [6]:
classifier = DecisionTreeClassifier(random_state=42)
parameters = {'class_weight':[{0:1,1:1}, {0:1,1:10}, {0:1,1:100}, {0:10,1:1}]}
GridSpace.setGridParameters(parameters)
GridSpace.setClassifier(classifier)
GridSpace.run()
parameters, scores = GridSpace.getMetrics().getBestResults()
f1 = scores[0]
roc = scores[1]
print(f"F1 score is: {f1}")
print(f"ROC-AUC is: {roc}")
print(f"Best Parameters: {parameters}")

F1 score is: 0.1771037181996086
ROC-AUC is: 0.5932689032547743
Best Parameters: {'class_weight': {0: 10, 1: 1}}


In [7]:
params, scores = GridSpace.getMetrics().getAllresults()
for i, j in zip(params, scores):
    print(i, "F1: ", round(j[0],3), "ROC_AUC: ", round(j[1],3))

{'class_weight': {0: 1, 1: 1}} F1:  0.175 ROC_AUC:  0.59
{'class_weight': {0: 1, 1: 10}} F1:  0.172 ROC_AUC:  0.586
{'class_weight': {0: 1, 1: 100}} F1:  0.163 ROC_AUC:  0.577
{'class_weight': {0: 10, 1: 1}} F1:  0.177 ROC_AUC:  0.593


The results are somewhat suprising, compared to our previous best weights (1:10 for nongilded to gilded). Let's try experimenting with the features next; we will keep both weights for comparison.

We will also restrict the max_depth (to avoid overfitting and to limit the resources needed for training).

In [13]:
classifier = DecisionTreeClassifier(random_state=42)
parameters = {'max_depth' : [2, 4],
    'class_weight':[{0:1,1:10}, {0:10,1:1}], 'max_features' : [2, 4, 6]}
GridSpace.setGridParameters(parameters)
GridSpace.setClassifier(classifier)
GridSpace.run()
GridSpace.save("DTGrid01") #Saving the grid as well, we can reuse it later
parameters, scores = GridSpace.getMetrics().getBestResults()
f1 = scores[0]
roc = scores[1]
print(f"F1 score is: {f1}")
print(f"ROC-AUC is: {roc}")
print(f"Best Parameters: {parameters}")

F1 score is: 0.264637366050629
ROC-AUC is: 0.7197968100839228
Best Parameters: {'class_weight': {0: 1, 1: 10}, 'max_depth': 2, 'max_features': 6}


In [14]:
params, scores = GridSpace.getMetrics().getAllresults()
print("\nAll Scores:")
for i, j in zip(params, scores):
    print(i, "F1: ", round(j[0],3), "ROC_AUC: ", round(j[1],3))


All Scores:
{'class_weight': {0: 1, 1: 10}, 'max_depth': 2, 'max_features': 2} F1:  0.244 ROC_AUC:  0.762
{'class_weight': {0: 1, 1: 10}, 'max_depth': 2, 'max_features': 4} F1:  0.247 ROC_AUC:  0.76
{'class_weight': {0: 1, 1: 10}, 'max_depth': 2, 'max_features': 6} F1:  0.265 ROC_AUC:  0.72
{'class_weight': {0: 1, 1: 10}, 'max_depth': 4, 'max_features': 2} F1:  0.22 ROC_AUC:  0.755
{'class_weight': {0: 1, 1: 10}, 'max_depth': 4, 'max_features': 4} F1:  0.25 ROC_AUC:  0.737
{'class_weight': {0: 1, 1: 10}, 'max_depth': 4, 'max_features': 6} F1:  0.257 ROC_AUC:  0.727
{'class_weight': {0: 10, 1: 1}, 'max_depth': 2, 'max_features': 2} F1:  0.0 ROC_AUC:  0.5
{'class_weight': {0: 10, 1: 1}, 'max_depth': 2, 'max_features': 4} F1:  0.0 ROC_AUC:  0.5
{'class_weight': {0: 10, 1: 1}, 'max_depth': 2, 'max_features': 6} F1:  0.0 ROC_AUC:  0.5
{'class_weight': {0: 10, 1: 1}, 'max_depth': 4, 'max_features': 2} F1:  0.002 ROC_AUC:  0.501
{'class_weight': {0: 10, 1: 1}, 'max_depth': 4, 'max_features':

Looks like limiting the tree depth has produced results similar to our previous models. We can use graphviz to visualize the results (saved as tree.png)

In [15]:
from sklearn import tree
import graphviz
Data = DataStore()
clf = tree.DecisionTreeClassifier(max_depth=4, class_weight={0:1, 1:10}, max_features=6)
clf = clf.fit(Data.getxTrain(), Data.getyTrain())

features_list=["comment_upvotes", "comment_karma", "link_karma", "is_premium", "comment_age", "edited_comment"]
tree.export_graphviz(clf, out_file='tree.dot', 
                     feature_names=features_list,  
                      class_names=['Non gilded', 'gilded'],  
                      filled=True, rounded=True,  
                      special_characters=True)
!dot -Tpng tree.dot -o tree.png #Convert dot file into png

Now that we have our best parameters, we can evaluate the results on our test set.

In [17]:
GridSpace = GridSpace.load("models/DTGrid01.pkl")
y_preds = GridSpace.grid.predict(Data.getxTest())
print(f"F1 score: {f1_score(Data.getyTest(), y_preds)}")
print(f"ROC_AUC score: {roc_auc_score(Data.getyTest(), y_preds)}")
print(f"Balanced accuracy score: {balanced_accuracy_score(Data.getyTest(), y_preds)}")
print("\n")

F1 score: 0.2881944444444445
ROC_AUC score: 0.6718979759057488
Balanced accuracy score: 0.671897975905749




Comparable results to both SVM (0.29) and Logistic Regression (0.297), before sampling. Let's apply resampling techniques, train the model and then evaluate them on our test set.

In [18]:
SampledDataStore = sds()
SampledDataStore.initializeSamplers()

Loading Sampling Data...


In [19]:
random = SampledDataStore.getRandomSampled
smote = SampledDataStore.getSMOTESampled
ada = SampledDataStore.getADASYNSampled
smote_tomek = SampledDataStore.getSMOTETOMEKSampled
smote_enn = SampledDataStore.getSMOTEENNSampled
samplers = [random, smote, ada, smote_tomek, smote_enn]
sampler_names = ["Random OverSampler", "SMOTE", "ADASYN", "SMOTE TOMEK", "SMOTE ENN"]

classifier = DecisionTreeClassifier(max_depth=4, max_features=4)

for i in range(len(samplers)):
    parameters = {'class_weight':[{0:1,1:10}]}
    X_resampled, y_resampled = samplers[i]()
    GridSpace.getDataStore().setxTrain(X_resampled)
    GridSpace.getDataStore().setyTrain(y_resampled) 
    GridSpace.setGridParameters(parameters)
    GridSpace.setClassifier(classifier)
    grid = GridSpace.run()
    GridSpace.save(f"DT_{sampler_names[i]}")
    y_preds = grid.predict(GridSpace.getDataStore().getxTest())
    print(f"{sampler_names[i]} on test set:")
    print(f"F1 score: {f1_score(GridSpace.getDataStore().getyTest(), y_preds)}")
    print(f"ROC_AUC score: {roc_auc_score(GridSpace.getDataStore().getyTest(), y_preds)}")
    print(f"Balanced accuracy score: {balanced_accuracy_score(GridSpace.getDataStore().getyTest(), y_preds)}")
    print("\n")

GridSpace.getDataStore().revertToOriginal()

Random OverSampler on test set:
F1 score: 0.03627962693591169
ROC_AUC score: 0.8964295492796688
Balanced accuracy score: 0.8964295492796688


SMOTE on test set:
F1 score: 0.04801623083859333
ROC_AUC score: 0.9098513536910428
Balanced accuracy score: 0.9098513536910428


ADASYN on test set:
F1 score: 0.03742166517457476
ROC_AUC score: 0.8922491423670668
Balanced accuracy score: 0.8922491423670669


SMOTE TOMEK on test set:
F1 score: 0.04928909952606635
ROC_AUC score: 0.9011539793362808
Balanced accuracy score: 0.9011539793362807


SMOTE ENN on test set:
F1 score: 0.058240946045824096
ROC_AUC score: 0.8849373183066379
Balanced accuracy score: 0.884937318306638




In [20]:
random = SampledDataStore.getRandomSampled
smote = SampledDataStore.getSMOTESampled
ada = SampledDataStore.getADASYNSampled
smote_tomek = SampledDataStore.getSMOTETOMEKSampled
smote_enn = SampledDataStore.getSMOTEENNSampled
samplers = [random, smote, ada, smote_tomek, smote_enn]
sampler_names = ["Random OverSampler", "SMOTE", "ADASYN", "SMOTE TOMEK", "SMOTE ENN"]

classifier = DecisionTreeClassifier(max_depth=4, max_features=6)

for i in range(len(samplers)):
    parameters = {'class_weight':[{0:1,1:10}]}
    X_resampled, y_resampled = samplers[i]()
    GridSpace.getDataStore().setxTrain(X_resampled)
    GridSpace.getDataStore().setyTrain(y_resampled) 
    GridSpace.setGridParameters(parameters)
    GridSpace.setClassifier(classifier)
    grid = GridSpace.run()
    GridSpace.save(f"DT_{sampler_names[i]}")
    y_preds = grid.predict(GridSpace.getDataStore().getxTest())
    print(f"{sampler_names[i]} on test set:")
    print(f"F1 score: {f1_score(GridSpace.getDataStore().getyTest(), y_preds)}")
    print(f"ROC_AUC score: {roc_auc_score(GridSpace.getDataStore().getyTest(), y_preds)}")
    print(f"Balanced accuracy score: {balanced_accuracy_score(GridSpace.getDataStore().getyTest(), y_preds)}")
    print("\n")

GridSpace.getDataStore().revertToOriginal()

Random OverSampler on test set:
F1 score: 0.04449623900836953
ROC_AUC score: 0.9013062796447064
Balanced accuracy score: 0.9013062796447064


SMOTE on test set:
F1 score: 0.04999396208187417
ROC_AUC score: 0.899706824422809
Balanced accuracy score: 0.899706824422809


ADASYN on test set:
F1 score: 0.049312470365101946
ROC_AUC score: 0.9011700851190619
Balanced accuracy score: 0.901170085119062


SMOTE TOMEK on test set:
F1 score: 0.05060211653083567
ROC_AUC score: 0.9020357709435572
Balanced accuracy score: 0.9020357709435574


SMOTE ENN on test set:
F1 score: 0.05144058365277607
ROC_AUC score: 0.9025753146667311
Balanced accuracy score: 0.9025753146667311




Slightly better results than with the original features set, but still worse when compared to SVM or Logistic Regression