In [1]:
import pandas as pd
import numpy as np
import joblib

import metrics_summary as ms

In [2]:
np.random.seed(170)

In [3]:
X_train = pd.read_csv("../data/X_train.csv")
X_test = pd.read_csv("../data/X_test.csv")
y_train = pd.read_csv("../data/y_train.csv").values.ravel()
y_test = pd.read_csv("../data/y_test.csv").values.ravel()

## Decision Trees

In [4]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
y_prob = dt.predict_proba(X_test)[:,1]
ms.metrics_summary(y_test, y_pred, y_prob)
ms.crossval_summary(dt, X_train, y_train)
joblib.dump(dt, "dt.pkl")

[[46235  4503]
 [ 4076  1923]]
Kappa Score: 0.22475040008125913
Accuracy Score: 0.8487935562331459
Precision: 0.2992530345471522
Recall: 0.3205534255709285
F1 Score: 0.30953722334004025
AUC Score: 0.6159016881490971
Average Accuracy Score: 0.8468116151860906
Average Precision Score: 0.29340572583814084
Average Recall Score: 0.3187337824733323
Average F1 Score: 0.8468116151860906
[0.84432359 0.84511085 0.84896325 0.84631945 0.84934094]
Average AUC Score: 0.6139872138990532
[0.61347818 0.61017736 0.61894704 0.60880535 0.61852813]


['dt.pkl']

#### Decision Trees Using Cost-Sensitive Learning

In [5]:
dt_cs = DecisionTreeClassifier(class_weight="balanced")
dt_cs.fit(X_train, y_train)
y_pred = dt_cs.predict(X_test)
y_prob = dt_cs.predict_proba(X_test)[:,1]
ms.metrics_summary(y_test, y_pred, y_prob)
ms.crossval_summary(dt_cs, X_train, y_train)
joblib.dump(dt_cs, "dt_cs.pkl")

[[46549  4189]
 [ 4179  1820]]
Kappa Score: 0.22066033430091947
Accuracy Score: 0.8525124698168743
Precision: 0.3028790148111167
Recall: 0.3033838973162194
F1 Score: 0.3031312458361093
AUC Score: 0.6104112517445537
Average Accuracy Score: 0.8509585690402914
Average Precision Score: 0.2967047595220204
Average Recall Score: 0.2990855662736692
Average F1 Score: 0.8509585690402914
[0.8498376  0.85221135 0.84862333 0.85292896 0.8511916 ]
Average AUC Score: 0.6076431294498679
[0.60632665 0.60926422 0.59906717 0.60998047 0.61357714]


['dt_cs.pkl']

#### Decision Trees Using SMOTE

In [6]:
# implement SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
dt_os = Pipeline(steps=[('over', SMOTE()), ('model', DecisionTreeClassifier())])
dt_os.fit(X_train, y_train)
y_pred = dt_os.predict(X_test)
y_prob = dt_os.predict_proba(X_test)[:,1]
ms.metrics_summary(y_test, y_pred, y_prob)
ms.crossval_summary(dt_os, X_train, y_train)
joblib.dump(dt_os, "dt_os.pkl")

[[45951  4787]
 [ 4040  1959]]
Kappa Score: 0.22012248299024406
Accuracy Score: 0.8444225108835505
Precision: 0.29039430773791874
Recall: 0.32655442573762294
F1 Score: 0.30741467242055703
AUC Score: 0.616103496916271
Average Accuracy Score: 0.8460109276606707
Average Precision Score: 0.29427626802469564
Average Recall Score: 0.32637903843209315
Average F1 Score: 0.8460109276606707
[0.84334164 0.84798127 0.84813234 0.84597953 0.84461986]
Average AUC Score: 0.6169102726580604
[0.61497606 0.61540515 0.61895503 0.61507356 0.62014155]


['dt_os.pkl']