In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import metrics_summary as ms

In [2]:
np.random.seed(170)

In [3]:
X_train = pd.read_csv("data/X_train.csv")
X_test = pd.read_csv("data/X_test.csv")
y_train = pd.read_csv("data/y_train.csv").values.ravel()
y_test = pd.read_csv("data/y_test.csv").values.ravel()

## SMOTE

In [4]:
# implement SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
os_X, os_y = os.fit_resample(X_train, y_train)
print(os_X.shape, os_y.shape)

(236866, 251) (236866,)


## Decision Trees

In [5]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
ms.metrics_summary(y_test, y_pred)
ms.crossval_summary(dt, X_train, y_train)

[[46026  4669]
 [ 4130  1912]]
Kappa Score: 0.21587050210461334
Accuracy Score: 0.8449160160036661
Precision: 0.29053335359367877
Recall: 0.3164515061238001
F1 Score: 0.302939079458132
AUC Score: 0.6121758467595033
Average Accuracy Score: 0.8483298798530517
Average Precision Score: 0.29821508906636246
Average Recall Score: 0.32423157383756546
Average F1 Score: 0.8483298798530517
[0.84885565 0.85100276 0.84412887 0.84888771 0.84877441]
Average AUC Score: 0.6171536621158582
[0.61683026 0.62249708 0.6123312  0.62001025 0.61409952]


#### Decision Trees Using Cost-Sensitive Learning

In [6]:
dt_cs = DecisionTreeClassifier(class_weight="balanced")
dt_cs.fit(X_train, y_train)
y_pred = dt_cs.predict(X_test)
ms.metrics_summary(y_test, y_pred)
ms.crossval_summary(dt_cs, X_train, y_train)

[[46478  4217]
 [ 4128  1914]]
Kappa Score: 0.23209311588869008
Accuracy Score: 0.8529178490226836
Precision: 0.31218398303702494
Recall: 0.31678252234359483
F1 Score: 0.31446644212601665
AUC Score: 0.6167993882060218
Average Accuracy Score: 0.8526354910495095
Average Precision Score: 0.2997101585748312
Average Recall Score: 0.29778489229974997
Average F1 Score: 0.8526354910495095
[0.85051741 0.85217358 0.8498697  0.85610152 0.85451524]
Average AUC Score: 0.6078945979391864
[0.60132303 0.60655065 0.6041563  0.6145599  0.61288311]


#### Decision Trees Using SMOTE

In [7]:
dt_os = DecisionTreeClassifier()
dt_os.fit(os_X, os_y)
y_pred = dt_os.predict(X_test)
ms.metrics_summary(y_test, y_pred)
ms.crossval_summary(dt_os, os_X, os_y)

[[45959  4736]
 [ 4073  1969]]
Kappa Score: 0.2217476643094085
Accuracy Score: 0.8447397641750534
Precision: 0.2936614466815809
Recall: 0.325885468387951
F1 Score: 0.30893543578881305
AUC Score: 0.6162320132155753
Average Accuracy Score: 0.899226483520674
Average Precision Score: 0.9185622459768936
Average Recall Score: 0.8856249436837462
Average F1 Score: 0.899226483520674
[0.71898088 0.94389209 0.94378655 0.94450425 0.94496865]
Average AUC Score: 0.8992264882240983
[0.71898088 0.94389319 0.94378761 0.94450318 0.94496759]
