In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import metrics_summary as ms

In [10]:
np.random.seed(170)

In [11]:
X_train = pd.read_csv("data/X_train.csv")
X_test = pd.read_csv("data/X_test.csv")
y_train = pd.read_csv("data/y_train.csv").values.ravel()
y_test = pd.read_csv("data/y_test.csv").values.ravel()

## SMOTE

In [12]:
# implement SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
os_X, os_y = os.fit_resample(X_train, y_train)
print(os_X.shape, os_y.shape)

(236866, 53) (236866,)


## Decision Trees

In [13]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
ms.metrics_summary(y_test, y_pred)
ms.crossval_summary(dt, X_train, y_train)

[[50095   600]
 [  814  5228]]
Kappa Score: 0.866964435108484
Accuracy Score: 0.9750779914341612
Precision: 0.8970487302676733
Recall: 0.8652763985435287
F1 Score: 0.8808761583824769
AUC Score: 0.9267204559045683
Average Accuracy Score: 0.9749973604894542
Average Precision Score: 0.8901947017035505
Average Recall Score: 0.8701349832860101
Average F1 Score: 0.9749973604894542
Average AUC Score: 0.928743237447193


0.928743237447193

#### Decision Trees Using Cost-Sensitive Learning

In [14]:
dt_cs = DecisionTreeClassifier(class_weight="balanced")
dt_cs.fit(X_train, y_train)
y_pred = dt_cs.predict(X_test)
ms.metrics_summary(y_test, y_pred)
ms.crossval_summary(dt_cs, X_train, y_train)

[[50106   589]
 [  829  5213]]
Kappa Score: 0.8663307567803279
Accuracy Score: 0.9750074907027161
Precision: 0.8984832816270252
Recall: 0.8627937768950679
F1 Score: 0.8802769334684227
AUC Score: 0.9255876370420698
Average Accuracy Score: 0.9743024185647771
Average Precision Score: 0.8909239485682144
Average Recall Score: 0.8617498128264302
Average F1 Score: 0.9743024185647771
Average AUC Score: 0.9246561935599523


0.9246561935599523

#### Decision Trees Using SMOTE

In [15]:
dt_os = DecisionTreeClassifier()
dt_os.fit(os_X, os_y)
y_pred = dt_os.predict(X_test)
ms.metrics_summary(y_test, y_pred)
ms.crossval_summary(dt_os, os_X, os_y)

[[50058   637]
 [  738  5304]]
Kappa Score: 0.8717071867575602
Accuracy Score: 0.9757653735657508
Precision: 0.8927789934354485
Recall: 0.8778550148957299
F1 Score: 0.8852541099891513
AUC Score: 0.9326448365730252
Average Accuracy Score: 0.9701438248303218
Average Precision Score: 0.987182346727387
Average Recall Score: 0.9528010671243188
Average F1 Score: 0.9701438248303218
Average AUC Score: 0.9701438159155937


0.9701438159155937