In [1]:
import pandas as pd
import numpy as np

import metrics_summary as ms

In [2]:
np.random.seed(170)

In [3]:
X_train = pd.read_csv("data/X_train.csv")
X_test = pd.read_csv("data/X_test.csv")
y_train = pd.read_csv("data/y_train.csv").values.ravel()
y_test = pd.read_csv("data/y_test.csv").values.ravel()

## Decision Trees

In [4]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
ms.metrics_summary(y_test, y_pred)
ms.crossval_summary(dt, X_train, y_train)

[[46151  4587]
 [ 4071  1928]]
Kappa Score: 0.22254199769939165
Accuracy Score: 0.8474011667871054
Precision: 0.2959324635456638
Recall: 0.3213868978163027
F1 Score: 0.30813488892440466
AUC Score: 0.6154906423331976
Average Accuracy Score: 0.846773844577717
Average Precision Score: 0.29587501987953524
Average Recall Score: 0.32552183614655106
Average F1 Score: 0.846773844577717
[0.84450135 0.84685808 0.84896211]
Average AUC Score: 0.6169589062635611
[0.61409755 0.61600529 0.62077388]


#### Decision Trees Using Cost-Sensitive Learning

In [5]:
dt_cs = DecisionTreeClassifier(class_weight="balanced")
dt_cs.fit(X_train, y_train)
y_pred = dt_cs.predict(X_test)
ms.metrics_summary(y_test, y_pred)
ms.crossval_summary(dt_cs, X_train, y_train)

[[46598  4140]
 [ 4153  1846]]
Kappa Score: 0.226338549699016
Accuracy Score: 0.8538343585314697
Precision: 0.308386234547277
Recall: 0.30771795299216537
F1 Score: 0.3080517313308302
AUC Score: 0.6130611523800356
Average Accuracy Score: 0.8518574432652137
Average Precision Score: 0.29981135171558004
Average Recall Score: 0.3004431807514379
Average F1 Score: 0.8518574432652137
[0.85127694 0.85295384 0.85134155]
Average AUC Score: 0.6087442640138545
[0.60720876 0.6089227  0.61010133]


#### Decision Trees Using SMOTE

In [6]:
# implement SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
pipeline = Pipeline(steps=[('over', SMOTE()), ('model', DecisionTreeClassifier())])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
ms.metrics_summary(y_test, y_pred)
ms.crossval_summary(pipeline, X_train, y_train)

[[45970  4768]
 [ 4061  1938]]
Kappa Score: 0.21776591371330956
Accuracy Score: 0.8443872605178279
Precision: 0.2889949299135103
Recall: 0.3230538423070512
F1 Score: 0.30507674144037783
AUC Score: 0.6145404415918558
Average Accuracy Score: 0.8436390501649819
Average Precision Score: 0.2866579825470721
Average Recall Score: 0.32180659095840825
Average F1 Score: 0.8436390501649819
[0.84357225 0.8444107  0.84293419]
Average AUC Score: 0.6135682131786215
[0.61131036 0.61425891 0.61513537]
