In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
X_train = pd.read_csv("data/X_train.csv")
X_test = pd.read_csv("data/X_test.csv")
y_train = pd.read_csv("data/y_train.csv").values.ravel()
y_test = pd.read_csv("data/y_test.csv").values.ravel()

## SMOTE

In [24]:
# implement SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
os_X, os_y = os.fit_resample(X_train, y_train)
print(os_X.shape, os_y.shape)

(236866, 52) (236866,)


In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate

# Custom function to print the metrics of the model
def display_metrics(y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print(f'Accuracy Score: {accuracy_score(y_test, y_pred)}')
    print(f'Precision: {precision_score(y_test, y_pred)}')
    print(f'Recall: {recall_score(y_test, y_pred)}')
    print(f'F1 Score: {f1_score(y_test, y_pred)}')

def display_crossval_scores(model, X_train, y_train):
    scoring = ['accuracy', 'precision', 'recall', 'f1_micro']

    results = cross_validate(model, X_train, y_train, scoring=scoring, cv=5)
    print(f"Average Accuracy Score: {np.mean(results['test_accuracy'])}")
    print(f"Average Precision Score: {np.mean(results['test_precision'])}")
    print(f"Average Recall Score: {np.mean(results['test_recall'])}")
    print(f"Average F1 Score: {np.mean(results['test_f1_micro'])}")

## Decision Trees

In [26]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
display_metrics(y_test, y_pred)
display_crossval_scores(dt, X_train, y_train)

[[50073   622]
 [  811  5231]]
Accuracy Score: 0.974743112959797
Precision: 0.8937297112591833
Recall: 0.8657729228732208
F1 Score: 0.8795292139554435
Average Accuracy Score: 0.9754430303160964
Average Precision Score: 0.8902157048610781
Average Recall Score: 0.8749377222485485
Average F1 Score: 0.9754430303160964


#### Decision Trees Using Cost-Sensitive Learning

In [27]:
dt_cs = DecisionTreeClassifier(class_weight="balanced")
dt_cs.fit(X_train, y_train)
y_pred = dt_cs.predict(X_test)
display_metrics(y_test, y_pred)
display_crossval_scores(dt_cs, X_train, y_train)

[[50092   603]
 [  837  5205]]
Accuracy Score: 0.9746197366797681
Precision: 0.8961776859504132
Recall: 0.8614697120158887
F1 Score: 0.8784810126582278
Average Accuracy Score: 0.9746952051782147
Average Precision Score: 0.8916217233627481
Average Recall Score: 0.8651179354016815
Average F1 Score: 0.9746952051782147


#### Decision Trees Using SMOTE

In [28]:
dt_os = DecisionTreeClassifier()
dt_os.fit(os_X, os_y)
y_pred = dt_os.predict(X_test)
display_metrics(y_test, y_pred)
display_crossval_scores(dt_os, os_X, os_y)

[[50074   621]
 [  732  5310]]
Accuracy Score: 0.9761531275886988
Precision: 0.8952959028831563
Recall: 0.8788480635551142
F1 Score: 0.8869957404159359
Average Accuracy Score: 0.9732087910289714
Average Precision Score: 0.9872020996209937
Average Recall Score: 0.9589816689549405
Average F1 Score: 0.9732087910289714
