In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
X_train = pd.read_csv("data/X_train.csv")
X_test = pd.read_csv("data/X_test.csv")
y_train = pd.read_csv("data/y_train.csv").values.ravel()
y_test = pd.read_csv("data/y_test.csv").values.ravel()

## SMOTE

In [9]:
# implement SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
os_X, os_y = os.fit_resample(X_train, y_train)
print(os_X.shape, os_y.shape)

(236866, 768) (236866,)


In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Custom function to print the metrics of the model
def print_metrics(y_test, y_pred):
    print(f'Accuracy Score: {accuracy_score(y_test, y_pred)}')
    print(confusion_matrix(y_test, y_pred))
    print(f'Precision: {precision_score(y_test, y_pred)}')
    print(f'Recall: {recall_score(y_test, y_pred)}')
    print(f'F1 Score: {f1_score(y_test, y_pred)}')

## Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print_metrics(y_test, y_pred)

Accuracy Score: 0.985494474505173
[[49950   745]
 [   78  5964]]
Precision: 0.8889551348934267
Recall: 0.9870903674280039
F1 Score: 0.9354560426633205


#### Random Forest Using Cost-Sensitive Learning

In [12]:
cs_rf = RandomForestClassifier(class_weight="balanced")
cs_rf.fit(X_train, y_train)
y_pred = cs_rf.predict(X_test)
print_metrics(y_test, y_pred)

Accuracy Score: 0.9847365916421383
[[49995   700]
 [  166  5876]]
Precision: 0.8935523114355232
Recall: 0.9725256537570341
F1 Score: 0.931367887145348


#### Random Forest Using SMOTE

In [13]:
rf_os = RandomForestClassifier()
rf_os.fit(os_X, os_y)
y_pred = rf_os.predict(X_test)
print_metrics(y_test, y_pred)

Accuracy Score: 0.985882228528121
[[49927   768]
 [   33  6009]]
Precision: 0.8866755201416556
Recall: 0.9945382323733863
F1 Score: 0.9375146267259536
