In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
X_train = pd.read_csv("data/X_train.csv")
X_test = pd.read_csv("data/X_test.csv")
y_train = pd.read_csv("data/y_train.csv").values.ravel()
y_test = pd.read_csv("data/y_test.csv").values.ravel()

## SMOTE

In [3]:
# implement SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
os_X, os_y = os.fit_resample(X_train, y_train)
print(os_X.shape, os_y.shape)

(236866, 768) (236866,)


In [4]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Custom function to print the metrics of the model
def print_metrics(y_test, y_pred):
    print(f'Accuracy Score: {accuracy_score(y_test, y_pred)}')
    print(confusion_matrix(y_test, y_pred))
    print(f'Precision: {precision_score(y_test, y_pred)}')
    print(f'Recall: {recall_score(y_test, y_pred)}')
    print(f'F1 Score: {f1_score(y_test, y_pred)}')

## Random Forest

In [5]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print_metrics(y_test, y_pred)

Accuracy Score: 0.9855297248708955
[[49958   737]
 [   84  5958]]
Precision: 0.8899178491411501
Recall: 0.9860973187686196
F1 Score: 0.9355421213786606


#### Random Forest Using SMOTE

In [6]:
rf_os = RandomForestClassifier()
rf_os.fit(os_X, os_y)
y_pred = rf_os.predict(X_test)
print_metrics(y_test, y_pred)

Accuracy Score: 0.9859527292595661
[[49929   766]
 [   31  6011]]
Precision: 0.88697063597462
Recall: 0.994869248593181
F1 Score: 0.9378266635462984
