In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [74]:
X_train = pd.read_csv("data/X_train.csv")
X_test = pd.read_csv("data/X_test.csv")
y_train = pd.read_csv("data/y_train.csv").values.ravel()
y_test = pd.read_csv("data/y_test.csv").values.ravel()

## SMOTE

In [75]:
# implement SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
os_X, os_y = os.fit_resample(X_train, y_train)
print(os_X.shape, os_y.shape)

(236866, 52) (236866,)


In [76]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate

# Custom function to print the metrics of the model
def display_metrics(y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print(f'Accuracy Score: {accuracy_score(y_test, y_pred)}')
    print(f'Precision: {precision_score(y_test, y_pred)}')
    print(f'Recall: {recall_score(y_test, y_pred)}')
    print(f'F1 Score: {f1_score(y_test, y_pred)}')

def display_crossval_scores(model, X_train, y_train):
    scoring = ['accuracy', 'precision', 'recall', 'f1_micro']

    results = cross_validate(model, X_train, y_train, scoring=scoring, cv=5)
    print(f"Average Accuracy Score: {np.mean(results['test_accuracy'])}")
    print(f"Average Precision Score: {np.mean(results['test_precision'])}")
    print(f"Average Recall Score: {np.mean(results['test_recall'])}")
    print(f"Average F1 Score: {np.mean(results['test_f1_micro'])}")

## Random Forest

In [77]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
display_metrics(y_test, y_pred)
display_crossval_scores(rf, X_train, y_train)

[[49937   758]
 [   25  6017]]
Accuracy Score: 0.9861994818196239
Precision: 0.8881180811808118
Recall: 0.9958622972525654
F1 Score: 0.9389092611375517
Average Accuracy Score: 0.9857009010464107
Average Precision Score: 0.883425799958198
Average Recall Score: 0.9957715339500108
Average F1 Score: 0.9857009010464107


In [78]:
# from sklearn.model_selection import RepeatedStratifiedKFold
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier()
# n_estimators = [10, 100, 1000]
# max_features = ['sqrt', 'log2']
# # define grid search
# grid = dict(n_estimators=n_estimators,max_features=max_features)
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1_micro',error_score=0)
# grid_result = grid_search.fit(X_train, y_train)
# # summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

#### Random Forest Using Cost-Sensitive Learning

In [79]:
rf_cs = RandomForestClassifier(random_state=0, class_weight="balanced")
rf_cs.fit(X_train, y_train)
y_pred = rf_cs.predict(X_test)
display_metrics(y_test, y_pred)
display_crossval_scores(rf_cs, X_train, y_train)

[[49969   726]
 [   68  5974]]
Accuracy Score: 0.9860056048081499
Precision: 0.8916417910447761
Recall: 0.9887454485269779
F1 Score: 0.937686391461309
Average Accuracy Score: 0.9854289703382975
Average Precision Score: 0.8867210570559202
Average Recall Score: 0.988031216570415
Average F1 Score: 0.9854289703382975


#### Random Forest Using SMOTE

In [80]:
rf_os = RandomForestClassifier(random_state=0)
rf_os.fit(os_X, os_y)
y_pred = rf_os.predict(X_test)
display_metrics(y_test, y_pred)
display_crossval_scores(rf_os, os_X, os_y)

[[49927   768]
 [   16  6026]]
Accuracy Score: 0.9861818566367626
Precision: 0.8869590815425376
Recall: 0.9973518702416418
F1 Score: 0.9389217824867561
Average Accuracy Score: 0.9864270184104594
Average Precision Score: 0.98510447054596
Average Recall Score: 0.9878414320091189
Average F1 Score: 0.9864270184104594
