In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [38]:
X_train = pd.read_csv("data/X_train.csv")
X_test = pd.read_csv("data/X_test.csv")
y_train = pd.read_csv("data/y_train.csv").values.ravel()
y_test = pd.read_csv("data/y_test.csv").values.ravel()

## SMOTE

In [39]:
# implement SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
os_X, os_y = os.fit_resample(X_train, y_train)
print(os_X.shape, os_y.shape)

(236866, 52) (236866,)


In [40]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate

# Custom function to print the metrics of the model
def display_metrics(y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print(f'Accuracy Score: {accuracy_score(y_test, y_pred)}')
    print(f'Precision: {precision_score(y_test, y_pred)}')
    print(f'Recall: {recall_score(y_test, y_pred)}')
    print(f'F1 Score: {f1_score(y_test, y_pred)}')

def display_crossval_scores(model, X_train, y_train):
    scoring = ['accuracy', 'precision', 'recall', 'f1_micro']

    results = cross_validate(model, X_train, y_train, scoring=scoring, cv=5)
    print(f"Average Accuracy Score: {np.mean(results['test_accuracy'])}")
    print(f"Average Precision Score: {np.mean(results['test_precision'])}")
    print(f"Average Recall Score: {np.mean(results['test_recall'])}")
    print(f"Average F1 Score: {np.mean(results['test_f1_micro'])}")

## Stochastic Gradient Descent

In [41]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(random_state=0)
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
display_metrics(y_test, y_pred)
display_crossval_scores(sgd, X_train, y_train)

[[50400   295]
 [ 3183  2859]]
Accuracy Score: 0.9386996140084953
Precision: 0.9064679771718452
Recall: 0.47318768619662366
F1 Score: 0.6217920835145716
Average Accuracy Score: 0.9526913611767526
Average Precision Score: 0.8778177944954283
Average Recall Score: 0.6445123020872261
Average F1 Score: 0.9526913611767526


#### Stochastic Gradient Descent Using SMOTE

In [42]:
sgd_os = SGDClassifier(random_state=0)
sgd_os.fit(os_X, os_y)
y_pred = sgd_os.predict(X_test)
display_metrics(y_test, y_pred)
display_crossval_scores(sgd_os, os_X, os_y)


[[47871  2824]
 [   24  6018]]
Accuracy Score: 0.9498034792110969
Precision: 0.6806152454195883
Recall: 0.9960278053624627
F1 Score: 0.8086535877452298
Average Accuracy Score: 0.9308088147717968
Average Precision Score: 0.9176799370518255
Average Recall Score: 0.9710893042559775
Average F1 Score: 0.9308088147717968
