In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
X_train = pd.read_csv("data/X_train.csv")
X_test = pd.read_csv("data/X_test.csv")
y_train = pd.read_csv("data/y_train.csv").values.ravel()
y_test = pd.read_csv("data/y_test.csv").values.ravel()

## SMOTE

In [9]:
# implement SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
os_X, os_y = os.fit_resample(X_train, y_train)
print(os_X.shape, os_y.shape)

(236866, 52) (236866,)


In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate

# Custom function to print the metrics of the model
def display_metrics(y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print(f'Accuracy Score: {accuracy_score(y_test, y_pred)}')
    print(f'Precision: {precision_score(y_test, y_pred)}')
    print(f'Recall: {recall_score(y_test, y_pred)}')
    print(f'F1 Score: {f1_score(y_test, y_pred)}')

def display_crossval_scores(model, X_train, y_train):
    scoring = ['accuracy', 'precision', 'recall', 'f1_micro']

    results = cross_validate(model, X_train, y_train, scoring=scoring, cv=5)
    print(f"Average Accuracy Score: {np.mean(results['test_accuracy'])}")
    print(f"Average Precision Score: {np.mean(results['test_precision'])}")
    print(f"Average Recall Score: {np.mean(results['test_recall'])}")
    print(f"Average F1 Score: {np.mean(results['test_f1_micro'])}")

## KNN

In [11]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
display_metrics(y_test, y_pred)
display_crossval_scores(knn, X_train, y_train)

[[50107   588]
 [ 3617  2425]]
Accuracy Score: 0.9258861060683504
Precision: 0.8048456687686691
Recall: 0.40135716650115855
F1 Score: 0.5356156819436775
Average Accuracy Score: 0.9257776507012745
Average Precision Score: 0.7927085663411186
Average Recall Score: 0.40055919629017483
Average F1 Score: 0.9257776507012745


#### KNN Using SMOTE

In [12]:
knn_os = KNeighborsClassifier()
knn_os.fit(os_X, os_y)
y_pred = knn_os.predict(X_test)
display_metrics(y_test, y_pred)
display_crossval_scores(knn_os, os_X, os_y)

[[42935  7760]
 [ 1912  4130]]
Accuracy Score: 0.8295292313657754
Precision: 0.3473507148864592
Recall: 0.6835484938762
F1 Score: 0.46062904305152796
Average Accuracy Score: 0.9118657848167295
Average Precision Score: 0.8577066805055
Average Recall Score: 0.9875794989797366
Average F1 Score: 0.9118657848167295
