In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import metrics_summary as ms

In [16]:
np.random.seed(170)

In [17]:
X_train = pd.read_csv("data/X_train.csv")
X_test = pd.read_csv("data/X_test.csv")
y_train = pd.read_csv("data/y_train.csv").values.ravel()
y_test = pd.read_csv("data/y_test.csv").values.ravel()

## SMOTE

In [18]:
# implement SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
X_os, y_os = os.fit_resample(X_train, y_train)
print(X_os.shape, y_os.shape)

(236866, 53) (236866,)


## Stochastic Gradient Descent

In [19]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(loss="hinge")
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
ms.metrics_summary(y_test, y_pred)
ms.crossval_summary(sgd, X_train, y_train)

[[50325   370]
 [ 4727  1315]]
Kappa Score: 0.30823641035479366
Accuracy Score: 0.9101644429560957
Precision: 0.7804154302670623
Recall: 0.21764316451506124
F1 Score: 0.340364954057202
AUC Score: 0.605172307181093
Average Accuracy Score: 0.9057528549213941
Average Precision Score: 0.7049982325537856
Average Recall Score: 0.2919041106269641
Average F1 Score: 0.9057528549213941
Average AUC Score: 0.7761861761199264


0.7761861761199264

In [22]:
from sklearn.metrics import roc_curve, auc
probs = sgd.predict_proba(X_test)
preds = probs[:, 1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(15, 9))
plt.title('ROC Curve')
plt.plot(fpr, tpr, 'b', label=f'{roc_auc:0.4f}')
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

AttributeError: probability estimates are not available for loss='hinge'

In [20]:
sgd_cs = SGDClassifier(class_weight="balanced")
sgd_cs.fit(X_train, y_train)
y_pred = sgd_cs.predict(X_test)
ms.metrics_summary(y_test, y_pred)
ms.crossval_summary(sgd_cs, X_train, y_train)

[[20333 30362]
 [  455  5587]]
Kappa Score: 0.10244757168461072
Accuracy Score: 0.45684473976417506
Precision: 0.1554146151492392
Recall: 0.9246938099966898
F1 Score: 0.2661046414707914
AUC Score: 0.6628893648070046
Average Accuracy Score: 0.7619574957111324
Average Precision Score: 0.46624204730697
Average Recall Score: 0.5210748835542817
Average F1 Score: 0.7619574957111324
Average AUC Score: 0.792124186663445


0.792124186663445

#### Stochastic Gradient Descent Using SMOTE

In [21]:
sgd_os = SGDClassifier()
sgd_os.fit(X_os, y_os)
y_pred = sgd_os.predict(X_test)
ms.metrics_summary(y_test, y_pred)
ms.crossval_summary(sgd_os, X_os, y_os)

[[ 5721 44974]
 [   38  6004]]
Kappa Score: 0.02492425116958541
Accuracy Score: 0.20665526904841638
Precision: 0.11777629565695005
Recall: 0.9937106918238994
F1 Score: 0.21059277446509994
AUC Score: 0.5532810289181633
Average Accuracy Score: 0.6679555377508726
Average Precision Score: 0.7055271018698127
Average Recall Score: 0.7965629964695393
Average F1 Score: 0.6679555377508726
Average AUC Score: 0.8355009256337336


0.8355009256337336