In [7]:
import pandas as pd
import numpy as np

import metrics_summary as ms

In [8]:
np.random.seed(170)

In [9]:
X_train = pd.read_csv("data/X_train.csv")
X_test = pd.read_csv("data/X_test.csv")
y_train = pd.read_csv("data/y_train.csv").values.ravel()
y_test = pd.read_csv("data/y_test.csv").values.ravel()

## Stochastic Gradient Descent

In [10]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(loss="log_loss")
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
ms.metrics_summary(y_test, y_pred)
ms.crossval_summary(sgd, X_train, y_train)

[[50575   163]
 [ 5936    63]]
Kappa Score: 0.012660811276536066
Accuracy Score: 0.8925040097291009
Precision: 0.27876106194690264
Recall: 0.010501750291715286
F1 Score: 0.020240963855421686
AUC Score: 0.5036445840031244
Average Accuracy Score: 0.8771091752511492
Average Precision Score: 0.3219030695424398
Average Recall Score: 0.1283279788706609
Average F1 Score: 0.8771091752511492
[0.8882141  0.89072945 0.85238397]
Average AUC Score: 0.6986051245842556
[0.68337767 0.68546549 0.72697221]


## Stochastic Gradient Descent Using Cost-Sensitive Learning

In [11]:
sgd_cs = SGDClassifier(loss="log_loss", class_weight="balanced")
sgd_cs.fit(X_train, y_train)
y_pred = sgd_cs.predict(X_test)
ms.metrics_summary(y_test, y_pred)
ms.crossval_summary(sgd_cs, X_train, y_train)

[[32055 18683]
 [ 1597  4402]]
Kappa Score: 0.16206261109124986
Accuracy Score: 0.6425612915734
Precision: 0.19068659302577431
Recall: 0.7337889648274712
F1 Score: 0.30270939348095177
AUC Score: 0.682781982906463
Average Accuracy Score: 0.6175390021486055
Average Precision Score: 0.20948580162019273
Average Recall Score: 0.433579854540543
Average F1 Score: 0.6175390021486055
[0.88646922 0.83027034 0.13587745]
Average AUC Score: 0.6373792488488573
[0.67037732 0.68021944 0.56154099]


#### Stochastic Gradient Descent Using SMOTE

In [12]:
# implement SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
pipeline = Pipeline(steps=[('over', SMOTE()), ('model', SGDClassifier(loss="log_loss"))])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
ms.metrics_summary(y_test, y_pred)
ms.crossval_summary(pipeline, X_train, y_train)

[[48612  2126]
 [ 5008   991]]
Kappa Score: 0.15642424270136834
Accuracy Score: 0.8742619454676842
Precision: 0.3179339108116779
Recall: 0.1651941990331722
F1 Score: 0.21741992101799035
AUC Score: 0.5616463328328382
Average Accuracy Score: 0.5089333018505479
Average Precision Score: 0.18580770082566342
Average Recall Score: 0.7535974133190364
Average F1 Score: 0.5089333018505479
[0.82351741 0.5502504  0.15303209]
Average AUC Score: 0.7168307936281119
[0.74739133 0.76648203 0.63661902]
