In [2]:
import pandas as pd
import numpy as np
import joblib

import metrics_summary as ms

In [3]:
np.random.seed(170)

In [4]:
X_train = pd.read_csv("../data/X_train.csv")
X_test = pd.read_csv("../data/X_test.csv")
y_train = pd.read_csv("../data/y_train.csv").values.ravel()
y_test = pd.read_csv("../data/y_test.csv").values.ravel()

## Stochastic Gradient Descent

In [5]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(loss="log_loss")
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
y_prob = sgd.predict_proba(X_test)[:,1]
ms.metrics_summary(y_test, y_pred, y_prob)
ms.crossval_summary(sgd, X_train, y_train)
joblib.dump(sgd, "sgd.pkl")

[[15421 35317]
 [  444  5555]]
Kappa Score: 0.0645319669031651
Accuracy Score: 0.3697058356980454
Precision: 0.13591211587394794
Recall: 0.9259876646107684
F1 Score: 0.2370335601971368
AUC Score: 0.6151003339401877
Average Accuracy Score: 0.717843886087794
Average Precision Score: 0.2718740291352759
Average Recall Score: 0.35091407645587713
Average F1 Score: 0.717843886087794
[0.23714027 0.88873362 0.88922461 0.68055293 0.893568  ]
Average AUC Score: 0.6872090965358665
[0.65727072 0.67955938 0.68491631 0.75161797 0.6626811 ]


['sgd.pkl']

## Stochastic Gradient Descent Using Cost-Sensitive Learning

In [6]:
sgd_cs = SGDClassifier(class_weight="balanced", loss="log_loss")
sgd_cs.fit(X_train, y_train)
y_pred = sgd_cs.predict(X_test)
y_prob = sgd_cs.predict_proba(X_test)[:,1]
ms.metrics_summary(y_test, y_pred, y_prob)
ms.crossval_summary(sgd_cs, X_train, y_train)
joblib.dump(sgd_cs, "sgd_cs.pkl")

[[49591  1147]
 [ 5590   409]]
Kappa Score: 0.06766680257520519
Accuracy Score: 0.8812591430636093
Precision: 0.262853470437018
Recall: 0.06817802967161193
F1 Score: 0.10827266710787559
AUC Score: 0.5227766668720477
Average Accuracy Score: 0.7664636172773412
Average Precision Score: 0.24731552162090042
Average Recall Score: 0.42260618588271315
Average F1 Score: 0.7664636172773412
[0.71402674 0.82611323 0.87812063 0.86376855 0.55028893]
Average AUC Score: 0.7139384068020469
[0.73265564 0.71950075 0.69695045 0.69584877 0.72473642]


['sgd_cs.pkl']

#### Stochastic Gradient Descent Using SMOTE

In [7]:
# implement SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
sgd_os = Pipeline(steps=[('over', SMOTE()), ('model', SGDClassifier(loss="log_loss"))])
sgd_os.fit(X_train, y_train)
y_pred = sgd_os.predict(X_test)
y_prob = sgd_os.predict_proba(X_test)[:,1]
ms.metrics_summary(y_test, y_pred, y_prob)
ms.crossval_summary(sgd_os, X_train, y_train)
joblib.dump(sgd_os, "sgd_os.pkl")

[[20158 30580]
 [  474  5525]]
Kappa Score: 0.09907416450169249
Accuracy Score: 0.45266757142605357
Precision: 0.1530258966902091
Recall: 0.9209868311385231
F1 Score: 0.26244537336120083
AUC Score: 0.659175378875706
Average Accuracy Score: 0.8180777991143859
Average Precision Score: 0.28232377091797367
Average Recall Score: 0.3077122696881539
Average F1 Score: 0.8180777991143859
[0.77056424 0.67073309 0.88314386 0.89149073 0.87445708]
Average AUC Score: 0.7283249199036355
[0.76362187 0.76047119 0.7114792  0.68717826 0.71887408]


['sgd_os.pkl']