In [7]:
import pandas as pd
import numpy as np
import joblib

import metrics_summary as ms

In [8]:
np.random.seed(170)

In [9]:
X_train = pd.read_csv("../data/X_train.csv")
X_test = pd.read_csv("../data/X_test.csv")
y_train = pd.read_csv("../data/y_train.csv").values.ravel()
y_test = pd.read_csv("../data/y_test.csv").values.ravel()

## XGBoost

In [10]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
y_prob = xgb.predict_proba(X_test)[:,1]
ms.metrics_summary(y_test, y_pred, y_prob)
ms.crossval_summary(xgb, X_train, y_train)
joblib.dump(xgb, "xgb.pkl")

[[49556  1182]
 [ 4464  1535]]
Kappa Score: 0.3065112549106489
Accuracy Score: 0.9004882175652572
Precision: 0.5649613544350387
Recall: 0.2558759793298883
F1 Score: 0.35222579164754475
AUC Score: 0.878215014300247
Average Accuracy Score: 0.89815388086446
Average Precision Score: 0.5408838034473414
Average Recall Score: 0.2422112744347471
Average F1 Score: 0.89815388086446
[0.89848176 0.89787363 0.89881784 0.89734487 0.89825131]
Average AUC Score: 0.8743988236333692
[0.87548496 0.87513395 0.8734569  0.87091087 0.87700744]


['xgb.pkl']

In [11]:
# from sklearn.model_selection import RepeatedStratifiedKFold
# from sklearn.model_selection import GridSearchCV

# max_depth = [3, 5, 7, 10]
# n_estimators = [100, 500, 1000]
# # learning_rate = [0.01, 0.1, 0.3]
# # colsample_bytree = [0.5, 1]
# # subsample = [0.6, 1]
# # define grid search
# grid = dict(max_depth=max_depth, n_estimators=n_estimators)
# cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1)
# grid_search = GridSearchCV(estimator=xgb, param_grid=grid, n_jobs=-1, cv=cv, scoring='roc_auc',error_score=0)
# grid_result = grid_search.fit(X_train, y_train)
# # summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

#### XGBoost Using SMOTE

In [12]:
# implement SMOTE to oversample the minority class
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
xgb_os = Pipeline(steps=[('over', SMOTE()), ('model', XGBClassifier())])
xgb_os.fit(X_train, y_train)
y_pred = xgb_os.predict(X_test)
y_prob = xgb_os.predict_proba(X_test)[:,1]
ms.metrics_summary(y_test, y_pred, y_prob)
ms.crossval_summary(xgb_os, X_train, y_train)
joblib.dump(xgb_os, "xgb_os.pkl")

[[49511  1227]
 [ 4410  1589]]
Kappa Score: 0.31419261890579686
Accuracy Score: 0.9006468442110087
Precision: 0.5642755681818182
Recall: 0.26487747957992996
F1 Score: 0.3605218377765173
AUC Score: 0.8789450113392505
Average Accuracy Score: 0.8980556832839307
Average Precision Score: 0.5390597878922299
Average Recall Score: 0.24685579033328228
Average F1 Score: 0.8980556832839307
[0.89836846 0.89776032 0.89711825 0.89927107 0.89776032]
Average AUC Score: 0.8748783349366598
[0.87800242 0.87524566 0.87368466 0.87294507 0.87451386]


['xgb_os.pkl']