In [1]:
import pandas as pd
import numpy as np
import joblib

import metrics_summary as ms

In [2]:
np.random.seed(170)

In [3]:
X_train = pd.read_csv("../data/X_train.csv")
X_test = pd.read_csv("../data/X_test.csv")
y_train = pd.read_csv("../data/y_train.csv").values.ravel()
y_test = pd.read_csv("../data/y_test.csv").values.ravel()

## Random Forest

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:,1]
ms.metrics_summary(y_test, y_pred, y_prob)
ms.crossval_summary(rf, X_train, y_train)
joblib.dump(rf, "rf.pkl")

[[50555   183]
 [ 5690   309]]
Kappa Score: 0.08046995805479862
Accuracy Score: 0.8964873010557485
Precision: 0.6280487804878049
Recall: 0.051508584764127355
F1 Score: 0.095208750577723
AUC Score: 0.8558698464801883
Average Accuracy Score: 0.8961295005880012
Average Precision Score: 0.6104944773085565
Average Recall Score: 0.04837135711733783
Average F1 Score: 0.8961295005880012
[0.89606466 0.89541867 0.89572081 0.89643842 0.89700495]
Average AUC Score: 0.8533453633600843
[0.85283244 0.85256834 0.85314364 0.85414836 0.85403403]


['rf.pkl']

#### Random Forest Using Cost-Sensitive Learning

In [5]:
rf_cs = RandomForestClassifier(class_weight="balanced")
rf_cs.fit(X_train, y_train)
y_pred = rf_cs.predict(X_test)
y_prob = rf_cs.predict_proba(X_test)[:,1]
ms.metrics_summary(y_test, y_pred, y_prob)
ms.crossval_summary(rf_cs, X_train, y_train)
joblib.dump(rf_cs, "rf_cs.pkl")

[[50551   187]
 [ 5703   296]]
Kappa Score: 0.07678246129144006
Accuracy Score: 0.8961876729471068
Precision: 0.6128364389233955
Recall: 0.049341556926154356
F1 Score: 0.09132983647022523
AUC Score: 0.8601272719247998
Average Accuracy Score: 0.8956989436334887
Average Precision Score: 0.5859920936010483
Average Recall Score: 0.04615605063032716
Average F1 Score: 0.8956989436334887
[0.89534708 0.89526759 0.8960985  0.8960985  0.89568305]
Average AUC Score: 0.8582239371386144
[0.85629115 0.85791747 0.85577059 0.86244753 0.85869294]


['rf_cs.pkl']

## Random Forest Using SMOTE

In [6]:
# implement SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
rf_os = Pipeline(steps=[('over', SMOTE()), ('model', RandomForestClassifier())])
rf_os.fit(X_train, y_train)
y_pred = rf_os.predict(X_test)
y_prob = rf_os.predict_proba(X_test)[:,1]
ms.metrics_summary(y_test, y_pred, y_prob)
ms.crossval_summary(rf_os, X_train, y_train)
joblib.dump(rf_os, "rf_os.pkl")

[[50409   329]
 [ 5493   506]]
Kappa Score: 0.12548772070471836
Accuracy Score: 0.8973861853816734
Precision: 0.6059880239520958
Recall: 0.08434739123187197
F1 Score: 0.14808311384255196
AUC Score: 0.854351983427724
Average Accuracy Score: 0.895955762895888
Average Precision Score: 0.560069603683577
Average Recall Score: 0.07387819629459501
Average F1 Score: 0.895955762895888
[0.89632903 0.89541867 0.89632511 0.89572081 0.89598519]
Average AUC Score: 0.8523724617972961
[0.8547922  0.85323972 0.84853538 0.85439883 0.85089618]


['rf_os.pkl']

In [7]:
# from sklearn.model_selection import RepeatedStratifiedKFold
# from sklearn.model_selection import GridSearchCV

# n_estimators = [10, 100, 1000, 10000]
# max_features = ['sqrt', 'log2']
# # define grid search
# grid = dict(n_estimators=n_estimators,max_features=max_features)
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1)
# grid_search = GridSearchCV(estimator=rf_os, param_grid=grid, n_jobs=-1, cv=cv, scoring='roc_auc',error_score=0)
# grid_result = grid_search.fit(X_train, y_train)
# # summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))