# Model Tuning

In [1]:
goog_drive=False

if goog_drive:
    from google.colab import drive
    drive.mount('/content/drive')
    goog_dir = '/content/drive/My Drive/lending_club_project/'
else:
    goog_dir = ''

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle


from utils import chunk_loader

from sklearn.metrics import classification_report, roc_curve, precision_recall_curve
from sklearn.model_selection import GridSearchCV

import xgboost as xgb

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


%matplotlib inline

Using TensorFlow backend.


# 1.0 Download the Data

In [3]:
#directory to save files
save_path = os.path.join(goog_dir, 'models')

In [4]:
#get directory
df_train_path = os.path.join(goog_dir, 'data/df_train_scaled.csv.zip')
df_test_path = os.path.join(goog_dir, 'data/df_test_scaled.csv.zip')

#download in chunks
df_train = chunk_loader(df_train_path, index_col=0)
df_test = chunk_loader(df_test_path, index_col=0)

In [5]:
df_train.shape

(1292073, 42)

In [6]:
df_test.shape

(11534, 42)

In [7]:
target_col = 'loan_status'

#feature space
X_train = df_train.drop(columns=[target_col]).values
X_test = df_test.drop(columns=[target_col]).values

#target variable
y_train = df_train[target_col].values
y_test = df_test[target_col].values


# 2.0 GridSearchCV + Oversampling
In this section we will make use of an algorithm in order to oversample the minority class. This will be integrated as part of the pipeline in order to avoid leakage.

For oversampling we can make use of SMOTE or the more simple RandomOverSampler. The choice of oversampling boils  down to speed and efficiency considerations

In [8]:
#define the param grid
# Parameters of pipelines can be set using ‘__’ separated para
clf_grid = {
    'xgb__eta': [0.05, 0.1, 0.3],
    'xgb__max_depth': [3, 6, 12],
    'xgb__colsample_bytree': [0.9, 1.0],
    }

#define the pipeline
model_pipe = Pipeline([
    ('sampling', SMOTE()),
    ('xgb', xgb.XGBClassifier(n_estimators=200,
                              n_jobs=-1))
])

#define the classifer
clf = GridSearchCV(model_pipe,
                   clf_grid,
                   n_jobs=-1,
                   cv=3, 
                   verbose=50, 
                   scoring='roc_auc')


#fit to training data
clf.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 30.4min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 30.5min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed: 30.5min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed: 52.3min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 81.7min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed: 81.8min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed: 110.1min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 136.8min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 138.3min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 156.7min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed: 165.7min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed: 185.8min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed: 188.6min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elaps

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('sampling',
                                        SMOTE(k_neighbors=5, kind='deprecated',
                                              m_neighbors='deprecated',
                                              n_jobs=1, out_step='deprecated',
                                              random_state=None, ratio=None,
                                              sampling_strategy='auto',
                                              svm_estimator='deprecated')),
                                       ('xgb',
                                        XGBClassifier(base_score=0.5,
                                                      booster='gbtree',
                                                      colsample_bylevel=1,...
                                                      objective='binary:logistic',
                                             

In [9]:
print("Best model parameters: \n")
print(clf.best_params_ )
print("Best model score: \n")

print(clf.best_score_)

Best model parameters: 

{'xgb__colsample_bytree': 0.9, 'xgb__eta': 0.3, 'xgb__max_depth': 3}
Best model score: 

0.6755571785773941


# 3.0 Model Persistence
We wish to demonsrate how to save and load the model for later use.

## 3.1 Save

In [10]:
#define location to save trained model
save_model_dir = os.path.join(save_path, 'xgb_grid_2020-02-29.pickle')
print("Saving models at: {}".format(save_model_dir))
#save the model
with open(save_model_dir, 'wb') as handle:
    pickle.dump(clf,
                handle,
                protocol=pickle.HIGHEST_PROTOCOL)

Saving models at: models/xgb_grid_2020-02-29.pickle


## 3.2 Load

In [11]:
#demonstrate how to load
with open(save_model_dir, 'rb') as handle:
    clf_load = pickle.load(handle)