XG Boost
=======

Libraries

In [1]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt; plt.style.use("ggplot")

from scipy.stats import randint as sp_randint
import xgboost as xgb

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix,\
                            precision_recall_fscore_support, auc, accuracy_score

from bayes_opt import BayesianOptimization

#### Read Data:

In [2]:
data = pd.read_csv("/media/juanan/DATA/loan_data_analysis/data/loans_processed.csv", sep = "^")\
                   .sample(100000)

In [3]:
data.head()

Unnamed: 0,addr_state,annual_inc,application_type,avg_cur_bal,bc_open_to_buy,bc_util,delinq_2yrs,delinq_amnt,disbursement_method,dti,...,revol_util,sub_grade,tax_liens,term,tot_coll_amt,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,verification_status
332160,AZ,105000.0,Individual,9520.0,59772.0,28.6,1.0,0.0,Cash,31.61,...,23.9,A5,0.0,36 months,0.0,418208.0,83949.0,83700.0,114511.0,Not Verified
274531,CO,148000.0,Individual,9464.0,37783.0,67.6,0.0,0.0,Cash,32.7,...,57.7,B3,0.0,36 months,0.0,588595.0,146120.0,116700.0,97852.0,Not Verified
493881,NJ,94000.0,Individual,9086.0,8304.0,30.5,3.0,0.0,Cash,17.44,...,27.3,C2,0.0,36 months,0.0,309321.0,76321.0,11700.0,76998.0,Verified
794516,VA,71500.0,Individual,7107.0,83019.0,0.6,0.0,0.0,Cash,9.01,...,4.4,A1,0.0,36 months,0.0,214450.0,12101.0,83500.0,17300.0,Source Verified
505865,NC,72000.0,Individual,3270.0,2350.0,85.8,0.0,0.0,Cash,22.32,...,68.0,C1,0.0,36 months,0.0,99592.0,39610.0,16600.0,40742.0,Source Verified


In [4]:
data.shape

(100000, 48)

Data balance:

In [5]:
data['loan_status'].value_counts()

0.0    79420
1.0    20580
Name: loan_status, dtype: int64

#### Pre-processing:

In [6]:
categorical_variables = data.select_dtypes(include="object").columns

In [7]:
def categorical_to_numeric(variable):
    
    variable_dict = dict(data.groupby(variable)['loan_status'].mean())
    
    result = data[variable].map(lambda i: variable_dict[i])
    
    return result

In [8]:
for variable in categorical_variables:
    data[variable] = categorical_to_numeric(variable)

In [9]:
data.head()

Unnamed: 0,addr_state,annual_inc,application_type,avg_cur_bal,bc_open_to_buy,bc_util,delinq_2yrs,delinq_amnt,disbursement_method,dti,...,revol_util,sub_grade,tax_liens,term,tot_coll_amt,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,verification_status
332160,0.216827,105000.0,0.205907,9520.0,59772.0,28.6,1.0,0.0,0.205428,31.61,...,23.9,0.091237,0.0,0.166162,0.0,418208.0,83949.0,83700.0,114511.0,0.152386
274531,0.157918,148000.0,0.205907,9464.0,37783.0,67.6,0.0,0.0,0.205428,32.7,...,57.7,0.133344,0.0,0.166162,0.0,588595.0,146120.0,116700.0,97852.0,0.152386
493881,0.210308,94000.0,0.205907,9086.0,8304.0,30.5,3.0,0.0,0.205428,17.44,...,27.3,0.201685,0.0,0.166162,0.0,309321.0,76321.0,11700.0,76998.0,0.2413
794516,0.208347,71500.0,0.205907,7107.0,83019.0,0.6,0.0,0.0,0.205428,9.01,...,4.4,0.037555,0.0,0.166162,0.0,214450.0,12101.0,83500.0,17300.0,0.219056
505865,0.206298,72000.0,0.205907,3270.0,2350.0,85.8,0.0,0.0,0.205428,22.32,...,68.0,0.185751,0.0,0.166162,0.0,99592.0,39610.0,16600.0,40742.0,0.219056


In [10]:
data.shape

(100000, 48)

Target:

In [11]:
X = data.loc[:, data.columns!='loan_status']

In [12]:
y = data['loan_status']

In [14]:
X_train = X_train.as_matrix()
X_test = X_test.as_matrix()

Prior:

In [15]:
y.value_counts()[0] / len(y)

0.7942

#### XG Boost:

In [16]:
xg_boost = xgb.XGBClassifier(n_estimators=200)

Hyperparameters to tune:

In [17]:
xg_params = {'max_depth': [6, 10, 15, 20],
             'learning_rate': [0.001, 0.01, 0.1, 0.2, 0,3],
             'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
             'gamma': [0, 0.25, 0.5, 1.0],
             'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0]}

Randomize search with Cross Validation:

In [18]:
random_search = RandomizedSearchCV(xg_boost, n_iter=50, param_distributions=xg_params,
                                   cv=5, scoring="roc_auc", n_jobs=2, verbose=1)

In [20]:
random_search.fit(X, y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 74.7min
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed: 199.1min
[Parallel(n_jobs=2)]: Done 250 out of 250 | elapsed: 233.5min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=50, n_jobs=2,
          param_distributions={'max_depth': [6, 10, 15, 20], 'learning_rate': [0.001, 0.01, 0.1, 0.2, 0, 3], 'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0], 'gamma': [0, 0.25, 0.5, 1.0], 'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_t

Best estimator and result:

In [21]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.8,
       colsample_bytree=0.5, gamma=0.25, learning_rate=0.1,
       max_delta_step=0, max_depth=6, min_child_weight=10.0, missing=None,
       n_estimators=200, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=50.0, scale_pos_weight=1, seed=None, silent=True,
       subsample=1.0)

In [22]:
random_search.best_score_

0.7196220705068009

__Bayesian Optimization__:

Parameters for optimization:

In [23]:
xg_params_bay = {'colsample_bytree': (0.4, 1.0),
                 'max_depth': (2, 15),
                 'subsample': (0.4, 1.0),
                 'n_estimators': (10, 1500),
                 'learning_rate': (0.001, 0.3)}

XG Boost evaluate function for __Bayesian Optimization__:

In [24]:
def xgb_evaluate(colsample_bytree, max_depth, subsample, n_estimators, learning_rate):
    
    # params:
    params = {'colsample_bytree': colsample_bytree,
              'max_depth': int(max_depth),
              'subsample': subsample,
              'n_estimators': int(n_estimators),
              'learning_rate': learning_rate}
    
    # gradient boosting model
    gbm = xgb.XGBClassifier(colsample_bytree=params['colsample_bytree'], 
                            max_depth=params['max_depth'], 
                            subsample=params['subsample'], 
                            n_estimators=params['n_estimators'],
                            learning_rate=params['learning_rate'])
    
    gbm.fit(X_train, y_train)
    
    # scores in the test set
    y_scores_test = gbm.predict_proba(X_test)
    y_scores_test = pd.DataFrame(y_scores_test).loc[:,1]
    
    
    # metric to optimize
    metric = roc_auc_score(y_test, y_scores_test)
    
    return metric

Bayesian Optimization:

In [25]:
bay_optimization = BayesianOptimization(xgb_evaluate, xg_params_bay)

In [26]:
bay_optimization.maximize()

[31mInitialization[0m
[94m-----------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   learning_rate |   max_depth |   n_estimators |   subsample | 
    1 | 02m58s | [35m   0.65278[0m | [32m            0.8553[0m | [32m         0.2842[0m | [32m     6.1750[0m | [32m      829.2753[0m | [32m     0.4185[0m | 
    2 | 00m12s | [35m   0.71494[0m | [32m            0.5165[0m | [32m         0.0564[0m | [32m     5.3417[0m | [32m      107.7978[0m | [32m     0.7367[0m | 
    3 | 00m09s | [35m   0.71534[0m | [32m            0.4020[0m | [32m         0.2112[0m | [32m     2.5910[0m | [32m      264.6536[0m | [32m     0.8324[0m | 
    4 | 02m03s |    0.70048 |             0.6912 |          0.1051 |      6.3792 |       658.2713 |      0.5685 | 
    5 | 03m36s |    0.70166 |             0.5814 |          0.0898 |      6.6901 |      1211.0768 |      0.7776 | 
