In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4


ImportError: cannot import name 'cross_validation'

In [3]:
from sklearn.model_selection import cross_val_score

In [None]:
def modelfit(alg, dtrain, predictors, useTrainCV = True, cv_folds=5, early_stopping_rounds = 50 ):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain   = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        
        cvresult  = xgb.cv(xgb_param, xgtrain, num_boost_round = alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

## General Approach for Parameter Tuning
We will use an approach similar to that of GBM here. The various steps to be performed are:

1. Choose a relatively high learning rate. Generally a learning rate of 0.1 works but somewhere between 0.05 to 0.3 should work for different problems. Determine the optimum number of trees for this learning rate. XGBoost has a very useful function called as “cv” which performs cross-validation at each boosting iteration and thus returns the optimum number of trees required.
2. Tune tree-specific parameters ( max_depth, min_child_weight, gamma, subsample, colsample_bytree) for decided learning rate and number of trees. Note that we can choose different parameters to define a tree and I’ll take up an example here.
3. Tune regularization parameters (lambda, alpha) for xgboost which can help reduce model complexity and enhance performance.
4. Lower the learning rate and decide the optimal parameters .

#### Step 1: Fix learning rate and number of estimators for tuning tree-based parameters.

In order to decide on boosting parameters, we need to set some initial values of other parameters. Lets take the following values:

1. **max_depth = 5 :** This should be between 3-10. I’ve started with 5 but you can choose a different number as well. 4-6 can be good starting points.
2. **min_child_weight = 1 :** A smaller value is chosen because it is a highly imbalanced class problem and leaf nodes can have smaller size groups.
3. **gamma = 0 :** A smaller value like 0.1-0.2 can also be chosen for starting. This will anyways be tuned later.
4. **subsample, colsample_bytree = 0.8 :** This is a commonly used used start value. Typical values range between 0.5-0.9.
5. **scale_pos_weight = 1:** Because of high class imbalance.

Please note that all the above are just initial estimates and will be tuned later. Lets take the default learning rate of 0.1 here and check the optimum number of trees using cv function of xgboost. The function defined above will do it for us.

In [None]:
#Choose all predictors except target & IDcols
predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(
 learning_rate = 0.1,
 n_estimators = 1000,
 max_depth = 5,
 min_child_weight = 1,
 gamma = 0,
 subsample = 0.8,
 colsample_bytree = 0.8,
 objective = 'binary:logistic',
 nthread = 4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, train, predictors)

In [None]:
import xgboost as xgb

In [70]:
# First we set default parameters like this
xgb_model = xgb.XGBRegressor(learning_rate = 0.1,
                            n_estimators = 100,
                            max_depth = 5,
                            min_child_weight = 1,
                            gamma = 0,
                            subsample = 0.8,
                            colsample_bytree = 0.8,
                            objective = "reg:squarederror",
                            verbosity = 1
                            )

In [35]:
xgb_param = xgb_model.get_xgb_params()
dm_train = xgb.DMatrix(x_train,y_train)
dm_valid = xgb.DMatrix(x_valid,y_valid)

In [71]:
# This give us a good start of n_estimators 
# It takes time
cvresult = xgb.cv(xgb_param, dm_train,
                  num_boost_round = xgb_model.get_params()['n_estimators'],
                  nfold = 5,
                  metrics= ["rmse","mae"],
                  early_stopping_rounds = 5)

In [73]:
# Then we set this good n_estimators to our xgb_model
xgb_model.set_params(n_estimators=cvresult.shape[0])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=0.8, verbosity=1)

In [74]:
## These two used to see eval_set error just pass both of this into fit method
eval_s = [ (x_train, y_train) , (x_valid,y_valid)]
eval_metric=["rmse","mae"]

xgb_model.fit(x_train,y_train,verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=0.8, verbosity=1)

In [77]:
# Checking the results with default params and good n_estimator value
#train_pred = xgb_model.predict(x_train)
xgb_model.score(x_train,y_train)  # This is R2 Score
#zz = xgb_model.predict(x_valid)
#xgb_model.evals_result()

0.9706367474877502

In [81]:
from sklearn.metrics import mean_absolute_error

In [83]:
from sklearn.metrics import r2_score

### Tuning max_depth and min_child_weight

In [72]:
# Now we tune this two params of booster using randomsearch
# RandomSearch

In [None]:
# We all ready have classifier and we just hypertune only this two params

param = {'max_depth':[3,5,7,9],
      'min_child_weight':[1,3,5],
      }

xgb_reg = RandomizedSearchCV(xgb_model, param, n_iter=10,verbose=2, cv=5,iid=True)

print("Randomized search..")
search_time_start = time.time()
xgb_reg.fit(x_train, y_train)
print("Randomized search time:", time.time() - search_time_start)

Randomized search..
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] min_child_weight=5, max_depth=5 .................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................. min_child_weight=5, max_depth=5, total= 1.1min
[CV] min_child_weight=5, max_depth=5 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s


[CV] .................. min_child_weight=5, max_depth=5, total= 1.1min
[CV] min_child_weight=5, max_depth=5 .................................
[CV] .................. min_child_weight=5, max_depth=5, total= 1.0min
[CV] min_child_weight=5, max_depth=5 .................................
[CV] .................. min_child_weight=5, max_depth=5, total= 1.1min
[CV] min_child_weight=5, max_depth=5 .................................
[CV] .................. min_child_weight=5, max_depth=5, total= 1.0min
[CV] min_child_weight=1, max_depth=7 .................................
[CV] .................. min_child_weight=1, max_depth=7, total= 1.5min
[CV] min_child_weight=1, max_depth=7 .................................
[CV] .................. min_child_weight=1, max_depth=7, total= 1.6min
[CV] min_child_weight=1, max_depth=7 .................................
[CV] .................. min_child_weight=1, max_depth=7, total= 1.5min
[CV] min_child_weight=1, max_depth=7 .................................
[CV] .

In [None]:
best_score = xgb_reg.best_score_
best_params = xgb_reg.best_params_
print("Best score: {}".format(best_score))
print("Best params: ")
for param_name in sorted(best_params.keys()):
    print('%s: %r' % (param_name, best_params[param_name]))

In [None]:
param_test2 = {
 'max_depth':[4,5,6],  # Best ke aas pass ke lena hai
 'min_child_weight':[4,5,6]
}

In [31]:
print("Model Report:")
print("accuracy: ")

AttributeError: 'XGBRegressor' object has no attribute 'predict_proba'

In [None]:
param_test3 = {
 'gamma':[0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
}
gsearch3 = GridSearchCV(estimator, param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(train[predictors],train[target])

gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
param_test4 = {
 'subsample':[0.6, 0.7, 0.8, 0.9],
 'colsample_bytree':[0.6, 0.7, 0.8, 0.9]
}
gsearch4 = GridSearchCV(estimator, param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(train[predictors],train[target])

gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

In [None]:
# If you get 0.8 for both of the params then try this
'subsample':[0.75, 0.8, 0.85],
'colsample_bytree':[0.75, 0.8, 0.85]

In [None]:
# Try to tune regularizations 
'reg_alpha':[0.0005, 0.02, 0.1, 1, 100]

In [None]:
# We got optimum 0.1
'reg_alpha':[0.001, 0.005, 0.01, 0.05]
    

In [None]:
# Now decrease the learning rate and increase the n_estimators

In [None]:
def modelfit(alg, dtrain, predictors, useTrainCV = True, cv_folds=5, early_stopping_rounds = 50 ):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain   = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        
        cvresult  = xgb.cv(xgb_param, xgtrain, num_boost_round = alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')