In [134]:
#https://www.kaggle.com/tilii7/bayesian-optimization-of-xgboost-parameters
import pandas as pd
import numpy as np
import warnings

In [135]:
import time
timestr = time.strftime("%Y%m%d-%H%M%S")

In [136]:
init_points=2
n_iter=5
acq='ei'
xi=1e-4
folds = 5

In [137]:
#log_file
log_file = open('/home/kate/logs/BaysianOptimization/full_log_%s.log'%acq,  'w')
log_file_bestparam = open('/home/kate/logs/BaysianOptimization/bestparam_%s.log'%acq,  'w')

In [138]:
#log in csv file
csv_file='/home/kate/logs/BestModel_%s.log.csv'%acq

In [139]:
#data
dataset = pd.read_csv('/home/kate/data/ClaimPrediction/fdata_v1_encd.csv', index_col=None)

In [140]:
#features
target_column = 'hasclaim'
featureset=[
'accidentpreventioncourseind_encd',
'carpoolind_encd',
'classcd_encd',
'driverage',
'drivernumber',
'driverstatuscd_encd',
'drivertrainingind_encd',
'estimatedannualdistance',
'gooddriverind_encd',
'maturedriverind_encd',
'mvrstatus_encd',
'mvrstatusage',
'ratingvalue',
'relationshiptoinsuredcd_encd',
'scholasticdiscountind_encd',
'vehbodytypecd_encd',
'vehicleage',
'vehnumber'
]

In [141]:
#xgb library and parameters to tune later
import xgboost as xgb
xgb_params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True}

In [142]:
#Random Undersampler to balance the dataset
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)

In [143]:
#splitting to train/test
from sklearn.model_selection import train_test_split
s=0.25
X_train, X_test, y_train, y_test = train_test_split(dataset.loc[:,featureset], dataset[target_column], test_size=s, random_state=42)
X_train = X_train.values
y_train = y_train.values
#balancing dataset
X_res, y_res = rus.fit_sample(X_train, y_train)
dtrain = xgb.DMatrix(X_res, y_res)

In [144]:
#best metric variables
AUCbest = -1.
ITERbest = 0

In [145]:
# cv fold for each parameters set
def xgb_evaluate(max_depth,
                 min_child_weight,
                 colsample_bytree,
                 subsample,
                 gamma,
                 colsample_bylevel,
                 max_delta_step,
                 eta,
                 reg_alpha,
                 reg_lambda
         ):

    global AUCbest
    global ITERbest

    params={}
    params['booster'] = 'gbtree'
    params['min_child_weight'] = int(min_child_weight)
    params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = gamma
    params['colsample_bylevel'] = max(min(colsample_bylevel, 1), 0)
    params['max_delta_step']=max(int(max_delta_step),0)
    params['eta']=max(min(eta,1), 0)
    params['reg_alpha'] = max(reg_alpha, 0)
    params['reg_lambda']=max(min(reg_lambda, 1), 0)
    params['eval_metric']='auc'
    params['silent']=True
    params['objective']='binary:logistic'
    params['seed'] =42

    
    

    print("\n Search parameters (%d-fold validation):\n %s" % (folds, params), file=log_file )
    log_file.flush()

    xgbc = xgb.cv(
                    params,
                    dtrain,
                    num_boost_round = 20000,
                    stratified = True,
                    nfold = folds,
                    early_stopping_rounds = 100,
                    metrics = 'auc',
                    show_stdv = True
               )


    val_score = xgbc['test-auc-mean'].iloc[-1]
    train_score = xgbc['train-auc-mean'].iloc[-1]
    print(' Stopped after %d iterations with train-auc = %f val-auc = %f ( diff = %f ) train-gini = %f val-gini = %f' 
          % ( len(xgbc), train_score, val_score, (train_score - val_score), (train_score*2-1),
(val_score*2-1)) , file=log_file)
    if ( val_score > AUCbest ):
        AUCbest = val_score
        ITERbest = len(xgbc)
        print('\n\nBest Valid AUC changed to %f'%AUCbest, file=log_file)
        log_file.flush()
        #
        print("\n Best parameters (%d-fold validation):\n %s" % (folds, params), file=log_file_bestparam )
        print('\n Best Valid AUC changed to %f'%AUCbest, file=log_file_bestparam)
        print('\n Train AUC is %f'%train_score, file=log_file_bestparam)
        log_file_bestparam.flush()
        #
    return (val_score*2) - 1

In [146]:
#bayesian optimization
from bayes_opt import BayesianOptimization
XGB_BO = BayesianOptimization(xgb_evaluate, {   'max_depth': (2, 12),
                                                'min_child_weight': (0.1, 20),
                                                'colsample_bytree': (0.2, 1.1),
                                                'subsample': (0.1, 1.1),
                                                'gamma': (0.001, 10),
                                                'colsample_bylevel': (0.2, 1.1),
                                                'max_delta_step':(0,10),
                                                'eta':(0.01,1.1),
                                                'reg_alpha': (0, 10),
                                                'reg_lambda':(1,10)
                                                })

In [147]:
#run optimization
print('-'*130)
print('-'*130, file=log_file)
log_file.flush()

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    XGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq=acq, xi=xi)

----------------------------------------------------------------------------------------------------------------------------------
[31mInitialization[0m
[94m-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bylevel |   colsample_bytree |       eta |     gamma |   max_delta_step |   max_depth |   min_child_weight |   reg_alpha |   reg_lambda |   subsample | 
    1 | 00m10s | [35m   0.37405[0m | [32m             1.0210[0m | [32m            1.0346[0m | [32m   0.0149[0m | [32m   2.6369[0m | [32m          8.8963[0m | [32m     4.7117[0m | [32m           15.5839[0m | [32m     7.8881[0m | [32m      5.6246[0m | [32m     0.1635[0m | 
    2 | 00m01s |    0.35730 |              1.0485 |             0.3696 |    0.4980 |    3.5478 |           0.5715 |     10.8642 |            18.8045 |   

In [148]:
print('-'*130)
print('Final Results')
print('Maximum XGBOOST value: %f' % XGB_BO.res['max']['max_val'])
print('Best XGBOOST parameters: ', XGB_BO.res['max']['max_params'])
print('-'*130, file=log_file)
print('Final Result:', file=log_file)
print('Maximum XGBOOST value: %f' % XGB_BO.res['max']['max_val'], file=log_file)
print('Best XGBOOST parameters: ', XGB_BO.res['max']['max_params'], file=log_file)
log_file.flush()
log_file.close()

----------------------------------------------------------------------------------------------------------------------------------
Final Results
Maximum XGBOOST value: 0.383866
Best XGBOOST parameters:  {'max_depth': 2.0, 'min_child_weight': 20.0, 'colsample_bytree': 0.20000000000000001, 'subsample': 1.1000000000000001, 'gamma': 0.001, 'colsample_bylevel': 1.1000000000000001, 'max_delta_step': 0.0, 'eta': 0.01, 'reg_alpha': 10.0, 'reg_lambda': 10.0}


In [149]:
history_df = pd.DataFrame(XGB_BO.res['all']['params'])
history_df2 = pd.DataFrame(XGB_BO.res['all']['values'])
history_df = pd.concat((history_df, history_df2), axis=1)
history_df.rename(columns = { 0 : 'gini'}, inplace=True)
history_df['AUC'] = ( history_df['gini'] + 1 ) / 2
history_df.to_csv(csv_file)