In [15]:
import pandas as pd
import numpy as np
import pickle

In [16]:
#data
dataset = pd.read_csv('/home/kate/data/ClaimPrediction/fdata_v1_encd.csv', index_col=None)
target_column = 'hasclaim'

In [17]:
featureset=[
'acci_last_infractionage',
'carpoolind_encd',
'classcd_encd',
'driverage',
'drivernumber',
'estimatedannualdistance',
'gooddriverind_encd',
'maritalstatuscd_encd',
'mvrstatus_encd',
'mvrstatusage',
'ratingvalue',
'vehbodytypecd_encd',
'vehicleage',
'vehnumber',
'licenseage',
'gendercd_encd',
'external_length_in',
'external_width_in'
]
#add calculated column
dataset['licenseage']=dataset['driverage']-dataset['havelicenseage']

In [18]:
#models files dir
ModelsDir='/home/kate/Models/XGB/'

In [19]:
#xgb library and parameters
import xgboost as xgb

xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'silent': True,
        'booster': 'gbtree',
        'seed': 42,
        'scale_pos_weight':0.3,
        'colsample_bylevel': 0.232094506,
        'colsample_bytree': 0.978684648,
        'eta': 0.01208041,
        'max_depth': 4}


In [20]:
#Evaluation metric to be used in tuning
from sklearn.metrics import roc_auc_score,confusion_matrix
def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)
def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

In [21]:
#StratifiedKFold
from sklearn.model_selection import StratifiedKFold
nrounds=5000 # need to change to 2000
kfold = 10  # need to change to 10
skf = StratifiedKFold(n_splits=kfold, random_state=42)

In [22]:
#splitting to train/test 
from sklearn.model_selection import train_test_split
X, X_test, y, y_test = train_test_split(dataset.loc[:,featureset], dataset[target_column], test_size=0.2, random_state=42)

In [23]:
#prediction dataframes
y_pred_test=pd.DataFrame(index=y_test.index)
y_pred_test[target_column]=0

#Stratified Fold
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    #getting fold data
    X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    #preparing for XGB run
    X_train = X_train.values
    X_valid = X_valid.values
    #
    y_pred_train=pd.DataFrame(index=y_train.index)
    y_pred_train[target_column]=0
    #
    y_train = y_train.values
    y_valid = y_valid.values

    #applying XGB
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(xgb_params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                          feval=gini_xgb, maximize=True, verbose_eval=1000)
    y_pred_test[target_column] +=  xgb_model.predict(xgb.DMatrix(X_test.values), ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)
    y_pred_train[target_column] += xgb_model.predict(xgb.DMatrix(X_train), ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)

    
    #save model
    #xgb_model.save_model('%sm_%s.model'%(ModelsDir,i))
    #pickle.dump(xgb_model, open('%sm_%s.model_protocol2'%(ModelsDir,i), 'wb'), protocol=2)
    pickle.dump(xgb_model, open('%sm_%s.model_with_all'%(ModelsDir,i), 'wb'))
    
    #dump model
    xgb_model.dump_model('%smodel_with_all_raw_%s.txt'%(ModelsDir,i))

 xgb kfold: 1  of  10 : 
[0]	train-auc:0.553876	valid-auc:0.574667	train-gini:0.111422	valid-gini:0.149707
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[1000]	train-auc:0.732396	valid-auc:0.708472	train-gini:0.464791	valid-gini:0.416944
Stopping. Best iteration:
[1766]	train-auc:0.747059	valid-auc:0.710901	train-gini:0.494118	valid-gini:0.421803

 xgb kfold: 2  of  10 : 
[0]	train-auc:0.556982	valid-auc:0.546742	train-gini:0.116147	valid-gini:0.178519
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[1000]	train-auc:0.732778	valid-auc:0.709573	train-gini:0.465556	valid-gini:0.419148
Stopping. Best iteration:
[1200]	train-auc:0.737085	valid-auc:0.710284	train-gini:0.474169	valid-gini:0.420569

 xgb kfold: 3  of  10 : 
[0]	train-auc:0.555479	valid-auc:0.560258	train-gini:0.113625	valid-gini:0.

In [24]:
#Prediction results
g=gini(y_test,y_pred_test)/gini(y_test,y_test)
print('Test Gini - %f'%g)

ROC_AUC=roc_auc_score(y_test, y_pred_test)
print('Test ROC_AUC - %f'%ROC_AUC)

Test Gini - 0.423917
Test ROC_AUC - 0.711958


In [25]:
#mean prediction value to convert to binary
m=y_pred_test.mean()

In [26]:
y_pred_test[y_pred_test > m] = 1
y_pred_test[y_pred_test <= m] = 0

print ('Confusion matrix\n')    
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test).ravel()
print('TP=%d FP=%d'%(tp,fp))
print('FN=%d TN=%d'%(fn,tn))

Confusion matrix

TP=924 FP=9552
FN=426 TN=15673
