In [8]:
ModelsDir = '/home/kate/Research/Property/Models/'
ModelName='basemodel0_class_XGB'
UseSavedIfExists = True
DataDir = '/home/kate/Research/Property/Data/'

In [9]:
import sys

sys.path.append('/home/kate/code/Utils/')

from MyFunctions import NormalizedWeightedGini

In [10]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
import os

In [11]:
training_dataset = pd.read_csv('%sproperty_wcf_class_training_basemodel0.csv'%DataDir, error_bad_lines=False, index_col=False)
testing_dataset = pd.read_csv('%sproperty_wcf_testing.csv'%DataDir, error_bad_lines=False, index_col=False)
prediction_dataset = pd.read_csv('%sproperty_water_claims_non_cat_fs.csv'%DataDir, error_bad_lines=False, index_col=False)

In [12]:
target_column = 'hasclaim'
prediction_column_cv='basemodel0_class_xgb_cv'
prediction_column_fold = 'basemodel0_class_xgb_fold'

In [13]:
featureset  = [
'roofcd_encd',
'sqft',  
'usagetype_encd',
'yearbuilt',
'water_risk_3_blk',
'landlordind',
'multipolicyind'  
]

In [14]:
#Evaluation metric to be used in tuning
from sklearn.metrics import roc_auc_score,confusion_matrix

In [15]:
def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)
def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

In [16]:
nrounds=5000
xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'silent': True,
        'booster': 'gbtree',
        'seed': 42,
        'scale_pos_weight':0.3,
        'colsample_bylevel': 0.8,
        'colsample_bytree': 0.8,
        'eta': 0.01,
        'max_depth': 6}

In [17]:
kfold = 5

In [18]:
training_dataset[prediction_column_cv]=0
for i in range(0,kfold):
    training_dataset['%s_%s'%(prediction_column_fold,i)]=0
#
testing_dataset[prediction_column_cv]=0
for i in range(0,kfold):
    testing_dataset['%s_%s'%(prediction_column_fold,i)]=0
#
prediction_dataset[prediction_column_cv]=0
for i in range(0,kfold):
    prediction_dataset['%s_%s'%(prediction_column_fold,i)]=0
#
Train_Gini_l = list()
Test_Gini_l = list()
Train_ROC_l = list()
Test_ROC_l = list()

#preparing for XGB run
#
X=training_dataset[featureset]
y=training_dataset[target_column]
Dtrain = xgb.DMatrix(X.values,y)
#
X_test=testing_dataset[featureset]
y_test=testing_dataset[target_column]
Dtest = xgb.DMatrix(X_test.values)
#
X_pred=prediction_dataset[featureset]
y_pred=prediction_dataset[target_column]
Dpred = xgb.DMatrix(X_pred.values)
#-----------------------------------------------------------------------------------------------------------       
#CV-folds modeling
for i in range(0,kfold):
    print(' fold: {}  of  {} : '.format(i+1, kfold))
    training_dataset_fold = training_dataset[training_dataset['fold_%s'%i]>0]
    validation_dataset = training_dataset[training_dataset['fold_%s'%i]==0]
        
    X_train =  training_dataset_fold[featureset].copy()
    X_valid =  validation_dataset[featureset].copy()        
    y_train =  training_dataset_fold[target_column].copy()

    y_valid =  validation_dataset[target_column].copy()       
                  
  
    #preparing for XGB run
    X_train = X_train.values
    X_valid = X_valid.values       
    #
    y_pred_train=pd.DataFrame(index=y_train.index)
    y_pred_train[prediction_column_cv]=0
    #
    y_train = y_train.values
    y_valid = y_valid.values
    #
       
    #
    d_train = xgb.DMatrix(X_train, y_train)
       
    #
    d_valid = xgb.DMatrix(X_valid, y_valid)
        
    #
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    #applying XGB
    xgb_model_file='%s%s_%s.model'%(ModelsDir,ModelName,i)
    if (os.path.exists(xgb_model_file) & UseSavedIfExists):
        print('%s file exists. Reading model from the file'%xgb_model_file)
        xgb_model = pickle.load(open(xgb_model_file, 'rb'))
    else:
        print('%s file does not exists. Training model...'%xgb_model_file)
        xgb_model = xgb.train(xgb_params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                          feval=gini_xgb, maximize=True, verbose_eval=100)
        pickle.dump(xgb_model, open(xgb_model_file, 'wb'))
            
    pred = xgb_model.predict(Dtrain, ntree_limit=xgb_model.best_ntree_limit+50)
    training_dataset[prediction_column_cv]+=   pred/ (kfold)
    training_dataset['%s_%s'%(prediction_column_fold,i)]=  pred
        
    pred = xgb_model.predict(Dtest, ntree_limit=xgb_model.best_ntree_limit+50)
    testing_dataset[prediction_column_cv] +=   pred/(kfold)  
    testing_dataset['%s_%s'%(prediction_column_fold,i)] =  pred 
    
    pred = xgb_model.predict(Dpred, ntree_limit=xgb_model.best_ntree_limit+50)
    prediction_dataset[prediction_column_cv] +=   pred/(kfold)  
    prediction_dataset['%s_%s'%(prediction_column_fold,i)] =  pred   
    
#Scores cv folds
Train_Gini_l.append(gini(training_dataset[target_column],training_dataset[prediction_column_cv])/gini(training_dataset[target_column],training_dataset[target_column]))
Test_Gini_l.append(gini(testing_dataset[target_column],testing_dataset[prediction_column_cv])/gini(testing_dataset[target_column],testing_dataset[target_column]))
Train_ROC_l.append(roc_auc_score(training_dataset[target_column], training_dataset[prediction_column_cv]))
Test_ROC_l.append(roc_auc_score(testing_dataset[target_column], testing_dataset[prediction_column_cv]))
#-----------------------------------------------------------------------------------------------------------
#Saving training
training_dataset.to_csv('%sproperty_wcf_class_training_basemodel0.csv'%DataDir,header=True,index=False)
testing_dataset.to_csv('%sproperty_wcf_testing.csv'%DataDir,header=True,index=False)
prediction_dataset.to_csv('%sproperty_water_claims_non_cat_fs.csv'%DataDir,header=True,index=False)

 fold: 1  of  5 : 
/home/kate/Research/Property/Models/basemodel0_class_XGB_0.model file exists. Reading model from the file
 fold: 2  of  5 : 
/home/kate/Research/Property/Models/basemodel0_class_XGB_1.model file exists. Reading model from the file
 fold: 3  of  5 : 
/home/kate/Research/Property/Models/basemodel0_class_XGB_2.model file exists. Reading model from the file
 fold: 4  of  5 : 
/home/kate/Research/Property/Models/basemodel0_class_XGB_3.model file exists. Reading model from the file
 fold: 5  of  5 : 
/home/kate/Research/Property/Models/basemodel0_class_XGB_4.model file exists. Reading model from the file


In [19]:
Scores = pd.DataFrame(list(zip(Train_Gini_l,Test_Gini_l,Train_ROC_l,Test_ROC_l)), 
               columns =['Train_Gini', 'Test_Gini','Train_ROC_AUC', 'Test_ROC_AUC']) 
Scores

Unnamed: 0,Train_Gini,Test_Gini,Train_ROC_AUC,Test_ROC_AUC
0,0.43063,0.346558,0.715316,0.673279
