In [171]:
ModelsDir = '/home/kate/Research/Property/Models/'
ModelName='wc_Gamma_Reg_XGB_mae'
UseSavedIfExists = False
DataDir = '/home/kate/Research/Property/Data/'

In [172]:
import sys

sys.path.append('/home/kate/code/Utils/')

from MyFunctions import NormalizedWeightedGini
from MyFunctions import mae
from MyFunctions import rmse

In [173]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
import os

In [174]:
training_dataset = pd.read_csv('%sproperty_wcs_training_for_gamma.csv'%DataDir, error_bad_lines=False, index_col=False)
testing_dataset = pd.read_csv('%sproperty_wcf_testing.csv'%DataDir, error_bad_lines=False, index_col=False)
prediction_dataset = pd.read_csv('%sproperty_water_claims_non_cat_fs.csv'%DataDir, error_bad_lines=False, index_col=False)

In [175]:
target_column = 'cova_il_nc_water'
prediction_column_cv='gamma_reg_xgb_mae'

In [176]:
features=[
'cova_deductible',
'roofcd_encd',
'water_risk_sev_3_blk',
'sqft',
'rep_cost_3_blk',
'yearbuilt',
'ecy',
'usagetype_encd'
]

In [177]:
X=training_dataset[features]
y=training_dataset[target_column]
Dtrain = xgb.DMatrix(X.values,y)
#
X_test=testing_dataset[features]
y_test=testing_dataset[target_column]
Dtest = xgb.DMatrix(X_test.values)
#
X_pred=prediction_dataset[features]
y_pred=prediction_dataset[target_column]
Dpred = xgb.DMatrix(X_pred.values)

In [178]:
nrounds = 600
esr=100
kfold=5
xgb_params = {
    'seed': 42,
    'eta': 0.02, 
    'colsample_bytree': 0.9, 
    'silent': 1,
    'subsample': 0.9,
    'max_depth': 6,
    'gamma': 0.9, 
    'min_child_weight': 4,
    'objective': 'reg:gamma'
}

In [179]:
kfold = 5

In [180]:
from sklearn.metrics import mean_absolute_error
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(preds, labels)

In [181]:
training_dataset[prediction_column_cv] = 0
testing_dataset[prediction_column_cv] = 0
prediction_dataset[prediction_column_cv] = 0
#
trn_gini_l = list()
trn_mae_l = list()
trn_rmse_l = list()
test_gini_l = list()
test_mae_l = list()
test_rmse_l = list()
#
for i in range(0,kfold):
    print(' fold: {}  of  {} : '.format(i+1, kfold))
    training_dataset_fold = training_dataset[training_dataset['fold_%s'%i]>0]
    validation_dataset = training_dataset[training_dataset['fold_%s'%i]==0]
        
    X_train =  training_dataset_fold[features].copy()
    X_valid =  validation_dataset[features].copy()        
    y_train =  training_dataset_fold[target_column].copy()
    y_valid =  validation_dataset[target_column].copy()       
                
  
    #preparing for XGB run
    X_train = X_train.values
    X_valid = X_valid.values
    #
    y_pred_train=pd.DataFrame(index=y_train.index)
    y_pred_train[prediction_column_cv]=0
    #
    y_train = y_train.values
    y_valid = y_valid.values
    #
    #
    d_train = xgb.DMatrix(X_train, y_train)
    #
    d_valid = xgb.DMatrix(X_valid, y_valid)
    #
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    #applying XGB
    xgb_model_file='%s%s_%s.model'%(ModelsDir,ModelName,i)
    if (os.path.exists(xgb_model_file) & UseSavedIfExists):
        print('%s file exists. Reading model from the file'%xgb_model_file)
        xgb_model = pickle.load(open(xgb_model_file, 'rb'))
    else:
        print('%s file does not exists. Training model...'%xgb_model_file)
        xgb_model = xgb.train(xgb_params, d_train, nrounds, watchlist, feval=evalerror,   verbose_eval=100, early_stopping_rounds=esr)
        pickle.dump(xgb_model, open(xgb_model_file, 'wb'))
    
        
    training_dataset[prediction_column_cv]+=  xgb_model.predict(Dtrain, ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)       
    testing_dataset[prediction_column_cv] +=  xgb_model.predict(Dtest, ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)
    prediction_dataset[prediction_column_cv] +=  xgb_model.predict(Dpred, ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)

 fold: 1  of  5 : 
/home/kate/Research/Property/Models/wc_Gamma_Reg_XGB_mae_0.model file does not exists. Training model...
[0]	train-gamma-nloglik:19752.7	valid-gamma-nloglik:19532.7	train-mae:10075.7	valid-mae:9963.48
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[100]	train-gamma-nloglik:2675.51	valid-gamma-nloglik:2645.73	train-mae:10072.4	valid-mae:9960.22
[200]	train-gamma-nloglik:366.092	valid-gamma-nloglik:362.052	train-mae:10048.4	valid-mae:9936.21
[300]	train-gamma-nloglik:55.2584	valid-gamma-nloglik:54.7021	train-mae:9875.06	valid-mae:9763
[400]	train-gamma-nloglik:14.8031	valid-gamma-nloglik:14.7266	train-mae:8883.5	valid-mae:8782.51
[500]	train-gamma-nloglik:10.4447	valid-gamma-nloglik:10.4574	train-mae:7035.08	valid-mae:6987.2
[599]	train-gamma-nloglik:10.1616	valid-gamma-nloglik:10.2038	train-mae:6765.47	valid-mae:6939.4
 fold: 2  of  5 : 
/home/kate/Research/Property/Models

In [182]:
training_dataset[prediction_column_cv] = training_dataset[prediction_column_cv]
testing_dataset[prediction_column_cv] = testing_dataset[prediction_column_cv]
prediction_dataset[prediction_column_cv] = prediction_dataset[prediction_column_cv]

In [183]:
trn_gini_l.append(NormalizedWeightedGini(training_dataset[target_column],training_dataset[prediction_column_cv],training_dataset['ecy']))
trn_mae_l.append(mae(training_dataset[target_column],training_dataset[prediction_column_cv]))
trn_rmse_l.append(rmse(training_dataset[target_column],training_dataset[prediction_column_cv]))
    #
test_gini_l.append(NormalizedWeightedGini(testing_dataset[testing_dataset.cova_ic_nc_water>0][target_column],testing_dataset[testing_dataset.cova_ic_nc_water>0][prediction_column_cv],testing_dataset[testing_dataset.cova_ic_nc_water>0]['ecy']))
test_mae_l.append(mae(testing_dataset[testing_dataset.cova_ic_nc_water>0][target_column],testing_dataset[testing_dataset.cova_ic_nc_water>0][prediction_column_cv]))
test_rmse_l.append(rmse(testing_dataset[testing_dataset.cova_ic_nc_water>0][target_column],testing_dataset[testing_dataset.cova_ic_nc_water>0][prediction_column_cv]))

In [184]:
ScoresFinal = pd.DataFrame(list(zip(trn_gini_l, trn_mae_l, trn_rmse_l, test_gini_l, test_mae_l, test_rmse_l )), 
               columns =['trn_gini','trn_mae','trn_rmse','test_gini','test_mae','test_rmse'])
ScoresFinal

Unnamed: 0,trn_gini,trn_mae,trn_rmse,test_gini,test_mae,test_rmse
0,0.407958,6784.743397,9514.662891,0.240904,11294.061518,21936.495222


In [186]:
#-----------------------------------------------------------------------------------------------------------
training_dataset.to_csv('%sproperty_wcs_training_for_gamma.csv'%DataDir,header=True,index=False)
testing_dataset.to_csv('%sproperty_wcf_testing.csv'%DataDir,header=True,index=False)
prediction_dataset.to_csv('%sproperty_water_claims_non_cat_fs.csv'%DataDir,header=True,index=False)