In [16]:
#compare to a base model
import pandas as pd
import numpy as np

In [17]:
#data
dataset = pd.read_csv('/home/kate/data/ClaimPrediction/fdata_v1_encd.csv', index_col=None)
target_column = 'hasclaim'

In [18]:
#models and analyzing results directory
ModelsDir='/home/kate/data/ClaimPrediction/fe6_trgtencd0/'

In [19]:
#comparing model metrics with t-test
#and save results
import scipy.stats as stats
def AnalyzeAndSaveModelsResults(result_df,result_lst,ModelName,filename):
    df=pd.DataFrame([result_lst])
    TestSizeColumns=['S0.45','S0.4','S0.35','S0.3','S0.25','S0.2','S0.15','S0.1']
    #TestSizeColumns=['S0.2','S0.15','S0.1']
    df.columns=['Model']+TestSizeColumns
    #mean
    df['Mean'] = df.drop('Model', axis=1).mean(axis=1)
    df['t-pvalue'] = 1
    df['t-statistic'] = 0
    df['Group'] = 1
    #t-test with base model
    if ModelName!='BaseModel':
        base_model=result_df[result_df['Model'] == 'BaseModel'].iloc[0]
        current_model=df.iloc[0]
        t=stats.ttest_ind(base_model[TestSizeColumns].tolist(),current_model[TestSizeColumns].tolist())
        line_to_save=[current_model['Model']]
        line_to_save.extend(current_model[TestSizeColumns].tolist())
        line_to_save.append(current_model[TestSizeColumns].mean())
        line_to_save.append(t.pvalue)
        line_to_save.append(t.statistic)
        if ((t.pvalue<=0.05) and (base_model['Mean']<current_model['Mean'])):
            line_to_save.append(2)
        elif ((t.pvalue<=0.05) and (base_model['Mean']>current_model['Mean'])):
            line_to_save.append(3)    
        else:
            line_to_save.append(1)                  
        df_to_save=pd.DataFrame([line_to_save])
        df_to_save.columns=['Model']+TestSizeColumns+['Mean','t-pvalue','t-statistic','Group']
        result_df=result_df.append(df_to_save, ignore_index=True)
    else:
        result_df=result_df.append(df, ignore_index=True)
    result_df.to_csv(ModelsDir+filename, index = False)
    return result_df

In [20]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))
def target_encode(trn_series=None,
                  tst_series=None,
                  target=None,
                  fmin_samples_leaf=1.0,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    fmin_samples_leaf (float) : minimum samples to take category average into account as a fraction of count
    KD: original min_samples_leaf = level count * fmin_samples_leaf
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    # 
    smoothing = 1 / (1 + np.exp(-(averages["count"] - averages["count"]*fmin_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [21]:
#xgb library and parameters to tune later
import xgboost as xgb
xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'silent': True,
        'booster': 'gbtree',
        'seed': 42,
        'scale_pos_weight':0.3,
        'colsample_bylevel': 0.232094506,
        'colsample_bytree': 0.978684648,
        'eta': 0.01208041,
        'max_depth': 4}

In [22]:
#Evaluation metric to be used in tuning
from sklearn.metrics import roc_auc_score
def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)
def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

In [23]:
#StratifiedKFold
from sklearn.model_selection import StratifiedKFold
nrounds=5000 # need to change to 2000
kfold = 10  # need to change to 10
skf = StratifiedKFold(n_splits=kfold, random_state=42)

In [24]:
#splitting to train/test in the loop below
from sklearn.model_selection import train_test_split

In [25]:
#each line in the file contains the model name and set of features to analize
models = pd.read_csv(ModelsDir+'Models.csv', index_col=None)

In [26]:
#names of labels in new bin columns created
group_names = [1,2,3,4,5,6,7,8,9,10]

In [27]:
#summary for test and train metrics for each model to test overfitting
models_test_gini_df=pd.DataFrame()
models_test_roc_auc_df=pd.DataFrame()
#
models_train_gini_df=pd.DataFrame()
models_train_roc_auc_df=pd.DataFrame()
#
base_model_df=pd.DataFrame()

In [28]:
for index, row in models.iterrows():
    #for test and train metrics for each model to test overfitting
    gini_test_lst=[]
    roc_auc_test_lst=[]
    gini_train_lst=[]
    roc_auc_train_lst=[]
    #Starting analyzing metric
    print (index, ': Analyzing model %s'%row['Model'])
    #add model name to metric storage
    gini_test_lst.append(row['Model'])
    roc_auc_test_lst.append(row['Model'])
    gini_train_lst.append(row['Model'])
    roc_auc_train_lst.append(row['Model'])
    #getting model parameters
    #first element names of rows in a creating current row dataframe
    featureset=['feature']
    fmin_samples_leaf_set=['fmin_samples_leaf']
    smoothing_set=['smoothing']
    noise_level_set=['noise_level']
    for c in row.index:
        if 'fmin_samples_leaf' in c:
            fmin_samples_leaf_set.append(row[c])
        elif 'smoothing' in c:
            smoothing_set.append(row[c])
        elif 'noise_level' in c:
            noise_level_set.append(row[c])
        elif 'F' in c:
            featureset.append(row[c])
    featureset=[x for x in featureset if str(x) != 'nan']
    fmin_samples_leaf_set=[x for x in fmin_samples_leaf_set if str(x) != 'nan']
    smoothing_set=[x for x in smoothing_set if str(x) != 'nan']
    noise_level_set=[x for x in noise_level_set if str(x) != 'nan']
    #into a dataframe with index as names of rows: fmin_samples_leaf, smoothing, noise_level
    #and columns as feature names
    analyzed_model=pd.DataFrame([fmin_samples_leaf_set,smoothing_set,noise_level_set])
    analyzed_model.columns=featureset #first column is now names of rows
    analyzed_model.set_index('feature', inplace=True)
    #calculating metrics for the current featureset and other parameters and 
    #several data sizes
    for s in (0.45,0.4,0.35,0.3,0.25,0.2,0.15,0.1):
        print ('Test size %s'%s)
        X, X_test, y, y_test = train_test_split(dataset.loc[:,analyzed_model.columns], dataset[target_column], test_size=s, random_state=42)
        #prediction dataframes
        y_pred_test=pd.DataFrame(index=y_test.index)
        y_pred_test[target_column]=0
        #
        X_test_origin=X_test.copy(deep=True)
        #Stratified Fold
        for i, (train_index, test_index) in enumerate(skf.split(X, y)):
            print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
            #getting fold data
            X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
            y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
            #adding targeting encoding 
            #for each train, test and valid part currently analyzing model columns
            #if all parameters are not -1
            X_test=X_test_origin.copy(deep=True)
            for c in analyzed_model.columns:
                fmin_samples_leaf=analyzed_model[c]['fmin_samples_leaf']
                smoothing=analyzed_model[c]['smoothing']
                noise_level=analyzed_model[c]['noise_level']
                if (fmin_samples_leaf>-1.0 and smoothing>-1.0 and noise_level>-1.0):
                    print ('--------------TARGETING ENCODING---------------------------------------')
                    X_train[c.replace('_encd','')+ "_trgenc"], X_test[c.replace('_encd','')+"_trgenc"] = target_encode(
                                         trn_series=X_train[c],
                                         tst_series=X_test[c],
                                         target=y_train,
                                         fmin_samples_leaf=fmin_samples_leaf,
                                         smoothing=smoothing,
                                         noise_level=noise_level)
                    X_train[c.replace('_encd','')+ "_trgenc"], X_valid[c.replace('_encd','')+ "_trgenc"] = target_encode(
                                         trn_series=X_train[c],
                                         tst_series=X_valid[c],
                                         target=y_train,
                                         fmin_samples_leaf=fmin_samples_leaf,
                                         smoothing=smoothing,
                                         noise_level=noise_level)
                    X_train.drop(c, axis=1, inplace=True)
                    X_valid.drop(c, axis=1, inplace=True)
                    X_test.drop(c, axis=1, inplace=True)
                    #binning if there are more then N levels
                    BinsNum=len(group_names)
                    if len(X_train[c.replace('_encd','')+ "_trgenc"].unique())>BinsNum:
                        print('--------------------------------BINNING------------------------------')
                        trgencd_column_name=c.replace('_encd','')+ '_trgenc'
                        #binning data to N levels
                        max_val=max([X_train[trgencd_column_name].max(),X_test[trgencd_column_name].max(),X_valid[trgencd_column_name].max()])
                        min_val=min([X_train[trgencd_column_name].min(),X_test[trgencd_column_name].min(),X_valid[trgencd_column_name].min()])
                        d=(max_val-min_val)/(BinsNum+1)
                        bins=np.linspace(min_val-d, max_val+d, BinsNum+1)
                        bin_column_name=c.replace('_encd','')+ '_trgencbin'
                        X_train[bin_column_name] = pd.cut(X_train[trgencd_column_name], bins, labels=group_names)
                        X_valid[bin_column_name] = pd.cut(X_valid[trgencd_column_name], bins, labels=group_names)
                        X_test[bin_column_name] = pd.cut(X_test[trgencd_column_name], bins, labels=group_names)
                        #deleting target encoded column
                        X_train.drop(trgencd_column_name, axis=1, inplace=True)
                        X_valid.drop(trgencd_column_name, axis=1, inplace=True)
                        X_test.drop(trgencd_column_name, axis=1, inplace=True)
            #
            X_train = X_train.values
            X_valid = X_valid.values
            #
            y_pred_train=pd.DataFrame(index=y_train.index)
            y_pred_train[target_column]=0
            #
            y_train = y_train.values
            y_valid = y_valid.values
            #applying XGB
            d_train = xgb.DMatrix(X_train, y_train) 
            d_valid = xgb.DMatrix(X_valid, y_valid) 
            watchlist = [(d_train, 'train'), (d_valid, 'valid')]
            xgb_model = xgb.train(xgb_params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                          feval=gini_xgb, maximize=True, verbose_eval=1000)
            y_pred_test[target_column] +=  xgb_model.predict(xgb.DMatrix(X_test.values), ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)
            y_pred_train[target_column] += xgb_model.predict(xgb.DMatrix(X_train), ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)
        #Prediction results
        #test
        g=gini(y_test,y_pred_test)/gini(y_test,y_test)
        print('Test Gini - %f'%g)
        gini_test_lst.append(g)
        ROC_AUC=roc_auc_score(y_test, y_pred_test)
        print('Test ROC_AUC - %f'%ROC_AUC)
        roc_auc_test_lst.append(ROC_AUC)
        #train
        g=gini(y_train,y_pred_train)/gini(y_train,y_train)
        print('Train Gini - %f'%g)
        gini_train_lst.append(g)
        ROC_AUC=roc_auc_score(y_train, y_pred_train)
        print('Train ROC_AUC - %f'%ROC_AUC)
        roc_auc_train_lst.append(ROC_AUC)
    #save model analysis results
    models_test_gini_df=AnalyzeAndSaveModelsResults(models_test_gini_df,gini_test_lst,row['Model'],'models_test_gini.csv')
    models_test_roc_auc_df=AnalyzeAndSaveModelsResults(models_test_roc_auc_df,roc_auc_test_lst,row['Model'],'models_test_roc_auc.csv')
    models_train_gini_df=AnalyzeAndSaveModelsResults(models_train_gini_df,gini_train_lst,row['Model'],'models_train_gini.csv')
    models_train_roc_auc_df=AnalyzeAndSaveModelsResults(models_train_roc_auc_df,roc_auc_train_lst,row['Model'],'models_train_roc_auc.csv')

0 : Analyzing model BaseModel
['feature', 'driverage', 'drivernumber', 'mvrstatusage', 'ratingvalue', 'vehicleage', 'vehnumber', 'acci_pointschargedterm', 'acci_last_infractionage']
['fmin_samples_leaf', -1, -1, -1, -1, -1, -1, -1, -1]
['smoothing', -1, -1, -1, -1, -1, -1, -1, -1]
['noise_level', -1, -1, -1, -1, -1, -1, -1, -1]
                   0  1  2  3  4  5  6  7  8
0  fmin_samples_leaf -1 -1 -1 -1 -1 -1 -1 -1
1          smoothing -1 -1 -1 -1 -1 -1 -1 -1
2        noise_level -1 -1 -1 -1 -1 -1 -1 -1
                   driverage  drivernumber  mvrstatusage  ratingvalue  \
feature                                                                 
fmin_samples_leaf         -1            -1            -1           -1   
smoothing                 -1            -1            -1           -1   
noise_level               -1            -1            -1           -1   

                   vehicleage  vehnumber  acci_pointschargedterm  \
feature                                                 

 xgb kfold: 6  of  10 : 
[0]	train-auc:0.5	valid-auc:0.5	train-gini:-0.004498	valid-gini:-0.009634
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[1000]	train-auc:0.67654	valid-auc:0.644725	train-gini:0.353077	valid-gini:0.289452
Stopping. Best iteration:
[1194]	train-auc:0.679715	valid-auc:0.645242	train-gini:0.359427	valid-gini:0.290486

 xgb kfold: 7  of  10 : 
[0]	train-auc:0.5	valid-auc:0.5	train-gini:-0.005067	valid-gini:0.033867
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[1000]	train-auc:0.674579	valid-auc:0.662305	train-gini:0.349157	valid-gini:0.324621
Stopping. Best iteration:
[955]	train-auc:0.673948	valid-auc:0.662494	train-gini:0.347894	valid-gini:0.324998

 xgb kfold: 8  of  10 : 
[0]	train-auc:0.5	valid-auc:0.5	train-gini:-0.005496	valid-gini:0.067697
Multiple eval metrics

Will train until valid-gini hasn't improved in 100 rounds.
[1000]	train-auc:0.674268	valid-auc:0.651304	train-gini:0.348535	valid-gini:0.302591
Stopping. Best iteration:
[1823]	train-auc:0.683559	valid-auc:0.655549	train-gini:0.367118	valid-gini:0.311093

 xgb kfold: 5  of  10 : 
[0]	train-auc:0.5	valid-auc:0.5	train-gini:-0.001525	valid-gini:-0.155245
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[1000]	train-auc:0.671876	valid-auc:0.662369	train-gini:0.343749	valid-gini:0.324721
[2000]	train-auc:0.684188	valid-auc:0.667658	train-gini:0.368376	valid-gini:0.335307
Stopping. Best iteration:
[2722]	train-auc:0.689255	valid-auc:0.668762	train-gini:0.378511	valid-gini:0.337522

 xgb kfold: 6  of  10 : 
[0]	train-auc:0.5	valid-auc:0.5	train-gini:-0.002599	valid-gini:-0.074355
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini has

 xgb kfold: 2  of  10 : 
[0]	train-auc:0.5	valid-auc:0.5	train-gini:0.005137	valid-gini:0.176106
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[1000]	train-auc:0.671838	valid-auc:0.66128	train-gini:0.343677	valid-gini:0.322575
[2000]	train-auc:0.681979	valid-auc:0.665776	train-gini:0.363957	valid-gini:0.331558
Stopping. Best iteration:
[2782]	train-auc:0.687644	valid-auc:0.667327	train-gini:0.375287	valid-gini:0.334656

 xgb kfold: 3  of  10 : 
[0]	train-auc:0.5	valid-auc:0.5	train-gini:0.00588	valid-gini:0.11785
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[1000]	train-auc:0.669961	valid-auc:0.656716	train-gini:0.339921	valid-gini:0.31345
[2000]	train-auc:0.681082	valid-auc:0.663354	train-gini:0.362164	valid-gini:0.326717
Stopping. Best iteration:
[2353]	train-auc:0.683801	valid-auc:0.66

Will train until valid-gini hasn't improved in 100 rounds.
Stopping. Best iteration:
[683]	train-auc:0.665531	valid-auc:0.633388	train-gini:0.331059	valid-gini:0.266797

 xgb kfold: 10  of  10 : 
[0]	train-auc:0.5	valid-auc:0.5	train-gini:-7.6e-05	valid-gini:0.071888
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[1000]	train-auc:0.670404	valid-auc:0.659623	train-gini:0.340808	valid-gini:0.319245
[2000]	train-auc:0.681681	valid-auc:0.664262	train-gini:0.363363	valid-gini:0.328522
Stopping. Best iteration:
[2080]	train-auc:0.682176	valid-auc:0.664498	train-gini:0.364352	valid-gini:0.328995

Test Gini - 0.322863
Test ROC_AUC - 0.661431
Train Gini - 0.365241
Train ROC_AUC - 0.682620
Test size 0.1
 xgb kfold: 1  of  10 : 
[0]	train-auc:0.5	valid-auc:0.5	train-gini:-0.00119	valid-gini:-0.066848
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train un

Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
Stopping. Best iteration:
[773]	train-auc:0.740687	valid-auc:0.638664	train-gini:0.481374	valid-gini:0.277323

 xgb kfold: 3  of  10 : 
--------------TARGETING ENCODING---------------------------------------
--------------------------------BINNING------------------------------
[0]	train-auc:0.5	valid-auc:0.5	train-gini:0.006466	valid-gini:-0.094089
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
Stopping. Best iteration:
[238]	train-auc:0.698928	valid-auc:0.610335	train-gini:0.39783	valid-gini:0.220619

 xgb kfold: 4  of  10 : 
--------------TARGETING ENCODING---------------------------------------
--------------------------------BINNING------------------------------
[0]	train-auc:0.5	valid-auc:0.5	train-gini:0.005656	valid-gini:-0.030243
Multipl

KeyboardInterrupt: 

Red and Green bars below indicates models where mean of gini is likely different from a previous model

In [29]:
models_test_gini_df

Unnamed: 0,Model,S0.45,S0.4,S0.35,S0.3,S0.25,S0.2,S0.15,S0.1,Mean,t-pvalue,t-statistic,Group
0,BaseModel,0.310927,0.312015,0.311775,0.31328,0.314376,0.314284,0.322863,0.312099,0.313952,1,0,1


In [None]:
models_test_gini_df.set_index('Model', inplace=True)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
colors = {1: 'blue', 2: 'green',3: 'red'}

In [None]:
customcmap = tuple(models_test_gini_df['Group'].map(colors))
models_test_gini_df['Mean'].plot(kind='barh',  color=[customcmap], figsize=(10, 12))