In [164]:
import pandas as pd
import numpy as np

In [165]:
dataset = pd.read_csv('C:\Kate\Python\ClaimPrediction\Data\\fdata_v1_encd.csv', index_col=None)

In [166]:
target_column='hasclaim' 
categorical = [
 'external_make_encd',
 'external_model_encd',
 'postalcode_encd',
 'city_encd',
 'manufacturer_encd',
 'model_encd',
 'registrationstateprovcd_encd',
 'vehbodytypecd_encd',
 'performancecd_encd',
 'restraintcd_encd',
 'antibrakingsystemcd_encd',
 'antitheftcd_encd',
 'enginecylinders_encd',
 'enginetype_encd',
 'vehusecd_encd',
 'classcd_encd',
 'leasedvehind_encd',
 'statedamtind_encd',
 'neworusedind_encd',
 'carpoolind_encd',
 'daylightrunninglightsind_encd',
 'passiveseatbeltind_encd',
 'customizingequipmentind_encd',
 'programtypecd_encd',
 'mileage_encd',
 'driverstatuscd_encd',
 'licensedstateprovcd_encd',
 'relationshiptoinsuredcd_encd',
 'scholasticdiscountind_encd',
 'mvrrequestind_encd',
 'mvrstatus_encd',
 'maturedriverind_encd',
 'drivertrainingind_encd',
 'gooddriverind_encd',
 'accidentpreventioncourseind_encd',
 'newtostateind_encd',
 'persontypecd_encd',
 'gendercd_encd',
 'maritalstatuscd_encd',
 'occupationclasscd_encd']


categorical_numbers= [
 'garageterritory',
 'vehnumber',
 'drivernumber',
 'viol_pointschargedterm',
 'acci_pointschargedterm',
 'viol_driverpointsnumbercountterm',
 'acci_driverpointsnumbercountterm',
 'viol_infractioncdcountterm',
 'acci_infractioncdcountterm',
 'viol_last_infractionage',
 'acci_last_infractionage',
 'viol_last_convictionage',
 'acci_last_convictionage',
 'external_engine',
 'enginesize',
 'enginehorsepower']

In [167]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))
def target_encode(trn_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    #add noise if we need to use it in prediction to avoide overfitting
    if noise_level>-1:
        ft_trn_series=add_noise(ft_trn_series, noise_level)
    return ft_trn_series

In [168]:
#for the purpose of statistical research (HasClaim dependency) we do not care about overfitting and do not need noise
#it's set to -1 and result in 0 level
#it's recommended no more then 5 levels for the research
group_names = [1,2,3,4,5]
for f in categorical+categorical_numbers:
    if len(dataset[f].unique())>5:
        print '%s: %i'%(f,len(dataset[f].unique()))
        trgencd_column_name='%s_trgencd'%f
        trgencd_column_name=trgencd_column_name.replace('_encd','')
        dataset[trgencd_column_name] = target_encode(trn_series= dataset[f],
                                         target=dataset[target_column],
                                         min_samples_leaf=200,
                                         smoothing=10,
                                         noise_level=-1)
        print '%s: %i'%(trgencd_column_name,len(dataset[trgencd_column_name].unique()))
        #binning data to 5 levels
        max_val=dataset[trgencd_column_name].max()
        min_val=dataset[trgencd_column_name].min()
        d=(max_val-min_val)/6
        bins=np.linspace(min_val-d, max_val+d, 6)
        bin_column_name='%s_trgbin'%f
        bin_column_name=bin_column_name.replace('_trgencd','')
        bin_column_name=bin_column_name.replace('_encd','')
        dataset[bin_column_name] = pd.cut(dataset[trgencd_column_name], bins, labels=group_names)
        print '%s: %i'%(bin_column_name,len(dataset[bin_column_name].unique()))
        #deleting target encoding column
        dataset.drop(trgencd_column_name, axis=1, inplace=True)

external_make_encd: 106
external_make_trgencd: 61
external_make_trgbin: 5
external_model_encd: 1050
external_model_trgencd: 489
external_model_trgbin: 5
postalcode_encd: 1984
postalcode_trgencd: 785
postalcode_trgbin: 3
city_encd: 2465
city_trgencd: 593
city_trgbin: 2
manufacturer_encd: 219
manufacturer_trgencd: 78
manufacturer_trgbin: 5
model_encd: 6115
model_trgencd: 635
model_trgbin: 5
registrationstateprovcd_encd: 30
registrationstateprovcd_trgencd: 15
registrationstateprovcd_trgbin: 4
vehbodytypecd_encd: 103
vehbodytypecd_trgencd: 69
vehbodytypecd_trgbin: 5
performancecd_encd: 6
performancecd_trgencd: 6
performancecd_trgbin: 4
restraintcd_encd: 8
restraintcd_trgencd: 8
restraintcd_trgbin: 5
antibrakingsystemcd_encd: 10
antibrakingsystemcd_trgencd: 10
antibrakingsystemcd_trgbin: 4
antitheftcd_encd: 9
antitheftcd_trgencd: 9
antitheftcd_trgbin: 4
enginecylinders_encd: 10
enginecylinders_trgencd: 10
enginecylinders_trgbin: 4
enginetype_encd: 12
enginetype_trgencd: 12
enginetype_trgbin

In [169]:
dataset.to_csv('C:\Kate\Python\ClaimPrediction\Data\\fdata_v1_trgencd.csv')