In [1]:
import pandas as pd
import numpy as np
import time

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
%%time
def data_loader():
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    sub = pd.read_csv('SampleSubmission.csv')
    #policy = pd.read_csv('policies.csv')
    #data_dict = pd.read_csv('DataDictionary.csv')
    return train,test, sub,

Wall time: 0 ns


In [3]:
%%time
train,test, sub,  = data_loader()

Wall time: 67.6 ms


In [4]:
df = pd.concat([train, test]).reset_index(drop=True)
ntrain = train.shape[0]

In [5]:
ntrain = train.shape[0]

In [6]:
df['risk_business'] = df['class_of_business'] + '_' + df['risk_type']
df['product_agent'] = df['agent'] + '_' + df['product']
df['renwal_business'] = df['renewal_frequency'] + '_' + df['class_of_business']
df['product_class'] = df['class_of_business'] + '_' + df['product']


#New Features
df['claim_policy'] = df['claim_number'] + '_' + df['policy_number']
df['business_agent'] = df['class_of_business'] + '_' + df['agent']
df['primary_risk'] = df['primary_cause'] + '_' + df['risk_type']
df['secondary_risk'] = df['secondary_cause'] + '_' + df['risk_type']


df['year'] = [int(i.split('-')[0]) - 2018 for i in df.loss_date]
df['cause'] = df['secondary_cause'] + '_' + df['primary_cause']
df['product_class'] = df['class_of_business'] + '_' + df['product']

id_claim = [i.split('m')[-1] for i in df['claim_number']]
df['id_claim'] = id_claim
df['idclaim_freq'] = df['id_claim'].map(df['id_claim'].value_counts())
df = df.drop('id_claim', axis = 1)

df['loss_date'] = pd.to_datetime(df['loss_date'], format='%Y-%m-%d')

for attr in ['day', 'month', 'week', 'dayofweek', 'weekofyear', 'days_in_month', 'is_month_start', 'is_month_end', 'dayofyear']:
    df[attr] = getattr(df['loss_date'].dt, attr)
df['is_weekend'] = (df['dayofweek'] >= 5)*1
df['fortnight'] = df['day']%15
df['which_fortnight'] = df['day']//15

df['branch_product'] = pd.Series(df.branch + '_' + df['product'])

  df[attr] = getattr(df['loss_date'].dt, attr)


In [7]:
df = df[['sum_insured', 'product', 'class_of_business', 'risk_type', 'client_type', 
    'renewal_frequency', 'primary_cause', 'secondary_cause', 'branch','renwal_business', 'product_class', 
         'cause', 'year', 'idclaim_freq',
    'day', 'month', 'week', 'dayofweek', 'weekofyear', 'days_in_month', 
    'fortnight', 'which_fortnight', 'branch_product','secondary_risk', 'primary_risk', 'claim_policy', 'business_agent']]

In [8]:
#df = df.drop(['claim_id', 'loss_date', 'target','client_type', 'risk_business'], axis = 1)

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
features = [feat for feat in list(df) if feat != 'target' ]
cat_feat = np.where(df[features].dtypes != float)[0]

In [11]:
to_encode = [val for index, val in enumerate(features) if index in cat_feat]
df[to_encode] = df[to_encode].apply(LabelEncoder().fit_transform)

In [12]:
new_test = df[train.shape[0]:].reset_index(drop = True)
new_train = df[:train.shape[0]]

In [13]:
from sklearn import model_selection
from sklearn import metrics
from math import sqrt

In [14]:
from lightgbm import LGBMRegressor

In [25]:
lgbm_model = LGBMRegressor(n_estimators=80)

fold = model_selection.KFold(n_splits=10)

def metric(y_val, pred):
    return metrics.mean_absolute_error(y_val, pred)

In [26]:
%%time
def modelling_tree(algorithm,yy,X, test_df):
    cv_score=[]
    test_pred=[]
    train_score=[]
    
    for train_index, test_index in fold.split(X,pd.Series(yy)):
        
        X_trainx, X_val = X.iloc[train_index], X.iloc[test_index]
        y_trainx, y_val = pd.Series(yy).iloc[train_index], pd.Series(yy).iloc[test_index]

        algorithm.fit(X_trainx,y_trainx, eval_set=[(X_trainx,y_trainx),(X_val, y_val)])
        
        train_preds=algorithm.predict(X_trainx)
        train_score.append(metric(y_trainx, train_preds))
        
        val_preds=algorithm.predict(X_val)
        cv_score.append(metric(y_val, val_preds))

        test_p = algorithm.predict(test_df)
        test_pred.append(test_p)


    print(f'Training RMSE Score {np.mean(train_score)}')
    print(f'Validation RMSE Score {np.mean(cv_score)}')
    
    return test_pred

Wall time: 0 ns


In [27]:
%%time
test_pred = modelling_tree(algorithm = lgbm_model, yy = np.log1p(train.target), X = new_train, test_df = new_test)

[1]	training's l2: 11.7254	valid_1's l2: 12.532
[2]	training's l2: 11.1921	valid_1's l2: 12.1159
[3]	training's l2: 10.7608	valid_1's l2: 11.8072
[4]	training's l2: 10.4074	valid_1's l2: 11.551
[5]	training's l2: 10.0966	valid_1's l2: 11.3137
[6]	training's l2: 9.82208	valid_1's l2: 11.1657
[7]	training's l2: 9.59446	valid_1's l2: 11.0516
[8]	training's l2: 9.40113	valid_1's l2: 10.9286
[9]	training's l2: 9.22866	valid_1's l2: 10.8371
[10]	training's l2: 9.09329	valid_1's l2: 10.7896
[11]	training's l2: 8.9592	valid_1's l2: 10.7096
[12]	training's l2: 8.85047	valid_1's l2: 10.6811
[13]	training's l2: 8.75	valid_1's l2: 10.6468
[14]	training's l2: 8.65128	valid_1's l2: 10.623
[15]	training's l2: 8.56387	valid_1's l2: 10.5961
[16]	training's l2: 8.48134	valid_1's l2: 10.5656
[17]	training's l2: 8.40157	valid_1's l2: 10.5562
[18]	training's l2: 8.32319	valid_1's l2: 10.5367
[19]	training's l2: 8.25218	valid_1's l2: 10.4936
[20]	training's l2: 8.19416	valid_1's l2: 10.4714
[21]	training's 

[7]	training's l2: 9.79018	valid_1's l2: 9.72623
[8]	training's l2: 9.58919	valid_1's l2: 9.58136
[9]	training's l2: 9.42537	valid_1's l2: 9.44576
[10]	training's l2: 9.27538	valid_1's l2: 9.36687
[11]	training's l2: 9.14341	valid_1's l2: 9.30209
[12]	training's l2: 9.03119	valid_1's l2: 9.21801
[13]	training's l2: 8.91369	valid_1's l2: 9.13265
[14]	training's l2: 8.81544	valid_1's l2: 9.09251
[15]	training's l2: 8.72203	valid_1's l2: 9.03082
[16]	training's l2: 8.63528	valid_1's l2: 8.99743
[17]	training's l2: 8.5488	valid_1's l2: 8.95168
[18]	training's l2: 8.47726	valid_1's l2: 8.951
[19]	training's l2: 8.40851	valid_1's l2: 8.93615
[20]	training's l2: 8.34065	valid_1's l2: 8.90279
[21]	training's l2: 8.2682	valid_1's l2: 8.84827
[22]	training's l2: 8.2011	valid_1's l2: 8.82499
[23]	training's l2: 8.14529	valid_1's l2: 8.79299
[24]	training's l2: 8.08462	valid_1's l2: 8.76769
[25]	training's l2: 8.0305	valid_1's l2: 8.74363
[26]	training's l2: 7.97692	valid_1's l2: 8.72262
[27]	trai

[61]	training's l2: 6.90999	valid_1's l2: 8.71136
[62]	training's l2: 6.88385	valid_1's l2: 8.71989
[63]	training's l2: 6.86242	valid_1's l2: 8.71618
[64]	training's l2: 6.8408	valid_1's l2: 8.72024
[65]	training's l2: 6.81898	valid_1's l2: 8.7195
[66]	training's l2: 6.79862	valid_1's l2: 8.7232
[67]	training's l2: 6.77472	valid_1's l2: 8.71856
[68]	training's l2: 6.74691	valid_1's l2: 8.7105
[69]	training's l2: 6.72998	valid_1's l2: 8.70747
[70]	training's l2: 6.7099	valid_1's l2: 8.70839
[71]	training's l2: 6.68578	valid_1's l2: 8.713
[72]	training's l2: 6.66797	valid_1's l2: 8.70588
[73]	training's l2: 6.64003	valid_1's l2: 8.70121
[74]	training's l2: 6.61689	valid_1's l2: 8.71345
[75]	training's l2: 6.60081	valid_1's l2: 8.71183
[76]	training's l2: 6.58439	valid_1's l2: 8.71002
[77]	training's l2: 6.56482	valid_1's l2: 8.71878
[78]	training's l2: 6.54255	valid_1's l2: 8.70715
[79]	training's l2: 6.52357	valid_1's l2: 8.71069
[80]	training's l2: 6.50863	valid_1's l2: 8.70792
[1]	tra

[1]	training's l2: 11.8268	valid_1's l2: 11.6443
[2]	training's l2: 11.3437	valid_1's l2: 11.3167
[3]	training's l2: 10.9296	valid_1's l2: 11.0112
[4]	training's l2: 10.5783	valid_1's l2: 10.7686
[5]	training's l2: 10.2422	valid_1's l2: 10.5161
[6]	training's l2: 10.0031	valid_1's l2: 10.3926
[7]	training's l2: 9.75972	valid_1's l2: 10.227
[8]	training's l2: 9.5539	valid_1's l2: 10.1054
[9]	training's l2: 9.3699	valid_1's l2: 9.97477
[10]	training's l2: 9.21283	valid_1's l2: 9.8557
[11]	training's l2: 9.0911	valid_1's l2: 9.81243
[12]	training's l2: 8.97315	valid_1's l2: 9.7487
[13]	training's l2: 8.85846	valid_1's l2: 9.68841
[14]	training's l2: 8.76188	valid_1's l2: 9.64456
[15]	training's l2: 8.67645	valid_1's l2: 9.61042
[16]	training's l2: 8.5893	valid_1's l2: 9.54884
[17]	training's l2: 8.51007	valid_1's l2: 9.52851
[18]	training's l2: 8.4415	valid_1's l2: 9.49935
[19]	training's l2: 8.3787	valid_1's l2: 9.49513
[20]	training's l2: 8.31017	valid_1's l2: 9.47401
[21]	training's l2

[57]	training's l2: 6.89898	valid_1's l2: 9.46634
[58]	training's l2: 6.87479	valid_1's l2: 9.46777
[59]	training's l2: 6.84242	valid_1's l2: 9.46742
[60]	training's l2: 6.8212	valid_1's l2: 9.46057
[61]	training's l2: 6.79198	valid_1's l2: 9.46542
[62]	training's l2: 6.77494	valid_1's l2: 9.46597
[63]	training's l2: 6.75493	valid_1's l2: 9.46653
[64]	training's l2: 6.73	valid_1's l2: 9.47152
[65]	training's l2: 6.70673	valid_1's l2: 9.47823
[66]	training's l2: 6.68847	valid_1's l2: 9.47285
[67]	training's l2: 6.67159	valid_1's l2: 9.46817
[68]	training's l2: 6.64832	valid_1's l2: 9.47247
[69]	training's l2: 6.63026	valid_1's l2: 9.47359
[70]	training's l2: 6.61081	valid_1's l2: 9.4697
[71]	training's l2: 6.59041	valid_1's l2: 9.4539
[72]	training's l2: 6.57736	valid_1's l2: 9.45266
[73]	training's l2: 6.55846	valid_1's l2: 9.44367
[74]	training's l2: 6.54066	valid_1's l2: 9.43805
[75]	training's l2: 6.51731	valid_1's l2: 9.42736
[76]	training's l2: 6.49697	valid_1's l2: 9.42792
[77]	t

In [28]:
cat_sub = np.mean(np.exp(test_pred) +1, axis = 0)
sub['target'] = cat_sub
sub['target'].describe()

sub.to_csv('3rd_model.csv', index = False)