# AllState Claims Severity

Download data [Here](https://www.kaggle.com/c/allstate-claims-severity/data).

In [1]:
%matplotlib inline

import os

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import RFECV

In [7]:
def load_data(train_path, test_path):
    return pd.read_csv(train_path), pd.read_csv(test_path)

def make_submission(pred, dst='lb.csv', src='submission.csv'):
    submission = pd.read_csv(src)
    submission.iloc[:, 1] = pred
    submission.to_csv(dst, index=None)
    
    os.system('zip sub.csv.zip {0}'.format(dst))

In [8]:
train, test = load_data('train.csv', 'test.csv')

In [9]:
print('Observations:', len(train))
print('Features:', len(train.columns))
train.head()

Observations: 188318
Features: 132


Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


In [10]:
test.drop(['id'], axis=1, inplace=True)
train.drop(['id'], axis=1, inplace=True)

#for col in (col for col in test.columns if col.startswith('cat')):
#    test[col] = pd.factorize(test[col], sort=True)[0]
#    train[col] = pd.factorize(train[col], sort=True)[0]

train['loss'].apply(np.log)
x_train, y_train = train.ix[:, 'cat1':'cont14'], train.ix[:, 'loss'].ravel()

In [11]:
for col in (col for col in train.columns if col.startswith('cat')):
    tmp = pd.DataFrame(train['loss'].groupby([train[col]]).mean())
    tmp[col] = tmp.index
    x_train[col] = pd.merge(left=x_train, right=tmp, how='left', on=col)['loss']
    test[col] = pd.merge(left=test, right=tmp, how='left', on=col)['loss']

In [12]:
x_train.head()

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
0,3408.091515,3800.104171,2902.219751,3488.392225,2814.648335,3259.916396,2908.944342,2975.558559,3827.621974,2694.135528,...,0.310061,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843
1,3408.091515,3800.104171,2902.219751,2826.826831,2814.648335,3259.916396,2908.944342,2975.558559,3827.621974,4993.767052,...,0.885834,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496
2,3408.091515,3800.104171,2902.219751,2826.826831,3464.009806,3259.916396,2908.944342,2975.558559,3827.621974,4993.767052,...,0.397069,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425
3,1915.198521,3800.104171,2902.219751,3488.392225,2814.648335,3259.916396,2908.944342,2975.558559,3827.621974,2694.135528,...,0.422268,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642
4,3408.091515,3800.104171,2902.219751,3488.392225,2814.648335,3259.916396,2908.944342,2975.558559,3827.621974,4993.767052,...,0.704268,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606


In [13]:
x_train = np.array(x_train)

In [14]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(np.array(test))

In [15]:
params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 6,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'silent': 1
    'eval_metric': ['mae']
}

cv = xgb.cv(params, dtrain, verbose_eval=20, show_stdv=True, stratified=False,
            early_stopping_rounds=25, num_boost_round=750, nfold=4)

[0]	train-mae:2809.45+4.04896	test-mae:2809.49+12.3357
[20]	train-mae:1279.38+2.21768	test-mae:1288.15+6.78602
[40]	train-mae:1217.07+2.86709	test-mae:1236.68+3.25867
[60]	train-mae:1193+2.45354	test-mae:1220.2+3.11923
[80]	train-mae:1174.74+2.05567	test-mae:1208.04+3.30164
[100]	train-mae:1162.06+2.04096	test-mae:1200.62+3.55653
[120]	train-mae:1152.01+1.88895	test-mae:1195.59+3.62391
[140]	train-mae:1144.03+2.0898	test-mae:1192.25+3.78117
[160]	train-mae:1137.55+2.21707	test-mae:1190.1+3.81969
[180]	train-mae:1131.65+1.99951	test-mae:1188.51+3.89268
[200]	train-mae:1126.41+1.67168	test-mae:1187.24+4.30746
[220]	train-mae:1121.41+1.87848	test-mae:1186.09+4.07848
[240]	train-mae:1116.37+2.11059	test-mae:1185.14+4.03597
[260]	train-mae:1111.55+2.20934	test-mae:1184.32+4.09598
[280]	train-mae:1107.31+2.22984	test-mae:1183.63+3.95682
[300]	train-mae:1102.74+2.06178	test-mae:1183+3.95375
[320]	train-mae:1098.64+2.07136	test-mae:1182.82+3.89532
[340]	train-mae:1094.03+2.10473	test-mae:1182.

In [16]:
best_nrounds = cv.shape[0] - 1
cv_mean = cv.iloc[-1, 0]
cv_std = cv.iloc[-1, 1]

print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))
print('Best_nround:', best_nrounds)

Ensemble-CV: 1180.12384025+3.738860221979952
Best_nround: 506


In [17]:
gb = xgb.train(params, dtrain, best_nrounds)

In [18]:
make_submission(np.exp(gb.predict(dtest)))

  if __name__ == '__main__':
