# AllState Claims Severity

Download data [Here](https://www.kaggle.com/c/allstate-claims-severity/data).

In [15]:
%matplotlib inline

import os

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import RFECV

In [16]:
def load_data(train_path, test_path):
    return pd.read_csv(train_path), pd.read_csv(test_path)

def make_submission(dst, pred, src='submission.csv'):
    submission = pd.read_csv(src)
    submission.iloc[:, 1] = pred
    submission.to_csv(dst, index=None)
    
    os.system('zip sub.csv.zip {0}'.format(dst))

In [17]:
train, test = load_data('train.csv', 'test.csv')

In [18]:
print('Observations:', len(train))
print('Features:', len(train.columns))
train.head()

Observations: 188318
Features: 132


Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


In [19]:
test.drop(['id'], axis=1, inplace=True)

for col in (col for col in test.columns if col.startswith('cat')):
    test[col] = pd.factorize(test[col], sort=True)[0]

In [20]:
X_train, Y_train = train.ix[:, 'cat1':'cont14'], train.ix[:, 'loss'].ravel()

In [21]:
print('x:', type(X_train), X_train.shape)
print('y:', type(Y_train), Y_train.shape)

x: <class 'pandas.core.frame.DataFrame'> (188318, 130)
y: <class 'numpy.ndarray'> (188318,)


In [22]:
for col in (col for col in X_train.columns if col.startswith('cat')):
    X_train[col] = pd.factorize(X_train[col], sort=True)[0]
    
x_train = np.array(X_train)

In [23]:
dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(test)

In [35]:
params_cv = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 7,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 400,
    'silent': 1,
    'eval_metric': 'mae'
}

params_xgb = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 7,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 400,
    'silent': 1,
    'eval_metric': 'mae'
}

cv = xgb.cv(params_cv, dtrain, verbose_eval=25, show_stdv=True, stratified=False,
            early_stopping_rounds=25, num_boost_round=500, nfold=4)

[0]	train-mae:2809.68+4.01167	test-mae:2809.67+12.5095
[25]	train-mae:1237.84+1.63162	test-mae:1257.21+4.59282
[50]	train-mae:1189.53+2.70712	test-mae:1226.89+2.77691
[75]	train-mae:1157.92+2.29312	test-mae:1208.59+2.01974
[100]	train-mae:1137.07+2.47067	test-mae:1198.26+1.69326
[125]	train-mae:1122.12+2.22191	test-mae:1192.97+1.86267
[150]	train-mae:1110.91+2.38254	test-mae:1189.87+1.9996
[175]	train-mae:1101.64+2.35801	test-mae:1188.18+2.12147
[200]	train-mae:1092.54+2.33433	test-mae:1187.02+2.29211
[225]	train-mae:1084.06+2.15724	test-mae:1186.06+2.32529
[250]	train-mae:1075.51+2.32399	test-mae:1185.33+2.04589
[275]	train-mae:1066.87+2.48412	test-mae:1184.65+2.12893
[300]	train-mae:1059.03+2.17813	test-mae:1184.12+2.21149
[325]	train-mae:1051.64+2.36195	test-mae:1183.89+2.19757
[350]	train-mae:1043.85+2.59	test-mae:1183.7+2.28873
[375]	train-mae:1036.03+2.72643	test-mae:1183.36+2.21246
[400]	train-mae:1028.48+2.58415	test-mae:1183.31+2.24676


In [36]:
best_nrounds = cv.shape[0] - 1
cv_mean = cv.iloc[-1, 0]
cv_std = cv.iloc[-1, 1]

print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))
print('Best_nround:', best_nrounds)

Ensemble-CV: 1183.2326355+2.257073656034313
Best_nround: 391


In [39]:
gb = xgb.train(params_cv, dtrain, best_nrounds)

In [40]:
make_submission('xgboost_02.csv', gb.predict(dtest))