# AllState Claims Severity

Download data [Here](https://www.kaggle.com/c/allstate-claims-severity/data).

In [27]:
%matplotlib inline

import os

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error
from scipy.stats import skew, boxcox

# Processing data

In [28]:
SEED = 0
NFOLDS = 5
DIR = '.'

In [29]:
df_train, df_test = pd.read_csv('train.csv'), pd.read_csv('test.csv')

y_train = np.log(df_train['loss'])

# Factorization of the categorical features
for col in (col for col in df_train.columns if col.startswith('cat')):
    tmp = df_train.groupby([df_train[col]])['loss'].mean()
    df_train[col] = df_train[col].map(tmp)
    df_test[col] = df_test[col].map(tmp)
    
df_train.drop(['id', 'loss'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)

ntrain = df_train.shape[0]
ntest = df_test.shape[0]

df_all = pd.concat((df_train, df_test)).reset_index(drop=True)

# Skew continuous features
skewed_feats = [col for col in df_all if col.startswith('cont')]
skewed_cols = df_all[skewed_feats].apply(lambda x: skew(x.dropna()))
skewed_cols = skewed_cols[skewed_cols > 0.25]
skewed_cols = skewed_cols.index
for col in skewed_cols:
    df_all[col] = df_all[col] + 1
    df_all[col], _ = boxcox(df_all[col])

    
x_train = np.array(df_all.iloc[:ntrain, :])
x_test = np.array(df_all.iloc[:ntest, :])

dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

# Cross-Validation

In [30]:
params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 6,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'mae'
}

def custom_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y), np.exp(yhat))

cv = xgb.cv(params, dtrain, num_boost_round=750, nfold=4, seed=0, stratified=False,
             early_stopping_rounds=20, verbose_eval=25, show_stdv=True,
             feval=custom_mae, maximize=False)

best_rounds = cv.shape[0] - 1

[0]	train-mae:3034.5+4.23324	test-mae:3034.5+12.699
[25]	train-mae:2118.98+2.68874	test-mae:2120.19+12.6238
[50]	train-mae:1258.56+1.93532	test-mae:1271.85+7.99047
[75]	train-mae:1160.28+1.84739	test-mae:1182.7+6.53358
[100]	train-mae:1138.05+1.50493	test-mae:1167.18+5.6351
[125]	train-mae:1126.02+1.35728	test-mae:1160.99+5.44647
[150]	train-mae:1117.03+1.65688	test-mae:1157.3+5.4599
[175]	train-mae:1109.29+1.31711	test-mae:1154.49+5.52512
[200]	train-mae:1102.67+1.04539	test-mae:1152.48+5.49301
[225]	train-mae:1096.56+1.07843	test-mae:1151.03+5.53475
[250]	train-mae:1090.82+1.01109	test-mae:1149.5+5.55211
[275]	train-mae:1085.64+1.19867	test-mae:1148.72+5.5693
[300]	train-mae:1080.79+1.32893	test-mae:1147.95+5.61412
[325]	train-mae:1075.88+1.31393	test-mae:1147.11+5.77183
[350]	train-mae:1070.96+1.34184	test-mae:1146.62+5.67829
[375]	train-mae:1066.5+1.41666	test-mae:1146.25+5.64086
[400]	train-mae:1062.32+1.3974	test-mae:1145.98+5.70688
[425]	train-mae:1058.04+1.41774	test-mae:1145.7

# Training

In [31]:
gb = xgb.train(params, dtrain, best_rounds)

# Prediction

In [32]:
sub = pd.read_csv('submission.csv')
sub.iloc[:, 1] = np.exp(gb.predict(dtest))
sub.to_csv('output.csv')

In [33]:
os.system('zip sub.csv.zip output.csv')

0