# AllState Claims Severity

Download data [Here](https://www.kaggle.com/c/allstate-claims-severity/data).

In [119]:
%matplotlib inline

import os

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import RFECV

In [111]:
def load_data(train_path, test_path):
    return pd.read_csv(train_path), pd.read_csv(test_path)

def make_submission(src, dst, pred):
    submission = pd.read_csv(src)
    submission.iloc[:, 1] = pred
    submission.to_csv(dst, index=None)

In [103]:
train, test = load_data('train.csv', 'test.csv')

In [104]:
print('Observations:', len(train))
print('Features:', len(train.columns))
train.head()

Observations: 188318
Features: 132


Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


In [105]:
X_train, Y_train = train[train.columns[1:130]], train[train.columns[131]].ravel()

In [106]:
print('x:', type(X_train), x_train.shape)
print('y:', type(y_train), Y_train.shape)

x: <class 'pandas.core.frame.DataFrame'> (188318, 129)
y: <class 'numpy.ndarray'> (188318,)


In [107]:
for col in (col for col in X_train.columns if col.startswith('cat')):
    X_train[col] = pd.factorize(X_train[col], sort=True)[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [108]:
x_train = np.array(X_train)

In [109]:
x_train, x_test, y_train, y_test = train_test_split(x_train, Y_train,
                                                    random_state=0,
                                                    test_size=0.2)

In [126]:
gb = GradientBoostingRegressor(max_features='sqrt')
gb.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [135]:
test.drop(['id', 'cont14'], axis=1, inplace=True)

for col in (col for col in test.columns if col.startswith('cat')):
    test[col] = pd.factorize(test[col], sort=True)[0]

In [127]:
gb.score(x_test, y_test)

0.49372083858413873

In [113]:
forest = RandomForestRegressor()
forest.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [114]:
forest.score(x_test, y_test)

0.47001449367112852

In [136]:
make_submission('submission.csv', 'sub_sklearn_rf.csv', forest.predict(test))

In [131]:
rfecv = RFECV(GradientBoostingRegressor(max_features='sqrt'), cv=KFold(n_splits=3), verbose=2)
rfecv.fit(x_train, y_train)

Fitting estimator with 129 features.
Fitting estimator with 128 features.
Fitting estimator with 127 features.
Fitting estimator with 126 features.
Fitting estimator with 125 features.
Fitting estimator with 124 features.
Fitting estimator with 123 features.
Fitting estimator with 122 features.
Fitting estimator with 121 features.
Fitting estimator with 120 features.
Fitting estimator with 119 features.
Fitting estimator with 118 features.
Fitting estimator with 117 features.
Fitting estimator with 116 features.
Fitting estimator with 115 features.
Fitting estimator with 114 features.
Fitting estimator with 113 features.
Fitting estimator with 112 features.
Fitting estimator with 111 features.
Fitting estimator with 110 features.
Fitting estimator with 109 features.
Fitting estimator with 108 features.
Fitting estimator with 107 features.
Fitting estimator with 106 features.
Fitting estimator with 105 features.
Fitting estimator with 104 features.
Fitting estimator with 103 features.
F

RFECV(cv=KFold(n_splits=3, random_state=None, shuffle=False),
   estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
   n_jobs=1, scoring=None, step=1, verbose=2)

In [132]:
rfecv.score(x_test, y_test)

0.50458094497176842

In [137]:
make_submission('submission.csv', 'sub_sklearn_rfecv.csv', rfecv.predict(test))