# Insurance Claim Prediction


## Load the data
This section loads the dataset and prints the first 5 entries.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Pretty display for notebooks
%matplotlib inline


# Load the provided datasets
data = pd.read_csv('data/train.csv')

# Drop 'loss' and 'id' from the feature set and save target feature 'loss' in its own variable
claims = data['loss']
features = data.drop(['id','loss'], axis = 1)

print data.head()

   id cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9   ...        cont6  \
0   1    A    B    A    B    A    A    A    A    B   ...     0.718367   
1   2    A    B    A    A    A    A    A    A    B   ...     0.438917   
2   5    A    B    A    A    B    A    A    A    B   ...     0.289648   
3  10    B    B    A    B    A    A    A    A    B   ...     0.440945   
4  11    A    B    A    B    A    A    A    A    B   ...     0.178193   

      cont7    cont8    cont9   cont10    cont11    cont12    cont13  \
0  0.335060  0.30260  0.67135  0.83510  0.569745  0.594646  0.822493   
1  0.436585  0.60087  0.35127  0.43919  0.338312  0.366307  0.611431   
2  0.315545  0.27320  0.26076  0.32446  0.381398  0.373424  0.195709   
3  0.391128  0.31796  0.32128  0.44467  0.327915  0.321570  0.605077   
4  0.247408  0.24564  0.22089  0.21230  0.204687  0.202213  0.246011   

     cont14     loss  
0  0.714843  2213.18  
1  0.304496  1283.60  
2  0.774425  3005.09  
3  0.602642   939.85  
4  0.432606  

## Explore the Data
To get a better understanding of the provided data, some analysis and visualization is provided.
### Statistical Analysis
This section shall detail some of the statistical properties of the target feature.

In [None]:
#Minimum claim of the data
minimum_claim = np.amin(claims)

#Maximum claim of the data
maximum_claim = np.amax(claims)

#Mean claim of the data
mean_claim = np.mean(claims)

#Median price of the data
median_claim = np.median(claims)

#25% Quantile
percentile_25 = np.percentile(claims, 25)

#50% Percentile
percentile_50 = np.percentile(claims, 50)

#75% Quantile
percentile_75 = np.percentile(claims, 75)

#Standard deviation of the claims in the data
std_claim = np.std(claims)

# Show the calculated statistics
print "Statistics for Insurence claims dataset:\n"
print "Minimum claim: ${:,.2f}".format(minimum_claim)
print "Maximum claim: ${:,.2f}".format(maximum_claim)
print "Mean claim: ${:,.2f}".format(mean_claim)
print "Median claim ${:,.2f}".format(median_claim)
print "25% percentile ${:,.2f}".format(percentile_25)
print "50% percentile ${:,.2f}".format(percentile_50)
print "75% percentile ${:,.2f}".format(percentile_75)
print "Standard deviation of claims: ${:,.2f}".format(std_claim)

This information helps to give context to future predictions. Also the Minimum of \$0.67 and Maximum of \$121,012.25, are already showing that some outliers will have to be removed.

### Visualizing the Data

A scatterplot matrix helps to get a better understanding of the data.
For performance reasons, the data is split to only contain the first 500 points. **It might still take a while to compute!**

In [None]:
scatterdata = data[:500].drop("id", axis = 1)

pd.scatter_matrix(scatterdata, alpha = 0.3, figsize = (14,8), diagonal = 'kde');

One can already see correlations, that can be used to trim down the dataset using feature selection. Most notably are cont11 <-> cont12, which seem to correlate strictly linearly. Some other correlations can also be seen, but not as clearly, like cont1 <-> cont9.

This is the scatterplot matrix over **ALL** the datapoints:

<img src="files/img/scatterplot_all.png">

 It can be produced by the following code:

**THIS WILL TAKE QUITE A LONG TIME**

In [None]:
#Activate if wished. Again: This will be slow
if False:
    pd.scatter_matrix(data.drop('id', axis = 1), alpha = 0.3, figsize = (14,8), diagonal = 'kde');

The scatterplot matrix also shows the right skewedness of the target feature 'loss':

In [None]:
claims.plot.kde()

## Preprocessing
### Normalization

The target feature needs to be normalized for the regression model to yield the best results possible.

In [2]:
claims_normalized = np.log(claims)

The normalized data can then be visualized:

In [None]:
claims_normalized.plot.kde()

The categorical features need to be label encoded and then one hot encoded.

### Outlier Detection

The big difference between Maximum and Minimum, and their distance to the nearest quartile already show some outliers need to be removed.

The following code will remove all the data points with a bigger distance to the nearest quartile than 1.5 * the interquartile range.

In [5]:
#USE NORMALIZED AND QUANTILE STEP!!!
q1 = np.percentile(claims_normalized, 25)
q3 = np.percentile(claims_normalized, 75)


# Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
step = 1.5 * (q3 - q1)

#Print of Q1 and Q3 and Step to better understand the outliers:
print "Q1:", q1
print "Q3:", q3
print "Step:", step

log_data = pd.concat([features, claims_normalized], axis = 1)
outliers = log_data[~((log_data['loss'] >= q1 - step) & (log_data['loss'] <= q3 + step))]

print "\nExample of outliers:"
print outliers.head()['loss']

# Remove the outliers. Unfortunately had problems with Dataframe Subtraction, so negated the query above
good_data = log_data[((log_data['loss'] >= q1 - step) & (log_data['loss'] <= q3 + step))]

# Save num of Records. OneHotEncoder adds several values with NaN leading to errors
numOfRecords = good_data.shape[0]

print "\nOriginal data had {:} rows. \n{:} outliers were removed and good_data now holds {:} rows".format(
    log_data.shape[0], outliers.shape[0], numOfRecords
)

claims_clean = good_data['loss']
features_clean = good_data.drop('loss', axis = 1)


# Free memory
del outliers
del log_data

Q1: 7.0937866127
Q3: 8.25946984122
Step: 1.74852484278

Example of outliers:
89       3.648057
470      5.260408
713     10.164725
867     10.295591
1015    10.124239
Name: loss, dtype: float64

Original data had 188318 rows. 
521 outliers were removed and good_data now holds 187797 rows


### Label Encoding

In [7]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
# Label Encode all the categorical features
for i in range(1,130):
    key = 'cat' + str(i)
    #Failsafe IF-Clause for column adressing
    if key in features_clean.columns:
        features_clean[key] = le.fit_transform(features_clean[key])
    else:
        break
        
print features_clean.head()[['cat1','cat2','cat3']]

   cat1  cat2  cat3
0     0     1     0
1     0     1     0
2     0     1     0
3     1     1     0
4     0     1     0


## Onehot Encoding

In [8]:
# Make Encoder
enc = preprocessing.OneHotEncoder()

#Split Categorical Columns
#print features_clean.iloc[:,0:116].head()
catF = features_clean.iloc[:,0:116]
#print catF.shape
#Fit Categorigal Columns
enc.fit(catF)

#Encode Categorical Columns
catEnc = enc.transform(catF).toarray()
catEnc = pd.DataFrame(catEnc)
#print catEnc.shape
#catEnc2 = pd.concat([catEnc, pd.DataFrame(enc.transform(catF.iloc[90001:]).toarray())], axis = 0)
#catEnc2 = pd.DataFrame(enc.transform(catF.iloc[90001:]).toarray())
print catEnc.head()
print catEnc.tail()

# Concatinate Categorical and Continous Columns
features_encoded = pd.concat([catEnc, features_clean.iloc[:,117:]], axis = 1)[:numOfRecords]
#print features_encoded.head()

   0     1     2     3     4     5     6     7     8     9     ...   1124  \
0   1.0   0.0   0.0   1.0   1.0   0.0   0.0   1.0   1.0   0.0  ...    0.0   
1   1.0   0.0   0.0   1.0   1.0   0.0   1.0   0.0   1.0   0.0  ...    0.0   
2   1.0   0.0   0.0   1.0   1.0   0.0   1.0   0.0   0.0   1.0  ...    0.0   
3   0.0   1.0   0.0   1.0   1.0   0.0   0.0   1.0   1.0   0.0  ...    0.0   
4   1.0   0.0   0.0   1.0   1.0   0.0   0.0   1.0   1.0   0.0  ...    0.0   

   1125  1126  1127  1128  1129  1130  1131  1132  1133  
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  

[5 rows x 1134 columns]
        0     1     2     3     4     5     6     7     8     9     ...   \
187792   1.0   0.0   0.0   1.0   1.0   0.0   1.0   0.0   1.0   0.0  ...    
187793   1.0

In [14]:
print features_encoded.head()
print features_encoded.tail()

print catEnc.shape
print features_clean.shape
print features_encoded.shape

# Check for NaN Values
print features_encoded.isnull().sum().sum()
#print features_encoded.isnull()

     0    1    2    3    4    5    6    7    8    9    ...        cont5  \
0  1.0  0.0  0.0  1.0  1.0  0.0  0.0  1.0  1.0  0.0    ...     0.310061   
1  1.0  0.0  0.0  1.0  1.0  0.0  1.0  0.0  1.0  0.0    ...     0.885834   
2  1.0  0.0  0.0  1.0  1.0  0.0  1.0  0.0  0.0  1.0    ...     0.397069   
3  0.0  1.0  0.0  1.0  1.0  0.0  0.0  1.0  1.0  0.0    ...     0.422268   
4  1.0  0.0  0.0  1.0  1.0  0.0  0.0  1.0  1.0  0.0    ...     0.704268   

      cont6     cont7    cont8    cont9   cont10    cont11    cont12  \
0  0.718367  0.335060  0.30260  0.67135  0.83510  0.569745  0.594646   
1  0.438917  0.436585  0.60087  0.35127  0.43919  0.338312  0.366307   
2  0.289648  0.315545  0.27320  0.26076  0.32446  0.381398  0.373424   
3  0.440945  0.391128  0.31796  0.32128  0.44467  0.327915  0.321570   
4  0.178193  0.247408  0.24564  0.22089  0.21230  0.204687  0.202213   

     cont13    cont14  
0  0.822493  0.714843  
1  0.611431  0.304496  
2  0.195709  0.774425  
3  0.605077  0.60264

### Dimensionality Reduction

In [10]:
# Apply PCA by fitting the reduced data
from sklearn.decomposition import PCA
pca = PCA(n_components=5).fit(features_encoded)
reduced_data = pca.transform(features_encoded)

print pca.explained_variance_ratio_

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## Creation of Training and Testing Sets

In [None]:
from sklearn.cross_validation import train_test_split

#Simple Switch to influence testing set size for performance
if True:
    size = 20000
else:
    size = None
    
X_train, X_test, y_train, y_test = train_test_split(
    reduced_data[:size], claims_clean[:size], test_size=0.25, random_state=1)

## Scorer Function

As I work with normalized log(data), define a custome scoring function with integrated np.exp()

In [None]:
from sklearn.metrics import mean_absolute_error

def scorerFunc(test, pred):
    return mean_absolute_error(np.exp(test), np.exp(pred))

## Benchmark

This is the benchmark, the models will compete against.
The benchmark model will always predict the mean of the training set. The performance is measured with the mean absolute error as requested by the Kaggle competition.

In [None]:
benchMAE = scorerFunc(y_test, np.full_like(y_test, np.mean(y_train)))
print "The benchmark performance is {:}".format(benchMAE)

## Model Creation
### Decision Tree Regressor

In [None]:
from sklearn import tree
clf = tree.DecisionTreeRegressor()

clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print scorerFunc(y_test, pred)

## SVM

In [None]:
from sklearn import svm
clf = svm.SVR()

clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print scorerFunc(y_test, pred)

# NOTES