In [101]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [102]:
data_frame = pd.read_excel("dataset/insurance_claims_data.xlsx")

data_frame.head()
data_frame.columns

Index(['months_as_customer', 'age', 'policy_number', 'policy_bind_date',
       'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital-gains', 'capital-loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported'],
      dtype='object')

In [103]:
data_frame.describe()

Unnamed: 0,months_as_customer,age,policy_number,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_year
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,203.954,38.948,546238.648,1136.0,1256.40615,1101000.0,501214.488,25126.1,-26793.7,11.644,1.839,0.992,1.487,52761.94,7433.42,7399.57,37928.95,2005.103
std,115.113174,9.140287,257063.005276,611.864673,244.167395,2297407.0,71701.610941,27872.187708,28104.096686,6.951373,1.01888,0.820127,1.111335,26401.53319,4880.951853,4824.726179,18886.252893,6.015861
min,0.0,19.0,100804.0,500.0,433.33,-1000000.0,430104.0,0.0,-111100.0,0.0,1.0,0.0,0.0,100.0,0.0,0.0,70.0,1995.0
25%,115.75,32.0,335980.25,500.0,1089.6075,0.0,448404.5,0.0,-51500.0,6.0,1.0,0.0,1.0,41812.5,4295.0,4445.0,30292.5,2000.0
50%,199.5,38.0,533135.0,1000.0,1257.2,0.0,466445.5,0.0,-23250.0,12.0,1.0,1.0,1.0,58055.0,6775.0,6750.0,42100.0,2005.0
75%,276.25,44.0,759099.75,2000.0,1415.695,0.0,603251.0,51025.0,0.0,17.0,3.0,2.0,2.0,70592.5,11305.0,10885.0,50822.5,2010.0
max,479.0,64.0,999435.0,2000.0,2047.59,10000000.0,620962.0,100500.0,0.0,23.0,4.0,2.0,3.0,114920.0,21450.0,23670.0,79560.0,2015.0


Now that we have looked at the meta data. Let's clean this data and encode it to make it suitable for training a DecisionTreeClassifier.
My approach to normalize this data is:
- Drop all null valued data points.
- Convert features with type of timestamp into a meaning full categorical feature.
- Encode every categorical feature to make it Training ready.
- Reduce the variance for features to the maximum extend possible.

In [104]:
#Let's begin with removing all the rows that have null/no values in them
data_frame = data_frame.dropna()

In [105]:
print(data_frame.describe())
#this would give us all continous features.

       months_as_customer          age  policy_number  policy_deductable  \
count         1000.000000  1000.000000    1000.000000        1000.000000   
mean           203.954000    38.948000  546238.648000        1136.000000   
std            115.113174     9.140287  257063.005276         611.864673   
min              0.000000    19.000000  100804.000000         500.000000   
25%            115.750000    32.000000  335980.250000         500.000000   
50%            199.500000    38.000000  533135.000000        1000.000000   
75%            276.250000    44.000000  759099.750000        2000.000000   
max            479.000000    64.000000  999435.000000        2000.000000   

       policy_annual_premium  umbrella_limit    insured_zip  capital-gains  \
count            1000.000000    1.000000e+03    1000.000000    1000.000000   
mean             1256.406150    1.101000e+06  501214.488000   25126.100000   
std               244.167395    2.297407e+06   71701.610941   27872.187708   
min

In [106]:
#lets see which features are categorical:

categorical_features = data_frame.select_dtypes(include=['object', 'category'])
categorical_features.columns

Index(['policy_state', 'policy_csl', 'insured_sex', 'insured_education_level',
       'insured_occupation', 'insured_hobbies', 'insured_relationship',
       'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'property_damage', 'police_report_available',
       'auto_make', 'auto_model', 'fraud_reported'],
      dtype='object')

All the columns that were returned within the variable categorical_features, are to be encoded.

Lets perfom some encoding for better fitting.

There are only three states, let encode this.


In [108]:
from sklearn.preprocessing import OneHotEncoder
y = data_frame["fraud_reported"]
data_frame = data_frame.drop(columns = ["fraud_reported"])

encoder = OneHotEncoder()
string_features = data_frame.select_dtypes(include='object').columns
encoded_data = encoder.fit_transform(data_frame[string_features])
data_frame = data_frame.drop(columns=string_features)

# concatenate encoded data with original dataframe
data_frame = pd.concat([data_frame, pd.DataFrame(encoded_data.toarray())], axis=1)
data_frame.columns= data_frame.columns.astype(str)
print(data_frame.head())
data_frame = pd.concat([data_frame, pd.DataFrame(y)])
data_frame = data_frame.dropna()
print(data_frame.columns)

   months_as_customer  age  policy_number policy_bind_date  policy_deductable  \
0                 328   48         521585       2014-10-17               1000   
1                 228   42         342868       2006-06-27               2000   
2                 134   29         687698       2000-09-06               2000   
3                 256   41         227811       1990-05-25               2000   
4                 228   44         367455       2014-06-06               1000   

   policy_annual_premium  umbrella_limit  insured_zip  capital-gains  \
0                1406.91               0       466132          53300   
1                1197.22         5000000       468176              0   
2                1413.14         5000000       430632          35100   
3                1415.74         6000000       608117          48900   
4                1583.91         6000000       610706          66000   

   capital-loss  ... 1137  1138  1139  1140  1141  1142  1143  1144  1145  \
0  

I decided to choose Logistic Regression as my baseline model.
Lets fit this.


In [73]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
model = LogisticRegression()

In [97]:

X = data_frame.drop(columns=["fraud_reported"])
print(X.describe())
y = data_frame["fraud_reported"]
X_train, X_test, y_train , y_test = train_test_split(X, y, test_size=0.3, random_state=42)

       months_as_customer  age  policy_number  policy_deductable  \
count                 0.0  0.0            0.0                0.0   
mean                  NaN  NaN            NaN                NaN   
std                   NaN  NaN            NaN                NaN   
min                   NaN  NaN            NaN                NaN   
25%                   NaN  NaN            NaN                NaN   
50%                   NaN  NaN            NaN                NaN   
75%                   NaN  NaN            NaN                NaN   
max                   NaN  NaN            NaN                NaN   

       policy_annual_premium  umbrella_limit  insured_zip  capital-gains  \
count                    0.0             0.0          0.0            0.0   
mean                     NaN             NaN          NaN            NaN   
std                      NaN             NaN          NaN            NaN   
min                      NaN             NaN          NaN            NaN   
25%    

ValueError: With n_samples=0, test_size=0.3 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [38]:
model = model.fit(X_train, y_train)

TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.