# Insurance Fraud Claims Detection

This notebook summarizes the steps I did for building the model and the information I need to put our model in production

**Premise: Setting the seed**

Because now we have to move the model in production for reproducibility it's important to **set the seed**

## Libraries and Environments

In [48]:
# Read the data
import pandas as pd
import numpy as np

# Data Science
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Model Deployment
import onnxmltools
import skl2onnx
import onnxruntime
import onnx
from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer
import onnxruntime as rt
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes  # noqa
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm  # noqa
import onnxmltools.convert.common.data_types
from skl2onnx.common.data_types import FloatTensorType

#Utils
import os

# Set notebook
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import warnings
warnings.simplefilter(action='ignore')

In [49]:
raw = '../data/raw/insurance_claims.csv'
interim = '../data/interim/'
processed= '../data/processed/'
models= '../models/'

## Data Preparation

In [50]:
# load dataset
data = pd.read_csv(raw)
print(data.shape)
data.head()

(1000, 40)


Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,insured_education_level,insured_occupation,insured_hobbies,insured_relationship,capital-gains,capital-loss,incident_date,incident_type,collision_type,incident_severity,authorities_contacted,incident_state,incident_city,incident_location,incident_hour_of_the_day,number_of_vehicles_involved,property_damage,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported,_c39
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,MALE,MD,craft-repair,sleeping,husband,53300,0,2015-01-25,Single Vehicle Collision,Side Collision,Major Damage,Police,SC,Columbus,9935 4th Drive,5,1,YES,1,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y,
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,MALE,MD,machine-op-inspct,reading,other-relative,0,0,2015-01-21,Vehicle Theft,?,Minor Damage,Police,VA,Riverwood,6608 MLK Hwy,8,1,?,0,0,?,5070,780,780,3510,Mercedes,E400,2007,Y,
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,FEMALE,PhD,sales,board-games,own-child,35100,0,2015-02-22,Multi-vehicle Collision,Rear Collision,Minor Damage,Police,NY,Columbus,7121 Francis Lane,7,3,NO,2,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N,
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,FEMALE,PhD,armed-forces,board-games,unmarried,48900,-62400,2015-01-10,Single Vehicle Collision,Front Collision,Major Damage,Police,OH,Arlington,6956 Maple Drive,5,1,?,1,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y,
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,MALE,Associate,sales,board-games,unmarried,66000,-46000,2015-02-17,Vehicle Theft,?,Minor Damage,,NY,Arlington,3041 3rd Ave,20,1,NO,0,1,NO,6500,1300,650,4550,Accura,RSX,2009,N,


## Variable settings

In [51]:
drop = ['policy_number', 'policy_bind_date', 'policy_state', 'policy_csl', 
        'insured_zip', 'insured_occupation', 'incident_date', 
        'insured_education_level', 'insured_relationship', 'injury_claim', 'property_claim', 'vehicle_claim', 
        'incident_state', 'incident_city', 'incident_location', 'auto_model', 'auto_make', 'auto_year', '_c39']

data.drop(drop, axis=1, inplace=True)
data.rename(columns={"capital-gains": "capital_gains", "capital-loss": "capital_loss"}, inplace=True)

In [52]:
# general
target = 'fraud_reported'
predictors = [col for col in data.columns if col not in drop and col != target]

## Selected features

In [53]:
# load selected features
features = pd.read_csv(processed + 'selected_features.csv', names=['features'])

print('Number of features: ', len(features))
print(features)

Number of features:  14
                        features
0                              0
1                 umbrella_limit
2                  capital_gains
3                   capital_loss
4              incident_severity
5       incident_hour_of_the_day
6                bodily_injuries
7                      witnesses
8             total_claim_amount
9        policy_deductable_group
10  policy_annual_premium_groups
11         insured_hobbies_chess
12     insured_hobbies_cross-fit
13        insured_hobbies_others


## Feature engineering

### Engineer Missing

In [54]:
# missing
missing_predictors = [col for col in data.select_dtypes(include='object').columns if any(data[col].str.contains('?', regex=False))]

In [55]:
def missing_encoder(data, variable):
    data[variable] = data[variable].replace('?', 'missing')
    print('\n', data[variable].value_counts())

In [56]:
for var in missing_predictors:
    missing_encoder(data, var)


 Rear Collision     292
Side Collision     276
Front Collision    254
missing            178
Name: collision_type, dtype: int64

 missing    360
NO         338
YES        302
Name: property_damage, dtype: int64

 NO         343
missing    343
YES        314
Name: police_report_available, dtype: int64


### Engineer Target

In [57]:
data[target].head()

0    Y
1    Y
2    N
3    Y
4    N
Name: fraud_reported, dtype: object

In [58]:
def target_encoder(data, target):
    
    #Create the list of target labels
    target_labels = sorted(set(data[target]), reverse = True)
    #Create encoding dictionary
    target_labels_dic = {label: index for index, label in enumerate(target_labels, 0)}
    #Encode the data
    data[target] = data[target].map(target_labels_dic).astype('category')
    
    print(data[target].cat.categories)

In [59]:
target_encoder(data, target)

Int64Index([0, 1], dtype='int64')


### Engineer Numerical Variables

I divided them in: 

- Discrete variables
- Continuous variables


In [60]:
# numerical
numerical_predictors = [col for col in predictors if data[col].dtypes != 'object']
discrete_predictors = [col for col in numerical_predictors if len(data[col].unique()) < 30]
continuous_predictors = [col for col in numerical_predictors if col not in discrete_predictors]

#### Discrete Variables

In [61]:
data[discrete_predictors].head()

Unnamed: 0,policy_deductable,umbrella_limit,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses
0,1000,0,5,1,1,2
1,2000,5000000,8,1,0,0
2,2000,5000000,7,3,2,3
3,2000,6000000,5,1,1,2
4,1000,6000000,20,1,0,1


In [62]:
def discrete_encoder(data, variable):
    
    if variable not in ['umbrella_limit', 'policy_deductable']:
        pass
    
    else:
        #umbrella
        if variable == 'umbrella_limit':
            data = data[data[variable]>=0]
            print('\n', data[variable].value_counts())
        
        #policy_deductable
        elif variable == 'policy_deductable':
            bins = list(np.linspace(0,2000, 5, dtype = int))
            bin_labels = ['0-500', '501-1000', '1001-1500', '1501-2000']
            new_variable_name = "_".join([variable, 'groups'])
            data[new_variable_name] = pd.cut(data[variable], bins = bins, labels = bin_labels)
            data[new_variable_name].astype('object', copy=False)
            data.drop(variable, axis=1, inplace=True)
            print('\n', data[new_variable_name].value_counts())

In [63]:
for var in discrete_predictors:
    discrete_encoder(data, var)


 501-1000     351
0-500        342
1501-2000    307
1001-1500      0
Name: policy_deductable_groups, dtype: int64

 0           798
6000000      57
5000000      46
4000000      39
7000000      29
3000000      12
8000000       8
9000000       5
2000000       3
10000000      2
Name: umbrella_limit, dtype: int64


#### Continuous variables

In [64]:
data[continuous_predictors].head()

Unnamed: 0,months_as_customer,age,policy_annual_premium,capital_gains,capital_loss,total_claim_amount
0,328,48,1406.91,53300,0,71610
1,228,42,1197.22,0,0,5070
2,134,29,1413.14,35100,0,34650
3,256,41,1415.74,48900,-62400,63400
4,228,44,1583.91,66000,-46000,6500


In [66]:
def continuous_encoder(data, variable):
    
    if variable not in ['age', 'policy_annual_premium', 'months_as_customer']:
        pass
    
    else: 
        #age
        if variable == 'age':
            bin_labels = ['15-20', '21-25', '26-30', '31-35', '36-40', '41-45', '46-50', '51-55', '56-60', '61-65']
            bins = [15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65]
            new_variable_name = "_".join([variable, 'groups'])
            data[new_variable_name] = pd.cut(data[variable], bins = bins, labels = bin_labels, include_lowest = True)
            data.drop(variable, axis=1, inplace=True)
            print('\n', data[new_variable_name].cat.categories)
    
        #policy_annual_premium
        elif variable == 'policy_annual_premium':
            bins = list(np.linspace(0,2500, 6, dtype = int))
            bin_labels = ['very low', 'low', 'medium', 'high', 'very high']
            new_variable_name = "_".join([variable, 'groups'])
            data[new_variable_name] = pd.cut(data[variable], bins = bins, labels=bin_labels)
            data.drop(variable, axis=1, inplace=True)
            print('\n', data[new_variable_name].cat.categories)

        #month_as_customer
        elif variable == 'months_as_customer':
            bins = [0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
            bin_labels = ['0-50','51-100','101-150','151-200','201-250','251-300','301-350','351-400','401-450','451-500']
            new_variable_name = "_".join([variable, 'groups'])
            data[new_variable_name] = pd.cut(data[variable], bins = 10, labels = bin_labels, include_lowest= True)
            data.drop([variable], axis=1, inplace=True)
            print('\n', data[new_variable_name].cat.categories)

In [67]:
for var in continuous_predictors:
    continuous_encoder(data, var)


 Index(['0-50', '51-100', '101-150', '151-200', '201-250', '251-300', '301-350', '351-400', '401-450', '451-500'], dtype='object')

 Index(['15-20', '21-25', '26-30', '31-35', '36-40', '41-45', '46-50', '51-55', '56-60', '61-65'], dtype='object')

 Index(['very low', 'low', 'medium', 'high', 'very high'], dtype='object')


### Engineer Categorical variables

In [68]:
# non numerical
categorical_predictors = [col for col in data.columns if data[col].dtypes != 'int' and col != target]
ordinal_predictors = ['incident_severity', 'policy_annual_premium_groups', 'policy_deductable_groups']
nominal_predictors = [col for col in categorical_predictors if col not in ordinal_predictors]

#### Nominal

In [69]:
def nominal_encoder(data, variable):
    
    #insured_hobbies
    if variable == 'insured_hobbies':
        replace_labels = [label for label in data[variable].unique() if label not in ['chess', 'cross-fit']]
        data[variable] = data[variable].replace(replace_labels, 'others')

    dummies = pd.get_dummies(data[[variable]])
    data = pd.concat([data.drop([variable], axis=1), dummies], axis=1)
    return data

In [70]:
for var in nominal_predictors:
    data = nominal_encoder(data, var)

#### Ordinal

In [73]:
def ordinal_encoder(data, variable):
    
    categories = []
    
    if variable not in ['incident_severity', 'policy_annual_premium_groups', 'policy_deductable_group']:
        pass
    
    else:
        
        if variable == 'incident_severity':
            categories = pd.Categorical(data[ordinal_predictors]['incident_severity'],
                                              categories=['Trivial Damage',
                                                          'Minor Damage',
                                                          'Major Damage',
                                                          'Total Loss'],
                                              ordered=True)

        elif variable == 'policy_annual_premium_groups':
            categories = pd.Categorical(data[ordinal_predictors]['policy_annual_premium_groups'],
                                                      categories=['very low',
                                                              'low',
                                                              'medium',
                                                              'high',
                                                              'very high'],
                                                      ordered=True)

        elif variable == 'policy_deductable_group':
            categories = pd.Categorical(data[ordinal_predictors]['policy_deductable_group'],
                                           categories=['0-500', '501-1000', '1001-1500', '1501-2000'],
                                           ordered=True)
            
    
    labels, unique = pd.factorize(categories, sort=True)
    data[variable] = labels

In [75]:
for var in ordinal_predictors:
    print('var')
    ordinal_encoder(data, var)

var
var
var


ValueError: Length of values does not match length of index

### Feature Scaling

For use in linear models, features need to be either scaled or normalised. In the next section, I will scale features between the min and max values:

In [None]:
# capture the target
y_train = X_train['SalePrice']
y_test = X_test['SalePrice']

In [None]:
# set up scaler
scaler = MinMaxScaler()

# train scaler
scaler.fit(X_train[features])

In [None]:
# explore maximum values of variables
scaler.data_max_

In [None]:
# explore minimum values of variables
scaler.data_min_

In [None]:
# transform the train and test set, and add on the Id and SalePrice variables
X_train = scaler.transform(X_train[features])
X_test = scaler.transform(X_test[features])

## Train the Linear Regression: Lasso

In [None]:
# set up the model
# remember to set the random_state / seed

lin_model = Lasso(alpha=0.005, random_state=0)

# train the model
lin_model.fit(X_train, y_train)

# we persist the model for future use
joblib.dump(lin_model, 'lasso_regression.pkl')

In [None]:
# evaluate the model:
# ====================

# remember that we log transformed the output (SalePrice)
# in our feature engineering notebook (step 2).

# In order to get the true performance of the Lasso
# we need to transform both the target and the predictions
# back to the original house prices values.

# We will evaluate performance using the mean squared error and
# the root of the mean squared error and r2

# make predictions for train set
pred = lin_model.predict(X_train)

# determine mse and rmse
print('train mse: {}'.format(int(
    mean_squared_error(np.exp(y_train), np.exp(pred)))))
print('train rmse: {}'.format(int(
    sqrt(mean_squared_error(np.exp(y_train), np.exp(pred))))))
print('train r2: {}'.format(
    r2_score(np.exp(y_train), np.exp(pred))))
print()

# make predictions for test set
pred = lin_model.predict(X_test)

# determine mse and rmse
print('test mse: {}'.format(int(
    mean_squared_error(np.exp(y_test), np.exp(pred)))))
print('test rmse: {}'.format(int(
    sqrt(mean_squared_error(np.exp(y_test), np.exp(pred))))))
print('test r2: {}'.format(
    r2_score(np.exp(y_test), np.exp(pred))))
print()

print('Average house price: ', int(np.exp(y_train).median()))

That is all for this notebook. And that is all for this section too.

**In the next section, we will show you how to productionise this code for model deployment**.