# FNOL Model Build

In [83]:
import pandas as pd
from catboost import CatBoostRegressor, Pool, cv
from sklearn.metrics import mean_absolute_error
import numpy as np
# from sklearn.model_selection import train_test_split

In [14]:
df = pd.read_csv('./data/Data_Scientist_Interview_Task.csv')

In [15]:
# drop columns that don't provide useful information
df = df.drop(columns=['Claim Number', 'Notifier', 'Loss_code', 'Loss_description', 'Inception_to_loss'])

# drop additional columns that we won't use in this round of modelling
# date of loss could arguably be used to provide day of the week info that might be useful
# time of loss could be used to create time bands that split out rush hours and late nights etc.
df = df.drop(columns=['date_of_loss', 'Time_hour'])

In [43]:
df['Weather_conditions'] = df['Weather_conditions'].fillna('N/K')

df['PH_considered_TP_at_fault'] = df['PH_considered_TP_at_fault'].replace('#', 'n/k')

## Create Attritional Model

In [45]:
# create a data frame for the attritional claims model

df_att = df.drop(columns=['Incurred'])

In [46]:

feature_names = list(df_att.drop(columns=['Capped Incurred']))
cat_features = df_att.drop(columns=['Capped Incurred']).select_dtypes(include=['object']).columns.tolist()

data_pool = Pool(
    data = df_att.drop(columns=['Capped Incurred']),
    label = df_att['Capped Incurred'],
    feature_names = feature_names,
    cat_features = cat_features
)

In [53]:
params = {"objective": "Tweedie:variance_power=1.99",
          "iterations": 1000,
          "random_seed": 69,
#          "depth": 2,
#          "loss_function": "Logloss",
          "verbose": False}

In [54]:
scores = cv(pool = data_pool,
            params = params,
            fold_count = 4,
            early_stopping_rounds = 15)

Stopped by overfitting detector  (15 iterations wait)


In [64]:
optimal_iterations = len(scores)-15
optimal_iterations

541

In [70]:
# Fit final model
params = {"objective": "Tweedie:variance_power=1.99",
          "iterations": optimal_iterations,
          "random_seed": 69,
          "verbose": False}


att_model = CatBoostRegressor(**params)

att_model.fit(data_pool)

<catboost.core.CatBoostRegressor at 0x7fa2c2c05e20>

In [78]:
# Calculate mse of the model

att_model_preds = att_model.predict(data_pool)

mae = mean_absolute_error(att_model_preds, df_att['Capped Incurred'])

mae

4453.762070983822

In [80]:
# save the model
model_name = "fnol_attritional_model.cbm"

att_model.save_model(model_name, format="cbm")

## Create Large Loss Propensity Model

In [81]:
list(df)

['Notification_period',
 'Location_of_incident',
 'Weather_conditions',
 'Vehicle_mobile',
 'Main_driver',
 'PH_considered_TP_at_fault',
 'Vechile_registration_present',
 'Incident_details_present',
 'Injury_details_present',
 'TP_type_insd_pass_back',
 'TP_type_insd_pass_front',
 'TP_type_driver',
 'TP_type_pass_back',
 'TP_type_pass_front',
 'TP_type_bike',
 'TP_type_cyclist',
 'TP_type_pass_multi',
 'TP_type_pedestrian',
 'TP_type_other',
 'TP_type_nk',
 'TP_injury_whiplash',
 'TP_injury_traumatic',
 'TP_injury_fatality',
 'TP_injury_unclear',
 'TP_injury_nk',
 'TP_region_eastang',
 'TP_region_eastmid',
 'TP_region_london',
 'TP_region_north',
 'TP_region_northw',
 'TP_region_outerldn',
 'TP_region_scotland',
 'TP_region_southe',
 'TP_region_southw',
 'TP_region_wales',
 'TP_region_westmid',
 'TP_region_yorkshire',
 'Incurred',
 'Capped Incurred']

In [82]:
df[['Incurred', 'Capped Incurred']]

Unnamed: 0,Incurred,Capped Incurred
0,0,0
1,2801,2801
2,1221,1221
3,3530,3530
4,3156,3156
...,...,...
7686,703,703
7687,42981,42981
7688,5175,5175
7689,30072,30072


In [84]:
df['Large_Prop'] = np.where(df['Incurred'] > df['Capped Incurred'], 1, 0)

## Create Large Loss Model

In [85]:
df['Large_Incurred'] = df['Incurred'] - df['Capped Incurred']

In [88]:
df[df['Large_Prop']==1]['Large_Incurred'].mean()

128818.101010101