In [432]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Matplotlib visual settings
plt.style.use("ggplot")
plt.rcParams['figure.figsize'] = [15,7]

In [433]:
df = pd.read_csv('data/training.csv')

preds = [item for item in list(df.columns) if item not in ('id_policy', 'pol_pay_freq', 'drv_sex2', 'drv_age2',
                                                          'drv_age_lic2', 'vh_make_model', 'claim_amount')]

In [434]:
df.head()

Unnamed: 0,id_policy,year,pol_no_claims_discount,pol_coverage,pol_duration,pol_sit_duration,pol_pay_freq,pol_payd,pol_usage,drv_sex1,...,vh_make_model,vh_age,vh_fuel,vh_type,vh_speed,vh_value,vh_weight,population,town_surface_area,claim_amount
0,PL000000,1.0,0.332,Med2,5,1,Monthly,No,WorkPrivate,M,...,aparvvfowrjncdhp,8.0,Gasoline,Tourism,174.0,11040.0,1143.0,1270.0,33.1,0.0
1,PL042495,1.0,0.0,Med2,6,1,Monthly,No,WorkPrivate,M,...,aparvvfowrjncdhp,10.0,Diesel,Tourism,174.0,11040.0,1143.0,1290.0,51.3,0.0
2,PL042496,1.0,0.196,Med1,2,1,Yearly,Yes,Retired,M,...,iwhqpdfuhrsxyqxe,8.0,Diesel,Commercial,150.0,14159.0,1193.0,1020.0,262.8,0.0
3,PL042497,1.0,0.0,Med2,8,5,Yearly,No,WorkPrivate,F,...,kvcddisqpkysmvvo,4.0,Gasoline,Tourism,149.0,17233.0,1012.0,180.0,219.7,0.0
4,PL042498,1.0,0.0,Med1,2,2,Yearly,No,Retired,F,...,tdgkjlphosocwbgu,13.0,Gasoline,Tourism,200.0,19422.0,1315.0,30.0,70.3,0.0


In [435]:
df.columns

Index(['id_policy', 'year', 'pol_no_claims_discount', 'pol_coverage',
       'pol_duration', 'pol_sit_duration', 'pol_pay_freq', 'pol_payd',
       'pol_usage', 'drv_sex1', 'drv_age1', 'drv_age_lic1', 'drv_drv2',
       'drv_sex2', 'drv_age2', 'drv_age_lic2', 'vh_make_model', 'vh_age',
       'vh_fuel', 'vh_type', 'vh_speed', 'vh_value', 'vh_weight', 'population',
       'town_surface_area', 'claim_amount'],
      dtype='object')

In [436]:
x = df.drop(columns=['claim_amount'])
y = df['claim_amount']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2021)

In [437]:
df = x_train.copy()
df['claim_amount'] = y_train
agg = df.groupby('id_policy').agg({
    'pol_no_claims_discount': np.mean,
    'pol_coverage': np.max,
    'pol_payd': np.max,
    'pol_usage': np.max,
    'drv_sex1': np.max,
    'drv_age1': np.mean,
    'drv_age_lic1': np.mean,
    'drv_drv2': np.max,
    'vh_age': np.mean,
    'vh_fuel': np.max,
    'vh_type': np.max,
    'vh_speed': np.max,
    'vh_value': np.mean,
    'vh_weight': np.max,
    'population': np.mean,
    'town_surface_area': np.mean,
    'claim_amount': np.mean
})

In [438]:
# Remove rows with missing population data
agg = agg[agg.population > 0]
# Remove rows with missing vehicle weight data
agg = agg[agg.vh_weight > 0]
# Remove rows with missing vehicle age data
agg = agg.dropna()

agg = agg[agg.claim_amount < 2500]

In [439]:
agg.head()

Unnamed: 0_level_0,pol_no_claims_discount,pol_coverage,pol_payd,pol_usage,drv_sex1,drv_age1,drv_age_lic1,drv_drv2,vh_age,vh_fuel,vh_type,vh_speed,vh_value,vh_weight,population,town_surface_area,claim_amount
id_policy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
PL000000,0.25075,Med2,No,WorkPrivate,M,36.5,17.5,Yes,9.5,Gasoline,Tourism,174.0,11040.0,1143.0,1270.0,33.1,0.0
PL000001,0.0,Max,No,WorkPrivate,F,47.0,29.0,No,13.0,Diesel,Tourism,182.0,12820.0,1036.0,80.0,139.3,0.0
PL000003,0.0,Min,No,WorkPrivate,M,55.5,36.5,No,29.5,Diesel,Tourism,156.0,25378.0,2554.0,420.0,673.7,0.0
PL000005,0.0,Max,No,Professional,M,75.5,55.5,Yes,4.5,Gasoline,Tourism,181.0,21171.0,1305.0,420.0,184.0,542.1325
PL000007,0.0,Max,No,Retired,F,86.666667,66.666667,No,2.666667,Diesel,Tourism,182.0,12820.0,1036.0,360.0,131.7,0.0


In [440]:
agg_x = agg.drop(columns=['claim_amount'])
agg_y = agg['claim_amount']

In [441]:
agg_x = pd.get_dummies(agg_x, prefix=['coverage', 'payd', 'usage', 'sex', 'drv2', 'fuel', 'type'],
                       columns=['pol_coverage', 'pol_payd', 'pol_usage', 'drv_sex1', 'drv_drv2', 'vh_fuel', 'vh_type'])

In [442]:
agg_x.head()

Unnamed: 0_level_0,pol_no_claims_discount,drv_age1,drv_age_lic1,vh_age,vh_speed,vh_value,vh_weight,population,town_surface_area,coverage_Max,...,usage_WorkPrivate,sex_F,sex_M,drv2_No,drv2_Yes,fuel_Diesel,fuel_Gasoline,fuel_Hybrid,type_Commercial,type_Tourism
id_policy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PL000000,0.25075,36.5,17.5,9.5,174.0,11040.0,1143.0,1270.0,33.1,0,...,1,0,1,0,1,0,1,0,0,1
PL000001,0.0,47.0,29.0,13.0,182.0,12820.0,1036.0,80.0,139.3,1,...,1,1,0,1,0,1,0,0,0,1
PL000003,0.0,55.5,36.5,29.5,156.0,25378.0,2554.0,420.0,673.7,0,...,1,0,1,1,0,1,0,0,0,1
PL000005,0.0,75.5,55.5,4.5,181.0,21171.0,1305.0,420.0,184.0,1,...,0,0,1,0,1,0,1,0,0,1
PL000007,0.0,86.666667,66.666667,2.666667,182.0,12820.0,1036.0,360.0,131.7,1,...,0,1,0,1,0,1,0,0,0,1


In [443]:
agg_y.head()

id_policy
PL000000      0.0000
PL000001      0.0000
PL000003      0.0000
PL000005    542.1325
PL000007      0.0000
Name: claim_amount, dtype: float64

In [444]:
def prepare_test_data(x, transformer):

    params = ['pol_no_claims_discount',
    'pol_coverage',
    'pol_payd',
    'pol_usage',
    'drv_sex1',
    'drv_age1',
    'drv_age_lic1',
    'drv_drv2',
    'vh_age',
    'vh_fuel',
    'vh_type',
    'vh_speed',
    'vh_value',
    'vh_weight',
    'population',
    'town_surface_area']

    x = x[params]

    x = pd.get_dummies(x, prefix=['coverage', 'payd', 'usage', 'sex', 'drv2', 'fuel', 'type'],
                       columns=['pol_coverage', 'pol_payd', 'pol_usage', 'drv_sex1', 'drv_drv2', 'vh_fuel', 'vh_type'])
  
    # Replace NaNs with column mean
    x = x.fillna(x.mean())

    x = transformer.transform(x)

    return x

In [445]:
# Normalize data
transformer = Normalizer().fit(agg_x)
x_train = transformer.transform(agg_x)
y_train = agg_y
x_test = prepare_test_data(x_test, transformer)

### Linear Regression

In [446]:
reg = LinearRegression().fit(x_train, y_train)

train_preds = reg.predict(x_train)
train_preds[train_preds < 0] = 0

test_preds = reg.predict(x_test)
test_preds[test_preds < 0] = 0

In [590]:
mean_squared_error(y_train, train_preds, squared=False)

279.4894257614839

In [591]:
mean_squared_error(y_test, test_preds, squared=False)

803.9185923740134

### Penalized LR

In [608]:
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV

alphas = [0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.4, 0.5, 0.8, 1, 1.5]
ridge = RidgeCV(alphas=alphas)
ridge.fit(x_train, y_train)

train_preds = ridge.predict(x_train)
train_preds[train_preds < 0] = 0

test_preds = ridge.predict(x_test)
test_preds[test_preds < 0] = 0

In [605]:
mean_squared_error(y_train, train_preds, squared=False)

280.15266951391925

In [606]:
mean_squared_error(y_test, test_preds, squared=False)

804.1028769237731

In [610]:
lasso = LassoCV()
lasso.fit(x_train, y_train)
train_preds = lasso.predict(x_train)
train_preds[train_preds < 0] = 0

test_preds = lasso.predict(x_test)
test_preds[test_preds < 0] = 0

In [611]:
mean_squared_error(y_train, train_preds, squared=False)

280.32271935601676

In [612]:
mean_squared_error(y_test, test_preds, squared=False)

804.1820733288081

### Random Forest

In [449]:
from sklearn.ensemble import RandomForestRegressor

In [585]:
n_estimators = 25
max_depth = 7
max_features = 'log2'

In [586]:
regr = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth,
                             max_features=max_features, random_state=2021)

In [592]:
regr.fit(x_train, y_train)

train_preds = regr.predict(x_train)
train_preds[train_preds < 0] = 0

test_preds = regr.predict(x_test)
test_preds[test_preds < 0] = 0

In [593]:
mean_squared_error(y_train, train_preds, squared=False)

272.15352273352994

In [594]:
mean_squared_error(y_test, test_preds, squared=False)

802.4536992091906

### XGBoost

In [613]:
import xgboost as xgb

In [736]:
regressor = xgb.XGBRegressor(
    n_estimators=25,
    reg_lambda=0.1,
    gamma=2,
    max_depth=4
)

In [737]:
regressor.fit(x_train, y_train)

train_preds = regressor.predict(x_train)
train_preds[train_preds < 0] = 0

test_preds = regressor.predict(x_test)
test_preds[test_preds < 0] = 0

In [738]:
mean_squared_error(y_train, train_preds, squared=False)

267.2583423278272

In [739]:
mean_squared_error(y_test, test_preds, squared=False)

802.5065079969723