In [11]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import re

In [348]:
df = pd.read_csv('data/training.csv')

y = df['claim_amount']
df = df.drop(columns=['claim_amount'])

def wrangle(df):
    
    original_len = len(df)
    
    # Replace 0 vehicle weight with mean
    df.vh_weight = df.vh_weight.replace(0.0, np.mean(df[df.vh_weight > 0].vh_weight))
    
    # Replace NaNs with column mean
    nans = ['vh_age', 'vh_speed', 'vh_value', 'vh_weight']
    df[nans] = df[nans].fillna(df[nans].mean())
    
    # Join first year data
    df = df.merge(df[df.year == 1.0][['id_policy', 'pol_no_claims_discount']],
                  on='id_policy', suffixes=('', '_first'), how='left')
    
    # Change from beginning discount level
    df['discount_base_change'] = df.pol_no_claims_discount - 0.631
    # Yearly discount change over licence ownership
    df['discount_yearly_change'] = df.discount_base_change / df.drv_age_lic1
    
    # Discount change from policy beginning
    df['discount_change'] = df.pol_no_claims_discount - df.pol_no_claims_discount_first
    # Approx. no. of claims since first year
    df['no_claims'] = np.maximum(np.zeros_like(df.year), np.ceil(df.discount_change / 0.2))
    
    # Driver 1 and 2 combined info
    df['drv_sex2'] = df.drv_sex2.replace('0', '')
    df['drv_sexes'] = df.apply(lambda row: ''.join(sorted(row.drv_sex1 + row.drv_sex2)), axis=1)
    df['drv_avg_age'] = np.mean(df[['drv_age1', 'drv_age2']], axis=1)
    df['drv_avg_lic'] = np.mean(df[['drv_age_lic1', 'drv_age_lic2']], axis=1)
    
    # Drop unnecessary cols
    df = df.drop(columns=['id_policy', 'drv_drv2', 'drv_sex2', 'drv_age2', 'drv_age_lic2',
                         'vh_make_model', 'pol_pay_freq', 'pol_no_claims_discount_first'])
    
    # One-hot encoding for categorical variables
    cats = ['pol_coverage', 'pol_payd', 'pol_usage', 'drv_sex1', 'vh_fuel', 'vh_type',
           'drv_sexes']
    df = pd.get_dummies(df, prefix=cats,
                       columns=cats)
    
#     # Normalization
#     dont_normalize = ['pol_no_claims_discount', 'discount_base_change', 'discount_yearly_change',
#                                                                      'discount_change']
    
#     # Don't normalize categorical variables nor those in dont_normalize
#     to_normalize = [elem for elem in list(df.columns) if elem not in dont_normalize 
#                     and '_'.join(elem.split('_')[:-1]) not in cats]

#     if normalizer is None:
#         normalizer = StandardScaler()
#         normalizer = normalizer.fit(df[to_normalize])
        
#     df[to_normalize] = normalizer.transform(df[to_normalize])
    
    assert len(df) == original_len
    return df

def normalize(df, normalizer=None):
    
    cats = ['pol_coverage', 'pol_pay_freq', 'pol_payd', 'pol_usage', 'drv_sex1', 'vh_fuel', 'vh_type',
           'drv_sexes']
    dont_normalize = ['pol_no_claims_discount', 'discount_base_change', 'discount_yearly_change',
                                                                     'discount_change']
    
    # Don't normalize categorical variables nor those in dont_normalize
    to_normalize = [elem for elem in list(df.columns) if elem not in dont_normalize 
                    and '_'.join(elem.split('_')[:-1]) not in cats]

    if normalizer is None:
        normalizer = StandardScaler()
        normalizer = normalizer.fit(df[to_normalize])
        
    df[to_normalize] = normalizer.transform(df[to_normalize])
    return df, normalizer

df = wrangle(df)

y = y[df.year != 1]
df = df[df.year != 1]

df = df.drop(columns=['year'])

x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=2021)
# x_train = df[df.year!=4]
# y_train = y[df.year!=4]
# x_test = df[df.year==4]
# y_test = y[df.year==4]

# x_train, norm = normalize(x_train)
# x_test, blah = normalize(x_test, norm)

In [401]:
reg = LinearRegression().fit(x_train, y_train)

train_preds = reg.predict(x_train)
train_preds[train_preds < 0] = 0

lr_test_preds = reg.predict(x_test)
lr_test_preds[lr_test_preds < 0] = 0

train_error = mean_squared_error(y_train, train_preds, squared=False)
test_error = mean_squared_error(y_test, lr_test_preds, squared=False)

print("Train: {}\nTest: {}".format(train_error, test_error))

Train: 696.3628338194085
Test: 664.7418476669355


In [381]:
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV

alphas = [0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.4, 0.5, 0.8, 1, 1.5]
ridge = RidgeCV(alphas=alphas)
ridge.fit(x_train, y_train)

train_preds = ridge.predict(x_train)
train_preds[train_preds < 0] = 0

test_preds = ridge.predict(x_test)
test_preds[test_preds < 0] = 0

train_error = mean_squared_error(y_train, train_preds, squared=False)
test_error = mean_squared_error(y_test, test_preds, squared=False)

print("Train: {}\nTest: {}".format(train_error, test_error))

Train: 696.368654728987
Test: 664.7460102457505


In [398]:
from sklearn.ensemble import RandomForestRegressor

n_estimators = 100
max_depth = 8
max_features = 'log2'
min_samples_split = 200
max_leaf_nodes = None

reg = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, max_leaf_nodes=max_leaf_nodes,
                             max_features=max_features, min_samples_split=min_samples_split,
                            random_state=2021).fit(x_train, y_train)

train_preds = reg.predict(x_train)
train_preds[train_preds < 0] = 0

rf_test_preds = reg.predict(x_test)
rf_test_preds[rf_test_preds < 0] = 0

train_error = mean_squared_error(y_train, train_preds, squared=False)
test_error = mean_squared_error(y_test, rf_test_preds, squared=False)

print("Train: {}\nTest: {}".format(train_error, test_error))

Train: 692.0164200246895
Test: 664.5876614766811


In [383]:
for name, importance in zip(x_train.columns, reg.feature_importances_):
     print(name, "=", importance)

pol_no_claims_discount = 0.02185229886195946
pol_duration = 0.01934444875792556
pol_sit_duration = 0.03889296164747536
drv_age1 = 0.028830085165734477
drv_age_lic1 = 0.024833315610418412
vh_age = 0.13854436003960954
vh_speed = 0.03366538693796143
vh_value = 0.08559991610900552
vh_weight = 0.07577918785446702
population = 0.044777588654106276
town_surface_area = 0.04971922750832976
discount_base_change = 0.022147245343338794
discount_yearly_change = 0.024662015377118058
discount_change = 0.04192945396717478
no_claims = 0.008809912432687073
drv_avg_age = 0.043669803029470745
drv_avg_lic = 0.030686526476394464
pol_coverage_Max = 0.1182591170369297
pol_coverage_Med1 = 0.016472684307511847
pol_coverage_Med2 = 0.031038846304237453
pol_coverage_Min = 0.021211849291579012
pol_payd_No = 0.001726124335330552
pol_payd_Yes = 0.001787944437608641
pol_usage_AllTrips = 0.004909735822445731
pol_usage_Professional = 0.012411278402981814
pol_usage_Retired = 0.004674117428713538
pol_usage_WorkPrivate = 0

In [399]:
import xgboost as xgb

reg = xgb.XGBRegressor(
    n_estimators=15,
    reg_lambda=0.001,
    gamma=1,
    max_depth=3
)

reg.fit(x_train, y_train)

train_preds = reg.predict(x_train)
train_preds[train_preds < 0] = 0

test_preds = reg.predict(x_test)
test_preds[test_preds < 0] = 0

train_error = mean_squared_error(y_train, train_preds, squared=False)
test_error = mean_squared_error(y_test, test_preds, squared=False)

print("Train: {}\nTest: {}".format(train_error, test_error))

Train: 693.3443824513934
Test: 664.6502019517762


In [385]:
for name, importance in zip(df.columns, reg.feature_importances_):
     print(name, "=", importance)

pol_no_claims_discount = 0.090079434
pol_duration = 0.0
pol_sit_duration = 0.0
drv_age1 = 0.0
drv_age_lic1 = 0.0
vh_age = 0.24531934
vh_speed = 0.0
vh_value = 0.10367675
vh_weight = 0.0
population = 0.0
town_surface_area = 0.0
discount_base_change = 0.0
discount_yearly_change = 0.0
discount_change = 0.06049785
no_claims = 0.0
drv_avg_age = 0.0
drv_avg_lic = 0.0
pol_coverage_Max = 0.50042665
pol_coverage_Med1 = 0.0
pol_coverage_Med2 = 0.0
pol_coverage_Min = 0.0
pol_payd_No = 0.0
pol_payd_Yes = 0.0
pol_usage_AllTrips = 0.0
pol_usage_Professional = 0.0
pol_usage_Retired = 0.0
pol_usage_WorkPrivate = 0.0
drv_sex1_F = 0.0
drv_sex1_M = 0.0
vh_fuel_Diesel = 0.0
vh_fuel_Gasoline = 0.0
vh_fuel_Hybrid = 0.0
vh_type_Commercial = 0.0
vh_type_Tourism = 0.0
drv_sexes_F = 0.0
drv_sexes_FF = 0.0
drv_sexes_FM = 0.0
drv_sexes_M = 0.0
drv_sexes_MM = 0.0


In [402]:
a = (rf_test_preds + test_preds + lr_test_preds) / 3.0
mean_squared_error(y_test, a, squared=False)

664.4390947437056