In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import re

In [87]:
df = pd.read_csv('data/training.csv')

y = df['claim_amount']
df = df.drop(columns=['claim_amount'])

def wrangle(df):
    
    original_len = len(df)
    
    # Replace 0 vehicle weight with mean
    df.vh_weight = df.vh_weight.replace(0.0, np.mean(df[df.vh_weight > 0].vh_weight))
    
    # Replace NaNs with column mean
    nans = ['vh_age', 'vh_speed', 'vh_value', 'vh_weight']
    df[nans] = df[nans].fillna(df[nans].mean())
    
    # Join first year data
    df = df.merge(df[df.year == 1.0][['id_policy', 'pol_no_claims_discount']],
                  on='id_policy', suffixes=('', '_first'), how='left')
    
    # Change from beginning discount level
    df['discount_base_change'] = df.pol_no_claims_discount - 0.631
    # Yearly discount change over licence ownership
    df['discount_yearly_change'] = df.discount_base_change / df.drv_age_lic1
    
    # Discount change from policy beginning
    df['discount_change'] = df.pol_no_claims_discount - df.pol_no_claims_discount_first
    # Approx. no. of claims since first year
    df['no_claims'] = np.maximum(np.zeros_like(df.year), np.ceil(df.discount_change / 0.2))
    
    # Driver 1 and 2 combined info
    df['drv_sex2'] = df.drv_sex2.replace('0', '')
    df['drv_sexes'] = df.apply(lambda row: ''.join(sorted(row.drv_sex1 + row.drv_sex2)), axis=1)
    df['drv_avg_age'] = np.mean(df[['drv_age1', 'drv_age2']], axis=1)
    df['drv_avg_lic'] = np.mean(df[['drv_age_lic1', 'drv_age_lic2']], axis=1)
    
    # Population density
    df['pop_dens'] = df.population / df.town_surface_area
    
    # Drop unnecessary cols
    df = df.drop(columns=['id_policy', 'drv_drv2', 'drv_sex2', 'drv_age2', 'drv_age_lic2',
                         'vh_make_model', 'pol_pay_freq', 'pol_no_claims_discount_first'])
    
    # One-hot encoding for categorical variables
    cats = ['pol_coverage', 'pol_payd', 'pol_usage', 'drv_sex1', 'vh_fuel', 'vh_type',
           'drv_sexes']
    df = pd.get_dummies(df, prefix=cats,
                       columns=cats)
    
    assert len(df) == original_len
    return df

def normalize(df, normalizer=None):
    
    cats = ['pol_coverage', 'pol_pay_freq', 'pol_payd', 'pol_usage', 'drv_sex1', 'vh_fuel', 'vh_type',
           'drv_sexes']
    dont_normalize = ['pol_no_claims_discount', 'discount_base_change', 'discount_yearly_change',
                                                                     'discount_change']
    
    # Don't normalize categorical variables nor those in dont_normalize
    to_normalize = [elem for elem in list(df.columns) if elem not in dont_normalize 
                    and '_'.join(elem.split('_')[:-1]) not in cats]

    if normalizer is None:
        normalizer = StandardScaler()
        normalizer = normalizer.fit(df[to_normalize])
        
    df[to_normalize] = normalizer.transform(df[to_normalize])
    return df, normalizer

df = wrangle(df)

# y = y[df.year != 1]
# df = df[df.year != 1]

df = df.drop(columns=['year'])

x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=2021)
# x_train = df[df.year!=4]
# y_train = y[df.year!=4]
# x_test = df[df.year==4]
# y_test = y[df.year==4]

# x_train, norm = normalize(x_train)
# x_test, blah = normalize(x_test, norm)

In [88]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(x_train, y_train)

train_preds = reg.predict(x_train)
train_preds[train_preds < 0] = 0

lr_test_preds = reg.predict(x_test)
lr_test_preds[lr_test_preds < 0] = 0

train_error = mean_squared_error(y_train, train_preds, squared=False)
test_error = mean_squared_error(y_test, lr_test_preds, squared=False)

print("Train: {}\nTest: {}".format(train_error, test_error))

Train: 733.7015619729996
Test: 674.9681929709905


In [89]:
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV

alphas = [0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.4, 0.5, 0.8, 1, 1.5]
ridge = RidgeCV(alphas=alphas)
ridge.fit(x_train, y_train)

train_preds = ridge.predict(x_train)
train_preds[train_preds < 0] = 0

test_preds = ridge.predict(x_test)
test_preds[test_preds < 0] = 0

train_error = mean_squared_error(y_train, train_preds, squared=False)
test_error = mean_squared_error(y_test, test_preds, squared=False)

print("Train: {}\nTest: {}".format(train_error, test_error))

Train: 733.7021775033223
Test: 674.9912326196772


In [90]:
from sklearn.ensemble import RandomForestRegressor

n_estimators = 150
max_depth = 12
max_features = 'log2'
min_samples_split = 200
max_leaf_nodes = None

reg = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, max_leaf_nodes=max_leaf_nodes,
                             max_features=max_features, min_samples_split=min_samples_split, bootstrap=True).fit(x_train, y_train)

train_preds = reg.predict(x_train)
train_preds[train_preds < 0] = 0

rf_test_preds = reg.predict(x_test)
rf_test_preds[rf_test_preds < 0] = 0

train_error = mean_squared_error(y_train, train_preds, squared=False)
test_error = mean_squared_error(y_test, rf_test_preds, squared=False)

print("Train: {}\nTest: {}".format(train_error, test_error))

Train: 722.5819187855234
Test: 675.0912266326844


In [91]:
for name, importance in zip(x_train.columns, reg.feature_importances_):
     print(name, "=", importance)

pol_no_claims_discount = 0.017800710076417967
pol_duration = 0.06193091702156504
pol_sit_duration = 0.0287562402152678
drv_age1 = 0.041109266238808354
drv_age_lic1 = 0.03315155399931589
vh_age = 0.08977832284098487
vh_speed = 0.030130355225949423
vh_value = 0.07987693283907299
vh_weight = 0.06355765099377612
population = 0.03304257498528113
town_surface_area = 0.07139374235792058
discount_base_change = 0.019046468750082337
discount_yearly_change = 0.04276612547125053
discount_change = 0.029768494263464063
no_claims = 0.007014586537455016
drv_avg_age = 0.0528665290285469
drv_avg_lic = 0.03384663228950876
pop_dens = 0.08716321251273039
pol_coverage_Max = 0.06232204881311778
pol_coverage_Med1 = 0.007705612463177845
pol_coverage_Med2 = 0.020990502425692528
pol_coverage_Min = 0.014771397593092282
pol_payd_No = 0.0012642949904980731
pol_payd_Yes = 0.0012120133192975884
pol_usage_AllTrips = 0.003980929041167347
pol_usage_Professional = 0.008603260334895248
pol_usage_Retired = 0.00408887565651

In [92]:
import xgboost as xgb

reg = xgb.XGBRegressor(
    n_estimators=15,
    reg_lambda=0.001,
    gamma=1,
    max_depth=3
)

reg.fit(x_train, y_train)

xg_train_preds = reg.predict(x_train)
xg_train_preds[xg_train_preds < 0] = 0

xg_test_preds = reg.predict(x_test)
xg_test_preds[xg_test_preds < 0] = 0

train_error = mean_squared_error(y_train, xg_train_preds, squared=False)
test_error = mean_squared_error(y_test, xg_test_preds, squared=False)

print("Train: {}\nTest: {}".format(train_error, test_error))

Train: 730.9118827303463
Test: 675.5593009529223


In [93]:
for name, importance in zip(df.columns, reg.feature_importances_):
     print(name, "=", importance)

pol_no_claims_discount = 0.03141623
pol_duration = 0.16082229
pol_sit_duration = 0.047738925
drv_age1 = 0.02446534
drv_age_lic1 = 0.0
vh_age = 0.06508392
vh_speed = 0.014568703
vh_value = 0.08143593
vh_weight = 0.038490303
population = 0.015958367
town_surface_area = 0.01719497
discount_base_change = 0.0
discount_yearly_change = 0.01768867
discount_change = 0.023430578
no_claims = 0.0
drv_avg_age = 0.014936874
drv_avg_lic = 0.021794667
pop_dens = 0.022748753
pol_coverage_Max = 0.33590415
pol_coverage_Med1 = 0.0
pol_coverage_Med2 = 0.0
pol_coverage_Min = 0.015666245
pol_payd_No = 0.0
pol_payd_Yes = 0.0
pol_usage_AllTrips = 0.010912179
pol_usage_Professional = 0.026105046
pol_usage_Retired = 0.0
pol_usage_WorkPrivate = 0.0
drv_sex1_F = 0.0
drv_sex1_M = 0.0
vh_fuel_Diesel = 0.013637887
vh_fuel_Gasoline = 0.0
vh_fuel_Hybrid = 0.0
vh_type_Commercial = 0.0
vh_type_Tourism = 0.0
drv_sexes_F = 0.0
drv_sexes_FF = 0.0
drv_sexes_FM = 0.0
drv_sexes_M = 0.0
drv_sexes_MM = 0.0


In [94]:
a = (rf_test_preds + test_preds) / 2.0
mean_squared_error(y_test, a, squared=False)

674.8572369219102

In [95]:
b = np.minimum(rf_test_preds, test_preds)
mean_squared_error(y_test, b, squared=False)

674.8610697715943

In [96]:
from sklearn.ensemble import GradientBoostingRegressor

reg = GradientBoostingRegressor(n_estimators=65, learning_rate=0.5,
     max_depth=1, random_state=2021, loss='ls', criterion='mse').fit(x_train, y_train)

train_preds = reg.predict(x_train)
train_preds[train_preds < 0] = 0

test_preds = reg.predict(x_test)
test_preds[test_preds < 0] = 0

train_error = mean_squared_error(y_train, train_preds, squared=False)
test_error = mean_squared_error(y_test, test_preds, squared=False)

print("Train: {}\nTest: {}".format(train_error, test_error))

Train: 733.3463944876357
Test: 674.8719602573025
