In [37]:
from sklearn.metrics import mean_squared_error, confusion_matrix, ConfusionMatrixDisplay
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
df = pd.read_excel('2025 - BEST Hackathon - dataset.xlsx')

# Data preparation

In [None]:
prepared = df[df['POLICYHOLDER_AGE'] > 0].copy()
prepared.drop(columns=['POLICYHOLDER_GENDER', 'PREMIUM_AMOUNT_PAID', 'CLAIM_ID', 'VEHICLE_MODEL'], inplace=True)
prepared.loc[prepared['CLAIM_PROVINCE'] == 'VS', 'CLAIM_REGION'] = 'SARDEGNA'
prepared.loc[prepared['CLAIM_PROVINCE'] == 'PS', 'CLAIM_REGION'] = 'MARCHE'
prepared.loc[prepared['CLAIM_PROVINCE'] == 'CI', 'CLAIM_REGION'] = 'SARDEGNA'
prepared.loc[prepared['CLAIM_PROVINCE'] == 'SU', 'CLAIM_REGION'] = 'SARDEGNA'
prepared.loc[prepared['CLAIM_PROVINCE'] == 'EN', 'CLAIM_REGION'] = 'SICILIA'

prepared = prepared.loc[prepared['CLAIM_REGION'].notna() | prepared['CLAIM_PROVINCE'].notna(), :]
prepared = prepared.loc[prepared['VEHICLE_BRAND'].notna(), :]
prepared = prepared.loc[prepared['POLICYHOLDER_AGE'] < 100, :]
# prepared = prepared.loc[prepared[],:]

prepared['WARRANTY'] = prepared['WARRANTY'].astype('category').cat.codes
prepared['CLAIM_REGION'] = prepared['CLAIM_REGION'].astype('category').cat.codes
prepared['VEHICLE_BRAND'] = prepared['VEHICLE_BRAND'].astype('category').cat.codes
prepared['CLAIM_PROVINCE'] = prepared['CLAIM_PROVINCE'].astype('category').cat.codes
# prepared['VEHICLE_MODEL'] = prepared['VEHICLE_MODEL'].str.upper().astype('category').cat.codes

# prepared.to_csv('prepared.csv', index=False)

prepared.drop(columns=['CLAIM_DATE'], inplace=True)

prepared.dtypes

POLICYHOLDER_AGE       int64
WARRANTY                int8
CLAIM_REGION            int8
CLAIM_PROVINCE          int8
VEHICLE_BRAND          int16
VEHICLE_MODEL          int16
CLAIM_AMOUNT_PAID    float64
dtype: object

In [59]:
prepared.corr()

Unnamed: 0,POLICYHOLDER_AGE,WARRANTY,CLAIM_REGION,CLAIM_PROVINCE,VEHICLE_BRAND,VEHICLE_MODEL,CLAIM_AMOUNT_PAID
POLICYHOLDER_AGE,1.0,-0.076937,0.000162,0.003551,0.009937,0.034787,-0.004903
WARRANTY,-0.076937,1.0,-0.019473,0.03324,-0.017557,-0.022454,-0.300613
CLAIM_REGION,0.000162,-0.019473,1.0,0.204507,0.002499,0.00246,0.023572
CLAIM_PROVINCE,0.003551,0.03324,0.204507,1.0,0.000911,0.00242,0.029508
VEHICLE_BRAND,0.009937,-0.017557,0.002499,0.000911,1.0,0.119551,-0.008529
VEHICLE_MODEL,0.034787,-0.022454,0.00246,0.00242,0.119551,1.0,0.008413
CLAIM_AMOUNT_PAID,-0.004903,-0.300613,0.023572,0.029508,-0.008529,0.008413,1.0


In [6]:
# macchine = prepared[['VEHICLE_BRAND', 'VEHICLE_MODEL']]
# macchine.to_csv('macchine.csv', index=False)

In [None]:
# reg = prepared.loc[:, ['CLAIM_PROVINCE', 'CLAIM_REGION']].drop_duplicates()

In [None]:
# macc = prepared[['VEHICLE_MODEL']].copy()
# macc.loc[:, 'n'] = 1
# macc.loc[:, 'VEHICLE_MODEL'] = macc['VEHICLE_MODEL'].str.upper()
# macc = macc.groupby(['VEHICLE_MODEL']).count().sort_values('n', ascending=False)
# macc.loc[macc['n'] <= 5, :].sum()

# Models

In [55]:
X = prepared.iloc[:, 1:-1]
y = prepared.iloc[:, -1]

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_train = xgb.DMatrix(X_train, y_train, enable_categorical=True)
xgb_test = xgb.DMatrix(X_test, y_test, enable_categorical=True)

# Random forest

In [51]:
regr = RandomForestRegressor(random_state=42, n_jobs=-1)

In [57]:
regr.fit(X_train, y_train)

In [58]:
preds_forest = regr.predict(X_test)
mean_squared_error(y_test, preds_forest)

1728163.130376711

# XGBoost

In [34]:
# n = 1000
params = {
    "eta": 0.1,
    "max_depth": 50,
    "tree_method": "hist",
    "device": "cuda",  # switch between "cpu" and "GPU"
    # "nthread": 24,  # increase this to increase CPU threads
    "seed": 42
}

# model = xgb.XGBRegressor(enable_categorical=True, device='cuda')

# model = xgb.XGBRegressor(
#     n_estimators=1000,
#     enable_categorical=True,
#     device='cuda',
#     max_depth=3,
#     objective='reg:squarederror',
#     learning_rate=0.1,
#     early_stopping_rounds=5,
# )

In [35]:
model = xgb.train(params, xgb_train, 1000, evals=[(xgb_train, 'train'), (xgb_test, 'test')], early_stopping_rounds=10)

[0]	train-rmse:1429.62611	test-rmse:1435.94213
[1]	train-rmse:1380.56829	test-rmse:1394.72222
[2]	train-rmse:1339.15240	test-rmse:1360.80165
[3]	train-rmse:1304.27586	test-rmse:1333.25592
[4]	train-rmse:1274.95389	test-rmse:1310.82650
[5]	train-rmse:1250.40495	test-rmse:1292.76500
[6]	train-rmse:1229.90391	test-rmse:1278.30390
[7]	train-rmse:1212.82397	test-rmse:1266.83774
[8]	train-rmse:1198.60186	test-rmse:1257.77152
[9]	train-rmse:1186.77134	test-rmse:1250.78048
[10]	train-rmse:1176.89061	test-rmse:1245.32381
[11]	train-rmse:1168.68928	test-rmse:1241.20245
[12]	train-rmse:1161.86140	test-rmse:1238.19534
[13]	train-rmse:1156.21020	test-rmse:1235.97937
[14]	train-rmse:1151.49618	test-rmse:1234.46287
[15]	train-rmse:1147.56916	test-rmse:1233.53823
[16]	train-rmse:1144.29985	test-rmse:1233.06660
[17]	train-rmse:1141.57279	test-rmse:1232.91309
[18]	train-rmse:1139.29405	test-rmse:1232.93650
[19]	train-rmse:1137.38425	test-rmse:1233.06614
[20]	train-rmse:1135.78069	test-rmse:1233.37237
[2

In [36]:
preds = model.predict(xgb_test)
mean_squared_error(y_test, preds)

1531056.5816954747