In [None]:
import numpy as np
import pandas as pd
import pickle
import statsmodels.api as sm
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import xgboost as xg

In [16]:
# Load dataset
df = pd.read_csv("feature_selected_data.csv")
X = df.drop(columns=['ClaimFrequency', "isClaim"], axis=1)
y = df['ClaimFrequency']

In [17]:
# prepare GLM predictions
with open("glm_model.pkl", "rb") as f:
    glm_model = pickle.load(f)

with open("preprocessor_glm.pkl", "rb") as f:
    glm_preprocessor = pickle.load(f)

X_transformed = pd.DataFrame(glm_preprocessor.transform(X).toarray(), columns=glm_preprocessor.get_feature_names_out()) 

glm_columns = glm_model.model.exog_names
col_names_X = [x for x in glm_columns if not x.startswith("inflate_") and x != 'const']
col_names_X_infl = [x.replace("inflate_", "") for x in glm_columns if x.startswith("inflate_") and x != 'inflate_const']

X_glm = X_transformed[col_names_X]
X_glm_infl = X_transformed[col_names_X_infl]
glm_preds = glm_model.predict(exog=sm.add_constant(X_glm), exog_infl = sm.add_constant(X_glm_infl)) 

In [18]:
# prepare ML predictions
with open("best_classifier.pkl", "rb") as f:
    best_classifier = pickle.load(f)

with open("best_regressor.pkl", "rb") as f:
    best_regressor = pickle.load(f)

class_preds = best_classifier.predict(X)
ml_preds = np.where(class_preds == 1, best_regressor.predict(X), 0)

In [26]:
df

Unnamed: 0,VehPower,VehAge,DrivAge,Area,VehBrand,VehGas,Region,ClaimFrequency,VehPowerDriverAge,TransformBonusMalus,TransformDensity,isClaim,Prediction_Diff,Prediction_GLM,Prediction_ML
0,5.0,0.0,55.0,D,B12,Regular,R82,1.000000,0.090909,3.912023,7.104144,1,1.379472,0.096035,1.475507
1,5.0,0.0,55.0,D,B12,Regular,R82,1.298701,0.090909,3.912023,7.104144,1,1.379472,0.096035,1.475507
2,6.0,2.0,52.0,B,B12,Diesel,R22,1.333333,0.115385,3.912023,3.988984,1,0.067744,0.067744,0.000000
3,7.0,0.0,46.0,B,B12,Diesel,R72,1.190476,0.152174,3.912023,4.330733,1,1.603318,0.056279,1.659596
4,6.0,2.0,38.0,E,B12,Regular,R31,1.923077,0.157895,3.912023,8.007367,1,0.053822,0.053822,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
677997,4.0,0.0,54.0,E,B12,Regular,R93,0.000000,0.074074,3.912023,8.106816,0,1.449262,0.080859,1.530120
677998,4.0,0.0,41.0,E,B12,Regular,R11,0.000000,0.097561,4.553877,9.195227,0,1.251910,0.186562,1.438472
677999,6.0,2.0,45.0,D,B12,Diesel,R82,0.000000,0.133333,3.912023,7.187657,0,0.075685,0.075685,0.000000
678000,4.0,0.0,60.0,B,B12,Regular,R26,0.000000,0.066667,3.912023,4.553877,0,0.076140,0.076140,0.000000


In [30]:
# Compute absolute prediction differences
df['Prediction_GLM'] = glm_preds
df['Prediction_ML'] = ml_preds
df['Prediction_Diff'] = np.abs(glm_preds - ml_preds)

# Identify customer segments with largest differences
categorical_features = X.select_dtypes(include=['object']).columns
for i in categorical_features:
    segment_differences = df.groupby([i])['Prediction_Diff'].mean().sort_values(ascending=False)
    print(segment_differences.head(10))




Area
F    0.267930
A    0.243156
D    0.232958
C    0.232858
E    0.232439
B    0.220872
Name: Prediction_Diff, dtype: float64
VehBrand
B1     0.282405
B2     0.273378
B5     0.223480
B3     0.211044
B12    0.208371
B4     0.166063
B6     0.163998
B11    0.147842
B10    0.135946
B13    0.134165
Name: Prediction_Diff, dtype: float64
VehGas
Regular    0.243219
Diesel     0.224337
Name: Prediction_Diff, dtype: float64
Region
R24    0.384500
R82    0.277802
R53    0.266724
R11    0.254541
R42    0.202896
R74    0.192122
R94    0.176433
R93    0.173473
R52    0.161825
R22    0.148004
Name: Prediction_Diff, dtype: float64


In [28]:
segment_differences = df.groupby(["VehPower"])['Prediction_Diff'].mean().sort_values(ascending=False)
print(segment_differences.head(10))

VehPower
6.0     0.308808
5.0     0.263459
7.0     0.252936
9.0     0.187441
10.0    0.186393
4.0     0.171652
8.0     0.153575
11.0    0.137231
13.0    0.136328
14.0    0.134456
Name: Prediction_Diff, dtype: float64


In [None]:
drivage_bins = list(range(18, 98, 10)) 
drivage_labels = [f"{start}-{start+9}" for start in drivage_bins[:-1]]

vehage_bins = list(range(0, 30, 5)) 
vehage_labels = [f"{start}-{start+4}" for start in vehage_bins[:-1]]

df["DrivAge_cat"] = pd.cut(df["DrivAge"], bins=drivage_bins, labels=drivage_labels, right=False)
df["VehAge_cat"] = pd.cut(df["VehAge"], bins=vehage_bins, labels=vehage_labels, right=False)

   DrivAge DrivAge_cat  VehAge VehAge_cat
0     55.0       48-57     0.0        0-4
1     55.0       48-57     0.0        0-4
2     52.0       48-57     2.0        0-4
3     46.0       38-47     0.0        0-4
4     38.0       38-47     2.0        0-4


In [32]:
segment_differences = df.groupby(["DrivAge_cat"])['Prediction_Diff'].mean().sort_values(ascending=False)
print(segment_differences.head(10))

DrivAge_cat
78-87    0.342025
68-77    0.327372
48-57    0.311176
58-67    0.287178
18-27    0.255935
38-47    0.194373
28-37    0.135145
Name: Prediction_Diff, dtype: float64


  segment_differences = df.groupby(["DrivAge_cat"])['Prediction_Diff'].mean().sort_values(ascending=False)


In [33]:
segment_differences = df.groupby(["VehAge_cat"])['Prediction_Diff'].mean().sort_values(ascending=False)
print(segment_differences.head(10))

VehAge_cat
5-9      0.290755
10-14    0.243019
0-4      0.225890
15-19    0.114538
20-24    0.095566
Name: Prediction_Diff, dtype: float64


  segment_differences = df.groupby(["VehAge_cat"])['Prediction_Diff'].mean().sort_values(ascending=False)
