In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import shap
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('..\\Data\\cleaned_df.csv')
df.head()

  df = pd.read_csv('..\\Data\\cleaned_df.csv')


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0
4,145255,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


In [3]:
# Add a dummy 'HasClaim' column if not present
def add_claim_flags(df):
    if 'HasClaim' not in df.columns:
        df['HasClaim'] = df['TotalClaims'] > 0
    return df

In [4]:
df_claim = add_claim_flags(df)

In [5]:
# Automatically select categorical and numerical columns
categorical_cols = df_claim.select_dtypes(include=['object', 'category']).columns
numerical_cols = df_claim.select_dtypes(include=['int64', 'float64']).columns

In [6]:
numerical_cols

Index(['UnderwrittenCoverID', 'PolicyID', 'PostalCode', 'mmcode',
       'RegistrationYear', 'Cylinders', 'cubiccapacity', 'kilowatts',
       'NumberOfDoors', 'SumInsured', 'CalculatedPremiumPerTerm',
       'TotalPremium', 'TotalClaims'],
      dtype='object')

In [7]:
# Remove 'TotalClaims' from numerical columns if it exists
if 'TotalClaims' in numerical_cols:
    numerical_cols = numerical_cols.drop('TotalClaims')

In [8]:
numerical_cols

Index(['UnderwrittenCoverID', 'PolicyID', 'PostalCode', 'mmcode',
       'RegistrationYear', 'Cylinders', 'cubiccapacity', 'kilowatts',
       'NumberOfDoors', 'SumInsured', 'CalculatedPremiumPerTerm',
       'TotalPremium'],
      dtype='object')

In [9]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # fill missing with mode
    ('encoder', OneHotEncoder(handle_unknown='ignore'))    # one-hot encode
])

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),         # fill missing with median
    ('scaler', StandardScaler())                           # scale
])


In [10]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])


In [11]:
X = df_claim.drop('TotalClaims', axis=1)
y = df_claim['TotalClaims']

In [12]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Force all categorical columns to string type
X_train[categorical_cols] = X_train[categorical_cols].astype(str)
X_test[categorical_cols] = X_test[categorical_cols].astype(str)

### Liner model prediction

In [28]:
pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [29]:
pipeline_lr.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [None]:
# Predict on test set
y_pred = pipeline_lr.predict(X_test)  # or pipeline_rf.predict(X_test)

# RMSE
rmse = mean_squared_error(y_test, y_pred)

# R²
r2 = r2_score(y_test, y_pred)

# Print results
print("Evaluation Metrics:")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")

Evaluation Metrics:
RMSE: 4864407.97
R² Score: 0.0045


### Random Forest

In [14]:
pipeline_r = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=50,
        max_depth=6,
        n_jobs=-1,
        random_state=42
    ))
])

In [None]:
pipeline_r.fit(X_train, y_train)

In [16]:
# Predict on test set
y_pred = pipeline_r.predict(X_test)  # or pipeline_rf.predict(X_test)

# RMSE
rmse = mean_squared_error(y_test, y_pred)

# R²
r2 = r2_score(y_test, y_pred)

# Print results
print("Evaluation Metrics:")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")

NotFittedError: Pipeline is not fitted yet.

### XGboost

In [None]:
pipeline_xg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgb', XGBRegressor())
])

In [None]:
pipeline_xg.fit(X_train, y_train)

In [None]:
# Predict on test set
y_pred = pipeline_xg.predict(X_test)  # or pipeline_rf.predict(X_test)

# RMSE
rmse = mean_squared_error(y_test, y_pred)

# R²
r2 = r2_score(y_test, y_pred)

# Print results
print("Evaluation Metrics:")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")