In [1]:
# Cell 1: imports & dataset
import sys
import pandas as pd
import numpy as np
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.models import default_feature_engineering, ModelTrainer, regression_report, classification_report_metrics

# Load preprocessed data
df = pd.read_csv("../data/clean/clean_portfolio_v2.csv", parse_dates=["TransactionMonth"], low_memory=False)
# compute KPIs if not present
if "has_claim" not in df.columns:
    from src.hypothesis_tests import compute_kpis
    df = compute_kpis(df)

# basic feature engineering
df = default_feature_engineering(df)
df.shape


  from .autonotebook import tqdm as notebook_tqdm


(981812, 66)

In [2]:
# Cell 2: choose features
# Select a compact but meaningful feature set (tweak as appropriate)
features = [
    "Province", "VehicleType", "Make", "Model", "Gender", "RegistrationYear",
    "SumInsured", "CalculatedPremiumPerTerm", "Kilowatts", "cubiccapacity", "NumberOfDoors", "VehicleAge"
]

# Ensure columns exist
features = [c for c in features if c in df.columns]
# categorize features
categorical = [c for c in features if df[c].dtype == "object" or df[c].dtype.name == "string"]
numeric = [c for c in features if c not in categorical]

print("features:", features)
print("categorical:", categorical)
print("numeric:", numeric)


features: ['Province', 'VehicleType', 'Make', 'Model', 'Gender', 'RegistrationYear', 'SumInsured', 'CalculatedPremiumPerTerm', 'cubiccapacity', 'NumberOfDoors', 'VehicleAge']
categorical: ['Province', 'VehicleType', 'Make', 'Model', 'Gender']
numeric: ['RegistrationYear', 'SumInsured', 'CalculatedPremiumPerTerm', 'cubiccapacity', 'NumberOfDoors', 'VehicleAge']


In [3]:
# Cell 3: severity model
# subset to rows with claims > 0
df_sev = df[df["has_claim"] == True].copy()
trainer = ModelTrainer(df_sev, features=features, categorical=categorical, numeric=numeric, target_reg="TotalClaims")
res_rf = trainer.train_regression(model_type="rf")
print("Severity RF metrics:", res_rf["metrics"])
res_xgb = trainer.train_regression(model_type="xgb")
print("Severity XGB metrics:", res_xgb["metrics"])
res_lin = trainer.train_regression(model_type="linear")
print("Severity Linear metrics:", res_lin["metrics"])


Severity RF metrics: {'rmse': 34494.16636811715, 'r2': 0.22164290356454752}
Severity XGB metrics: {'rmse': 38655.7360697038, 'r2': 0.022502902013075365}
Severity Linear metrics: {'rmse': 38539.20224200684, 'r2': 0.028387657440095415}


In [4]:
# Cell 4: classification (claim probability) on full dataset
trainer_full = ModelTrainer(df, features=features, categorical=categorical, numeric=numeric, target_claim_flag="has_claim")
clf_rf = trainer_full.train_classifier(model_type="rf")
print("Classifier RF metrics:", clf_rf["metrics"])
clf_xgb = trainer_full.train_classifier(model_type="xgb")
print("Classifier XGB metrics:", clf_xgb["metrics"])


Classifier RF metrics: {'accuracy': 0.9971786945605843, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'classification_report': {'0': {'precision': 0.9971939295172133, 'recall': 0.9999846791820726, 'f1-score': 0.9985873545281152, 'support': 195812.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 551.0}, 'accuracy': 0.9971786945605843, 'macro avg': {'precision': 0.49859696475860665, 'recall': 0.4999923395910363, 'f1-score': 0.4992936772640576, 'support': 196363.0}, 'weighted avg': {'precision': 0.9943957758163431, 'recall': 0.9971786945605843, 'f1-score': 0.995785290838189, 'support': 196363.0}}, 'confusion_matrix': [[195809, 3], [551, 0]], 'auc': 0.7060218331202012, 'classification_report_text': '              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00    195812\n           1       0.00      0.00      0.00       551\n\n    accuracy                           1.00    196363\n   macro avg       0.50      0.50      0.50    1963

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Classifier XGB metrics: {'accuracy': 0.9971939723878734, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'classification_report': {'0': {'precision': 0.9971939723878734, 'recall': 1.0, 'f1-score': 0.9985950149805571, 'support': 195812.0}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 551.0}, 'accuracy': 0.9971939723878734, 'macro avg': {'precision': 0.4985969861939367, 'recall': 0.5, 'f1-score': 0.49929750749027857, 'support': 196363.0}, 'weighted avg': {'precision': 0.994395818566707, 'recall': 0.9971939723878734, 'f1-score': 0.9957929297951897, 'support': 196363.0}}, 'confusion_matrix': [[195812, 0], [551, 0]], 'auc': 0.8888074955632654, 'classification_report_text': '              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00    195812\n           1       0.00      0.00      0.00       551\n\n    accuracy                           1.00    196363\n   macro avg       0.50      0.50      0.50    196363\nweighted avg       0.99  

In [None]:
# Cell 5: premium naive regression and risk-based premium
# naive premium predictor: predict CalculatedPremiumPerTerm (if exists)
if "CalculatedPremiumPerTerm" in df.columns:
    trainer_prem = ModelTrainer(df.dropna(subset=["CalculatedPremiumPerTerm"]), features=features, categorical=categorical, numeric=numeric, target_reg="CalculatedPremiumPerTerm")
    prem_rf = trainer_prem.train_regression(model_type="rf")
    print("Premium RF metrics (naive):", prem_rf["metrics"])

# compute risk-based premium using classifier and severity models (best performing keys)
classifier_key = clf_rf["model_key"]    # e.g., 'class_rf'
severity_key = res_xgb["model_key"]     # e.g., 'reg_xgb' from severity training (use the best)
X_for_premium = df[features].copy()
risk_prem = trainer_full.compute_risk_based_premium(classifier_key=classifier_key, severity_key=severity_key, X=X_for_premium, expense_loading=0.05, profit_margin=0.10)
risk_prem.describe()



KeyboardInterrupt



In [None]:
# Cell 6: feature importance and SHAP
# Feature importance for severity model (xgb or rf)
imp_df = trainer.feature_importance(res_xgb["model_key"])
display(imp_df.head(20))

# SHAP explain example on a small sample
X_sample = res_xgb["X_test"].sample(200, random_state=42)
explainer, shap_vals = trainer.explain_shap(res_xgb["model_key"], X_sample)
# summary plot (if running locally)
import matplotlib.pyplot as plt
shap.summary_plot(shap_vals, explainer.data if hasattr(explainer, "data") else X_sample, show=True)


In [None]:
# Cell 7: save models
trainer.save_model(res_xgb["model_key"], "models/severity_xgb.joblib")
trainer_full.save_model(clf_rf["model_key"], "models/claimprob_rf.joblib")
