# Task 4: Predictive Modeling for Risk-Based Pricing

This notebook builds and evaluates predictive models to support dynamic, risk-based insurance pricing. The workflow is modular and leverages utility functions from `src/modeling_utils.py` for clean, reusable code.

In [1]:
import sys
import os
sys.path.insert(0, '../src')
from utils.task4_utils import (
    prepare_data, train_linear_regression, train_random_forest, train_xgboost,
    regression_metrics, classification_metrics, get_feature_importance, shap_summary_plot
)
import pandas as pd
import matplotlib.pyplot as plt
import shap
import sys
sys.path.append('../src/utils')
import importlib
sys.path.insert(0, os.path.abspath('../src'))

from utils.task4_utils import (
    prepare_data, train_linear_regression, train_random_forest, train_xgboost,
    regression_metrics, classification_metrics, get_feature_importance, shap_summary_plot
)

# If you want to reload after editing task4_utils.py:
import utils.task4_utils as t4u
importlib.reload(t4u)
from utils.task4_utils import filter_flat_numeric_columns

## Data Loading and Preparation
- Loads data
- Handles missing values and encodes categoricals
- Performs feature engineering
- Splits into train/test sets


In [2]:
df = pd.read_csv('../data/MachineLearningRating_v3.txt', sep='|')
# For claim severity model: only use rows where TotalClaims > 0
df_claims = df[df['TotalClaims'] > 0].copy()
# Prepare data for regression (claim severity)
drop_cols = ['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'TotalPremium', 'CalculatedPremiumPerTerm']
# Do NOT include 'TotalClaims' in drop_cols
X_train, X_test, y_train, y_test = prepare_data(df_claims, target='TotalClaims', drop_cols=drop_cols, regression=True)
print(df_claims.columns)

  df = pd.read_csv('../data/MachineLearningRating_v3.txt', sep='|')


Index(['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth',
       'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language',
       'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province',
       'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode',
       'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders',
       'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors',
       'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser',
       'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff',
       'Rebuilt', 'Converted', 'CrossBorder', 'NumberOfVehiclesInFleet',
       'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm',
       'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section',
       'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalPremium',
       'TotalClaims'],
      dtype='object')


## Model Building: Claim Severity (Regression)
Train and evaluate Linear Regression, Random Forest, and XGBoost models. Compare RMSE and R2.


In [3]:
# XGBoost (filter and align test set columns)
xgb_model = train_xgboost(X_train, y_train, regression=True)
X_test_filtered = filter_flat_numeric_columns(X_test)
X_test_filtered = X_test_filtered[list(xgb_model.feature_names_in_)]
xgb_pred = xgb_model.predict(X_test_filtered)
xgb_metrics = regression_metrics(y_test, xgb_pred)
print('XGBoost:', xgb_metrics)

# Linear Regression
lr_model = train_linear_regression(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_metrics = regression_metrics(y_test, lr_pred)
print('Linear Regression:', lr_metrics)

# Random Forest
rf_model = train_random_forest(X_train, y_train, regression=True)
rf_pred = rf_model.predict(X_test)
rf_metrics = regression_metrics(y_test, rf_pred)
print('Random Forest:', rf_metrics)



XGBoost: {'RMSE': np.float64(39834.462843018184), 'R2': np.float64(0.013345926347972825)}
Linear Regression: {'RMSE': np.float64(37213.81225950447), 'R2': np.float64(0.13889659295005807)}
Random Forest: {'RMSE': np.float64(37078.7649885877), 'R2': np.float64(0.1451350647989721)}




## Model Interpretation (SHAP)
Interpret the best-performing regression model using SHAP to identify top features.


In [None]:
# Use the exact preprocessed training DataFrame for SHAP
X_train_prepared = X_train
shap_summary_plot(rf_model, X_train_prepared)



## Premium Optimization: Claim Probability Model (Classification)
Train and evaluate models to predict the probability of a claim.


In [None]:
df['HasClaim'] = df['TotalClaims'] > 0
drop_cols_class = ['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'TotalPremium', 'CalculatedPremiumPerTerm', 'TotalClaims', 'HasClaim']
X_train_c, X_test_c, y_train_c, y_test_c = prepare_data(df, target='HasClaim', drop_cols=drop_cols_class, regression=False)

# Random Forest Classifier
rf_clf = train_random_forest(X_train_c, y_train_c, regression=False)
rf_clf_pred = rf_clf.predict(X_test_c)
rf_clf_metrics = classification_metrics(y_test_c, rf_clf_pred)
print('Random Forest Classifier:', rf_clf_metrics)

# XGBoost Classifier
xgb_clf = train_xgboost(X_train_c, y_train_c, regression=False)
xgb_clf_pred = xgb_clf.predict(X_test_c)
xgb_clf_metrics = classification_metrics(y_test_c, xgb_clf_pred)
print('XGBoost Classifier:', xgb_clf_metrics)


## Model Interpretation (SHAP)
Interpret the best-performing classifier using SHAP to identify top features.


In [None]:
shap_summary_plot(rf_clf, X_train_c)
