In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor


In [None]:
DATA_PATH = (Path().resolve() / ".." / "data" / "train.csv").resolve()
df = pd.read_csv(DATA_PATH)

target_col = "loss"
df["log_loss"] = np.log1p(df[target_col])

cat_cols = [c for c in df.columns if c.startswith("cat")]
cont_cols = [c for c in df.columns if c.startswith("cont")]

X = df[cat_cols + cont_cols]
y = df["log_loss"]

print("X shape:", X.shape, "y shape:", y.shape)


In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(X_train.shape, X_val.shape)


In [None]:
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("cont", "passthrough", cont_cols),
    ]
)


In [None]:
baseline_model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", Ridge(alpha=10.0, random_state=42)),
    ]
)
baseline_model


In [None]:
baseline_model.fit(X_train, y_train)
pred_val_base = baseline_model.predict(X_val)

mae_base = mean_absolute_error(y_val, pred_val_base)
mse_base = mean_squared_error(y_val, pred_val_base)
rmse_base = np.sqrt(mse_base)
r2_base = r2_score(y_val, pred_val_base)

print("Baseline (Ridge) on log_loss")
print("MAE:", mae_base)
print("RMSE:", rmse_base)
print("R^2:", r2_base)


In [None]:
resid = y_val - pred_val_base
plt.figure()
plt.scatter(pred_val_base, resid, s=5)
plt.axhline(0)
plt.title("Baseline Residuals (log_loss)")
plt.xlabel("Predicted log_loss")
plt.ylabel("Residual (actual - pred)")
plt.show()


In [None]:
import numpy as np

N = 25000  # good starting point for 8GB
rng = np.random.RandomState(42)
idx = rng.choice(len(X_train), size=N, replace=False)

X_train_s = X_train.iloc[idx]
y_train_s = y_train.iloc[idx]

print("Training sample:", X_train_s.shape)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

gb_model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", RandomForestRegressor(
            n_estimators=80,        # keep small
            max_depth=16,           # limits tree growth
            max_features=0.3,       # reduces split cost
            min_samples_leaf=20,    # smoother trees
            n_jobs=-1,
            random_state=42
        )),
    ]
)

In [None]:
gb_model.fit(X_train_s, y_train_s)

In [None]:
pred_val_gb = gb_model.predict(X_val)



In [None]:

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae_gb = mean_absolute_error(y_val, pred_val_gb)
rmse_gb = np.sqrt(mean_squared_error(y_val, pred_val_gb))
r2_gb = r2_score(y_val, pred_val_gb)

print("RandomForest (sample-trained) on log_loss")
print("MAE:", mae_gb)
print("RMSE:", rmse_gb)
print("R^2:", r2_gb)


In [None]:
print("Improvement vs baseline (MAE):", (mae_base - mae_gb) / mae_base)
print("Improvement vs baseline (RMSE):", (rmse_base - rmse_gb) / rmse_base)


In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.scatter(y_val, pred_val_gb, s=5)
plt.xlabel("Actual log_loss")
plt.ylabel("Predicted log_loss")
plt.title("Actual vs Predicted (log_loss) — Random Forest")
plt.show()



In [None]:

decile_df = pd.DataFrame({
    "y_true_log": y_val.values,
    "y_pred_log": pred_val_gb
})

# Convert back to dollars
decile_df["y_true_loss"] = np.expm1(decile_df["y_true_log"])
decile_df["y_pred_loss"] = np.expm1(decile_df["y_pred_log"])

# Create deciles based on predicted severity
decile_df["decile"] = pd.qcut(
    decile_df["y_pred_loss"], 
    10, 
    labels=False
) + 1  # 1 = lowest risk, 10 = highest risk

decile_summary = decile_df.groupby("decile").agg(
    count=("y_true_loss", "size"),
    avg_actual_loss=("y_true_loss", "mean"),
    avg_pred_loss=("y_pred_loss", "mean"),
    total_actual_loss=("y_true_loss", "sum")
).reset_index()

decile_summary


In [None]:

plt.figure(figsize=(8,4))
plt.plot(
    decile_summary["decile"],
    decile_summary["avg_actual_loss"],
    marker="o",
    label="Avg Actual Loss"
)
plt.plot(
    decile_summary["decile"],
    decile_summary["avg_pred_loss"],
    marker="o",
    label="Avg Predicted Loss"
)

plt.xlabel("Predicted Severity Decile (1 = Lowest, 10 = Highest)")
plt.ylabel("Average Loss ($)")
plt.title("Decile Lift Chart — Random Forest Severity Ranking")
plt.legend()
plt.show()


In [None]:

total_loss = decile_summary["total_actual_loss"].sum()

top10_loss = decile_summary.loc[
    decile_summary["decile"] == 10, "total_actual_loss"
].values[0]

top20_loss = decile_summary.loc[
    decile_summary["decile"].isin([9,10]), "total_actual_loss"
].sum()

print("Share of total loss in top decile:", top10_loss / total_loss)
print("Share of total loss in top 2 deciles:", top20_loss / total_loss)


In [None]:

# Extract trained forest
rf = gb_model.named_steps["model"]

# Get feature names from preprocessing
ohe = gb_model.named_steps["preprocess"].named_transformers_["cat"]
cat_feature_names = ohe.get_feature_names_out(cat_cols)
feature_names = np.concatenate([cat_feature_names, cont_cols])

importances = pd.Series(
    rf.feature_importances_,
    index=feature_names
).sort_values(ascending=False)

importances.head(20)


In [None]:
plt.figure(figsize=(8,4))
importances.head(15).iloc[::-1].plot(kind="barh")
plt.xlabel("Feature Importance")
plt.title("Top Feature Drivers — Random Forest")
plt.show()


Modeling Results & Interpretation
Modeling Approach

Severity was modeled using a log-transformed loss target to stabilize variance. Two models were evaluated:

Ridge Regression as a linear baseline, chosen for stability and interpretability in high-dimensional categorical data

Random Forest Regression, trained on a representative sample to explore non-linear effects while managing computational constraints

Models were evaluated using both point-prediction metrics and decile-based loss concentration, aligning with real insurance decision-making workflows.

Performance Summary

Baseline — Ridge Regression

MAE: 0.440

RMSE: 0.560

R²: 0.519

The baseline model performs strongly, capturing a substantial share of variance and providing a reliable reference point.

Random Forest (Sample-Trained)

MAE: 0.449

RMSE: 0.574

R²: 0.494

The Random Forest does not outperform the linear baseline on global error metrics, suggesting that severity in this dataset is largely driven by additive effects rather than complex interactions.

Ranking & Business Performance

Despite weaker point accuracy, the Random Forest demonstrates strong risk ranking capability:

Top decile captures ~25.7% of total loss

Top two deciles capture ~41.5% of total loss

This concentration supports operational use cases where prioritization and triage matter more than exact dollar prediction.

Modeling Takeaways

Linear models provide stable severity estimates suitable for portfolio analysis

Non-linear models add value through ranking and segmentation

Insurance model evaluation should emphasize loss concentration, not just MAE/RMSE