In [None]:
import sys
import os

sys.path.append(os.path.abspath(".."))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from scripts.data_preprocessing import load_and_prepare_data, prepare_data_for_claim_probability, prepare_data_for_claim_severity





In [None]:

# Load data
df, claim_data = load_and_prepare_data("../data/raw/insurance_data.txt")



In [None]:
# Prepare data
X_prob, y_prob = prepare_data_for_claim_probability(df)
X_sev, y_sev = prepare_data_for_claim_severity(claim_data)

In [None]:
#Train Model
# Probability model
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_prob, y_prob, test_size=0.2, random_state=42, stratify=y_prob)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_p, y_train_p)
prob_preds = clf.predict_proba(X_prob)[:, 1]

# Severity model
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_sev, y_sev, test_size=0.2, random_state=42)
reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train_s, y_train_s)
sev_preds = reg.predict(X_prob)  # Use same full dataset

In [None]:
# Set business constants
EXPENSE_LOADING = 100  # base operating cost per policy
PROFIT_MARGIN = 0.15   # 15%

# Compute premium
premium = (prob_preds * sev_preds) + EXPENSE_LOADING
premium_with_margin = premium * (1 + PROFIT_MARGIN)

df["PredictedPremium"] = premium_with_margin

# Preview
df[["PredictedPremium", "CalculatedPremiumPerTerm"]].head()


In [None]:
# Set business constants
EXPENSE_LOADING = 100  # base operating cost per policy
PROFIT_MARGIN = 0.15   # 15%

# Compute premium
premium = (prob_preds * sev_preds) + EXPENSE_LOADING
premium_with_margin = premium * (1 + PROFIT_MARGIN)

df["PredictedPremium"] = premium_with_margin

# Preview
df[["PredictedPremium", "CalculatedPremiumPerTerm"]].head()


In [None]:
#visualize distribution
plt.figure(figsize=(10, 6))
plt.hist(df["PredictedPremium"], bins=50, alpha=0.6, label="Model Predicted")
plt.hist(df["CalculatedPremiumPerTerm"], bins=50, alpha=0.6, label="Original Premium")
plt.xlabel("Premium Amount")
plt.ylabel("Frequency")
plt.title("Distribution: Model vs. Original Premium")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
#business comparison
# Compare mean premiums
mean_model = df["PredictedPremium"].mean()
mean_actual = df["CalculatedPremiumPerTerm"].mean()

print(f"Mean Model Premium: R {mean_model:.2f}")
print(f"Mean Actual Premium: R {mean_actual:.2f}")

# Over/Under-pricing insight
df["PriceDiff"] = df["PredictedPremium"] - df["CalculatedPremiumPerTerm"]
print("Percentage Overpriced:", (df["PriceDiff"] > 0).mean())
print("Percentage Underpriced:", (df["PriceDiff"] < 0).mean())
