 Select Key Metrics

In [3]:
import pandas as pd

df = pd.read_parquet("../data/MachineLearningRating_v3.parquet")

# Create derived metrics
df["ClaimOccurred"] = df["TotalClaims"] > 0
df["Margin"] = df["TotalPremium"] - df["TotalClaims"]
df["ClaimSeverity"] = df["TotalClaims"] / df["ClaimOccurred"].replace(False, pd.NA)


Statistical Testing Functions

In [4]:
from scipy.stats import ttest_ind, chi2_contingency
import numpy as np

def t_test_between_groups(df, group_col, metric):
    groups = df[group_col].dropna().unique()
    if len(groups) != 2:
        raise ValueError("This test assumes exactly 2 groups.")
    a, b = groups
    group_a = df[df[group_col] == a][metric].dropna()
    group_b = df[df[group_col] == b][metric].dropna()
    t_stat, p_value = ttest_ind(group_a, group_b, equal_var=False)
    return f"{a} vs {b}", t_stat, p_value

def chi_square_test(df, cat_col, binary_metric):
    contingency = pd.crosstab(df[cat_col], df[binary_metric])
    chi2, p, _, _ = chi2_contingency(contingency)
    return chi2, p


Test Hypotheses

H₀: No risk differences across provinces

In [5]:
# Claim frequency by province
chi2, p = chi_square_test(df, "Province", "ClaimOccurred")
print(f"Province vs ClaimOccurred: Chi2 = {chi2:.2f}, p = {p:.4f}")


Province vs ClaimOccurred: Chi2 = 104.19, p = 0.0000


H₀: No risk differences between zip codes

In [6]:
chi2, p = chi_square_test(df, "PostalCode", "ClaimOccurred")
print(f"PostalCode vs ClaimOccurred: Chi2 = {chi2:.2f}, p = {p:.4f}")


PostalCode vs ClaimOccurred: Chi2 = 1454.47, p = 0.0000


H₀: No significant margin difference between zip codes

In [7]:
result, t_stat, p_value = t_test_between_groups(df[df["PostalCode"].isin(df["PostalCode"].unique()[:2])], "PostalCode", "Margin")
print(f"{result}: t = {t_stat:.2f}, p = {p_value:.4f}")


1459 vs 1513: t = -0.44, p = 0.6630


H₀: No significant risk difference between Women and Men

In [8]:
chi2, p = chi_square_test(df[df["Gender"].isin(["Female", "Male"])], "Gender", "ClaimOccurred")
print(f"Gender vs ClaimOccurred: Chi2 = {chi2:.2f}, p = {p:.4f}")


Gender vs ClaimOccurred: Chi2 = 0.00, p = 0.9515
