In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols, logit
from statsmodels.stats.anova import anova_lm


In [None]:
 #--- Step 0: Load CSV properly ---
file_path = '/mnt/data/processed_insurance_data.csv'
df = pd.read_csv(file_path, sep='|')

# Inspect first rows to confirm
print(df.head())
print(df.columns.tolist())

In [None]:
# --- Step 1: Create Metrics ---
# Claim Frequency: 1 if TotalClaims > 0 else 0
df['ClaimFrequency'] = (df['totalclaims'] > 0).astype(int)

# Claim Severity: Only for policies with claims
df['ClaimSeverity'] = df['totalclaims'] / df['ClaimFrequency'].replace(0, np.nan)

# Margin
df['Margin'] = df['totalpremium'] - df['totalclaims']


In [None]:
# --- Step 2: Hypothesis Testing ---

# 2a. Risk differences across Provinces
print("\n--- Provinces: Claim Severity ---")
prov_model = ols('ClaimSeverity ~ C(province)', data=df.dropna(subset=['ClaimSeverity'])).fit()
prov_anova = anova_lm(prov_model)
print(prov_anova)

print("\n--- Provinces: Claim Frequency ---")
prov_freq_model = logit('ClaimFrequency ~ C(province)', data=df).fit(disp=0)
print(prov_freq_model.summary())

# 2b. Risk differences across Zip Codes
print("\n--- Zip Codes: Claim Severity ---")
zip_model = ols('ClaimSeverity ~ C(postalcode)', data=df.dropna(subset=['ClaimSeverity'])).fit()
zip_anova = anova_lm(zip_model)
print(zip_anova)

print("\n--- Zip Codes: Claim Frequency ---")
zip_freq_model = logit('ClaimFrequency ~ C(postalcode)', data=df).fit(disp=0)
print(zip_freq_model.summary())

# 2c. Margin differences across Zip Codes
print("\n--- Zip Codes: Margin ---")
margin_model = ols('Margin ~ C(postalcode)', data=df).fit()
margin_anova = anova_lm(margin_model)
print(margin_anova)

# 2d. Risk differences by Gender
print("\n--- Gender: Claim Severity ---")
gender_model = ols('ClaimSeverity ~ C(gender)', data=df.dropna(subset=['ClaimSeverity'])).fit()
gender_anova = anova_lm(gender_model)
print(gender_anova)

print("\n--- Gender: Claim Frequency ---")
gender_freq_model = logit('ClaimFrequency ~ C(gender)', data=df).fit(disp=0)
print(gender_freq_model.summary())


In [None]:
# --- Step 3: Automated Interpretation ---
def interpret_pval(model, feature_name, is_anova=True):
    if is_anova:
        p_val = model['PR(>F)'][0] if 'PR(>F)' in model.columns else None
    else:
        p_val = model.pvalues[1]  # first coefficient after intercept
    if p_val is not None:
        if p_val < 0.05:
            print(f"Reject H0 for {feature_name} (p = {p_val:.4f}) → significant effect.")
        else:
            print(f"Fail to reject H0 for {feature_name} (p = {p_val:.4f}) → no significant effect.")

# Interpret all tests
interpret_pval(prov_anova, "Province Claim Severity", is_anova=True)
interpret_pval(prov_freq_model, "Province Claim Frequency", is_anova=False)
interpret_pval(zip_anova, "Zip Code Claim Severity", is_anova=True)
interpret_pval(zip_freq_model, "Zip Code Claim Frequency", is_anova=False)
interpret_pval(margin_anova, "Zip Code Margin", is_anova=True)
interpret_pval(gender_anova, "Gender Claim Severity", is_anova=True)
interpret_pval(gender_freq_model, "Gender Claim Frequency", is_anova=False)