In [4]:
import sys
print(sys.executable)
import pandas as pd
from scipy import stats
import numpy as np

try:
    df = pd.read_csv('data/cleaned_insurance_data.csv')
    print("Cleaned data loaded successfully.")
    print("Data shape:", df.shape)
except FileNotFoundError:
    print("ERROR: Could not find 'data/cleaned_insurance_data.csv'.")

alpha = 0.05

D:\Tenx\acis-insurance-risk-analytics-Modeling\venv\Scripts\python.exe
Cleaned data loaded successfully.
Data shape: (617736, 48)


In [5]:
# Prepare data for Claim SEVERITY test 

# Create a dataframe containing only policies where a claim occurred (TotalClaims > 0).
df_claims_only = df[df['TotalClaims'] > 0].copy()

print(f"Number of policies with a claim: {len(df_claims_only)}")
provinces = df_claims_only['Province'].unique()
claim_groups_by_province = [
    df_claims_only['TotalClaims'][df_claims_only['Province'] == prov]
    for prov in provinces
]

print(f"\nNumber of province groups to compare: {len(claim_groups_by_province)}")

Number of policies with a claim: 2628

Number of province groups to compare: 9


In [6]:
# Conduct the ANOVA Test 

f_statistic, p_value = stats.f_oneway(*claim_groups_by_province)

print(f"ANOVA Test Results for Claim Severity across Provinces:")
print(f"F-statistic: {f_statistic:.4f}")
print(f"P-value: {p_value}")

print("\n--- Conclusion ---")
if p_value < alpha:
    print(f"Result: The p-value ({p_value}) is less than our alpha ({alpha}).")
    print("Decision: We REJECT the null hypothesis.")
    print("Insight: There IS a statistically significant difference in the average claim amount (severity) across provinces.")
else:
    print(f"Result: The p-value ({p_value}) is greater than our alpha ({alpha}).")
    print("Decision: We FAIL TO REJECT the null hypothesis.")
    print("Insight: There is NO statistically significant difference in the average claim amount (severity) across provinces.")

print("\n--- Business Interpretation & Recommendation ---")
print("This statistical test provides strong evidence that the differences in claim severity we saw in our EDA are real.")
print("Recommendation: ACIS should proceed with developing a region-based pricing strategy, as risk is not uniform across the country.")

ANOVA Test Results for Claim Severity across Provinces:
F-statistic: 4.6675
P-value: 1.0943454529653844e-05

--- Conclusion ---
Result: The p-value (1.0943454529653844e-05) is less than our alpha (0.05).
Decision: We REJECT the null hypothesis.
Insight: There IS a statistically significant difference in the average claim amount (severity) across provinces.

--- Business Interpretation & Recommendation ---
This statistical test provides strong evidence that the differences in claim severity we saw in our EDA are real.
Recommendation: ACIS should proceed with developing a region-based pricing strategy, as risk is not uniform across the country.


In [7]:
# --- Hypothesis 1 (Part 2): Claim Frequency Across Provinces ---
print("--- H₀: Claim frequency is the same across provinces ---")

# We need a 'HasClaim' column (1 if claim > 0, else 0)
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)

# Create a contingency table (crosstab) of observed frequencies
contingency_table = pd.crosstab(df['Province'], df['HasClaim'])
print("Contingency Table (Observed Frequencies):\n", contingency_table)

# The correct test for comparing proportions across multiple categories is the Chi-Squared test.
chi2_stat, p_value_freq, dof, expected_freqs = stats.chi2_contingency(contingency_table)

print(f"\nChi-Squared Test Results:")
print(f"Chi2 Statistic: {chi2_stat:.4f}")
print(f"P-value: {p_value_freq}")

# --- Conclusion ---
if p_value_freq < alpha:
    print("\nDecision: We REJECT the null hypothesis.")
    print("Insight: Claim frequency IS significantly different across provinces.")
else:
    print("\nDecision: We FAIL TO REJECT the null hypothesis.")
    print("Insight: No significant difference found in claim frequency across provinces.")

--- H₀: Claim frequency is the same across provinces ---
Contingency Table (Observed Frequencies):
 HasClaim            0     1
Province                   
Eastern Cape    19647    47
Free State       5923     9
Gauteng        239229  1243
KwaZulu-Natal  111443   453
Limpopo         17943    66
Mpumalanga      31514   125
North West      89465   334
Northern Cape    3635     8
Western Cape    96309   343

Chi-Squared Test Results:
Chi2 Statistic: 98.5540
P-value: 8.428614727758814e-18

Decision: We REJECT the null hypothesis.
Insight: Claim frequency IS significantly different across provinces.


In [8]:
# --- Hypothesis 2: Risk Differences Across Top 10 Zip Codes (Severity) ---
print("\n--- H₀: Claim severity is the same across the top 10 zip codes ---")

# Find the top 10 zip codes by number of policies
top_10_zipcodes = df['PostalCode'].value_counts().nlargest(10).index

# Filter the claims data to only include these top zip codes
df_claims_top_zips = df_claims_only[df_claims_only['PostalCode'].isin(top_10_zipcodes)]

# Create the groups for the ANOVA test
claim_groups_by_zip = [
    df_claims_top_zips['TotalClaims'][df_claims_top_zips['PostalCode'] == zc]
    for zc in top_10_zipcodes
]

# Conduct the ANOVA Test
f_stat_zip, p_value_zip = stats.f_oneway(*claim_groups_by_zip)

print(f"\nANOVA Test Results for Top 10 Zip Codes:")
print(f"F-statistic: {f_stat_zip:.4f}")
print(f"P-value: {p_value_zip}")

# --- Conclusion ---
if p_value_zip < alpha:
    print("\nDecision: We REJECT the null hypothesis.")
    print("Insight: There IS a statistically significant difference in claim severity among the top 10 zip codes.")
else:
    print("\nDecision: We FAIL TO REJECT the null hypothesis.")
    print("Insight: No significant difference found in claim severity among the top 10 zip codes.")


--- H₀: Claim severity is the same across the top 10 zip codes ---

ANOVA Test Results for Top 10 Zip Codes:
F-statistic: 5.9382
P-value: 4.158134818707754e-08

Decision: We REJECT the null hypothesis.
Insight: There IS a statistically significant difference in claim severity among the top 10 zip codes.
