In [None]:
import pandas as pd
from scipy import stats # You'll need this for statistical tests
import numpy as np

# Load your data (adjust path if using processed_insurance_v1.csv)
df = pd.read_csv(
    "../data/insurance.csv",
    parse_dates=['TransactionMonth'], # Ensure this column name is correct from Task 1 fix
    dtype={'PostalCode': 'str'}
)

# Convert TransactionMonth to datetime if not already done
df['TransactionMonth'] = pd.to_datetime(df['TransactionMonth'])

# Ensure numerical columns are correctly typed
numeric_cols = ['TotalPremium', 'TotalClaims', 'CustomValueEstimate', 'CalculatedPremiumPerTerm', 'SumInsured']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [None]:
df['has_claim'] = (df['TotalClaims'] > 0).astype(int)
df['claim_amount_if_claimed'] = df['TotalClaims'].where(df['has_claim'] == 1)
df['margin'] = df['TotalPremium'] - df['TotalClaims']

In [None]:
# Create a contingency table for 'Province' and 'has_claim'
contingency_table_province_claim = pd.crosstab(df['Province'], df['has_claim'])
print("\nContingency Table (Province vs. Has Claim):\n", contingency_table_province_claim)

# Perform Chi-squared test
chi2_freq_province, p_freq_province, _, _ = stats.chi2_contingency(contingency_table_province_claim)
print(f"\nChi-squared test for Claim Frequency across Provinces:")
print(f"Chi-squared statistic: {chi2_freq_province:.4f}")
print(f"P-value: {p_freq_province:.4f}")

In [None]:
# Filter for policies with claims to calculate severity
df_claimed = df[df['has_claim'] == 1].dropna(subset=['claim_amount_if_claimed'])

# Create a list of claim amounts for each province
province_groups = [df_claimed['claim_amount_if_claimed'][df_claimed['Province'] == p] for p in df_claimed['Province'].unique()]

# Perform ANOVA test
f_stat_severity_province, p_severity_province = stats.f_oneway(*province_groups)
print(f"\nANOVA test for Claim Severity across Provinces:")
print(f"F-statistic: {f_stat_severity_province:.4f}")
print(f"P-value: {p_severity_province:.4f}")

# Optional: If p_severity_province < 0.05, consider post-hoc tests (e.g., Tukey HSD)
# from statsmodels.stats.multicomp import pairwise_tukeyhsd
# tukey_results = pairwise_tukeyhsd(endog=df_claimed['claim_amount_if_claimed'], groups=df_claimed['Province'], alpha=0.05)
# print("\nTukey's HSD Post-Hoc Test for Claim Severity:\n", tukey_results)

In [None]:
# Identify top/bottom zipcodes by claim frequency or severity from EDA
# Example: Let's take top 5 and bottom 5 zipcodes by total claims for simplicity
top_n_zipcodes = df.groupby('PostalCode')['TotalClaims'].sum().nlargest(5).index
bottom_n_zipcodes = df.groupby('PostalCode')['TotalClaims'].sum().nsmallest(5).index # Exclude 0 claim zipcodes if many

# Filter data for selected zipcodes for testing
df_selected_zipcodes = df[df['PostalCode'].isin(list(top_n_zipcodes) + list(bottom_n_zipcodes))]

# Perform Chi-squared for Claim Frequency
contingency_table_zip_claim = pd.crosstab(df_selected_zipcodes['PostalCode'], df_selected_zipcodes['has_claim'])
chi2_freq_zip, p_freq_zip, _, _ = stats.chi2_contingency(contingency_table_zip_claim)
print(f"\nChi-squared test for Claim Frequency across selected Zipcodes:")
print(f"Chi-squared statistic: {chi2_freq_zip:.4f}")
print(f"P-value: {p_freq_zip:.4f}")

# Perform ANOVA for Claim Severity
df_claimed_zip = df_selected_zipcodes[df_selected_zipcodes['has_claim'] == 1].dropna(subset=['claim_amount_if_claimed'])
zip_groups_severity = [df_claimed_zip['claim_amount_if_claimed'][df_claimed_zip['PostalCode'] == z] for z in df_claimed_zip['PostalCode'].unique()]
f_stat_severity_zip, p_severity_zip = stats.f_oneway(*zip_groups_severity)
print(f"\nANOVA test for Claim Severity across selected Zipcodes:")
print(f"F-statistic: {f_stat_severity_zip:.4f}")
print(f"P-value: {p_severity_zip:.4f}")

In [None]:
# Use the same selected zipcodes as in Hypothesis 2, or broaden if data allows
# Group margin data by PostalCode
zip_groups_margin = [df_selected_zipcodes['margin'][df_selected_zipcodes['PostalCode'] == z] for z in df_selected_zipcodes['PostalCode'].unique()]

# Perform ANOVA test for Margin
f_stat_margin_zip, p_margin_zip = stats.f_oneway(*zip_groups_margin)
print(f"\nANOVA test for Margin across selected Zipcodes:")
print(f"F-statistic: {f_stat_margin_zip:.4f}")
print(f"P-value: {p_margin_zip:.4f}")

In [None]:
# Standardize gender values
df['Gender'] = df['Gender'].replace({'F': 'Female', 'M': 'Male', 'Woman': 'Female', 'Man': 'Male'})
df_gender_clean = df[df['Gender'].isin(['Female', 'Male'])] # Filter for only valid genders

In [None]:
contingency_table_gender_claim = pd.crosstab(df_gender_clean['Gender'], df_gender_clean['has_claim'])
print("\nContingency Table (Gender vs. Has Claim):\n", contingency_table_gender_claim)

chi2_freq_gender, p_freq_gender, _, _ = stats.chi2_contingency(contingency_table_gender_claim)
print(f"\nChi-squared test for Claim Frequency between Genders:")
print(f"Chi-squared statistic: {chi2_freq_gender:.4f}")
print(f"P-value: {p_freq_gender:.4f}")

In [None]:
df_claimed_gender = df_gender_clean[df_gender_clean['has_claim'] == 1].dropna(subset=['claim_amount_if_claimed'])
women_claims = df_claimed_gender['claim_amount_if_claimed'][df_claimed_gender['Gender'] == 'Female']
men_claims = df_claimed_gender['claim_amount_if_claimed'][df_claimed_gender['Gender'] == 'Male']

# Perform Independent Samples t-test
t_stat_severity_gender, p_severity_gender = stats.ttest_ind(women_claims, men_claims, equal_var=False) # Welch's t-test if variances are unequal
print(f"\nT-test for Claim Severity between Genders:")
print(f"T-statistic: {t_stat_severity_gender:.4f}")
print(f"P-value: {p_severity_gender:.4f}")