## Import Libraries


In [2]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, chi2_contingency, norm # For statistical tests

## Import Data


In [3]:
df = pd.read_csv('../data/processed/MachineLearningRating_clean.csv')

  df = pd.read_csv('../data/processed/MachineLearningRating_clean.csv')


In [4]:
df.isnull().sum()

Unnamed: 0                       0
UnderwrittenCoverID              0
PolicyID                         0
TransactionMonth                 0
IsVATRegistered                  0
Citizenship                      0
LegalType                        0
Title                            0
Language                         0
Bank                             0
AccountType                      0
MaritalStatus                    0
Gender                           0
Country                          0
Province                         0
PostalCode                       0
MainCrestaZone                   0
SubCrestaZone                    0
ItemType                         0
mmcode                           0
VehicleType                      0
RegistrationYear                 0
make                             0
Model                            0
Cylinders                        0
cubiccapacity                    0
kilowatts                        0
bodytype                         0
NumberOfDoors       

In [7]:
import statsmodels.stats.api as sms # For Z-test of proportions
import warnings

In [8]:
# Suppress the UserWarning from pandas/statsmodels when running multiple tests
warnings.filterwarnings('ignore')

# Assume df is your cleaned DataFrame from Task 2 (approx. 1M rows)

## ----------------------------------------------------------------------
## 1. Metric Selection and Feature Engineering
## ----------------------------------------------------------------------

# 1.1 Create Claim Flag (Binary Target for Frequency)
# Claim Frequency (proportion of policies with at least one claim)
df['ClaimFlag'] = np.where(df['TotalClaims'] > 0, 1, 0)

# 1.2 Create Margin
# Margin = TotalPremium - TotalClaims
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

# 1.3 Create Non-Zero Claims Dataset (For Severity Testing)
# Claim Severity (the average amount of a claim, given a claim occurred)
df_severity = df[df['ClaimFlag'] == 1].copy()

print("--- Metrics Calculated ---")
print(f"Total Exposure (Policies): {len(df):,.0f}")
print(f"Total Claim Count: {df['ClaimFlag'].sum():,.0f}")
print(f"Overall Claim Frequency: {df['ClaimFlag'].mean():.4f}")
print("-" * 50)

--- Metrics Calculated ---
Total Exposure (Policies): 999,546
Total Claim Count: 2,775
Overall Claim Frequency: 0.0028
--------------------------------------------------


In [11]:
## ----------------------------------------------------------------------
## 2. Statistical Testing Execution
## ----------------------------------------------------------------------

# Dictionary to store final results for the report
results = []
SIGNIFICANCE_THRESHOLD = 0.05

def run_chi2_test(contingency_table, hypothesis_name):
    """Conducts a Chi-Squared test for independence."""
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    
    # Calculate the overall observed frequency for interpretation
    group_A = contingency_table.iloc[:, 1].sum()
    total_A = contingency_table.sum(axis=1).iloc[0]
    
    # Check top two groups for clear effect size
    top_groups = contingency_table.sort_values(by=1, ascending=False).index[:2]
    freq_A = contingency_table.loc[top_groups[0], 1] / contingency_table.loc[top_groups[0]].sum()
    freq_B = contingency_table.loc[top_groups[1], 1] / contingency_table.loc[top_groups[1]].sum()

    print(f"\n[Test: {hypothesis_name}]")
    print(f"Chi2: {chi2:.3f}, P-value: {p_value:.5f}")
    
    if p_value < SIGNIFICANCE_THRESHOLD:
        decision = "Reject H₀"
        effect = f"Difference in Frequency between {top_groups[0]} ({freq_A:.2%}) and {top_groups[1]} ({freq_B:.2%})"
    else:
        decision = "Fail to Reject H₀"
        effect = "No statistically significant difference observed."
        
    print(f"Decision: {decision}")
    
    results.append({
        'Hypothesis': hypothesis_name, 
        'Metric': 'Claim Frequency', 
        'Test': 'Chi-Squared', 
        'P_Value': p_value, 
        'Decision': decision,
        'Effect_Size': effect
    })


def run_ttest(group_A_data, group_B_data, hypothesis_name, metric_name, group_A_name, group_B_name):
    """Conducts a T-Test for the difference between two independent sample means."""
    # Only run if both groups have sufficient data
    if len(group_A_data) < 30 or len(group_B_data) < 30:
        print(f"\n[Test: {hypothesis_name} - {metric_name}] Skipping (Insufficient data).")
        return

    # T-test (assuming unequal variance, Welch's T-test)
    t_stat, p_value = ttest_ind(group_A_data, group_B_data, equal_var=False)
    
    mean_A = group_A_data.mean()
    mean_B = group_B_data.mean()
    
    print(f"\n[Test: {hypothesis_name} - {metric_name}]")
    print(f"T-stat: {t_stat:.3f}, P-value: {p_value:.5f}")

    if p_value < SIGNIFICANCE_THRESHOLD:
        decision = "Reject H₀"
        effect = f"Mean difference of {mean_A - mean_B:.2f}. {group_A_name} Mean: {mean_A:.2f}, {group_B_name} Mean: {mean_B:.2f}"
    else:
        decision = "Fail to Reject H₀"
        effect = "No statistically significant difference observed."

    print(f"Decision: {decision}")
    
    results.append({
        'Hypothesis': hypothesis_name, 
        'Metric': metric_name, 
        'Test': 'T-Test', 
        'P_Value': p_value, 
        'Decision': decision,
        'Effect_Size': effect
    })

# ======================================================================
# H₀ 1: There are no risk differences across provinces
# ======================================================================

# Frequency Test: Chi-Squared for Claim Flag vs. Province
province_contingency = pd.crosstab(df['Province'], df['ClaimFlag'])
run_chi2_test(province_contingency, "H₀: No Risk Differences Across Provinces (Frequency)")

# Severity Test: T-Test (Comparing Top 2 Most Exposed Provinces)
top_provinces = df['Province'].value_counts().nlargest(2).index
province_A = df_severity[df_severity['Province'] == top_provinces[0]]['TotalClaims']
province_B = df_severity[df_severity['Province'] == top_provinces[1]]['TotalClaims']
run_ttest(province_A, province_B, "H₀: No Risk Differences Across Provinces (Severity)", 
          "Claim Severity", top_provinces[0], top_provinces[1])

# ======================================================================
# H₀ 2 & 3: Risk and Margin differences between zip codes
# ======================================================================
# Strategy: Compare a High-Risk ZIP group vs. Low-Risk ZIP group

# 1. Calculate Loss Ratio (LR) per ZIP Code
zip_metrics = df.groupby('PostalCode').agg(
    TotalPremium=('TotalPremium', 'sum'),
    TotalClaims=('TotalClaims', 'sum'),
    Exposure=('ClaimFlag', 'size')
).reset_index()

# Filter Zips with very low exposure for stable calculation
zip_metrics = zip_metrics[zip_metrics['Exposure'] >= 100] # Adjust threshold as needed
zip_metrics['LossRatio'] = zip_metrics['TotalClaims'] / zip_metrics['TotalPremium']

# 2. Define High-Risk (Top 10% LR) and Low-Risk (Bottom 10% LR) ZIP Groups
lr_threshold_high = zip_metrics['LossRatio'].quantile(0.90)
lr_threshold_low = zip_metrics['LossRatio'].quantile(0.10)

high_risk_zips = zip_metrics[zip_metrics['LossRatio'] >= lr_threshold_high]['PostalCode']
low_risk_zips = zip_metrics[zip_metrics['LossRatio'] <= lr_threshold_low]['PostalCode']



[Test: H₀: No Risk Differences Across Provinces (Frequency)]
Chi2: 110.731, P-value: 0.00000
Decision: Reject H₀

[Test: H₀: No Risk Differences Across Provinces (Severity) - Claim Severity]
T-stat: -1.961, P-value: 0.05046
Decision: Fail to Reject H₀


In [13]:
# Check the actual values in the Gender column
print(df['Gender'].value_counts())

Gender
Not specified    940438
Male              42817
Unknown            9536
Female             6755
Name: count, dtype: int64


In [14]:
# 3. Create Samples for Testing
high_risk_sample = df[df['PostalCode'].isin(high_risk_zips)]
low_risk_sample = df[df['PostalCode'].isin(low_risk_zips)]

# H₀ 2: No risk difference (Frequency) between High/Low LR Postals
Postal_contingency = pd.crosstab(
    np.where(df['PostalCode'].isin(high_risk_zips), 'High_Risk_Postal', 
             np.where(df['PostalCode'].isin(low_risk_zips), 'Low_Risk_Postal', 'Other')),
    df['ClaimFlag']
).iloc[0:2, :] # Select only the two test groups
run_chi2_test(Postal_contingency, "H₀: No Risk Differences Between Postal Code Groups (Frequency)")


# H₀ 3: No significant margin difference between Postal codes (T-Test on Margin)
run_ttest(high_risk_sample['Margin'], low_risk_sample['Margin'], 
          "H₀: No Significant Margin Difference Between Postal Code Groups", 
          "Margin (Total Premium - Total Claims)", "High_Risk_Postal_Group", "Low_Risk_Postal_Group")


# ======================================================================
# H₀ 4: There is no significant risk difference between Women and Men
# ======================================================================
# Filter to only known genders
df_gender = df[df['Gender'].isin(['Female', 'Male'])].copy()

# Frequency Test: Chi-Squared for Claim Flag vs. Gender
gender_contingency = pd.crosstab(df_gender['Gender'], df_gender['ClaimFlag'])
run_chi2_test(gender_contingency, "H₀: No Significant Risk Difference Between Women and Men (Frequency)")

# Severity Test: T-Test
men_claims = df_severity[df_severity['Gender'] == 'Male']['TotalClaims']
women_claims = df_severity[df_severity['Gender'] == 'Female']['TotalClaims']
run_ttest(women_claims, men_claims, "H₀: No Significant Risk Difference Between Women and Men (Severity)", 
          "Claim Severity", "Women", "Men")



[Test: H₀: No Risk Differences Between Postal Code Groups (Frequency)]
Chi2: 447.603, P-value: 0.00000
Decision: Reject H₀

[Test: H₀: No Significant Margin Difference Between Postal Code Groups - Margin (Total Premium - Total Claims)]
T-stat: -9.856, P-value: 0.00000
Decision: Reject H₀

[Test: H₀: No Significant Risk Difference Between Women and Men (Frequency)]
Chi2: 0.004, P-value: 0.95146
Decision: Fail to Reject H₀

[Test: H₀: No Significant Risk Difference Between Women and Men (Severity) - Claim Severity] Skipping (Insufficient data).


In [17]:
## ----------------------------------------------------------------------
## 3. Analyze and Report (Final Summary)
## ----------------------------------------------------------------------

print("\n\n" + "="*70)
print("FINAL HYPOTHESIS TESTING SUMMARY")
print("="*70)

summary_df = pd.DataFrame(results)

# Clean up and present the P-Value
summary_df['P_Value'] = summary_df['P_Value'].apply(lambda x: f"{x:.5f}")
summary_df['Interpretation'] = np.where(summary_df['Decision'] == 'Reject H₀', 
                                       'Statistically Significant Effect', 
                                       'No Significant Effect Found')

print(summary_df[['Hypothesis', 'Metric', 'Decision', 'P_Value', 'Effect_Size', 'Interpretation']].to_markdown(index=False))

# --- BUSINESS RECOMMENDATIONS (Based on typical outcomes) ---
print("\n--- BUSINESS RECOMMENDATIONS ---")
for index, row in summary_df.iterrows():
    if row['Decision'] == 'Reject H₀':
        print(f"\n✅ Hypothesis: {row['Hypothesis']} ({row['Metric']})")
        print(f"Interpretation: We reject the null hypothesis (P < {SIGNIFICANCE_THRESHOLD}). The factor is a significant risk driver.")
        print(f"Recommendation: Incorporate the factor into the pricing model and segmentation strategy.")
    else:
        print(f"\nHypothesis: {row['Hypothesis']} ({row['Metric']})")
        print(f"Interpretation: We fail to reject the null hypothesis (P >= {SIGNIFICANCE_THRESHOLD}). The factor is NOT a significant risk driver.")
        print(f"Recommendation: Do not use this factor for segmentation or risk adjustment in the pricing model.")



FINAL HYPOTHESIS TESTING SUMMARY
| Hypothesis                                                           | Metric                                | Decision          |   P_Value | Effect_Size                                                                                         | Interpretation                   |
|:---------------------------------------------------------------------|:--------------------------------------|:------------------|----------:|:----------------------------------------------------------------------------------------------------|:---------------------------------|
| H₀: No Risk Differences Across Provinces (Frequency)                 | Claim Frequency                       | Reject H₀         |   0       | Difference in Frequency between Gauteng (0.34%) and KwaZulu-Natal (0.28%)                           | Statistically Significant Effect |
| H₀: No Risk Differences Across Provinces (Severity)                  | Claim Severity                        | Fail t