# Task 3: Statistical Hypothesis Testing

We test four key business hypotheses to understand risk and margin differences using the full dataset.

In [1]:
import sys
import os
import pandas as pd
import numpy as np

# Setup path to allow imports from src
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.data.loader import load_data
from src.stats.hypothesis import check_chi2_independence, check_ttest_means, check_anova

# TARGET DATA FILE
DATA_PATH = '../data/raw/MachineLearningRating.txt'
if not os.path.exists(DATA_PATH):
    # Try alternative location if running from root
    DATA_PATH = 'data/raw/MachineLearningRating.txt'

print(f"Loading data from: {DATA_PATH}")
df = load_data(DATA_PATH)

# Feature Engineering for Tests
# 1. Define 'Risk' (Claim vs No Claim)
df['HasClaim'] = df['TotalClaims'].apply(lambda x: 1 if x > 0 else 0)

# 2. Define 'Margin'
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

print(f"Data Ready. Shape: {df.shape}")

Loading data from: ../data/raw/MachineLearningRating.txt


Data Ready. Shape: (1000098, 54)


## Test 1: Risk Differences Across Provinces
**Null Hypothesis**: Risk (Claim Frequency) is independent of Province.

In [2]:
if 'Province' in df.columns:
    p_val, contingency, interp = check_chi2_independence(df, 'Province', 'HasClaim')
    print(f"P-Value: {p_val}")
    print(interp)
    
    # Business Insight
    if p_val < 0.05:
        print("\nInsight: Risk is NOT uniform. Some provinces have significantly higher claim rates. Stratify pricing by Province.")

P-Value: 5.925510718204678e-19
Reject Null Hypothesis: Significant difference exists.

Insight: Risk is NOT uniform. Some provinces have significantly higher claim rates. Stratify pricing by Province.


## Test 2: Risk Differences Across Zip Codes
**Null Hypothesis**: Risk is independent of Zip Code.

In [3]:
if 'PostalCode' in df.columns:
    # ANOVA on TotalClaims for Variance across Zips
    # We use a subset if too many zips to speed up demonstration, or full if robust
    p_val, interp = check_anova(df, 'PostalCode', 'TotalClaims')
    if p_val is not None:
        print(f"ANOVA P-Value: {p_val}")
        print(interp)
        
        if p_val < 0.05:
            print("\nInsight: Location (Zip) drives claim severity. Granular geo-rating is justified.")
    else:
        print("Insufficient data levels for test.")

ANOVA P-Value: 0.8906511279164051
Fail to Reject Null: No significant difference across groups.


## Test 3: Margin Differences Between Zip Codes
**Null Hypothesis**: Margins are equal across Zip Codes.

In [4]:
if 'PostalCode' in df.columns:
    # ANOVA for Margin across Zips
    p_val, interp = check_anova(df, 'PostalCode', 'Margin')
    if p_val is not None:
        print(f"ANOVA P-Value: {p_val}")
        print(interp)
        
        if p_val < 0.05:
            print("\nInsight: Profitability varies by Zip. Review underwriting in low-margin areas.")
    else:
         print("Insufficient data levels for test.")

ANOVA P-Value: 0.9976859758015036
Fail to Reject Null: No significant difference across groups.


## Test 4: Risk Difference Between Women and Men
**Null Hypothesis**: Mean Claims (Risk) are equal for Men and Women.

In [5]:
if 'Gender' in df.columns:
    # T-Test
    # Identify groups (Check unique values first)
    print(f"Gender Groups: {df['Gender'].unique()}")
    
    # Assuming 'Male' and 'Female' exist
    p_val, interp = check_ttest_means(df, 'Gender', 'TotalClaims', 'Male', 'Female')
    if p_val is not None:
        print(f"T-Test P-Value: {p_val}")
        print(interp)
        
        if p_val > 0.05:
            print("\nInsight: Gender is NOT a significant risk factor. Consider removing it from the rating model (fairness).")
    else:
        print("Insufficient data for Gender test.")

Gender Groups: ['Not specified' 'Male' 'Female' nan]
T-Test P-Value: 0.7669656471629474
Fail to Reject Null: No significant difference in means.

Insight: Gender is NOT a significant risk factor. Consider removing it from the rating model (fairness).
