In [None]:
# Cell 1: Import necessary libraries
import pandas as pd
from scipy import stats
import numpy as np


In [None]:
# Cell 2: Load the dataset
file_path = './data/MachineLearningRating_v3.txt'
df = pd.read_csv(file_path, delimiter='|')  # Adjust delimiter if needed
df.head()


In [None]:
# Cell 3: Function to calculate claim frequency, severity, and margin
def calculate_risk_metrics(df):
    """Calculates Claim Frequency, Claim Severity, and Margin."""
    df['ClaimOccurred'] = (df['TotalClaims'] > 0).astype(int)
    df['ClaimFrequency'] = df['ClaimOccurred']
    df['ClaimSeverity'] = df['TotalClaims'].where(df['ClaimOccurred'] == 1)
    df['Margin'] = df['TotalPremium'] - df['TotalClaims']
    return df

# Apply calculation
df = calculate_risk_metrics(df)
df[['TotalPremium', 'TotalClaims', 'ClaimOccurred', 'ClaimFrequency', 'ClaimSeverity', 'Margin']].head()


In [None]:
# Cell 4: T-test for numerical comparisons
def perform_t_test(group1_data, group2_data, alpha=0.05):
    """Performs independent t-test for numerical data."""
    stat, p_value = stats.ttest_ind(group1_data.dropna(), group2_data.dropna(), equal_var=False)  # Welch's t-test
    return stat, p_value, p_value < alpha


In [None]:
# Cell 5: Chi-squared test for categorical comparisons
def perform_chi_squared_test(group1_claims_occurred, group1_no_claims,
                             group2_claims_occurred, group2_no_claims, alpha=0.05):
    """Performs Chi-squared test for Claim Frequency."""
    contingency_table = np.array([[group1_claims_occurred, group1_no_claims],
                                  [group2_claims_occurred, group2_no_claims]])
    chi2_stat, p_value, _, _ = stats.chi2_contingency(contingency_table)
    return chi2_stat, p_value, p_value < alpha


In [None]:
# Cell 6: Core function to conduct pairwise risk comparison
def conduct_hypothesis_test(df, group_col, alpha=0.05):
    results = {}
    groups = df[group_col].unique()
    if len(groups) < 2:
        print(f"Not enough groups in '{group_col}' to perform tests.")
        return results

    group_names = df[group_col].value_counts().index.tolist()
    if len(group_names) < 2:
        print(f"Insufficient groups for {group_col} comparison.")
        return results

    g1_name, g2_name = group_names[0], group_names[1]
    group1_df = df[df[group_col] == g1_name]
    group2_df = df[df[group_col] == g2_name]

    # 1. Claim Frequency
    g1_claims = group1_df['ClaimOccurred'].sum()
    g1_no_claims = len(group1_df) - g1_claims
    g2_claims = group2_df['ClaimOccurred'].sum()
    g2_no_claims = len(group2_df) - g2_claims

    chi2_freq, p_freq, reject_freq = perform_chi_squared_test(
        g1_claims, g1_no_claims, g2_claims, g2_no_claims, alpha)
    results[f'Claim_Frequency_{g1_name}_vs_{g2_name}'] = {
        'p_value': p_freq,
        'reject_null': reject_freq,
        'interpretation': f"Claim frequency difference between {g1_name} and {g2_name} is {'significant' if reject_freq else 'not significant'}"
    }

    # 2. Claim Severity
    stat_sev, p_sev, reject_sev = perform_t_test(group1_df['ClaimSeverity'], group2_df['ClaimSeverity'], alpha)
    results[f'Claim_Severity_{g1_name}_vs_{g2_name}'] = {
        'p_value': p_sev,
        'reject_null': reject_sev,
        'interpretation': f"Claim severity difference between {g1_name} and {g2_name} is {'significant' if reject_sev else 'not significant'}"
    }

    # 3. Margin
    stat_margin, p_margin, reject_margin = perform_t_test(group1_df['Margin'], group2_df['Margin'], alpha)
    results[f'Margin_Difference_{g1_name}_vs_{g2_name}'] = {
        'p_value': p_margin,
        'reject_null': reject_margin,
        'interpretation': f"Margin difference between {g1_name} and {g2_name} is {'significant' if reject_margin else 'not significant'}"
    }

    return results


In [None]:
# Cell 7: Province test
def test_province_risk_differences(df, alpha=0.05):
    print("\n--- Testing Risk Differences Across Provinces ---")
    province_results = {}
    top_provinces = df['Province'].value_counts().index.tolist()
    if len(top_provinces) >= 2:
        p1, p2 = top_provinces[0], top_provinces[1]
        print(f"Comparing {p1} vs {p2}...")
        province_results.update(conduct_hypothesis_test(df[df['Province'].isin([p1, p2])], 'Province', alpha))
    else:
        print("Not enough unique provinces for comparison.")
    return province_results


In [None]:
# Cell 8: Zipcode risk difference test
def test_zipcode_risk_differences(df, alpha=0.05):
    print("\n--- Testing Risk Differences Between Zipcodes ---")
    zipcode_results = {}
    top_zipcodes = df['PostalCode'].value_counts().head(2).index.tolist()
    if len(top_zipcodes) >= 2:
        z1, z2 = top_zipcodes[0], top_zipcodes[1]
        print(f"Comparing {z1} vs {z2}...")
        zipcode_results.update(conduct_hypothesis_test(df[df['PostalCode'].isin([z1, z2])], 'PostalCode', alpha))
    else:
        print("Not enough unique zipcodes for comparison.")
    return zipcode_results


In [None]:
# Cell 9: Zipcode margin test
def test_zipcode_margin_differences(df, alpha=0.05):
    print("\n--- Testing Margin Differences Between Zipcodes ---")
    margin_results = {}
    top_zipcodes = df['PostalCode'].value_counts().head(2).index.tolist()
    if len(top_zipcodes) >= 2:
        z1, z2 = top_zipcodes[0], top_zipcodes[1]
        print(f"Comparing Margin for {z1} vs {z2}...")
        g1_margin = df[df['PostalCode'] == z1]['Margin']
        g2_margin = df[df['PostalCode'] == z2]['Margin']
        stat_margin, p_margin, reject_margin = perform_t_test(g1_margin, g2_margin, alpha)
        margin_results[f'Margin_Difference_{z1}_vs_{z2}'] = {
            'p_value': p_margin,
            'reject_null': reject_margin,
            'interpretation': f"Margin difference between {z1} and {z2} is {'significant' if reject_margin else 'not significant'}"
        }
    else:
        print("Not enough unique zipcodes for margin comparison.")
    return margin_results


In [None]:
# Cell 10: Gender test
def test_gender_risk_differences(df, alpha=0.05):
    print("\n--- Testing Risk Differences Between Women and Men ---")
    gender_results = {}
    if 'Female' in df['Gender'].unique() and 'Male' in df['Gender'].unique():
        print("Comparing Female vs Male...")
        gender_results.update(conduct_hypothesis_test(df[df['Gender'].isin(['Female', 'Male'])], 'Gender', alpha))
    else:
        print("Gender categories 'Female'/'Male' not found or insufficient for comparison.")
    return gender_results


In [None]:
# Cell 11: Run all tests and display results
province_results = test_province_risk_differences(df)
zipcode_results = test_zipcode_risk_differences(df)
zipcode_margin_results = test_zipcode_margin_differences(df)
gender_results = test_gender_risk_differences(df)

# Combine and display
all_results = {
    **province_results,
    **zipcode_results,
    **zipcode_margin_results,
    **gender_results
}

for test, result in all_results.items():
    print(f"\n{test}:")
    print(result)
