# Hypothesis Analysis

## Import modules

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from scripts.data_loader import DataLoader

In [2]:
# Loads a dataset from a CSV file.
dataLoader = DataLoader()
df = dataLoader.load_csv('../data/cleaned/CleanedMachineLearningRating_v3.csv')

  data = pd.read_csv(csv_path)


Loaded data from ../data/cleaned/CleanedMachineLearningRating_v3.csv


In [3]:
# Create the 'HasClaim' column
# If 'TotalClaims' > 0, set 'HasClaim' to 1, otherwise set it to 0
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)

--- Data Segmentaion and Metrics Selection ---

Metrics:
- **Claim Frequency**: mean of 'HasClaim'
- **Claim Severity**: mean of 'TotalClaims' where 'HasClaims' is 1
- **Margin**: mean of 'TotalPremium' - mean of 'TotalClaims'

In [4]:
# --- Statistical Testing ---

# H₀:There are no risk differences across provinces
# For Claim Frequency (categorical outcome), we can use Chi-Squared test.
# For Claim Severity (numerical outcome), we can use ANOVA 

# Example for Claim Frequency across provinces using Chi-Squared
if 'Province' in df.columns and 'HasClaim' in df.columns:
  contingency_table_province_claim = pd.crosstab(df['Province'], df['HasClaim'])
  chi2_province_claim, p_province_claim, dof_province_claim, expected_province_claim = stats.chi2_contingency(contingency_table_province_claim)
  print(f"\nH₀: No risk differences across provinces (Claim Frequency)")
  print(f"Chi-squared statistic: {chi2_province_claim:.4f}")
  print(f"P-value: {p_province_claim:.4f}")
  print(f"Degrees of Freedom: {dof_province_claim}")
  print()

  # Interpretation for Claim Frequency across provinces
  if p_province_claim < 0.05:
    print("✅ Reject H₀: There are statistically significant differences in claim frequency across provinces.")
  else:
    print("❌ Fail to reject H₀: There is no statistically significant difference in claim frequency across provinces.")

  # Show the expected frequencies
  print(f"\nExpected Frequencies:")
  expected_df = pd.DataFrame(expected_province_claim, 
                            index=contingency_table_province_claim.index, 
                            columns=contingency_table_province_claim.columns)
  print(expected_df.round(2))

  # Example for Claim Severity across provinces (assuming at least 2 provinces and numerical 'TotalClaims')
  if 'TotalClaims' in df.columns:
    claim_severity_by_province = {}
    for province in df['Province'].unique():
      claims_in_province = df[(df['Province'] == province) & (df['HasClaim'] == 1)]['TotalClaims']
      if len(claims_in_province) > 0:
        claim_severity_by_province[province] = claims_in_province

    fvalue_province_severity, p_province_severity = stats.f_oneway(*claim_severity_by_province.values())
    print(f"\nH₀: No risk differences across provinces (Claim Severity - ANOVA)")
    print(f"F-statistic: {fvalue_province_severity:.4f}")
    print(f"P-value: {p_province_severity:.4f}")
    print()

    if p_province_severity < 0.05:
      print("✅ Reject H₀: There are statistically significant differences in claim severity across provinces.")
    else:
      print("❌ Fail to reject H₀: There is no statistically significant difference in claim severity across provinces.")


H₀: No risk differences across provinces (Claim Frequency)
Chi-squared statistic: 104.1909
P-value: 0.0000
Degrees of Freedom: 8

✅ Reject H₀: There are statistically significant differences in claim frequency across provinces.

Expected Frequencies:
HasClaim               0        1
Province                         
Eastern Cape    30251.43    84.57
Free State       8076.42    22.58
Gauteng        392767.01  1097.99
KwaZulu-Natal  169307.70   473.30
Limpopo         24766.76    69.24
Mpumalanga      52571.04   146.96
North West     142887.55   399.45
Northern Cape    6362.21    17.79
Western Cape   170319.87   476.13

H₀: No risk differences across provinces (Claim Severity - ANOVA)
F-statistic: 4.8302
P-value: 0.0000

✅ Reject H₀: There are statistically significant differences in claim severity across provinces.


In [5]:
# H₀:There are no risk differences between postal codes

# Example for Claim Frequency between two specific postal codes
if 'PostalCode' in df.columns and 'HasClaim' in df.columns:
  # Select two postal codes for comparison
  postal_codes_to_compare = df['PostalCode'].unique()
  if len(postal_codes_to_compare) >= 2:
    postal_code_a = postal_codes_to_compare[2]
    postal_code_b = postal_codes_to_compare[13]

    df_postal_a = df[df['PostalCode'] == postal_code_a]
    df_postal_b = df[df['PostalCode'] == postal_code_b]

    contingency_table_postal_claim = pd.crosstab(df['PostalCode'], df['HasClaim']).loc[[postal_code_a, postal_code_b]]
    print(contingency_table_postal_claim)
    chi2_postal_claim, p_postal_claim, dof_postal_claim, expected_postal_claim = stats.chi2_contingency(contingency_table_postal_claim)

    print(f"\nH₀: No risk differences between Postal Code {postal_code_a} and {postal_code_b} (Claim Frequency)")
    print(f"Chi-squared statistic: {chi2_postal_claim:.4f}")
    print(f"P-value: {p_postal_claim:.4f}")
    print()
    
    # Interpretation
    if p_postal_claim < 0.05:
      print(f"✅ Reject H₀: There are statistically significant differences in claim frequency between Postal Code {postal_code_a} and {postal_code_b}.")
    else:
      print(f"❌ Fail to reject H₀: There is no statistically significant difference in claim frequency between Postal Code {postal_code_a} and {postal_code_b}.")

HasClaim       0  1
PostalCode         
1619        2108  2
2410        4122  7

H₀: No risk differences between Postal Code 1619 and 2410 (Claim Frequency)
Chi-squared statistic: 0.1470
P-value: 0.7014

❌ Fail to reject H₀: There is no statistically significant difference in claim frequency between Postal Code 1619 and 2410.


In [6]:
# H₀:There are no significant margin (profit) difference between postal codes

# Example for Margin between two specific postal codes
if 'PostalCode' in df.columns and 'TotalPremium' in df.columns and 'TotalClaims' in df.columns:
  df['Margin'] = df['TotalPremium'] - df['TotalClaims']

  postal_codes_to_compare = df['PostalCode'].unique()
  if len(postal_codes_to_compare) >= 2:
    postal_code_a = postal_codes_to_compare[2]
    postal_code_b = postal_codes_to_compare[13]

    margin_postal_a = df[df['PostalCode'] == postal_code_a]['Margin']
    margin_postal_b = df[df['PostalCode'] == postal_code_b]['Margin']

    # Perform t-test for independent samples
    tstat_postal_margin, p_postal_margin = stats.ttest_ind(margin_postal_a, margin_postal_b)

    print(f"\nH₀: No significant margin difference between Postal Code {postal_code_a} and {postal_code_b}")
    print(f"T-statistic: {tstat_postal_margin:.4f}")
    print(f"P-value: {p_postal_margin:.4f}")
    print()
    
    # Interpretation
    if p_postal_margin < 0.05:
      print(f"✅ Reject H₀: There is a statistically significant difference in margin between Postal Code {postal_code_a} and {postal_code_b}.")
    else:
      print(f"❌ Fail to reject H₀: There is no statistically significant difference in margin between Postal Code {postal_code_a} and {postal_code_b}.")



H₀: No significant margin difference between Postal Code 1619 and 2410
T-statistic: -0.3568
P-value: 0.7212

❌ Fail to reject H₀: There is no statistically significant difference in margin between Postal Code 1619 and 2410.


In [7]:
# H₀:There are not significant risk difference between Female and Male

# Example for Claim Frequency between genders using Chi-Squared
if 'Gender' in df.columns and 'HasClaim' in df.columns:
  contingency_table_gender_claim = pd.crosstab(df['Gender'], df['HasClaim'])
  print("Contingency Table:")
  print(contingency_table_gender_claim)
  print()

  chi2_gender_claim, p_gender_claim, dof_gender_claim, expected_gender_claim = stats.chi2_contingency(contingency_table_gender_claim)
  print(f"\nH₀: No significant risk difference between Female and Male (Claim Frequency)")
  print(f"Chi-squared statistic: {chi2_gender_claim:.4f}")
  print(f"P-value: {p_gender_claim:.4f}")
  print()
  
  # Interpretation
  if p_gender_claim < 0.05:
    print("✅ Reject H₀: There is a statistically significant difference in claim frequency between Female and Male.")
  else:
    print("❌ Fail to reject H₀: There is no statistically significant difference in claim frequency between Female and Male.")

# Example for Claim Severity between genders using t-test
if 'Gender' in df.columns and 'TotalClaims' in df.columns and 'HasClaim' in df.columns:
  claims_female = df[(df['Gender'] == 'Female') & (df['HasClaim'] == 1)]['TotalClaims']
  claims_male = df[(df['Gender'] == 'Male') & (df['HasClaim'] == 1)]['TotalClaims']

  if len(claims_female) > 0 and len(claims_male) > 0:
    tstat_gender_severity, p_gender_severity = stats.ttest_ind(claims_female, claims_male)
    print(f"\nH₀: No significant risk difference between Female and Male (Claim Severity - t-test)")
    print(f"T-statistic: {tstat_gender_severity:.4f}")
    print(f"P-value: {p_gender_severity:.4f}")
    print()

    # Interpretation
    if p_gender_severity < 0.05:
      print("✅ Reject H₀: There is a statistically significant difference in claim severity between Female and Male.")
    else:
      print("❌ Fail to reject H₀: There is no statistically significant difference in claim severity between Female and Male.")
  elif len(claims_female) == 0 and len(claims_male) == 0:
    print("\nNo claims reported for either Female or Male to compare claim severity.")
  elif len(claims_female) == 0:
    print("\nNo claims reported for Female to compare claim severity.")
  else: # len(claims_male) == 0
    print("\nNo claims reported for Male to compare claim severity.")

  print(f"\nDescriptive Statistics:")
  print(f"Male average claim: ${claims_male.mean():.2f}")
  print(f"Female average claim: ${claims_female.mean():.2f}")
  print(f"Difference: ${claims_male.mean() - claims_female.mean():.2f}")


Contingency Table:
HasClaim            0     1
Gender                     
Female          65603   130
Male           930899  2656
Not specified     808     2


H₀: No significant risk difference between Female and Male (Claim Frequency)
Chi-squared statistic: 16.6474
P-value: 0.0002

✅ Reject H₀: There is a statistically significant difference in claim frequency between Female and Male.

H₀: No significant risk difference between Female and Male (Claim Severity - t-test)
T-statistic: -0.0484
P-value: 0.9614

❌ Fail to reject H₀: There is no statistically significant difference in claim severity between Female and Male.

Descriptive Statistics:
Male average claim: $23269.46
Female average claim: $23101.23
Difference: $168.24


In [8]:
# Aggregate at policy level

policy_agg = df.groupby('PolicyID').agg(
    Province=('Province', 'first'),  # assume policy only belongs to one province
    PostalCode=('PostalCode', 'first'),
    Gender=('Gender', 'first'),
    TotalPremium=('TotalPremium', 'first'),
    TotalClaims=('TotalClaims', 'sum')
).reset_index()

policy_agg['HasClaim'] = (policy_agg['TotalClaims'] > 0).astype(int)
policy_agg['Margin'] = policy_agg['TotalPremium'] - policy_agg['TotalClaims']

claims = df[df['TotalClaims'] > 0]  # individual claims data


In [9]:
# --- Statistical Testing ---

# H₀:There are no risk differences across provinces
# For Claim Frequency (categorical outcome), we can use Chi-Squared test.
# For Claim Severity (numerical outcome), we can use ANOVA 

# Example for Claim Frequency across provinces using Chi-Squared
if 'Province' in policy_agg.columns and 'HasClaim' in policy_agg.columns:
  contingency_table_province_claim = pd.crosstab(policy_agg['Province'], policy_agg['HasClaim'])
  chi2_province_claim, p_province_claim, dof_province_claim, expected_province_claim = stats.chi2_contingency(contingency_table_province_claim)
  print(f"\nH₀: No risk differences across provinces (Claim Frequency)")
  print(f"Chi-squared statistic: {chi2_province_claim:.4f}")
  print(f"P-value: {p_province_claim:.4f}")
  print()

  # Interpretation for Claim Frequency across provinces
  if p_province_claim < 0.05:
    print("✅ Reject H₀: There are statistically significant differences in claim frequency across provinces.")
  else:
    print("❌ Fail to reject H₀: There is no statistically significant difference in claim frequency across provinces.")

  # Show the expected frequencies
  print(f"\nExpected Frequencies:")
  expected_df = pd.DataFrame(expected_province_claim, 
                            index=contingency_table_province_claim.index, 
                            columns=contingency_table_province_claim.columns)
  print(expected_df.round(2))

  # Example for Claim Severity across provinces (assuming at least 2 provinces and numerical 'TotalClaims')
  if 'TotalClaims' in policy_agg.columns:
    claim_severity_by_province = {}
    for province in policy_agg['Province'].unique():
      claims_in_province = policy_agg[(policy_agg['Province'] == province) & (policy_agg['HasClaim'] == 1)]['TotalClaims']
      if len(claims_in_province) > 0:
        claim_severity_by_province[province] = claims_in_province

    # Perform ANOVA test
    fvalue_province_severity, p_province_severity = stats.f_oneway(*claim_severity_by_province.values())
    print(f"\nH₀: No risk differences across provinces (Claim Severity - ANOVA)")
    print(f"F-statistic: {fvalue_province_severity:.4f}")
    print(f"P-value: {p_province_severity:.4f}")
    print()


H₀: No risk differences across provinces (Claim Frequency)
Chi-squared statistic: 35.7951
P-value: 0.0000

✅ Reject H₀: There are statistically significant differences in claim frequency across provinces.

Expected Frequencies:
HasClaim             0       1
Province                      
Eastern Cape    254.24   64.76
Free State       43.04   10.96
Gauteng        2053.87  523.13
KwaZulu-Natal  1090.30  277.70
Limpopo         144.26   36.74
Mpumalanga      304.45   77.55
North West      890.25  226.75
Northern Cape    43.84   11.16
Western Cape    754.76  192.24

H₀: No risk differences across provinces (Claim Severity - ANOVA)
F-statistic: 5.3504
P-value: 0.0000



In [10]:
# H₀:There are no risk differences between postal codes

# Example for Claim Frequency between two specific postal codes 
if 'PostalCode' in policy_agg.columns and 'HasClaim' in policy_agg.columns:
  # Select two postal codes for comparison
  postal_codes_to_compare = policy_agg['PostalCode'].unique()
  if len(postal_codes_to_compare) >= 2:
    postal_code_a = postal_codes_to_compare[2]
    postal_code_b = postal_codes_to_compare[13]

    policy_agg_postal_a = policy_agg[policy_agg['PostalCode'] == postal_code_a]
    policy_agg_postal_a = policy_agg[policy_agg['PostalCode'] == postal_code_b]

    contingency_table_postal_claim = pd.crosstab(policy_agg['PostalCode'], policy_agg['HasClaim']).loc[[postal_code_a, postal_code_b]]
    print(contingency_table_postal_claim)
    chi2_postal_claim, p_postal_claim, dof_postal_claim, expected_postal_claim = stats.chi2_contingency(contingency_table_postal_claim)

    print(f"\nH₀: No risk differences between Postal Code {postal_code_a} and {postal_code_b} (Claim Frequency)")
    print(f"Chi-squared statistic: {chi2_postal_claim:.4f}")
    print(f"P-value: {p_postal_claim:.4f}")
    print()
    
    # Interpretation
    if p_postal_claim < 0.05:
      print(f"✅ Reject H₀: There are statistically significant differences in claim frequency between Postal Code {postal_code_a} and {postal_code_b}.")
    else:
      print(f"❌ Fail to reject H₀: There is no statistically significant difference in claim frequency between Postal Code {postal_code_a} and {postal_code_b}.")

HasClaim     0  1
PostalCode       
7888        14  0
2735         8  1

H₀: No risk differences between Postal Code 7888 and 2735 (Claim Frequency)
Chi-squared statistic: 0.0519
P-value: 0.8199

❌ Fail to reject H₀: There is no statistically significant difference in claim frequency between Postal Code 7888 and 2735.


In [11]:
# H₀:There are no significant margin (profit) difference between postal codes.

# Example for Margin between two specific postal codes
if 'PostalCode' in policy_agg.columns and 'TotalPremium' in policy_agg.columns and 'TotalClaims' in policy_agg.columns:
  policy_agg['Margin'] = policy_agg['TotalPremium'] - policy_agg['TotalClaims']

  postal_codes_to_compare = policy_agg['PostalCode'].unique()
  if len(postal_codes_to_compare) >= 2:
    postal_code_a = postal_codes_to_compare[2]
    postal_code_b = postal_codes_to_compare[13]

    margin_postal_a = policy_agg[policy_agg['PostalCode'] == postal_code_a]['Margin']
    margin_postal_b = policy_agg[policy_agg['PostalCode'] == postal_code_b]['Margin']

    # Perform t-test for independent samples
    tstat_postal_margin, p_postal_margin = stats.ttest_ind(margin_postal_a, margin_postal_b)

    print(f"\nH₀: No significant margin difference between Postal Code {postal_code_a} and {postal_code_b}")
    print(f"T-statistic: {tstat_postal_margin:.4f}")
    print(f"P-value: {p_postal_margin:.4f}")
    print()
    
    # Interpretation
    if p_postal_margin < 0.05:
      print(f"✅ Reject H₀: There is a statistically significant difference in margin between Postal Code {postal_code_a} and {postal_code_b}.")
    else:
      print(f"❌ Fail to reject H₀: There is no statistically significant difference in margin between Postal Code {postal_code_a} and {postal_code_b}.")



H₀: No significant margin difference between Postal Code 7888 and 2735
T-statistic: 1.3650
P-value: 0.1867

❌ Fail to reject H₀: There is no statistically significant difference in margin between Postal Code 7888 and 2735.


In [12]:
# H₀:There are not significant risk difference between Female and Male

# Example for Claim Frequency between genders using Chi-Squared
if 'Gender' in policy_agg.columns and 'HasClaim' in policy_agg.columns:
  contingency_table_gender_claim = pd.crosstab(policy_agg['Gender'], policy_agg['HasClaim'])
  print("Contingency Table:")
  print(contingency_table_gender_claim)
  print()

  chi2_gender_claim, p_gender_claim, dof_gender_claim, expected_gender_claim = stats.chi2_contingency(contingency_table_gender_claim)
  print(f"\nH₀: No significant risk difference between Female and Male (Claim Frequency)")
  print(f"Chi-squared statistic: {chi2_gender_claim:.4f}")
  print(f"P-value: {p_gender_claim:.4f}")

  # Interpretation
  if p_gender_claim < 0.05:
    print("✅ Reject H₀: There is a statistically significant difference in claim frequency between Female and Male.")
  else:
    print("❌ Fail to reject H₀: There is no statistically significant difference in claim frequency between Female and Male.")

# Example for Claim Severity between genders using t-test
if 'Gender' in policy_agg.columns and 'TotalClaims' in policy_agg.columns and 'HasClaim' in policy_agg.columns:
  claims_female = policy_agg[(policy_agg['Gender'] == 'Female') & (policy_agg['HasClaim'] == 1)]['TotalClaims']
  claims_male = policy_agg[(policy_agg['Gender'] == 'Male') & (policy_agg['HasClaim'] == 1)]['TotalClaims']

  if len(claims_female) > 0 and len(claims_male) > 0:
    tstat_gender_severity, p_gender_severity = stats.ttest_ind(claims_female, claims_male)
    print(f"\nH₀: No significant risk difference between Female and Male (Claim Severity - t-test)")
    print(f"T-statistic: {tstat_gender_severity:.4f}")
    print(f"P-value: {p_gender_severity:.4f}")
    print()

    # Interpretation
    if p_gender_severity < 0.05:
      print("✅ Reject H₀: There is a statistically significant difference in claim severity between Female and Male.")
    else:
      print("❌ Fail to reject H₀: There is no statistically significant difference in claim severity between Female and Male.")
  elif len(claims_female) == 0 and len(claims_male) == 0:
    print("\nNo claims reported for either Female or Male to compare claim severity.")
  elif len(claims_female) == 0:
    print("\nNo claims reported for Female to compare claim severity.")
  else: # len(claims_male) == 0
    print("\nNo claims reported for Male to compare claim severity.")

  print(f"\nDescriptive Statistics:")
  print(f"Male average claim: ${claims_male.mean():.2f}")
  print(f"Female average claim: ${claims_female.mean():.2f}")
  print(f"Difference: ${claims_male.mean() - claims_female.mean():.2f}")

Contingency Table:
HasClaim          0     1
Gender                   
Female          433    92
Male           5143  1328
Not specified     3     1


H₀: No significant risk difference between Female and Male (Claim Frequency)
Chi-squared statistic: 2.7533
P-value: 0.2524
❌ Fail to reject H₀: There is no statistically significant difference in claim frequency between Female and Male.

H₀: No significant risk difference between Female and Male (Claim Severity - t-test)
T-statistic: -1.9596
P-value: 0.0502

❌ Fail to reject H₀: There is no statistically significant difference in claim severity between Female and Male.

Descriptive Statistics:
Male average claim: $46529.89
Female average claim: $32609.67
Difference: $13920.22
