# Hypothesis Analysis

## Import modules

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from scripts.data_loader import DataLoader

In [2]:
# Loads a dataset from a CSV file.
dataLoader = DataLoader()
df = dataLoader.load_csv('../data/cleaned/CleanedMachineLearningRating_v3.csv')

  data = pd.read_csv(csv_path)


Loaded data from ../data/cleaned/CleanedMachineLearningRating_v3.csv


In [3]:
# Create the 'HasClaim' column
# If 'TotalClaims' > 0, set 'HasClaim' to 1, otherwise set it to 0
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)

--- Data Segmentation and Metrics Selection ---

Metrics:
Claim Frequency: mean of 'HasClaim'
Claim Severity: mean of 'TotalClaims' where 'HasClaims' is 1
Margin: mean of 'TotalPremium' - mean of 'TotalClaims'

Since the data is already provided, we'll perform the analysis based on the columns.
For the purpose of demonstrating the A/B testing framework, we'll treat
each category within the features (Province, PostalCode, Gender) as separate "groups"
and perform pairwise comparisons where appropriate for demonstration.

In [4]:
# --- Statistical Testing ---

# H₀:There are no risk differences across provinces
# We can compare Claim Frequency and Claim Severity across provinces.
# For Claim Frequency (categorical outcome), we can use Chi-Squared test.
# For Claim Severity (numerical outcome), we can use ANOVA (if comparing more than 2 provinces) or t-test (if comparing 2 provinces).

# Example for Claim Frequency across provinces using Chi-Squared
if 'Province' in df.columns and 'HasClaim' in df.columns:
  contingency_table_province_claim = pd.crosstab(df['Province'], df['HasClaim'])
  chi2_province_claim, p_province_claim, dof_province_claim, expected_province_claim = stats.chi2_contingency(contingency_table_province_claim)
  print(f"\nH₀: No risk differences across provinces (Claim Frequency)")
  print(f"Chi-squared statistic: {chi2_province_claim:.4f}")
  print(f"P-value: {p_province_claim:.4f}")

  # Interpretation for Claim Frequency across provinces
  if p_province_claim < 0.05:
    print("Reject H₀: There are statistically significant differences in claim frequency across provinces.")
  else:
    print("Fail to reject H₀: There is no statistically significant difference in claim frequency across provinces.")

  # Example for Claim Severity across provinces (assuming at least 2 provinces and numerical 'TotalClaims')
  # This requires filtering for policies with claims and then comparing the 'TotalClaims'.
  if 'TotalClaims' in df.columns:
    claim_severity_by_province = {}
    for province in df['Province'].unique():
      claims_in_province = df[(df['Province'] == province) & (df['HasClaim'] == 1)]['TotalClaims']
      if len(claims_in_province) > 0:
        claim_severity_by_province[province] = claims_in_province

    if len(claim_severity_by_province) > 1:
      # Perform ANOVA if more than 2 provinces
      if len(claim_severity_by_province) > 2:
        fvalue_province_severity, p_province_severity = stats.f_oneway(*claim_severity_by_province.values())
        print(f"\nH₀: No risk differences across provinces (Claim Severity - ANOVA)")
        print(f"F-statistic: {fvalue_province_severity:.4f}")
        print(f"P-value: {p_province_severity:.4f}")
        if p_province_severity < 0.05:
          print("Reject H₀: There are statistically significant differences in claim severity across provinces.")
        else:
          print("Fail to reject H₀: There is no statistically significant difference in claim severity across provinces.")
      # Perform t-test if exactly 2 provinces
      elif len(claim_severity_by_province) == 2:
        province1 = list(claim_severity_by_province.keys())[0]
        province2 = list(claim_severity_by_province.keys())[1]
        tstat_province_severity, p_province_severity = stats.ttest_ind(claim_severity_by_province[province1], claim_severity_by_province[province2])
        print(f"\nH₀: No risk differences between {province1} and {province2} (Claim Severity - t-test)")
        print(f"T-statistic: {tstat_province_severity:.4f}")
        print(f"P-value: {p_province_severity:.4f}")
        if p_province_severity < 0.05:
          print(f"Reject H₀: There is a statistically significant difference in claim severity between {province1} and {province2}.")
        else:
          print(f"Fail to reject H₀: There is no statistically significant difference in claim severity between {province1} and {province2}.")



H₀: No risk differences across provinces (Claim Frequency)
Chi-squared statistic: 104.1909
P-value: 0.0000
Reject H₀: There are statistically significant differences in claim frequency across provinces.

H₀: No risk differences across provinces (Claim Severity - ANOVA)
F-statistic: 4.8302
P-value: 0.0000
Reject H₀: There are statistically significant differences in claim severity across provinces.


In [5]:
# H₀:There are no risk differences between postal codes
# This can be done similarly to provinces, potentially comparing specific postal codes or groups of postal codes.
# Due to the potential large number of postal codes, comparing all pairs might not be feasible.
# We'll demonstrate for a subset or a common comparison.

# Example for Claim Frequency between two specific postal codes (replace 'PostalCode1' and 'PostalCode2' with actual postal codes)
if 'PostalCode' in df.columns and 'HasClaim' in df.columns:
  # Select two postal codes for comparison
  postal_codes_to_compare = df['PostalCode'].unique()
  if len(postal_codes_to_compare) >= 2:
    postal_code_a = postal_codes_to_compare[10]
    postal_code_b = postal_codes_to_compare[13]

    df_postal_a = df[df['PostalCode'] == postal_code_a]
    df_postal_b = df[df['PostalCode'] == postal_code_b]

    contingency_table_postal_claim = pd.crosstab(df['PostalCode'], df['HasClaim']).loc[[postal_code_a, postal_code_b]]
    print(contingency_table_postal_claim)
    chi2_postal_claim, p_postal_claim, dof_postal_claim, expected_postal_claim = stats.chi2_contingency(contingency_table_postal_claim)

    print(f"\nH₀: No risk differences between Postal Code {postal_code_a} and {postal_code_b} (Claim Frequency)")
    print(f"Chi-squared statistic: {chi2_postal_claim:.4f}")
    print(f"P-value: {p_postal_claim:.4f}")

    # Interpretation
    if p_postal_claim < 0.05:
      print(f"Reject H₀: There are statistically significant differences in claim frequency between Postal Code {postal_code_a} and {postal_code_b}.")
    else:
      print(f"Fail to reject H₀: There is no statistically significant difference in claim frequency between Postal Code {postal_code_a} and {postal_code_b}.")

HasClaim         0    1
PostalCode             
2000        133012  486
2410          4122    7

H₀: No risk differences between Postal Code 2000 and 2410 (Claim Frequency)
Chi-squared statistic: 3.7182
P-value: 0.0538
Fail to reject H₀: There is no statistically significant difference in claim frequency between Postal Code 2000 and 2410.


In [6]:
# H₀:There are no significant margin (profit) difference between postal codes
# We can compare the mean margin between postal codes using t-tests (for 2 postal codes) or ANOVA (for more than 2).

# Example for Margin between two specific postal codes
if 'PostalCode' in df.columns and 'TotalPremium' in df.columns and 'TotalClaims' in df.columns:
  df['Margin'] = df['TotalPremium'] - df['TotalClaims']

  postal_codes_to_compare = df['PostalCode'].unique()
  if len(postal_codes_to_compare) >= 2:
    postal_code_a = postal_codes_to_compare[10]
    postal_code_b = postal_codes_to_compare[13]

    margin_postal_a = df[df['PostalCode'] == postal_code_a]['Margin']
    margin_postal_b = df[df['PostalCode'] == postal_code_b]['Margin']

    # Perform t-test for independent samples
    tstat_postal_margin, p_postal_margin = stats.ttest_ind(margin_postal_a, margin_postal_b)

    print(f"\nH₀: No significant margin difference between Postal Code {postal_code_a} and {postal_code_b}")
    print(f"T-statistic: {tstat_postal_margin:.4f}")
    print(f"P-value: {p_postal_margin:.4f}")

    # Interpretation
    if p_postal_margin < 0.05:
      print(f"Reject H₀: There is a statistically significant difference in margin between Postal Code {postal_code_a} and {postal_code_b}.")
    else:
      print(f"Fail to reject H₀: There is no statistically significant difference in margin between Postal Code {postal_code_a} and {postal_code_b}.")



H₀: No significant margin difference between Postal Code 2000 and 2410
T-statistic: -0.8669
P-value: 0.3860
Fail to reject H₀: There is no statistically significant difference in margin between Postal Code 2000 and 2410.


In [7]:
# H₀:There are not significant risk difference between Women and Men
# We'll compare Claim Frequency and Claim Severity between genders.

# Example for Claim Frequency between genders using Chi-Squared
if 'Gender' in df.columns and 'HasClaim' in df.columns:
  contingency_table_gender_claim = pd.crosstab(df['Gender'], df['HasClaim'])
  chi2_gender_claim, p_gender_claim, dof_gender_claim, expected_gender_claim = stats.chi2_contingency(contingency_table_gender_claim)
  print(f"\nH₀: No significant risk difference between Women and Men (Claim Frequency)")
  print(f"Chi-squared statistic: {chi2_gender_claim:.4f}")
  print(f"P-value: {p_gender_claim:.4f}")

  # Interpretation
  if p_gender_claim < 0.05:
    print("Reject H₀: There is a statistically significant difference in claim frequency between Women and Men.")
  else:
    print("Fail to reject H₀: There is no statistically significant difference in claim frequency between Women and Men.")

# Example for Claim Severity between genders using t-test
if 'Gender' in df.columns and 'TotalClaims' in df.columns and 'HasClaim' in df.columns:
  claims_women = df[(df['Gender'] == 'Women') & (df['HasClaim'] == 1)]['TotalClaims']
  claims_men = df[(df['Gender'] == 'Men') & (df['HasClaim'] == 1)]['TotalClaims']

  if len(claims_women) > 0 and len(claims_men) > 0:
    tstat_gender_severity, p_gender_severity = stats.ttest_ind(claims_women, claims_men)
    print(f"\nH₀: No significant risk difference between Women and Men (Claim Severity - t-test)")
    print(f"T-statistic: {tstat_gender_severity:.4f}")
    print(f"P-value: {p_gender_severity:.4f}")

    # Interpretation
    if p_gender_severity < 0.05:
      print("Reject H₀: There is a statistically significant difference in claim severity between Women and Men.")
    else:
      print("Fail to reject H₀: There is no statistically significant difference in claim severity between Women and Men.")
  elif len(claims_women) == 0 and len(claims_men) == 0:
    print("\nNo claims reported for either Women or Men to compare claim severity.")
  elif len(claims_women) == 0:
    print("\nNo claims reported for Women to compare claim severity.")
  else: # len(claims_men) == 0
    print("\nNo claims reported for Men to compare claim severity.")

# --- Analyze and Report ---

# This section is for interpreting the results from the statistical tests.
# Based on the p-values calculated above, you would summarize whether each
# null hypothesis was rejected or failed to be rejected.
# For example:
# - For H₀: No risk differences across provinces (Claim Frequency): If p < 0.05, state that there's evidence of risk differences by province.
# - For H₀: No significant margin difference between postal codes: If p >= 0.05, state that the margin differences between the compared postal codes are not statistically significant.

# You would then discuss the business implications:
# - If risk differs by province/postal code/gender, it might indicate areas for
#   premium adjustment, targeted risk mitigation strategies, or further investigation.
# - If margin differs by postal code, it might point to areas of higher or lower profitability.

# Remember to consider the limitations of the data and the tests performed.
# For a real-world scenario, you would also consider effect sizes and practical significance
# in addition to statistical significance.



H₀: No significant risk difference between Women and Men (Claim Frequency)
Chi-squared statistic: 16.6474
P-value: 0.0002
Reject H₀: There is a statistically significant difference in claim frequency between Women and Men.

No claims reported for either Women or Men to compare claim severity.


In [8]:
# Aggregate at policy level

policy_agg = df.groupby('PolicyID').agg(
    Province=('Province', 'first'),  # assume policy only belongs to one province
    PostalCode=('PostalCode', 'first'),
    Gender=('Gender', 'first'),
    TotalPremium=('TotalPremium', 'first'),
    TotalClaims=('TotalClaims', 'sum')
).reset_index()

policy_agg['HasClaim'] = (policy_agg['TotalClaims'] > 0).astype(int)
policy_agg['Margin'] = policy_agg['TotalPremium'] - policy_agg['TotalClaims']

# Now, for Claim Severity, calculate average claim amount **per claim** given that claim occurred,
# But here we only have aggregated claim amount per policy,
# So we need to return to the original df filtered for claims.

claims = df[df['TotalClaims'] > 0]  # individual claims data

# Proceed with tests on policy_agg for frequency and margin,
# Use 'claims' dataframe for claim severity tests grouped by Province, ZipCode, Gender.


In [9]:
# --- Statistical Testing ---

# H₀:There are no risk differences across provinces
# We can compare Claim Frequency and Claim Severity across provinces.
# For Claim Frequency (categorical outcome), we can use Chi-Squared test.
# For Claim Severity (numerical outcome), we can use ANOVA (if comparing more than 2 provinces) or t-test (if comparing 2 provinces).

# Example for Claim Frequency across provinces using Chi-Squared
if 'Province' in policy_agg.columns and 'HasClaim' in policy_agg.columns:
  contingency_table_province_claim = pd.crosstab(policy_agg['Province'], policy_agg['HasClaim'])
  chi2_province_claim, p_province_claim, dof_province_claim, expected_province_claim = stats.chi2_contingency(contingency_table_province_claim)
  print(f"\nH₀: No risk differences across provinces (Claim Frequency)")
  print(f"Chi-squared statistic: {chi2_province_claim:.4f}")
  print(f"P-value: {p_province_claim:.4f}")

  # Interpretation for Claim Frequency across provinces
  if p_province_claim < 0.05:
    print("Reject H₀: There are statistically significant differences in claim frequency across provinces.")
  else:
    print("Fail to reject H₀: There is no statistically significant difference in claim frequency across provinces.")

  # Example for Claim Severity across provinces (assuming at least 2 provinces and numerical 'TotalClaims')
  # This requires filtering for policies with claims and then comparing the 'TotalClaims'.
  if 'TotalClaims' in policy_agg.columns:
    claim_severity_by_province = {}
    for province in policy_agg['Province'].unique():
      claims_in_province = policy_agg[(policy_agg['Province'] == province) & (policy_agg['HasClaim'] == 1)]['TotalClaims']
      if len(claims_in_province) > 0:
        claim_severity_by_province[province] = claims_in_province

    if len(claim_severity_by_province) > 1:
      # Perform ANOVA if more than 2 provinces
      if len(claim_severity_by_province) > 2:
        fvalue_province_severity, p_province_severity = stats.f_oneway(*claim_severity_by_province.values())
        print(f"\nH₀: No risk differences across provinces (Claim Severity - ANOVA)")
        print(f"F-statistic: {fvalue_province_severity:.4f}")
        print(f"P-value: {p_province_severity:.4f}")
        if p_province_severity < 0.05:
          print("Reject H₀: There are statistically significant differences in claim severity across provinces.")
        else:
          print("Fail to reject H₀: There is no statistically significant difference in claim severity across provinces.")
      # Perform t-test if exactly 2 provinces
      elif len(claim_severity_by_province) == 2:
        province1 = list(claim_severity_by_province.keys())[0]
        province2 = list(claim_severity_by_province.keys())[1]
        tstat_province_severity, p_province_severity = stats.ttest_ind(claim_severity_by_province[province1], claim_severity_by_province[province2])
        print(f"\nH₀: No risk differences between {province1} and {province2} (Claim Severity - t-test)")
        print(f"T-statistic: {tstat_province_severity:.4f}")
        print(f"P-value: {p_province_severity:.4f}")
        if p_province_severity < 0.05:
          print(f"Reject H₀: There is a statistically significant difference in claim severity between {province1} and {province2}.")
        else:
          print(f"Fail to reject H₀: There is no statistically significant difference in claim severity between {province1} and {province2}.")



H₀: No risk differences across provinces (Claim Frequency)
Chi-squared statistic: 35.7951
P-value: 0.0000
Reject H₀: There are statistically significant differences in claim frequency across provinces.

H₀: No risk differences across provinces (Claim Severity - ANOVA)
F-statistic: 5.3504
P-value: 0.0000
Reject H₀: There are statistically significant differences in claim severity across provinces.


In [11]:
# H₀:There are no risk differences between postal codes
# This can be done similarly to provinces, potentially comparing specific postal codes or groups of postal codes.
# Due to the potential large number of postal codes, comparing all pairs might not be feasible.
# We'll demonstrate for a subset or a common comparison.

# Example for Claim Frequency between two specific postal codes (replace 'PostalCode1' and 'PostalCode2' with actual postal codes)
if 'PostalCode' in policy_agg.columns and 'HasClaim' in policy_agg.columns:
  # Select two postal codes for comparison
  postal_codes_to_compare = policy_agg['PostalCode'].unique()
  if len(postal_codes_to_compare) >= 2:
    postal_code_a = postal_codes_to_compare[1]
    postal_code_b = postal_codes_to_compare[2]

    policy_agg = policy_agg[policy_agg['PostalCode'] == postal_code_a]
    policy_agg = policy_agg[policy_agg['PostalCode'] == postal_code_b]

    contingency_table_postal_claim = pd.crosstab(policy_agg['PostalCode'], policy_agg['HasClaim']).loc[[postal_code_a, postal_code_b]]
    print(contingency_table_postal_claim)
    chi2_postal_claim, p_postal_claim, dof_postal_claim, expected_postal_claim = stats.chi2_contingency(contingency_table_postal_claim)

    print(f"\nH₀: No risk differences between Postal Code {postal_code_a} and {postal_code_b} (Claim Frequency)")
    print(f"Chi-squared statistic: {chi2_postal_claim:.4f}")
    print(f"P-value: {p_postal_claim:.4f}")

    # Interpretation
    if p_postal_claim < 0.05:
      print(f"Reject H₀: There are statistically significant differences in claim frequency between Postal Code {postal_code_a} and {postal_code_b}.")
    else:
      print(f"Fail to reject H₀: There is no statistically significant difference in claim frequency between Postal Code {postal_code_a} and {postal_code_b}.")

In [12]:
# H₀:There are no significant margin (profit) difference between postal codes
# We can compare the mean margin between postal codes using t-tests (for 2 postal codes) or ANOVA (for more than 2).

# Example for Margin between two specific postal codes
if 'PostalCode' in policy_agg.columns and 'TotalPremium' in policy_agg.columns and 'TotalClaims' in policy_agg.columns:
  policy_agg['Margin'] = policy_agg['TotalPremium'] - policy_agg['TotalClaims']

  postal_codes_to_compare = policy_agg['PostalCode'].unique()
  if len(postal_codes_to_compare) >= 2:
    postal_code_a = postal_codes_to_compare[10]
    postal_code_b = postal_codes_to_compare[13]

    margin_postal_a = policy_agg[policy_agg['PostalCode'] == postal_code_a]['Margin']
    margin_postal_b = policy_agg[policy_agg['PostalCode'] == postal_code_b]['Margin']

    # Perform t-test for independent samples
    tstat_postal_margin, p_postal_margin = stats.ttest_ind(margin_postal_a, margin_postal_b)

    print(f"\nH₀: No significant margin difference between Postal Code {postal_code_a} and {postal_code_b}")
    print(f"T-statistic: {tstat_postal_margin:.4f}")
    print(f"P-value: {p_postal_margin:.4f}")

    # Interpretation
    if p_postal_margin < 0.05:
      print(f"Reject H₀: There is a statistically significant difference in margin between Postal Code {postal_code_a} and {postal_code_b}.")
    else:
      print(f"Fail to reject H₀: There is no statistically significant difference in margin between Postal Code {postal_code_a} and {postal_code_b}.")


In [None]:
# H₀:There are not significant risk difference between Women and Men
# We'll compare Claim Frequency and Claim Severity between genders.

# Example for Claim Frequency between genders using Chi-Squared
if 'Gender' in policy_agg.columns and 'HasClaim' in policy_agg.columns:
  contingency_table_gender_claim = pd.crosstab(policy_agg['Gender'], policy_agg['HasClaim'])
  chi2_gender_claim, p_gender_claim, dof_gender_claim, expected_gender_claim = stats.chi2_contingency(contingency_table_gender_claim)
  print(f"\nH₀: No significant risk difference between Women and Men (Claim Frequency)")
  print(f"Chi-squared statistic: {chi2_gender_claim:.4f}")
  print(f"P-value: {p_gender_claim:.4f}")

  # Interpretation
  if p_gender_claim < 0.05:
    print("Reject H₀: There is a statistically significant difference in claim frequency between Women and Men.")
  else:
    print("Fail to reject H₀: There is no statistically significant difference in claim frequency between Women and Men.")

# Example for Claim Severity between genders using t-test
if 'Gender' in policy_agg.columns and 'TotalClaims' in policy_agg.columns and 'HasClaim' in policy_agg.columns:
  claims_women = policy_agg[(policy_agg['Gender'] == 'Women') & (policy_agg['HasClaim'] == 1)]['TotalClaims']
  claims_men = policy_agg[(policy_agg['Gender'] == 'Men') & (policy_agg['HasClaim'] == 1)]['TotalClaims']

  if len(claims_women) > 0 and len(claims_men) > 0:
    tstat_gender_severity, p_gender_severity = stats.ttest_ind(claims_women, claims_men)
    print(f"\nH₀: No significant risk difference between Women and Men (Claim Severity - t-test)")
    print(f"T-statistic: {tstat_gender_severity:.4f}")
    print(f"P-value: {p_gender_severity:.4f}")

    # Interpretation
    if p_gender_severity < 0.05:
      print("Reject H₀: There is a statistically significant difference in claim severity between Women and Men.")
    else:
      print("Fail to reject H₀: There is no statistically significant difference in claim severity between Women and Men.")
  elif len(claims_women) == 0 and len(claims_men) == 0:
    print("\nNo claims reported for either Women or Men to compare claim severity.")
  elif len(claims_women) == 0:
    print("\nNo claims reported for Women to compare claim severity.")
  else: # len(claims_men) == 0
    print("\nNo claims reported for Men to compare claim severity.")

# --- Analyze and Report ---

# This section is for interpreting the results from the statistical tests.
# Based on the p-values calculated above, you would summarize whether each
# null hypothesis was rejected or failed to be rejected.
# For example:
# - For H₀: No risk differences across provinces (Claim Frequency): If p < 0.05, state that there's evidence of risk differences by province.
# - For H₀: No significant margin difference between postal codes: If p >= 0.05, state that the margin differences between the compared postal codes are not statistically significant.

# You would then discuss the business implications:
# - If risk differs by province/postal code/gender, it might indicate areas for
#   premium adjustment, targeted risk mitigation strategies, or further investigation.
# - If margin differs by postal code, it might point to areas of higher or lower profitability.

# Remember to consider the limitations of the data and the tests performed.
# For a real-world scenario, you would also consider effect sizes and practical significance
# in addition to statistical significance.



H₀: No significant risk difference between Women and Men (Claim Frequency)
Chi-squared statistic: 16.6474
P-value: 0.0002
Reject H₀: There is a statistically significant difference in claim frequency between Women and Men.

No claims reported for either Women or Men to compare claim severity.
