In [None]:
# Imports and helper functions
import sys
import os
import pandas as pd
sys.path.append(os.path.abspath('../src'))
import hypothesis_tests as ht

In [None]:
# Load cleaned data from task 2 DVC tracked file
df = pd.read_csv('../data/cleaned_data.txt')

In [None]:
# Step 1: Compute KPIs
df = ht.compute_kpis(df)

In [None]:
# Hypothesis 1: Risk difference across provinces (Claim Frequency and Claim Severity)
group_a_prov, group_b_prov = ht.segment_data(df, 'Province', 'Gauteng', 'Western Cape')

In [None]:
# Claim Frequency - Chi-Square Test
chi2, p_val_freq = ht.chi_squared_test(group_a_prov, group_b_prov, 'HasClaim')
print("Claim Frequency Province Test:", ht.interpret_result(p_val_freq))

In [None]:
# Claim Severity - t-test
t_stat_sev, p_val_sev = ht.t_test(group_a_prov, group_b_prov, 'ClaimSeverity')
print("Claim Severity Province Test:", ht.interpret_result(p_val_sev))

In [None]:
# Hypothesis 2: Risk difference between zip codes 
zipcodes = df['PostalCode'].value_counts().index[:2]
group_a_zip, group_b_zip = ht.segment_data(df, 'PostalCode', zipcodes[0], zipcodes[1])

In [None]:
chi2_zip, p_val_freq_zip = ht.chi_squared_test(group_a_zip, group_b_zip, 'HasClaim')
print("Claim Frequency ZipCode Test:", ht.interpret_result(p_val_freq_zip))

In [None]:
t_stat_sev_zip, p_val_sev_zip = ht.t_test(group_a_zip, group_b_zip, 'ClaimSeverity')
print("Claim Severity ZipCode Test:", ht.interpret_result(p_val_sev_zip))

In [None]:
# Hypothesis 3: Margin difference between zip codes
t_stat_margin_zip, p_val_margin_zip = ht.t_test(group_a_zip, group_b_zip, 'Margin')
print("Margin ZipCode Test:", ht.interpret_result(p_val_margin_zip))

In [4]:
# Hypothesis 4: Risk difference between Women and Men (Claim Frequency)
group_a_gender, group_b_gender = ht.segment_data(df, 'Gender', 'Female', 'Male')

chi2_gender, p_val_freq_gender = ht.chi_squared_test(group_a_gender, group_b_gender, 'HasClaim')
print("Claim Frequency Gender Test:", ht.interpret_result(p_val_freq_gender))

# Optionally test Claim Severity difference by Gender
t_stat_sev_gender, p_val_sev_gender = ht.t_test(group_a_gender, group_b_gender, 'ClaimSeverity')
print("Claim Severity Gender Test:", ht.interpret_result(p_val_sev_gender))

  df = pd.read_csv('../data/cleaned_data.txt')


Claim Frequency Province Test: Reject Null Hypothesis (p = 0.0000) — statistically significant difference found.
Claim Severity Province Test: Reject Null Hypothesis (p = 0.0255) — statistically significant difference found.
Claim Frequency ZipCode Test: Reject Null Hypothesis (p = 0.0016) — statistically significant difference found.
Claim Severity ZipCode Test: Fail to Reject Null Hypothesis (p = 0.5253) — no statistically significant difference.
Margin ZipCode Test: Fail to Reject Null Hypothesis (p = 0.5206) — no statistically significant difference.
Claim Frequency Gender Test: Fail to Reject Null Hypothesis (p = 0.7061) — no statistically significant difference.
Claim Severity Gender Test: Fail to Reject Null Hypothesis (p = 0.4311) — no statistically significant difference.


In [5]:
# Summarize business implications as markdown or final print statements in notebook
def summarize_business_implications():
    print("\n### Business Implications Summary\n")

    # Province - Claim Frequency
    if p_val_freq < 0.05:
        print(f"- Claim Frequency differs significantly across provinces (p = {p_val_freq:.4f}).")
        print("  Suggestion: Consider adjusting premiums regionally based on observed risk differences.")
    else:
        print(f"- No significant difference in Claim Frequency across provinces (p = {p_val_freq:.4f}).")
        print("  Suggestion: Uniform risk treatment by province is appropriate for Claim Frequency.")

    # Province - Claim Severity
    if p_val_sev < 0.05:
        print(f"- Claim Severity differs significantly across provinces (p = {p_val_sev:.4f}).")
        print("  Suggestion: Region-specific claim severity should be considered in pricing.")
    else:
        print(f"- No significant difference in Claim Severity across provinces (p = {p_val_sev:.4f}).")

    # Zip Code - Claim Frequency
    if p_val_freq_zip < 0.05:
        print(f"- Claim Frequency differs significantly between zip codes {zipcodes[0]} and {zipcodes[1]} (p = {p_val_freq_zip:.4f}).")
        print("  Suggestion: Zip code based segmentation may improve risk assessment.")
    else:
        print(f"- No significant difference in Claim Frequency between zip codes {zipcodes[0]} and {zipcodes[1]} (p = {p_val_freq_zip:.4f}).")

    # Zip Code - Claim Severity
    if p_val_sev_zip < 0.05:
        print(f"- Claim Severity differs significantly between zip codes {zipcodes[0]} and {zipcodes[1]} (p = {p_val_sev_zip:.4f}).")
        print("  Suggestion: Consider zip code based differentiation in claim severity.")
    else:
        print(f"- No significant difference in Claim Severity between zip codes {zipcodes[0]} and {zipcodes[1]} (p = {p_val_sev_zip:.4f}).")

    # Zip Code - Margin
    if p_val_margin_zip < 0.05:
        print(f"- Margin differs significantly between zip codes {zipcodes[0]} and {zipcodes[1]} (p = {p_val_margin_zip:.4f}).")
        print("  Suggestion: Profitability differs by zip code; consider adjusting margins accordingly.")
    else:
        print(f"- No significant difference in Margin between zip codes {zipcodes[0]} and {zipcodes[1]} (p = {p_val_margin_zip:.4f}).")

    # Gender - Claim Frequency
    if p_val_freq_gender < 0.05:
        print(f"- Claim Frequency differs significantly between Women and Men (p = {p_val_freq_gender:.4f}).")
        print("  Suggestion: Gender-based segmentation could enhance risk differentiation.")
    else:
        print(f"- No significant difference in Claim Frequency between Women and Men (p = {p_val_freq_gender:.4f}).")

    # Gender - Claim Severity
    if p_val_sev_gender < 0.05:
        print(f"- Claim Severity differs significantly between Women and Men (p = {p_val_sev_gender:.4f}).")
        print("  Suggestion: Consider gender differences in claim severity for pricing.")
    else:
        print(f"- No significant difference in Claim Severity between Women and Men (p = {p_val_sev_gender:.4f}).")

In [6]:
# Calling the function 
summarize_business_implications()


### Business Implications Summary

- Claim Frequency differs significantly across provinces (p = 0.0000).
  Suggestion: Consider adjusting premiums regionally based on observed risk differences.
- Claim Severity differs significantly across provinces (p = 0.0255).
  Suggestion: Region-specific claim severity should be considered in pricing.
- Claim Frequency differs significantly between zip codes 2000 and 122 (p = 0.0016).
  Suggestion: Zip code based segmentation may improve risk assessment.
- No significant difference in Claim Severity between zip codes 2000 and 122 (p = 0.5253).
- No significant difference in Margin between zip codes 2000 and 122 (p = 0.5206).
- No significant difference in Claim Frequency between Women and Men (p = 0.7061).
- No significant difference in Claim Severity between Women and Men (p = 0.4311).
