In [1]:
# load the autoreload extension for IPython
# This allows you to automatically reload modules before executing code
%load_ext autoreload
%autoreload 2

import sys
import os

# Add the parent directory to the system path
sys.path.append(os.path.abspath(os.path.join('..')))

# Hypothesis Testing for Insurance Risk Drivers
This notebook statistically validates or rejects key hypotheses about risk drivers, using the cleaned insurance dataset. We use modular functions from `scripts/hypothesis_testing.py` for all calculations and tests.

In [2]:
# Import required libraries and hypothesis testing functions
import pandas as pd
import sys
sys.path.append('../scripts')
from hypothesis_testing import compute_metrics, test_hypothesis

## 1. Load and Prepare Data
We use the cleaned insurance dataset: `data/insurance_cleaned.csv`.

In [3]:
# Load the cleaned insurance data
df = pd.read_csv('../data/insurance_cleaned.csv')
df = compute_metrics(df)
df.head()

  df = pd.read_csv('../data/insurance_cleaned.csv')


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims,ClaimFrequency,NumClaims,ClaimSeverity,Margin
0,145249,12827,2015-03-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,0,0,0.0,21.929825
1,145249,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,0,0,0.0,21.929825
2,145249,12827,2015-07-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0,0,0,0.0,0.0
3,145255,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0,0,0,0.0,512.84807
4,145255,12827,2015-07-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0,0,0,0.0,0.0


## 2. Hypothesis 1: Risk Differences Across Provinces
**Null Hypothesis (H₀):** There are no risk differences across provinces.

We compare claim frequency and claim severity between two provinces with large sample sizes.

In [4]:
# Select two provinces with the most data for comparison
province_counts = df['Province'].value_counts()
prov_a, prov_b = province_counts.index[:2]

# Test claim frequency (proportion z-test)
result_freq = test_hypothesis(df, 'Province', prov_a, prov_b, 'ClaimFrequency', test_type='z-test')
print(f"Claim Frequency: {result_freq['conclusion']}")

# Test claim severity (t-test)
result_sev = test_hypothesis(df, 'Province', prov_a, prov_b, 'ClaimSeverity', test_type='t-test')
print(f"Claim Severity: {result_sev['conclusion']}")

Claim Frequency: Reject the null hypothesis (p-value=0.0000 < 0.05). Statistically significant difference detected by z-test.
Claim Severity: Fail to reject the null hypothesis (p-value=0.0627 >= 0.05). No statistically significant difference detected by t-test.
Claim Severity: Fail to reject the null hypothesis (p-value=0.0627 >= 0.05). No statistically significant difference detected by t-test.


## 3. Hypothesis 2: Risk Differences Between Zip Codes
**Null Hypothesis (H₀):** There are no risk differences between zip codes.

We compare claim frequency and margin between two zip codes with the most data.

In [5]:
# Select two zip codes with the most data
zip_counts = df['PostalCode'].value_counts()
zip_a, zip_b = zip_counts.index[:2]

# Test claim frequency (proportion z-test)
result_zip_freq = test_hypothesis(df, 'PostalCode', zip_a, zip_b, 'ClaimFrequency', test_type='z-test')
print(f"Claim Frequency: {result_zip_freq['conclusion']}")

# Test margin (t-test)
result_zip_margin = test_hypothesis(df, 'PostalCode', zip_a, zip_b, 'Margin', test_type='t-test')
print(f"Margin: {result_zip_margin['conclusion']}")

Claim Frequency: Fail to reject the null hypothesis (p-value=0.0525 >= 0.05). No statistically significant difference detected by z-test.
Margin: Fail to reject the null hypothesis (p-value=0.2445 >= 0.05). No statistically significant difference detected by t-test.


## 4. Hypothesis 3: Margin Differences Between Zip Codes
**Null Hypothesis (H₀):** There are no significant margin (profit) differences between zip codes.

We use the same two zip codes as above and test the margin.

In [6]:
# Already tested margin above; print again for clarity
print(f"Margin Difference Between Zip Codes: {result_zip_margin['conclusion']}")

Margin Difference Between Zip Codes: Fail to reject the null hypothesis (p-value=0.2445 >= 0.05). No statistically significant difference detected by t-test.


## 5. Hypothesis 4: Risk Differences Between Women and Men
**Null Hypothesis (H₀):** There are not significant risk differences between Women and Men.

We compare claim frequency and claim severity between genders.

In [7]:
# Ensure gender column is present and select two most common values
if 'Gender' in df.columns:
    gender_counts = df['Gender'].value_counts()
    gender_a, gender_b = gender_counts.index[:2]
    # Test claim frequency (proportion z-test)
    result_gender_freq = test_hypothesis(df, 'Gender', gender_a, gender_b, 'ClaimFrequency', test_type='z-test')
    print(f"Claim Frequency: {result_gender_freq['conclusion']}")
    # Test claim severity (t-test)
    result_gender_sev = test_hypothesis(df, 'Gender', gender_a, gender_b, 'ClaimSeverity', test_type='t-test')
    print(f"Claim Severity: {result_gender_sev['conclusion']}")
else:
    print("Gender column not found in data.")

Claim Frequency: Reject the null hypothesis (p-value=0.0166 < 0.05). Statistically significant difference detected by z-test.
Claim Severity: Reject the null hypothesis (p-value=0.0000 < 0.05). Statistically significant difference detected by t-test.
Claim Severity: Reject the null hypothesis (p-value=0.0000 < 0.05). Statistically significant difference detected by t-test.


## 6. Summary Table
Below is a summary of the statistical test results for all hypotheses.

In [8]:
import pandas as pd
summary = pd.DataFrame([
    {"Hypothesis": "Risk difference across provinces (Claim Frequency)", "Conclusion": result_freq['conclusion']},
    {"Hypothesis": "Risk difference across provinces (Claim Severity)", "Conclusion": result_sev['conclusion']},
    {"Hypothesis": "Risk difference between zip codes (Claim Frequency)", "Conclusion": result_zip_freq['conclusion']},
    {"Hypothesis": "Margin difference between zip codes", "Conclusion": result_zip_margin['conclusion']},
    {"Hypothesis": "Risk difference between Women and Men (Claim Frequency)", "Conclusion": result_gender_freq['conclusion'] if 'result_gender_freq' in locals() else 'N/A'},
    {"Hypothesis": "Risk difference between Women and Men (Claim Severity)", "Conclusion": result_gender_sev['conclusion'] if 'result_gender_sev' in locals() else 'N/A'},
])
display(summary)

Unnamed: 0,Hypothesis,Conclusion
0,Risk difference across provinces (Claim Freque...,Reject the null hypothesis (p-value=0.0000 < 0...
1,Risk difference across provinces (Claim Severity),Fail to reject the null hypothesis (p-value=0....
2,Risk difference between zip codes (Claim Frequ...,Fail to reject the null hypothesis (p-value=0....
3,Margin difference between zip codes,Fail to reject the null hypothesis (p-value=0....
4,Risk difference between Women and Men (Claim F...,Reject the null hypothesis (p-value=0.0166 < 0...
5,Risk difference between Women and Men (Claim S...,Reject the null hypothesis (p-value=0.0000 < 0...


---
**Interpretation:**
- If the null hypothesis is rejected, the feature has a statistically significant impact on the KPI.
- If not rejected, there is no significant impact detected.
- Use these findings to inform segmentation and business strategy.