In [2]:
# Cell 1: Imports & load
import sys, os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.hypothesis_tests import (
    compute_kpis, run_core_hypotheses, summarize_test_result,
    test_frequency_by_group, test_numeric_by_group, pairwise_comparisons,
    logistic_regression, ols_regression
)

sns.set(style="whitegrid")
# Load cleaned data
df = pd.read_csv("../data/clean/clean_portfolio_v2.csv", parse_dates=["TransactionMonth"], low_memory=False)
df = compute_kpis(df)
print("Rows:", len(df))
df.head(2)


Rows: 981812


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,CustomValueEstimate_outlier_flag,SumInsured_outlier_flag,LossRatio,Trans_Year,Trans_Month,Trans_YearMonth,VehicleAge,has_claim,claim_severity,margin
0,145249,12827,2015-03-01,True,Unknown,Close Corporation,Mr,English,First National Bank,Current account,...,,,0.0,2015,3,2015-03,21,False,,21.929825
1,145249,12827,2015-05-01,True,Unknown,Close Corporation,Mr,English,First National Bank,Current account,...,,,0.0,2015,5,2015-05,21,False,,21.929825


In [3]:
# Cell 2: KPI quick summary
total_policies = len(df)
n_with_claim = df["has_claim"].sum()
overall_freq = n_with_claim / total_policies
overall_loss_ratio = df["TotalClaims"].sum() / df["TotalPremium"].sum()

display(Markdown(f"**Total policies:** {total_policies:,}  \n**Policies with claims:** {n_with_claim:,}  \n**Overall claim frequency:** {overall_freq:.4f}  \n**Overall loss ratio:** {overall_loss_ratio:.4f}"))


**Total policies:** 981,812  
**Policies with claims:** 2,755  
**Overall claim frequency:** 0.0028  
**Overall loss ratio:** 1.0456

In [4]:
# Cell 3: Run core hypothesis tests
results = run_core_hypotheses(df, alpha=0.05)
# Summarize in markdown
md_lines = ["# Hypothesis Test Results\n"]
for k, v in results.items():
    md_lines.append(f"## {k}")
    md_lines.append("```\n" + str(v) + "\n```")
    # quick one-line
    md_lines.append(summarize_test_result(v))
display(Markdown("\n\n".join(md_lines)))


# Hypothesis Test Results


## province_freq

```
{'table': has_claim       False  True 
Province                    
Eastern Cape    29202     47
Free State       8088     11
Gauteng        385120   1309
KwaZulu-Natal  166654    475
Limpopo         24300     67
Mpumalanga      51830    127
North West     141233    347
Northern Cape    6308      8
Western Cape   166322    364, 'chi2': 105.11134478837724, 'p_value': 3.837835235091429e-19, 'dof': 8, 'expected': array([[2.91669262e+04, 8.20737524e+01],
       [8.07627391e+03, 2.27260871e+01],
       [3.85344666e+05, 1.08433376e+03],
       [1.66660030e+05, 4.68970022e+02],
       [2.42986253e+04, 6.83746837e+01],
       [5.18112068e+04, 1.45793222e+02],
       [1.41182721e+05, 3.97278603e+02],
       [6.29827708e+03, 1.77229246e+01],
       [1.66218273e+05, 4.67726948e+02]]), 'cramers_v': 0.010346908970583591, 'reject_H0': np.True_}
```

p=3.838e-19 -> REJECT H0

## province_severity

```
{'test': 'kruskal', 'statistic': 101.62245808829827, 'p_value': 1.9887278741890022e-18, 'levene_p': 0.0008311593689027535, 'reject_H0': np.True_}
```

p=1.989e-18 -> REJECT H0

## province_margin

```
{'test': 'kruskal', 'statistic': 4521.833211532078, 'p_value': 0.0, 'levene_p': 2.0147567779413292e-15, 'reject_H0': np.True_}
```

p=0 -> REJECT H0

## zipcode_freq

```
{'table': has_claim   False  True 
PostalCode              
1            5271     12
2            1451      6
4              77      0
5             396      4
6             438      2
...           ...    ...
9781          640      3
9830           56      0
9868          100      0
9869         1414      1
9870          220      0

[888 rows x 2 columns], 'chi2': 1449.4839515218407, 'p_value': 8.384445322361946e-30, 'dof': 887, 'expected': array([[5.26817571e+03, 1.48242892e+01],
       [1.45291161e+03, 4.08839472e+00],
       [7.67839352e+01, 2.16064786e-01],
       ...,
       [9.97193964e+01, 2.80603619e-01],
       [1.41102946e+03, 3.97054120e+00],
       [2.19382672e+02, 6.17327961e-01]], shape=(888, 2)), 'cramers_v': 0.03842311208590997, 'reject_H0': np.True_}
```

p=8.384e-30 -> REJECT H0

## zipcode_severity

```
{'test': 'anova', 'statistic': 1.137805522088133, 'p_value': 0.04019459982244671, 'levene_p': 0.9949834627480213, 'reject_H0': np.True_}
```

p=0.04019 -> REJECT H0

## zipcode_margin

```
{'test': 'kruskal', 'statistic': 89074.74980138706, 'p_value': 0.0, 'levene_p': 0.0006116057154116108, 'reject_H0': np.True_}
```

p=0 -> REJECT H0

## gender_freq

```
{'table': has_claim   False  True 
Gender                  
Female       6560     13
Male        42343     94
Unknown    930154   2648, 'chi2': 7.265377391679415, 'p_value': 0.026444986212228135, 'dof': 2, 'expected': array([[6.55455592e+03, 1.84440759e+01],
       [4.23179202e+04, 1.19079758e+02],
       [9.30184524e+05, 2.61747617e+03]]), 'cramers_v': 0.0027202882218328316, 'reject_H0': np.True_}
```

p=0.02644 -> REJECT H0

## gender_severity

```
{'test': 'anova', 'statistic': 2.3630677658151935, 'p_value': 0.09432198442087561, 'levene_p': 0.07357665771834956, 'reject_H0': np.False_}
```

p=0.09432 -> FAIL TO REJECT H0

## gender_margin

```
{'test': 'kruskal', 'statistic': 5694.720093496273, 'p_value': 0.0, 'levene_p': 1.7987209419738608e-06, 'reject_H0': np.True_}
```

p=0 -> REJECT H0

In [5]:
# Cell 4: Province frequency details (chi-square + Cramer's V)
prov_freq = test_frequency_by_group(df, "Province")
display(Markdown("### Province vs Claim Frequency (Chi-square)"))
display(prov_freq["table"])
display(Markdown(f"- chi2 = {prov_freq['chi2']:.4g}, p = {prov_freq['p_value']:.4g}, dof = {prov_freq['dof']}, Cramer's V = {prov_freq['cramers_v']:.4f}"))


### Province vs Claim Frequency (Chi-square)

has_claim,False,True
Province,Unnamed: 1_level_1,Unnamed: 2_level_1
Eastern Cape,29202,47
Free State,8088,11
Gauteng,385120,1309
KwaZulu-Natal,166654,475
Limpopo,24300,67
Mpumalanga,51830,127
North West,141233,347
Northern Cape,6308,8
Western Cape,166322,364


- chi2 = 105.1, p = 3.838e-19, dof = 8, Cramer's V = 0.0103

In [6]:
# Cell 5: Province severity test
prov_sev = test_numeric_by_group(df, "claim_severity", "Province")
display(Markdown("### Province: Claim Severity Test"))
display(Markdown(summarize_test_result(prov_sev)))
display(Markdown(f"Details: {prov_sev}"))


### Province: Claim Severity Test

p=1.989e-18 -> REJECT H0

Details: {'test': 'kruskal', 'statistic': 101.62245808829827, 'p_value': 1.9887278741890022e-18, 'levene_p': 0.0008311593689027535, 'reject_H0': np.True_}

In [7]:
# Cell 6: Gender frequency & severity
g_freq = test_frequency_by_group(df, "Gender")
g_sev = test_numeric_by_group(df, "claim_severity", "Gender")
display(Markdown("### Gender: Claim Frequency"))
display(Markdown(summarize_test_result(g_freq)))
display(Markdown("### Gender: Claim Severity"))
display(Markdown(summarize_test_result(g_sev)))


### Gender: Claim Frequency

p=0.02644 -> REJECT H0

### Gender: Claim Severity

p=0.09432 -> FAIL TO REJECT H0

In [8]:
# Cell 7: PostalCode caveat & example top-N vs rest approach
display(Markdown("### PostalCode: high-cardinality caution"))
if "PostalCode" in df.columns:
    counts = df["PostalCode"].value_counts().nlargest(10)
    df_top = df[df["PostalCode"].isin(counts.index)]
    display(Markdown("Top 10 PostalCodes by volume (used for pairwise tests):"))
    display(counts)
    # Example: compare top postal code vs rest for frequency
    top_code = counts.index[0]
    df["is_top"] = (df["PostalCode"] == top_code)
    top_freq = test_frequency_by_group(df, "is_top")
    display(Markdown(f"Top postal code ({top_code}) vs rest -- frequency test: {summarize_test_result(top_freq)}"))
else:
    display(Markdown("PostalCode column not found in dataset."))


### PostalCode: high-cardinality caution

Top 10 PostalCodes by volume (used for pairwise tests):

PostalCode
2000    130576
122      48840
7784     27746
299      25080
7405     18382
458      13705
8000     11596
2196     10937
470      10044
1724      9994
Name: count, dtype: int64

Top postal code (2000) vs rest -- frequency test: p=1.205e-10 -> REJECT H0

In [9]:
# Cell 8: Regression example: logistic regression (has_claim)
display(Markdown("### Adjusted effect estimates (logistic regression)"))
# Simple model; adjust formula per available features
formula = "has_claim ~ C(Province) + C(VehicleType) + C(Gender)"
try:
    logit = logistic_regression(df, formula)
    display(Markdown(logit.summary().as_text()))
except Exception as e:
    display(Markdown(f"Logit failed: {e}"))


### Adjusted effect estimates (logistic regression)

                           Logit Regression Results                           
==============================================================================
Dep. Variable:              has_claim   No. Observations:               981812
Model:                          Logit   Df Residuals:                   981796
Method:                           MLE   Df Model:                           15
Date:                Tue, 09 Dec 2025   Pseudo R-squ.:                0.003979
Time:                        21:38:41   Log-Likelihood:                -18864.
converged:                       True   LL-Null:                       -18939.
Covariance Type:            nonrobust   LLR p-value:                 1.743e-24
=======================================================================================================
                                          coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
Intercept                              -7.2604      1.050     -6.913      0.000      -9.319      -5.202
C(Province)[T.Free State]              -0.1663      0.335     -0.496      0.620      -0.823       0.491
C(Province)[T.Gauteng]                  0.7494      0.149      5.041      0.000       0.458       1.041
C(Province)[T.KwaZulu-Natal]            0.5789      0.153      3.781      0.000       0.279       0.879
C(Province)[T.Limpopo]                  0.5443      0.191      2.849      0.004       0.170       0.919
C(Province)[T.Mpumalanga]               0.4105      0.171      2.399      0.016       0.075       0.746
C(Province)[T.North West]               0.4275      0.156      2.746      0.006       0.122       0.733
C(Province)[T.Northern Cape]           -0.2415      0.383     -0.631      0.528      -0.992       0.509
C(Province)[T.Western Cape]             0.3169      0.155      2.038      0.042       0.012       0.622
C(VehicleType)[T.Heavy Commercial]      0.6815      1.025      0.665      0.506      -1.328       2.691
C(VehicleType)[T.Light Commercial]      0.2946      1.062      0.277      0.781      -1.787       2.376
C(VehicleType)[T.Medium Commercial]     0.6138      1.005      0.611      0.541      -1.355       2.583
C(VehicleType)[T.Passenger Vehicle]     0.5482      1.002      0.547      0.584      -1.415       2.511
C(VehicleType)[T.Unknown]               2.7183      1.040      2.614      0.009       0.680       4.757
C(Gender)[T.Male]                       0.0629      0.297      0.212      0.832      -0.518       0.644
C(Gender)[T.Unknown]                    0.2787      0.279      1.000      0.317      -0.268       0.825
=======================================================================================================

In [10]:
# Cell 9: Pairwise comparisons example (claim severity across top 6 provinces)
display(Markdown("### Pairwise t-tests (claim severity) for top provinces"))
top_provs = df["Province"].value_counts().index[:6]
df_small = df[df["Province"].isin(top_provs)]
pairwise_df = pairwise_comparisons(df_small, "claim_severity", "Province", correction_method="fdr_bh")
display(pairwise_df.sort_values("p_adj").head(20))


### Pairwise t-tests (claim severity) for top provinces

Unnamed: 0,group1,group2,t_stat,p_value,cohen_d,p_adj,reject_H0
10,KwaZulu-Natal,North West,4.705509,3e-06,0.320459,4.5e-05,True
9,KwaZulu-Natal,Mpumalanga,4.080676,5.9e-05,0.334711,0.000441,True
14,North West,Western Cape,-3.587537,0.000359,-0.266829,0.001796,True
5,Gauteng,KwaZulu-Natal,-3.350416,0.000848,-0.193493,0.003122,True
13,Mpumalanga,Western Cape,-3.306039,0.001041,-0.272108,0.003122,True
7,Gauteng,North West,2.500055,0.012694,0.144524,0.031734,True
8,Gauteng,Western Cape,-2.216832,0.027105,-0.15562,0.058082,False
6,Gauteng,Mpumalanga,2.152311,0.032837,0.171468,0.061569,False
2,Eastern Cape,Mpumalanga,1.534492,0.130432,0.334924,0.217386,False
3,Eastern Cape,North West,1.459017,0.150687,0.31976,0.226031,False
