## Import Libraries

In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

### Loading the Dataset

In [3]:
df = pd.read_csv("../data/raw/MachineLearningRating_v3.txt", sep="|")
df.head()

  df = pd.read_csv("../data/raw/MachineLearningRating_v3.txt", sep="|")


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0
4,145255,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


## Step 0: Data Preparation

In [10]:
df_clean = df.dropna(subset=["PostalCode", "TotalClaims", "Province", "Gender", "TotalPremium"]).copy()
df_clean["HasClaim"] = (df_clean["TotalClaims"] > 0).astype(int)
df_clean["Margin"] = df_clean["TotalPremium"] - df_clean["TotalClaims"]

print("Step 0: Data cleaned")
print("Total rows after cleaning:", len(df_clean))
print(df_clean.head())

Step 0: Data cleaned
Total rows after cleaning: 990562
   UnderwrittenCoverID  PolicyID     TransactionMonth  IsVATRegistered  \
0               145249     12827  2015-03-01 00:00:00             True   
1               145249     12827  2015-05-01 00:00:00             True   
2               145249     12827  2015-07-01 00:00:00             True   
3               145255     12827  2015-05-01 00:00:00             True   
4               145255     12827  2015-07-01 00:00:00             True   

  Citizenship          LegalType Title Language                 Bank  \
0              Close Corporation    Mr  English  First National Bank   
1              Close Corporation    Mr  English  First National Bank   
2              Close Corporation    Mr  English  First National Bank   
3              Close Corporation    Mr  English  First National Bank   
4              Close Corporation    Mr  English  First National Bank   

       AccountType  ...            CoverGroup              Section 

## Step 1: Hypothesis 1 - Province vs Claim Frequency

In [11]:
top_provinces = df_clean["Province"].value_counts().head(2).index.tolist()
prov_a, prov_b = top_provinces[0], top_provinces[1]

group_a = df_clean[df_clean["Province"] == prov_a]["HasClaim"]
group_b = df_clean[df_clean["Province"] == prov_b]["HasClaim"]

count = [group_a.sum(), group_b.sum()]
nobs = [len(group_a), len(group_b)]
z_stat_prov, p_value_prov = proportions_ztest(count, nobs)

print("\nStep 1: Hypothesis 1 - Province vs Claim Frequency")
print("Provinces selected:", prov_a, "and", prov_b)
print("Group sizes:", nobs)
print("Claim counts:", count)
print("Z-statistic:", z_stat_prov)
print("P-value:", p_value_prov)
if p_value_prov < 0.05:
    print("Reject H0: Provinces differ in risk.")
else:
    print("Fail to reject H0: No significant difference.")


Step 1: Hypothesis 1 - Province vs Claim Frequency
Provinces selected: Gauteng and Western Cape
Group sizes: [386057, 170796]
Claim counts: [np.int64(1308), np.int64(370)]
Z-statistic: 7.67034092573736
P-value: 1.7153978879462923e-14
Reject H0: Provinces differ in risk.


## Step 2: Hypothesis 2 - Zip Codes vs Claim Frequency

In [12]:
top_zips = df_clean["PostalCode"].value_counts().head(2).index.tolist()
zip_a, zip_b = top_zips[0], top_zips[1]

group_a = df_clean[df_clean["PostalCode"] == zip_a]["HasClaim"]
group_b = df_clean[df_clean["PostalCode"] == zip_b]["HasClaim"]

count = [group_a.sum(), group_b.sum()]
nobs = [len(group_a), len(group_b)]
z_stat_zip, p_value_zip = proportions_ztest(count, nobs)

print("\nStep 2: Hypothesis 2 - Zip Codes vs Claim Frequency")
print("Zip codes selected:", zip_a, "and", zip_b)
print("Group sizes:", nobs)
print("Claim counts:", count)
print("Z-statistic:", z_stat_zip)
print("P-value:", p_value_zip)
if p_value_zip < 0.05:
    print("Reject H0: Zip codes differ in risk.")
else:
    print("Fail to reject H0: No significant difference.")


Step 2: Hypothesis 2 - Zip Codes vs Claim Frequency
Zip codes selected: 2000 and 122
Group sizes: [133498, 49171]
Claim counts: [np.int64(486), np.int64(210)]
Z-statistic: -1.9394014055477224
P-value: 0.052452479518764646
Fail to reject H0: No significant difference.


## Step 3: Hypothesis 3 - Zip Codes vs Margin

In [13]:
group_a = df_clean[df_clean["PostalCode"] == zip_a]["Margin"]
group_b = df_clean[df_clean["PostalCode"] == zip_b]["Margin"]

t_stat_margin, p_value_margin = stats.ttest_ind(group_a, group_b, equal_var=False)

print("\nStep 3: Hypothesis 3 - Zip Codes vs Margin")
print("Zip codes selected:", zip_a, "and", zip_b)
print("Group sizes:", len(group_a), len(group_b))
print("T-statistic:", t_stat_margin)
print("P-value:", p_value_margin)
if p_value_margin < 0.05:
    print("Reject H0: Margins differ between zip codes.")
else:
    print("Fail to reject H0: No significant difference.")


Step 3: Hypothesis 3 - Zip Codes vs Margin
Zip codes selected: 2000 and 122
Group sizes: 133498 49171
T-statistic: 1.1639145988804174
P-value: 0.24446241842452016
Fail to reject H0: No significant difference.


## Step 4: Hypothesis 4 - Gender vs Claim Frequency

In [16]:
# Keep only valid gender rows
df_clean = df_clean[df_clean["Gender"].isin(["Male", "Female"])]

# Step 4: Hypothesis 4 - Gender vs Claim Frequency
female_group = df_clean[df_clean["Gender"] == "Female"]["HasClaim"]
male_group   = df_clean[df_clean["Gender"] == "Male"]["HasClaim"]

# Check for empty groups
if len(female_group) == 0 or len(male_group) == 0:
    print("One of the gender groups has no observations. Cannot perform z-test.")
else:
    count = [female_group.sum(), male_group.sum()]
    nobs = [len(female_group), len(male_group)]
    z_stat_gender, p_value_gender = proportions_ztest(count, nobs)

    print("\nStep 4: Hypothesis 4 - Gender vs Claim Frequency")
    print("Group sizes (F/M):", nobs)
    print("Claim counts (F/M):", count)
    print("Z-statistic:", z_stat_gender)
    print("P-value:", p_value_gender)
    
    if p_value_gender < 0.05:
        print("Reject H0: Claim frequency differs by gender.")
    else:
        print("Fail to reject H0: No significant difference.")



Step 4: Hypothesis 4 - Gender vs Claim Frequency
Group sizes (F/M): [6755, 42817]
Claim counts (F/M): [np.int64(14), np.int64(94)]
Z-statistic: -0.20126144446741484
P-value: 0.8404941485359676
Fail to reject H0: No significant difference.
