In [1]:
import pandas as pd    
import numpy as np 
from scipy.stats import ttest_ind, mannwhitneyu, chi2_contingency
from statsmodels.stats.proportion import proportions_ztest 
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [2]:
df = pd.read_csv("../data/df_cleaned.csv")

In [3]:
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

# Proportion test; claim frequancy vs gender

In [4]:
# Create binary column for claim
df['has_claim'] = df['TotalClaims'] > 0

# Get claim counts and total counts by gender
claim_counts = df.groupby('Gender')['has_claim'].sum()
total_counts = df.groupby('Gender')['has_claim'].count()

# Prepare inputs for z-test
count = [claim_counts['Male'], claim_counts['Female']]
nobs = [total_counts['Male'], total_counts['Female']]

# Perform z-test
z_stat, p_value = proportions_ztest(count, nobs)

# Calculate sample proportions
proportions = [c / n for c, n in zip(count, nobs)]

# Print detailed output
print("=== Claim Frequency by Gender ===")
print(f"Male: {claim_counts['Male']} out of {total_counts['Male']} "
      f"→ Proportion = {proportions[0]:.5f}")
print(f"Female: {claim_counts['Female']} out of {total_counts['Female']} "
      f"→ Proportion = {proportions[1]:.5f}")
print("\n=== Hypothesis Test ===")
print(f"Z = {z_stat:.3f}, p = {p_value:.5f}")

=== Claim Frequency by Gender ===
Male: 94 out of 42576 → Proportion = 0.00221
Female: 14 out of 6755 → Proportion = 0.00207

=== Hypothesis Test ===
Z = 0.221, p = 0.82510


# Independent t_test: claim severity vs gender 

In [5]:
# Only consider rows with at least one claim
claimed = df[df['TotalClaims'] > 0]

# Split into gender groups
female_sev = claimed[claimed['Gender'] == 'Female']['TotalClaims']
male_sev = claimed[claimed['Gender'] == 'Male']['TotalClaims']

# Compute sample means
female_mean = female_sev.mean()
male_mean = male_sev.mean()

# Perform independent t-test
t_stat, p_value = ttest_ind(female_sev, male_sev, equal_var=False)

# Print output
print("=== Claim Severity (TotalClaims) by Gender ===")
print(f"Female mean: {female_mean:.2f}")
print(f"Male mean:   {male_mean:.2f}")
print("\n=== Hypothesis Test ===")
print(f"T = {t_stat:.3f}, p = {p_value:.3f}")

=== Claim Severity (TotalClaims) by Gender ===
Female mean: 17874.72
Male mean:   14858.55

=== Hypothesis Test ===
T = 0.579, p = 0.568


# Independent t_test: margin vs Gender

In [6]:
female_sev = claimed[claimed['Gender'] == 'Female']['Margin']
male_sev = claimed[claimed['Gender'] == 'Male']['Margin']

# Compute sample means
female_mean = female_sev.mean()
male_mean = male_sev.mean()

# Perform independent t-test
t_stat, p_value = ttest_ind(female_sev, male_sev, equal_var=False)

# Print output
print("=== Margin( total premium-total claim) by Gender ===")
print(f"Female mean: {female_mean:.2f}")
print(f"Male mean:   {male_mean:.2f}")
print("\n=== Hypothesis Test ===")
print(f"T = {t_stat:.3f}, p = {p_value:.3f}")

=== Margin( total premium-total claim) by Gender ===
Female mean: -17539.83
Male mean:   -14623.82

=== Hypothesis Test ===
T = -0.562, p = 0.579


# Chi-square test of association claim frequancy and province

In [7]:
# Create binary column for claim
df['has_claim'] = df['TotalClaims'] > 0 

# Get number of claims and total observations per province
claim_counts = df.groupby('Province')['has_claim'].sum()
total_counts = df.groupby('Province')['has_claim'].count()

# Combine into a contingency table
contingency_table = pd.concat([claim_counts, total_counts - claim_counts], axis=1)
contingency_table.columns = ['Claimed', 'Not_Claimed']

# Perform chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Convert expected into a readable DataFrame
expected_df = pd.DataFrame(expected, 
                           index=contingency_table.index, 
                           columns=contingency_table.columns)

# Output

print("\n=== Observed Counts ===")
print(contingency_table)

print("\n=== Expected Counts ===")
print(expected_df.round(2))

print("=== Chi-Square Test for Risk by Province ===")
print(f"Chi2 = {chi2:.3f}, p-value = {p:.5f}")
print(f"Degrees of Freedom = {dof}")



=== Observed Counts ===
               Claimed  Not_Claimed
Province                           
Eastern Cape        50        30282
Free State          11         8088
Gauteng           1322       392520
KwaZulu-Natal      483       169052
Limpopo             67        24769
Mpumalanga         128        52588
North West         349       142938
Northern Cape        8         6372
Western Cape       370       170408

=== Expected Counts ===
               Claimed  Not_Claimed
Province                           
Eastern Cape     84.58     30247.42
Free State       22.58      8076.42
Gauteng        1098.25    392743.75
KwaZulu-Natal   472.76    169062.24
Limpopo          69.26     24766.74
Mpumalanga      147.00     52569.00
North West      399.56    142887.44
Northern Cape    17.79      6362.21
Western Cape    476.22    170301.78
=== Chi-Square Test for Risk by Province ===
Chi2 = 104.190, p-value = 0.00000
Degrees of Freedom = 8


In [8]:
# Only consider customers who have made at least one claim
claimed = df[df['TotalClaims'] > 0]

# === ANOVA Table ===
model = smf.ols('TotalClaims ~ C(Province)', data=claimed).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

print("=== ANOVA Table ===")
print(anova_table.round(3))

# === Group Summary ===
summary = claimed.groupby('Province')['TotalClaims'].agg(['count', 'mean']).rename(
    columns={'count': 'Sample Size', 'mean': 'Mean Claim Severity'}
)
print("\n=== Group Summary ===")
print(summary.round(2))

# === Tukey's HSD for Pairwise Comparison ===
tukey = pairwise_tukeyhsd(endog=claimed['TotalClaims'],
                          groups=claimed['Province'],
                          alpha=0.05)

print("\n=== Pairwise Comparison (Tukey HSD) ===")
print(tukey.summary())

=== ANOVA Table ===
                   sum_sq      df     F  PR(>F)
C(Province)  5.730111e+10     8.0  4.83     0.0
Residual     4.120971e+12  2779.0   NaN     NaN

=== Group Summary ===
               Sample Size  Mean Claim Severity
Province                                       
Eastern Cape            50             27128.53
Free State              11             32265.66
Gauteng               1322             22243.88
KwaZulu-Natal          483             29609.49
Limpopo                 67             15171.29
Mpumalanga             128             15979.55
North West             349             16963.47
Northern Cape            8             11186.31
Western Cape           370             28095.85

=== Pairwise Comparison (Tukey HSD) ===
            Multiple Comparison of Means - Tukey HSD, FWER=0.05             
    group1        group2      meandiff  p-adj     lower      upper    reject
----------------------------------------------------------------------------
 Eastern Cape

# ANOVA:Vehicle type vs claim severity

In [9]:
# Only consider customers who have made at least one claim
claimed = df[df['TotalClaims'] > 0]

# === ANOVA Table ===
model = smf.ols('TotalClaims ~ C(VehicleType)', data=claimed).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

print("=== ANOVA Table ===")
print(anova_table.round(3))

# === Group Summary ===
summary = claimed.groupby('VehicleType')['TotalClaims'].agg(['count', 'mean']).rename(
    columns={'count': 'Sample Size', 'mean': 'Mean Claim Severity'}
)
print("\n=== Group Summary ===")
print(summary.round(2))

# === Tukey's HSD for Pairwise Comparison ===
tukey = pairwise_tukeyhsd(endog=claimed['TotalClaims'],
                          groups=claimed['VehicleType'],
                          alpha=0.05)

print("\n=== Pairwise Comparison (Tukey HSD) ===")
print(tukey.summary())

=== ANOVA Table ===
                      sum_sq      df      F  PR(>F)
C(VehicleType)  1.193816e+10     5.0  1.594   0.158
Residual        4.166334e+12  2782.0    NaN     NaN

=== Group Summary ===
                   Sample Size  Mean Claim Severity
VehicleType                                        
Bus                          1              7996.54
Heavy Commercial            21             35736.88
Light Commercial             8              7556.56
Medium Commercial          158             26075.11
Passenger Vehicle         2587             22957.37
Unknown_Category            13             42822.00

=== Pairwise Comparison (Tukey HSD) ===
                 Multiple Comparison of Means - Tukey HSD, FWER=0.05                  
      group1            group2        meandiff  p-adj     lower        upper    reject
--------------------------------------------------------------------------------------
              Bus  Heavy Commercial  27740.3498 0.9819  -85214.1459 140694.8455  Fa