In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm

In [3]:
# Load the dataset
df = pd.read_csv('../data/MachineLearningRating_v3.txt')

  df = pd.read_csv('../data/MachineLearningRating_v3.txt')


In [27]:
# Define Helper Functions for Statistical Testing
def perform_ttest(group_a, group_b, metric):
    """
    Perform a t-test between two groups on a specified metric.
    """
    group_a_values = group_a[metric].dropna()
    group_b_values = group_b[metric].dropna()
    t_stat, p_value = stats.ttest_ind(group_a_values, group_b_values, equal_var=False)
    return t_stat, p_value

def perform_chi2_test(group_a, group_b, categorical_column):
    """
    Perform a chi-squared test between two groups on a categorical column.
    """
    contingency_table = pd.crosstab(group_a[categorical_column], group_b[categorical_column])
    chi2_stat, p_value, _, _ = stats.chi2_contingency(contingency_table)
    return chi2_stat, p_value

def analyze_results(p_value, alpha=0.05):
    if p_value < alpha:
        return 'Reject the null hypothesis (significant difference)'
    else:
        return 'Fail to reject the null hypothesis (no significant difference)'


#### Risk Differences Across Provinces

In [5]:
# Data Segmentation
province_a = df[df['Province'] == 'Gauteng']
province_b = df[df['Province'] == 'KwaZulu-Natal']

# Perform T-Test for TotalClaims
t_stat, p_value = perform_ttest(province_a, province_b, 'TotalClaims')
print(f'Province T-Test: t-statistic = {t_stat}, p-value = {p_value}')


Province T-Test: t-statistic = -1.246201702257548, p-value = 0.21269135333580677


In [28]:
print('Province Test Result:', analyze_results(0.21269135333580677))

Province Test Result: Fail to reject the null hypothesis (no significant difference)


#### Risk Differences Between Zip Codes

In [24]:
# Choose two zip codes for comparison
#zip_code_a = df[df['PostalCode'] == '2000']
#zip_code_b = df[df['PostalCode'] == '122']

# Combine zip codes for a larger sample size
zip_code_a = df[df['PostalCode'].isin([4093, 1852])]  # Group 1: Combine two zip codes
zip_code_b = df[df['PostalCode'].isin([1619, 2000])]  # Group 2: Combine two other zip codes

# Perform T-Test for TotalClaims
t_stat, p_value = perform_ttest(zip_code_a, zip_code_b, 'TotalClaims')
print(f'Zip Code T-Test: t-statistic = {t_stat}, p-value = {p_value}')

Zip Code T-Test: t-statistic = 0.5241840959722943, p-value = 0.6001818944372983


In [29]:
print('Zip Code Test Result:', analyze_results(0.6001818944372983))

Zip Code Test Result: Fail to reject the null hypothesis (no significant difference)


#### Margin (Profit) Differences Between Zip Codes

In [25]:
# Perform T-Test for TotalPremium (Margin)
t_stat, p_value = perform_ttest(zip_code_a, zip_code_b, 'TotalPremium')
print(f'Margin T-Test: t-statistic = {t_stat}, p-value = {p_value}')

Margin T-Test: t-statistic = -3.3086968162781236, p-value = 0.0009462764058919544


In [30]:
print('Margin Test Result:', analyze_results(0.0009462764058919544))

Margin Test Result: Reject the null hypothesis (significant difference)


#### Risk Differences Between Women and Men

In [12]:
# Data Segmentation
women = df[df['Gender'] == 'Female']
men = df[df['Gender'] == 'Male']

# Perform T-Test for TotalClaims
t_stat, p_value = perform_ttest(women, men, 'TotalClaims')
print(f'Gender T-Test: t-statistic = {t_stat}, p-value = {p_value}')


Gender T-Test: t-statistic = 0.296353891400699, p-value = 0.7669656471629474


In [31]:
print('Gender Test Result:', analyze_results(0.7669656471629474))

Gender Test Result: Fail to reject the null hypothesis (no significant difference)
