In [2]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, ttest_ind

In [8]:
# Load the data
file_path = '../data/MachineLearningRating_v3.txt'

In [9]:
# Load data from txt file with '|' separator
data = pd.read_csv('../data/MachineLearningRating_v3.txt', sep='|', low_memory=False)

In [None]:
# Function to perform chi-squared test
def chi_squared_test(group_a, group_b):
    contingency_table = pd.crosstab(group_a, group_b)
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    return chi2, p

In [None]:
# Function to perform t-test
def t_test(group_a, group_b):
    t_stat, p = ttest_ind(group_a, group_b, equal_var=False)
    return t_stat, p

In [None]:
# Hypothesis 1: Risk differences across provinces
print("Hypothesis 1: Risk differences across provinces")
province_risks = data.groupby('province')['risk'].mean()
chi2, p = chi_squared_test(data['province'], data['risk'])
print(f"Chi-squared test statistic: {chi2}, p-value: {p}")

In [None]:
# Hypothesis 2: Risk differences between zipcodes
print("\nHypothesis 2: Risk differences between zipcodes")
zipcode_risks = data.groupby('zipcode')['risk'].mean()
chi2, p = chi_squared_test(data['zipcode'], data['risk'])
print(f"Chi-squared test statistic: {chi2}, p-value: {p}")

In [None]:
# Hypothesis 3: Margin differences between zipcodes
print("\nHypothesis 3: Margin differences between zipcodes")
zipcode_margins = data.groupby('zipcode')['margin'].mean()
group_a = data[data['zipcode'] == 'zip_a']['margin']
group_b = data[data['zipcode'] == 'zip_b']['margin']
t_stat, p = t_test(group_a, group_b)
print(f"T-test statistic: {t_stat}, p-value: {p}")

In [None]:
# Hypothesis 4: Risk differences between Women and Men
print("\nHypothesis 4: Risk differences between Women and Men")
group_a = data[data['gender'] == 'Female']['risk']
group_b = data[data['gender'] == 'Male']['risk']
t_stat, p = t_test(group_a, group_b)
print(f"T-test statistic: {t_stat}, p-value: {p}")

In [None]:
# Analyze and Report
def analyze_results(p_value, hypothesis):
    if p_value < 0.05:
        print(f"Reject the null hypothesis for {hypothesis}. There is a significant effect.")
    else:
        print(f"Fail to reject the null hypothesis for {hypothesis}. No significant effect detected.")

analyze_results(p, "Hypothesis 1: Risk differences across provinces")
analyze_results(p, "Hypothesis 2: Risk differences between zipcodes")
analyze_results(p, "Hypothesis 3: Margin differences between zipcodes")
analyze_results(p, "Hypothesis 4: Risk differences between Women and Men")