## Import Libraries

In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

### Loading the Dataset

In [3]:
df = pd.read_csv("../data/raw/MachineLearningRating_v3.txt", sep="|")
df.head()

  df = pd.read_csv("../data/raw/MachineLearningRating_v3.txt", sep="|")


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0
4,145255,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


## Create Business Metrics

In [4]:
df["HasClaim"] = df["TotalClaims"] > 0
df["ClaimFrequency"] = df.groupby("PolicyID")["HasClaim"].transform("max")
df["Margin"] = df["TotalPremium"] - df["TotalClaims"]


### Hypothesis 1
H₀: There are NO risk differences across provinces

In [None]:
province_groups = [
    df[df["Province"] == p]["TotalClaims"]
    for p in df["Province"].dropna().unique()
]

f_stat, p_value = stats.f_oneway(*province_groups)

f_stat, p_value

(np.float64(5.849413762408303), np.float64(1.6782057588675903e-07))

### Hypothesis 2
H₀: There are NO risk differences between Zip Codes

In [6]:
zip_groups = [
    df[df["PostalCode"] == z]["TotalClaims"]
    for z in df["PostalCode"].dropna().unique()[:2]
]

t_stat, p_value = stats.ttest_ind(zip_groups[0], zip_groups[1], equal_var=False)

t_stat, p_value


(np.float64(nan), np.float64(nan))

### Hypothesis 3
H₀: There is NO margin difference between Zip Codes

In [7]:
zip_margin = [
    df[df["PostalCode"] == z]["Margin"]
    for z in df["PostalCode"].dropna().unique()[:2]
]

t_stat, p_value = stats.ttest_ind(zip_margin[0], zip_margin[1], equal_var=False)

t_stat, p_value


(np.float64(-0.4370784074657527), np.float64(0.6630316429729602))

### Hypothesis 4
H₀: There is NO risk difference between Women and Men

In [8]:
male_claims = df[df["Gender"] == "Male"]["TotalClaims"]
female_claims = df[df["Gender"] == "Female"]["TotalClaims"]

t_stat, p_value = stats.ttest_ind(male_claims, female_claims, equal_var=False)

t_stat, p_value


(np.float64(-0.296353891400699), np.float64(0.7669656471629474))