In [1]:
import pandas as pd

# Define the column names
columns = [
    "UnderwrittenCoverID", "PolicyID", "TransactionMonth", "IsVATRegistered", "Citizenship", "LegalType", 
    "Title", "Language", "Bank", "AccountType", "MaritalStatus", "Gender", "Country", "Province", "PostalCode", 
    "MainCrestaZone", "SubCrestaZone", "ItemType", "mmcode", "VehicleType", "RegistrationYear", "make", "Model", 
    "Cylinders", "cubiccapacity", "kilowatts", "bodytype", "NumberOfDoors", "VehicleIntroDate", 
    "CustomValueEstimate", "AlarmImmobiliser", "TrackingDevice", "CapitalOutstanding", "NewVehicle", "WrittenOff", 
    "Rebuilt", "Converted", "CrossBorder", "NumberOfVehiclesInFleet", "SumInsured", "TermFrequency", 
    "CalculatedPremiumPerTerm", "ExcessSelected", "CoverCategory", "CoverType", "CoverGroup", "Section", 
    "Product", "StatutoryClass", "StatutoryRiskType", "TotalPremium", "TotalClaims"
]

# Read the file directly (adjust the path as necessary)
df = pd.read_csv(r'..\MachineLearningRating.txt', sep='|', header=None, names=columns, low_memory=False)

# Print the first few rows of the DataFrame
print(df.head(1))


   UnderwrittenCoverID  PolicyID     TransactionMonth  IsVATRegistered  \
0               145249     12827  2015-03-01 00:00:00             True   

  Citizenship          LegalType Title Language                 Bank  \
0              Close Corporation    Mr  English  First National Bank   

       AccountType  ...         ExcessSelected CoverCategory   CoverType  \
0  Current account  ...  Mobility - Windscreen    Windscreen  Windscreen   

             CoverGroup              Section                          Product  \
0  Comprehensive - Taxi  Motor Comprehensive  Mobility Metered Taxis: Monthly   

  StatutoryClass StatutoryRiskType  TotalPremium TotalClaims  
0     Commercial     IFRS Constant     21.929825         0.0  

[1 rows x 52 columns]


In [2]:
from scipy import stats

# Group by province and get the risk metric (e.g., TotalClaims or TotalPremium)
provinces = df['Province'].unique()

# Perform one-way ANOVA for TotalClaims across provinces
anova_result = stats.f_oneway(*(df[df['Province'] == prov]['TotalClaims'] for prov in provinces))

print(f"ANOVA F-statistic: {anova_result.statistic}")
print(f"ANOVA p-value: {anova_result.pvalue}")

# Interpretation
alpha = 0.05
if anova_result.pvalue < alpha:
    print("Reject the Null Hypothesis: There are significant risk differences across provinces.")
else:
    print("Fail to reject the Null Hypothesis: There are no significant risk differences across provinces.")


ANOVA F-statistic: 5.8494137624076075
ANOVA p-value: 1.6782057588675903e-07
Reject the Null Hypothesis: There are significant risk differences across provinces.


In [3]:
zip_codes = df['PostalCode'].unique()

# Perform one-way ANOVA for TotalClaims across zip codes
anova_zip_result = stats.f_oneway(*(df[df['PostalCode'] == zip_code]['TotalClaims'] for zip_code in zip_codes))

print(f"ANOVA F-statistic: {anova_zip_result.statistic}")
print(f"ANOVA p-value: {anova_zip_result.pvalue}")

# Interpretation
if anova_zip_result.pvalue < alpha:
    print("Reject the Null Hypothesis: There are significant risk differences between zip codes.")
else:
    print("Fail to reject the Null Hypothesis: There are no significant risk differences between zip codes.")


ANOVA F-statistic: 0.9419762214391849
ANOVA p-value: 0.8906511279164051
Fail to reject the Null Hypothesis: There are no significant risk differences between zip codes.


In [4]:
# Calculate profit margin as TotalPremium - TotalClaims
df['ProfitMargin'] = df['TotalPremium'] - df['TotalClaims']

# Perform ANOVA for ProfitMargin across zip codes
anova_margin_result = stats.f_oneway(*(df[df['PostalCode'] == zip_code]['ProfitMargin'] for zip_code in zip_codes))

print(f"ANOVA F-statistic: {anova_margin_result.statistic}")
print(f"ANOVA p-value: {anova_margin_result.pvalue}")

# Interpretation
if anova_margin_result.pvalue < alpha:
    print("Reject the Null Hypothesis: There are significant margin differences between zip codes.")
else:
    print("Fail to reject the Null Hypothesis: There are no significant margin differences between zip codes.")


ANOVA F-statistic: 0.8707474893589263
ANOVA p-value: 0.9976859758015036
Fail to reject the Null Hypothesis: There are no significant margin differences between zip codes.
