Loading and Inspecting the dataset

In [None]:
import pandas as pd

file_path = './data/MachineLearningRating_v3.txt'

try:
    # 'on_bad_lines' skips rows with parsing errors.
    # 'encoding' helps with special characters.
    # 'low_memory=False' can help with mixed data type issues in large files.
    df = pd.read_csv(file_path, sep='\t', on_bad_lines='skip', encoding='latin1', low_memory=False)
    
    print("--- Data Loaded Successfully ---")
    print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")
    
    # Display the first few rows to get a feel for the data
    print("\n--- First 5 Rows of the Dataset ---")
    print(df.head())
    
    # Display column names and data types
    print("\n--- Dataset Info ---")
    df.info()

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please ensure it's in the correct directory.")


Data Cleaning and Feature Engineering

In [None]:
print("\n--- Starting Step 2: Data Cleaning and Feature Engineering ---")

# 1. Convert key numeric columns to numbers, coercing errors to 'Not a Number' (NaN)
df['TotalClaims'] = pd.to_numeric(df['TotalClaims'], errors='coerce')
df['TotalPremium'] = pd.to_numeric(df['TotalPremium'], errors='coerce')

# 2. Drop rows where these key columns are missing, as they are crucial for analysis
df.dropna(subset=['TotalClaims', 'TotalPremium'], inplace=True)

# 3. Engineer 'HasClaim' feature: 1 if a claim was made, 0 otherwise. This is our primary risk indicator.
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)

# 4. Engineer 'Margin' feature: This represents the profit or loss on a policy.
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

# 5. Clean key categorical columns for consistency
for col in ['Gender', 'Province', 'PostalCode']:
    if col in df.columns:
        df.dropna(subset=[col], inplace=True)
        df[col] = df[col].astype(str).str.strip()

print("--- Data Cleaning and Feature Engineering Complete ---")
print(f"Dataset shape after cleaning: {df.shape}")
print("New columns 'HasClaim' and 'Margin' created.")

Hypothesis Test

In [None]:
from scipy import stats

alpha = 0.05
print(f"\n--- Starting Test 1: Risk Differences Across Provinces (alpha={alpha}) ---")

if 'Province' in df.columns and df['Province'].nunique() > 1:
    # Create a contingency table (crosstab) of Province vs. HasClaim
    contingency_table_province = pd.crosstab(df['Province'], df['HasClaim'])
    
    # Perform the Chi-Squared test
    chi2, p_value, _, _ = stats.chi2_contingency(contingency_table_province)
    
    print(f"Chi-Squared Statistic: {chi2:.2f}, P-value: {p_value}")
    
    # Interpret the result
    if p_value < alpha:
        print("\nConclusion: Reject the null hypothesis.")
        print("Finding: There is a statistically significant difference in claim risk across provinces.")
        print("Recommendation: Prioritize marketing in lower-risk provinces and review underwriting rules for higher-risk ones.")
    else:
        print("\nConclusion: Fail to reject the null hypothesis.")
        print("Finding: There is no statistically significant evidence that risk differs by province.")

In [None]:
print(f"\n--- Starting Test 2: Risk Differences Between Zip Codes (alpha={alpha}) ---")

if 'PostalCode' in df.columns and df['PostalCode'].nunique() > 1:
    # Identify the top 20 zip codes with the most policies
    top_zipcodes = df['PostalCode'].value_counts().nlargest(20).index
    df_top_zips = df[df['PostalCode'].isin(top_zipcodes)]
    
    # Create the contingency table for these top zip codes
    contingency_table_zip = pd.crosstab(df_top_zips['PostalCode'], df_top_zips['HasClaim'])
    
    # Perform the test
    chi2, p_value, _, _ = stats.chi2_contingency(contingency_table_zip)
    
    print(f"Chi-Squared Statistic (top 20 zips): {chi2:.2f}, P-value: {p_value}")
    
    if p_value < alpha:
        print("\nConclusion: Reject the null hypothesis.")
        print("Finding: There is a significant difference in claim risk even at the zip code level.")
        print("Recommendation: Develop location-based risk profiles for more accurate premium pricing and targeted marketing.")
    else:
        print("\nConclusion: Fail to reject the null hypothesis.")
        print("Finding: No significant evidence of risk differences among the top 20 zip codes.")

In [None]:
# --- Step 5 Code ---
print(f"\n--- Starting Test 3: Margin Differences Between Zip Codes (alpha={alpha}) ---")

if 'PostalCode' in df.columns and 'df_top_zips' in locals():
    # Create a list of margin values for each of the top 20 zip codes
    groups = [df_top_zips['Margin'][df_top_zips['PostalCode'] == zip_code] for zip_code in top_zipcodes]
    
    # Perform the ANOVA test
    f_stat, p_value = stats.f_oneway(*groups)
    
    print(f"F-Statistic (top 20 zips): {f_stat:.2f}, P-value: {p_value}")

    if p_value < alpha:
        print("\nConclusion: Reject the null hypothesis.")
        print("Finding: There is a significant difference in profitability across top zip codes.")
        print("Recommendation: Analyze zip codes with low margins to check if premiums are too low or claims are too high. High-margin areas are safe markets for expansion.")
    else:
        print("\nConclusion: Fail to reject the null hypothesis.")
        print("Finding: No significant evidence of profitability differences among top zip codes.")

In [None]:
# --- Step 6 Code ---
print(f"\n--- Starting Test 4: Risk Differences Between Genders (alpha={alpha}) ---")

if 'Gender' in df.columns:
    # Standardize gender to 'M' and 'F' for a clean comparison
    df['Gender_Clean'] = df['Gender'].str.upper().str[0]
    df_gender_filtered = df[df['Gender_Clean'].isin(['M', 'F'])]
    
    if df_gender_filtered['Gender_Clean'].nunique() == 2:
        contingency_table_gender = pd.crosstab(df_gender_filtered['Gender_Clean'], df_gender_filtered['HasClaim'])
        print("Contingency Table (Gender vs. HasClaim):")
        print(contingency_table_gender)
        
        chi2, p_value, _, _ = stats.chi2_contingency(contingency_table_gender)
        
        print(f"\nChi-Squared Statistic: {chi2:.2f}, P-value: {p_value}")
        
        if p_value < alpha:
            print("\nConclusion: Reject the null hypothesis.")
            print("Finding: There is a statistically significant difference in claim risk between men and women.")
            print("Recommendation: Gender can be considered a valid factor in our risk models. Further analysis is needed to quantify the effect on premiums.")
        else:
            print("\nConclusion: Fail to reject the null hypothesis.")
            print("Finding: No significant evidence of a risk difference between men and women in this dataset.")
    else:
        print("Skipping test: Not enough distinct gender categories ('M', 'F') found after cleaning.")