Loading and Inspecting the dataset

In [None]:
import pandas as pd

file_path = './data/MachineLearningRating_v3.txt'

try:
    # 'on_bad_lines' skips rows with parsing errors.
    # 'encoding' helps with special characters.
    # 'low_memory=False' can help with mixed data type issues in large files.
    df = pd.read_csv(file_path, sep='\t', on_bad_lines='skip', encoding='latin1', low_memory=False)
    
    print("--- Data Loaded Successfully ---")
    print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")
    
    # Display the first few rows to get a feel for the data
    print("\n--- First 5 Rows of the Dataset ---")
    print(df.head())
    
    # Display column names and data types
    print("\n--- Dataset Info ---")
    df.info()

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please ensure it's in the correct directory.")


Data Cleaning and Feature Engineering

In [None]:
print("\n--- Starting Step 2: Data Cleaning and Feature Engineering ---")

# 1. Convert key numeric columns to numbers, coercing errors to 'Not a Number' (NaN)
df['TotalClaims'] = pd.to_numeric(df['TotalClaims'], errors='coerce')
df['TotalPremium'] = pd.to_numeric(df['TotalPremium'], errors='coerce')

# 2. Drop rows where these key columns are missing, as they are crucial for analysis
df.dropna(subset=['TotalClaims', 'TotalPremium'], inplace=True)

# 3. Engineer 'HasClaim' feature: 1 if a claim was made, 0 otherwise. This is our primary risk indicator.
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)

# 4. Engineer 'Margin' feature: This represents the profit or loss on a policy.
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

# 5. Clean key categorical columns for consistency
for col in ['Gender', 'Province', 'PostalCode']:
    if col in df.columns:
        df.dropna(subset=[col], inplace=True)
        df[col] = df[col].astype(str).str.strip()

print("--- Data Cleaning and Feature Engineering Complete ---")
print(f"Dataset shape after cleaning: {df.shape}")
print("New columns 'HasClaim' and 'Margin' created.")