In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data with proper parsing
df = pd.read_csv(
    "../data/insurance_data.csv",
    parse_dates=['TransactionMonth'],
    dtype={'PostalCode': 'str'}
)

# Initial inspection
print(f"Dataset dimensions: {df.shape}")
print("\nMissing values per column:")
print(df.isnull().sum().sort_values(ascending=False))

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# Calculate Loss Ratio (Primary KPI)
df['LossRatio'] = df['TotalClaims'] / df['TotalPremium']

# Add risk flags
df['HasClaim'] = np.where(df['TotalClaims'] > 0, 1, 0)
df['ClaimSeverity'] = np.where(
    df['HasClaim'] == 1,
    df['TotalClaims'],
    np.nan
)

: 

In [None]:
plt.figure(figsize=(12,6))
df.resample('M', on='TransactionMonth')['HasClaim'].mean().plot(
    title='Monthly Claim Frequency',
    ylabel='Claim Rate'
)
plt.axhline(
    df['HasClaim'].mean(),
    color='red',
    linestyle='--',
    label='Overall Mean'
)
plt.legend()

: 

In [None]:
# Province-level analysis
province_stats = df.groupby('Province').agg({
    'LossRatio': 'mean',
    'HasClaim': 'mean',
    'TotalPremium': 'mean'
}).sort_values('LossRatio', ascending=False)

plt.figure(figsize=(10,6))
sns.heatmap(
    province_stats.T,
    annot=True,
    fmt=".2f",
    cmap="YlOrRd"
)
plt.title("Province-Level Risk Metrics")

In [None]:
# Top 10 highest claim vehicles
top_risky = df.groupby('Make').agg({
    'HasClaim': 'mean',
    'ClaimSeverity': 'mean'
}).nlargest(10, 'HasClaim')

sns.scatterplot(
    data=top_risky,
    x='HasClaim',
    y='ClaimSeverity',
    hue=top_risky.index,
    s=100
)
plt.title("High-Risk Vehicle Identification")

In [None]:
# Check premium distribution
plt.figure(figsize=(10,6))
sns.histplot(
    df['TotalPremium'],
    kde=True,
    bins=50
)
plt.xlim(0, df['TotalPremium'].quantile(0.95))  # Remove extreme outliers

# QQ-Plot for normality check
import scipy.stats as stats
stats.probplot(df['TotalPremium'], plot=plt)