1. Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

# Load Data
df = pd.read_csv('../data/raw/insurance_data.csv')  # Adjust path as needed


2. Data Structure & Overview

In [None]:
# Shape and dtypes
print(df.shape)
print(df.dtypes)

# Sample rows
df.head()


3. Descriptive Statistics

In [None]:
# Summary for numerical columns
df.describe()

# Variability (standard deviation)
df[['TotalPremium', 'TotalClaims', 'CustomValueEstimate']].std()


4. Check for Missing Values

In [None]:
# Count missing values
missing = df.isnull().sum()
missing[missing > 0].sort_values(ascending=False)


5. Loss Ratio Analysis

In [None]:
# Overall loss ratio
df['LossRatio'] = df['TotalClaims'] / df['TotalPremium']
overall_loss_ratio = df['LossRatio'].mean()
print(f"Overall Loss Ratio: {overall_loss_ratio:.2f}")

# By Province
loss_by_province = df.groupby('Province')['LossRatio'].mean().sort_values()
loss_by_province.plot(kind='barh', title='Loss Ratio by Province', figsize=(10, 6))
plt.xlabel("Loss Ratio")
plt.tight_layout()
plt.show()

# By VehicleType
loss_by_vehicle = df.groupby('VehicleType')['LossRatio'].mean().sort_values()
loss_by_vehicle.plot(kind='barh', title='Loss Ratio by Vehicle Type', figsize=(10, 6))
plt.xlabel("Loss Ratio")
plt.tight_layout()
plt.show()

# By Gender
loss_by_gender = df.groupby('Gender')['LossRatio'].mean()
loss_by_gender.plot(kind='bar', title='Loss Ratio by Gender', figsize=(6, 4), color='orchid')
plt.ylabel("Loss Ratio")
plt.tight_layout()
plt.show()


6. Univariate Distributions

In [None]:
# Histogram: TotalPremium
sns.histplot(df['TotalPremium'], bins=50, kde=True)
plt.title("Distribution of TotalPremium")
plt.show()

# Histogram: TotalClaims
sns.histplot(df['TotalClaims'], bins=50, kde=True)
plt.title("Distribution of TotalClaims")
plt.show()

# Histogram: CustomValueEstimate
sns.histplot(df['CustomValueEstimate'], bins=50, kde=True)
plt.title("Distribution of CustomValueEstimate")
plt.show()


7. Outlier Detection (Box Plots)

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x=df['TotalClaims'])
plt.title("Outlier Detection in TotalClaims")
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(x=df['CustomValueEstimate'])
plt.title("Outlier Detection in CustomValueEstimate")
plt.show()


8. Temporal Trends

In [None]:
# Convert to datetime
df['TransactionMonth'] = pd.to_datetime(df['TransactionMonth'])

# Group by month
monthly = df.groupby(df['TransactionMonth'].dt.to_period('M')).agg({
    'TotalClaims': 'sum',
    'TotalPremium': 'sum'
}).reset_index()

monthly['TransactionMonth'] = monthly['TransactionMonth'].astype(str)

# Plot trend
plt.figure(figsize=(14, 6))
sns.lineplot(data=monthly, x='TransactionMonth', y='TotalClaims', label='Claims')
sns.lineplot(data=monthly, x='TransactionMonth', y='TotalPremium', label='Premium')
plt.xticks(rotation=45)
plt.title("Monthly Trends in Claims and Premiums")
plt.legend()
plt.tight_layout()
plt.show()


9. Correlation Analysis

In [None]:
# Correlation matrix
corr = df[['TotalClaims', 'TotalPremium', 'CustomValueEstimate']].corr()

# Heatmap
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()


10. Vehicle Make/Model Insights

In [None]:
# Top claims by make
top_makes = df.groupby('Make')['TotalClaims'].mean().sort_values(ascending=False).head(10)
top_makes.plot(kind='bar', title="Top 10 Makes by Average Claim", figsize=(10, 5), color='salmon')
plt.ylabel("Average Claims")
plt.tight_layout()
plt.show()

# Lowest claims by make
low_makes = df.groupby('Make')['TotalClaims'].mean().sort_values().head(10)
low_makes.plot(kind='bar', title="Lowest 10 Makes by Average Claim", figsize=(10, 5), color='skyblue')
plt.ylabel("Average Claims")
plt.tight_layout()
plt.show()


11. Geographical Comparison Example

In [None]:
# Average Premium by Province
df.groupby('Province')['TotalPremium'].mean().sort_values().plot(
    kind='barh', title="Average Premium by Province", figsize=(10, 6), color='goldenrod'
)
plt.xlabel("Average Premium")
plt.tight_layout()
plt.show()


12. 3 Beautiful Visuals to Capture Insight

CustomValueEstimate vs Claim

In [None]:
sns.scatterplot(data=df, x='CustomValueEstimate', y='TotalClaims', hue='VehicleType', alpha=0.6)
plt.title("CustomValueEstimate vs TotalClaims by VehicleType")
plt.xscale("log")
plt.yscale("log")
plt.tight_layout()
plt.show()


Loss Ratio Distribution

In [None]:
sns.violinplot(x='Gender', y='LossRatio', data=df)
plt.title("Distribution of Loss Ratio by Gender")
plt.tight_layout()
plt.show()


Premium Over Time by Province

In [None]:
premium_trends = df.groupby(['TransactionMonth', 'Province'])['TotalPremium'].mean().reset_index()
premium_trends['TransactionMonth'] = pd.to_datetime(premium_trends['TransactionMonth'])

plt.figure(figsize=(12, 6))
sns.lineplot(data=premium_trends, x='TransactionMonth', y='TotalPremium', hue='Province')
plt.title("Average Premium Over Time by Province")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
