1. Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

# Load Data
df = pd.read_csv('../data/raw/insurance_data.csv')  # Adjust path as needed


2. Data Structure & Overview

In [None]:
# Shape and dtypes
print(df.shape)
print(df.dtypes)

# Sample rows
df.head()


3. Descriptive Statistics

In [None]:
# Summary for numerical columns
df.describe()

# Variability (standard deviation)
df[['TotalPremium', 'TotalClaims', 'CustomValueEstimate']].std()


4. Check for Missing Values

In [None]:
# Count missing values
missing = df.isnull().sum()
missing[missing > 0].sort_values(ascending=False)


5. Loss Ratio Analysis

In [None]:
# Overall loss ratio
df['LossRatio'] = df['TotalClaims'] / df['TotalPremium']
overall_loss_ratio = df['LossRatio'].mean()
print(f"Overall Loss Ratio: {overall_loss_ratio:.2f}")

# By Province
loss_by_province = df.groupby('Province')['LossRatio'].mean().sort_values()
loss_by_province.plot(kind='barh', title='Loss Ratio by Province', figsize=(10, 6))
plt.xlabel("Loss Ratio")
plt.tight_layout()
plt.show()

# By VehicleType
loss_by_vehicle = df.groupby('VehicleType')['LossRatio'].mean().sort_values()
loss_by_vehicle.plot(kind='barh', title='Loss Ratio by Vehicle Type', figsize=(10, 6))
plt.xlabel("Loss Ratio")
plt.tight_layout()
plt.show()

# By Gender
loss_by_gender = df.groupby('Gender')['LossRatio'].mean()
loss_by_gender.plot(kind='bar', title='Loss Ratio by Gender', figsize=(6, 4), color='orchid')
plt.ylabel("Loss Ratio")
plt.tight_layout()
plt.show()
