In [1]:
# Standard imports
import pandas as pd
import numpy as np
import os
import sys

# Append the project root directory to the system path to import modules
# The path is relative to the notebook's location in 'notebooks/'
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Import custom modules
from src.loader import DataLoader
from src.eda_utils import EDAPlotter

print("Custom modules (DataLoader, EDAPlotter) imported successfully.")

Custom modules (DataLoader, EDAPlotter) imported successfully.


In [25]:
# Instantiate and Load Data (The DVC-tracked file)
loader = DataLoader('../data/raw/insurance_claims.csv')
df = loader.load_data()
df = loader.clean_column_names()
df = loader.optimize_dtypes()

print(f"Data successfully loaded. Shape: {df.shape}")
print("\n--- Data Quality Check ---")
# Print a concise summary of column types and memory usage
df.info(verbose=False, memory_usage="deep")
print("\nTop 5 Missing Columns:")
# Identify critical missing data points for your interim report
print(df.isnull().sum().sort_values(ascending=False).head())

Data successfully loaded. Shape: (1000098, 52)

--- Data Quality Check ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000098 entries, 0 to 1000097
Columns: 52 entries, underwrittencoverid to totalclaims
dtypes: bool(1), category(36), float64(11), int64(4)
memory usage: 152.7 MB

Top 5 Missing Columns:
numberofvehiclesinfleet    1000098
crossborder                 999400
customvalueestimate         779642
rebuilt                     641901
converted                   641901
dtype: int64


In [26]:
# In Cell 3: Feature Engineering (The KPI Metric)

# CRITICAL FIX: Ensure 'transactionmonth' is the correct datetime type, overriding any previous type issues.
if 'transactionmonth' in df.columns:
    # Coercing errors will turn invalid date strings into NaT (Not a Time)
    df['transactionmonth'] = pd.to_datetime(df['transactionmonth'], errors='coerce')
    # Drop any rows where the date failed to parse (safeguards against resample issues)
    df.dropna(subset=['transactionmonth'], inplace=True)
    
    # DIAGNOSTIC CHECK: Print the type to confirm the fix
    print(f"TransactionMonth Dtype after fix: {df['transactionmonth'].dtype}")


# Calculate the industry-standard KPI: LOSS RATIO
# Loss Ratio = Total Claims / Total Premium
df['loss_ratio'] = df['totalclaims'] / df['totalpremium']

# CRITICAL DATA CLEANING: Handle division by zero/NaN/Inf values.
df['loss_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
df['loss_ratio'] = df['loss_ratio'].fillna(0) 

# Cap the loss ratio for robust visualization. Loss Ratio > 1.0 means unprofitable.
df['capped_loss_ratio'] = df['loss_ratio'].clip(upper=5.0) 

print(f"\nLoss Ratio calculated.")
print(f"Mean Loss Ratio (Profitability Indicator): {df['loss_ratio'].mean():.4f}")
print(f"Max Capped Loss Ratio: {df['capped_loss_ratio'].max():.2f}")

TransactionMonth Dtype after fix: category

Loss Ratio calculated.
Mean Loss Ratio (Profitability Indicator): 0.2164
Max Capped Loss Ratio: 5.00


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['loss_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)


In [27]:
plotter = EDAPlotter(df)

# Check distribution of the core money variables
plotter.plot_univariate_distribution(column='totalpremium', title_suffix="($) - Skewed")
plotter.plot_univariate_distribution(column='totalclaims', title_suffix="($) - Heavy Right Tail")

# Plot the engineered KPI metric
plotter.plot_univariate_distribution(column='capped_loss_ratio', title_suffix=" (Capped at 5.0)")

print("Cell 4: Univariate plots generated and saved to reports/figures/")

Cell 4: Univariate plots generated and saved to reports/figures/


In [28]:
# Insight 1: Geography (Province) vs. Profitability (Loss Ratio)
plotter.plot_risk_by_category(x_col='province', y_col='loss_ratio', sort=True)

# Insight 2: Vehicle Body Type vs. Claim Severity (totalclaims)
plotter.plot_risk_by_category(x_col='bodytype', y_col='totalclaims', sort=True)

# Insight 3: Demographics (Marital Status) vs. Loss Ratio
plotter.plot_risk_by_category(x_col='maritalstatus', y_col='loss_ratio', sort=True)

print("Cell 5: Bivariate plots generated and saved to reports/figures/")

  risk_df = self.df.groupby(x_col)[y_col].mean().reset_index()
  risk_df = self.df.groupby(x_col)[y_col].mean().reset_index()


Cell 5: Bivariate plots generated and saved to reports/figures/


  risk_df = self.df.groupby(x_col)[y_col].mean().reset_index()


In [32]:
# In Cell 6: Correlation and Time Series - FINAL SOLUTION

# 6.1 Correlation Matrix
numerical_cols = [
    'totalpremium', 
    'totalclaims', 
    'loss_ratio', 
    'customvalueestimate', 
    'registrationyear',    
    'cylinders',           
    'termfrequency'        
]
plotter.plot_correlation_heatmap(numerical_cols)

# 6.2 Time Series of Claims (Creative Plotting)
# FINAL SOLUTION: Convert categorical datetime to regular datetime

# Method 1: Extract datetime from categorical (most reliable)
# Since the categories are already datetime64[ns], we can convert them directly
try:
    # Extract the datetime values from the categorical
    # Convert categorical to its underlying datetime categories
    datetime_values = df['transactionmonth'].cat.categories[df['transactionmonth'].cat.codes]
    
    # Create a new DataFrame with proper datetime
    df_time = df.copy()
    df_time['transactionmonth'] = pd.DatetimeIndex(datetime_values)
    
    # Drop any NaT values (just in case)
    df_time = df_time.dropna(subset=['transactionmonth'])
    
    # Set index and resample - use 'ME' (Month End) instead of deprecated 'M'
    time_series_df = df_time.set_index('transactionmonth').resample('ME')['totalclaims'].sum().reset_index()
    
    print("✓ Time series created using categorical datetime extraction")
    
except Exception as e:
    print(f"Method 1 failed: {e}")
    
    # Method 2: Fallback - convert to string then to datetime
    df_time = df.copy()
    # Convert categorical to string then to datetime
    df_time['transactionmonth'] = pd.to_datetime(df_time['transactionmonth'].astype(str), errors='coerce')
    df_time = df_time.dropna(subset=['transactionmonth'])
    
    # Set index and resample
    time_series_df = df_time.set_index('transactionmonth').resample('ME')['totalclaims'].sum().reset_index()
    
    print("✓ Time series created using string conversion fallback")

# Generate the plot
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid", palette="viridis")

plt.figure(figsize=(14, 6))
sns.lineplot(x='transactionmonth', y='totalclaims', data=time_series_df, marker='o', color=sns.color_palette("viridis")[3])
plt.title('Monthly Total Claims Over Time', fontsize=16)
plt.xlabel("Month", fontsize=12)
plt.ylabel("Total Claims (USD)", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig("../reports/figures/claims_time_series.png")
plt.close()

print("✓ Time series plot generated and saved to reports/figures/claims_time_series.png")
print(f"✓ Data points: {len(time_series_df)} months from {time_series_df['transactionmonth'].min()} to {time_series_df['transactionmonth'].max()}")

✓ Time series created using categorical datetime extraction
✓ Time series plot generated and saved to reports/figures/claims_time_series.png
✓ Data points: 23 months from 2013-10-31 00:00:00 to 2015-08-31 00:00:00
