# Exploratory Data Analysis - Insurance Claims Complexity

This notebook contains the exploratory analysis of the claims and policies data to understand patterns related to claim complexity.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Add src to path
sys.path.append(os.path.abspath('..'))

from src.utils.config import load_config
from src.data.loader import DataLoader
from src.preprocessing.merging import DataMerger

config = load_config('../config/config.yaml')
loader = DataLoader(config)
data = loader.load_all_data()

df_claims = data['train_claims']
df_policies = data['train_policies']

## 1. Data Overview

In [None]:
print(f"Claims Shape: {df_claims.shape}")
print(f"Policies Shape: {df_policies.shape if df_policies is not None else 'N/A'}")
df_claims.head()

## 2. Target Variable Analysis

In [None]:
target_col = config['data']['target_col']
sns.countplot(x=target_col, data=df_claims)
plt.title('Distribution of Claim Complexity Labels')
plt.show()

print(df_claims[target_col].value_counts(normalize=True))

## 3. Numeric Features Analysis

In [None]:
sns.boxplot(x=target_col, y='ReportedDamage', data=df_claims)
plt.title('Reported Damage vs Complexity')
plt.show()

sns.boxplot(x=target_col, y='NumParties', data=df_claims)
plt.title('Number of Parties vs Complexity')
plt.show()