"""
# Exploratory Data Analysis (EDA) Notebook

### Objective:
- Load `insurance_featured.csv`
- Perform data visualization & statistical analysis
- Identify key trends and relationships
"""

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Load dataset
file_path = os.path.join("..", "data", "insurance_featured.csv")
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
else:
    raise FileNotFoundError(f"File not found: {file_path}")

# Display dataset info
print("Dataset Info:")
print(df.info())

# Convert categorical variables to numerical for correlation analysis
df_numeric = df.copy()
df_numeric['sex'] = df_numeric['sex'].map({'male': 0, 'female': 1})
df_numeric['smoker'] = df_numeric['smoker'].map({'no': 0, 'yes': 1})
df_numeric = pd.get_dummies(df_numeric, columns=['region', 'bmi_category', 'age_group'])

# 1️⃣ Distribution of Age, BMI, and Charges
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

sns.histplot(df['age'], bins=20, kde=True, ax=axes[0])
axes[0].set_title("Age Distribution")

sns.histplot(df['bmi'], bins=20, kde=True, ax=axes[1])
axes[1].set_title("BMI Distribution")

sns.histplot(df['charges'], bins=20, kde=True, ax=axes[2])
axes[2].set_title("Charges Distribution")

plt.tight_layout()
plt.show()

# 2️⃣ Correlation Analysis
plt.figure(figsize=(8, 5))
sns.heatmap(df_numeric.corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

# 3️⃣ Boxplot for Smoker vs. Charges
plt.figure(figsize=(6, 4))
sns.boxplot(x="smoker", y="charges", hue="smoker", data=df, palette=['blue', 'red'], legend=False)
plt.title("Insurance Charges: Smokers vs. Non-Smokers")
plt.xlabel("Smoker")
plt.ylabel("Charges")
plt.show()
