In [None]:
# HEART-MLOPS/notebooks/eda.py

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from src.data import load_raw, clean_and_preprocess

# 1. Load and Clean Data
raw_df = load_raw()
df = clean_and_preprocess(raw_df)

# Set visual style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# --- Task 1.1: Class Balance ---
plt.figure(figsize=(6, 4))
sns.countplot(x='target', data=df, palette='viridis')
plt.title('Class Balance: Heart Disease Presence (1) vs Absence (0)')
plt.xlabel('Diagnosis (Target)')
plt.ylabel('Count')
plt.savefig('../reports/figures/class_balance.png')
plt.show()

# --- Task 1.2: Histograms (Feature Distributions) ---
# Visualizing key numeric features: Age, Blood Pressure, Cholesterol
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.histplot(df['age'], kde=True, ax=axes[0], color='skyblue').set_title('Age Distribution')
sns.histplot(df['trestbps'], kde=True, ax=axes[1], color='salmon').set_title('Resting Blood Pressure')
sns.histplot(df['chol'], kde=True, ax=axes[2], color='green').set_title('Cholesterol Distribution')
plt.tight_layout()
plt.savefig('../reports/figures/histograms.png')
plt.show()

# --- Task 1.3: Correlation Heatmap ---
plt.figure(figsize=(12, 10))
# Using spearman or pearson to see feature relationships
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', fmt='.2f', linewidths=0.5)
plt.title('Feature Correlation Heatmap')
plt.savefig('../reports/figures/correlation_heatmap.png')
plt.show()

# --- Task 1.4: Feature vs Target (e.g., Age vs Disease) ---
plt.figure(figsize=(10, 6))
sns.boxplot(x='target', y='age', data=df, palette='Set2')
plt.title('Age Distribution by Heart Disease Status')
plt.show()

ModuleNotFoundError: No module named 'pandas'