# Exploratory Data Analysis

This notebook summarises missing values, outliers, distributions, correlations, and medical insights for the heart disease survival dataset.


In [None]:
from IPython.display import Image, display
import pandas as pd
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve().parents[0]
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.data_loader import load_data
from src.eda import (
    detect_outliers_iqr,
    medical_insights,
    missing_value_summary,
    run_eda,
)

df = load_data()
df.head()


ModuleNotFoundError: No module named 'src'

In [None]:
missing = missing_value_summary(df)
missing


In [None]:
outliers = detect_outliers_iqr(df, features=df.columns[:-1].tolist())
outliers


In [None]:
figures = run_eda(df)
figures

for name, path in figures.items():
    print(f"Plot: {name} -> {path}")
    display(Image(filename=path))


In [None]:
insights = medical_insights(df)
for insight in insights:
    print("-", insight)


In [None]:
if "df" not in globals():
    df = load_data()

survival_summary = (
    df["target"].apply(lambda x: "Survived" if x == 0 else "High Risk").value_counts()
)
survival_summary


NameError: name 'df' is not defined

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

survival_df = df.copy()
survival_df["survival_label"] = survival_df["target"].apply(
    lambda x: "Survived" if x == 0 else "High Risk"
)
plt.figure(figsize=(6, 4))
sns.countplot(data=survival_df, x="survival_label", palette="Set2")
plt.title("Survival vs High Risk Counts")
plt.xlabel("Category")
plt.ylabel("Number of Patients")
plt.show()

