# Preprocessing + EDA


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import StandardScaler

heart_disease = fetch_ucirepo(id=45)
X = heart_disease.data.features
y = heart_disease.data.targets
df = pd.concat([X, y], axis=1)

print("Shape:", df.shape)
print(df.head())
print(df.info())
print(df.describe())
print(df.isna().sum())

for col in df.columns:
    if df[col].dtype != 'object':
        df[col] = df[col].fillna(df[col].median())
    else:
        df[col] = df[col].fillna(df[col].mode()[0])

df = pd.get_dummies(df, drop_first=True)

scaler = StandardScaler()
features = df.drop(columns=['target'])
scaled = scaler.fit_transform(features)
df_scaled = pd.DataFrame(scaled, columns=features.columns)
df_scaled['target'] = df['target']

df_scaled.hist(figsize=(15, 12))
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()

plt.figure(figsize=(12, 8))
sns.heatmap(df_scaled.corr(), cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap")
plt.show()

for col in features.columns[:6]:
    plt.figure(figsize=(6,4))
    sns.boxplot(x='target', y=col, data=df_scaled)
    plt.title(f"{col} vs Target")
    plt.show()

df_scaled.to_csv("data/processed/heart_processed.csv", index=False)
print("Processed dataset saved to data/processed/heart_processed.csv")

