In [None]:
# ===============================
# Diabetes EDA Notebook (Updated)
# ===============================

import sys
from pathlib import Path
import warnings

# -------------------------------
# Project root setup
# -------------------------------
project_root = Path().resolve().parent
sys.path.append(str(project_root))

# -------------------------------
# Suppress warnings
# -------------------------------
warnings.simplefilter(action="ignore", category=FutureWarning)

# -------------------------------
# Imports
# -------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from src.data.preprocess import load_data, clean_data
from src.data.feature_engineering import add_age_bins, add_bmi_features

sns.set(style="whitegrid")

# -------------------------------
# Create plots directory
# -------------------------------
plots_dir = project_root / "plots"
plots_dir.mkdir(exist_ok=True)

# -------------------------------
# Load and preprocess data
# -------------------------------
df = load_data()
df = clean_data(df)
df = add_age_bins(df)
df = add_bmi_features(df)

# ===============================
# 1. Missing / Invalid Value Analysis
# ===============================
cols_invalid_zero = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

plt.figure(figsize=(8,6))
sns.heatmap(df[cols_invalid_zero].isnull(), cbar=False)
plt.title("Missing Values Heatmap")
plt.tight_layout()
plt.savefig(plots_dir / "missing_values_heatmap.png")
plt.close()

# ===============================
# 2. Outcome Distribution
# ===============================
plt.figure(figsize=(6,6))
df["Outcome"].value_counts(normalize=True).plot.pie(
    autopct="%1.1f%%", startangle=90
)
plt.title("Diabetes Outcome Distribution (%)")
plt.ylabel("")
plt.tight_layout()
plt.savefig(plots_dir / "outcome_distribution_percentage.png")
plt.close()

# ===============================
# 3. Univariate Analysis (Numeric Features)
# ===============================
numeric_cols = df.select_dtypes(include=np.number).columns.drop("Outcome")

df[numeric_cols].hist(
    bins=30,
    layout=(3, 3),
    figsize=(15, 10)
)
plt.suptitle("Distribution of Numeric Features", y=1.02)
plt.tight_layout()
plt.savefig(plots_dir / "numeric_features_histograms.png")
plt.close()

# ===============================
# 4. Bivariate Analysis: Feature vs Outcome
# ===============================
for col in numeric_cols:
    plt.figure(figsize=(8,6))
    sns.boxplot(x="Outcome", y=col, data=df)
    plt.title(f"{col} vs Outcome")
    plt.tight_layout()
    plt.savefig(plots_dir / f"{col}_vs_outcome.png")
    plt.close()

# ===============================
# 5. Feature Engineering Visualizations
# ===============================

# BMI Category vs BMI
plt.figure(figsize=(8,6))
sns.boxplot(x="BMI_Category", y="BMI", data=df)
plt.title("BMI by Category")
plt.xlabel("BMI Category")
plt.ylabel("BMI")
plt.tight_layout()
plt.savefig(plots_dir / "bmi_by_category.png")
plt.close()

# Age Group vs Outcome (if available)
if "AgeGroup" in df.columns:
    plt.figure(figsize=(8,6))
    sns.countplot(x="AgeGroup", hue="Outcome", data=df)
    plt.title("Diabetes Outcome by Age Group")
    plt.tight_layout()
    plt.savefig(plots_dir / "outcome_by_age_group.png")
    plt.close()

# ===============================
# 6. Correlation Analysis
# ===============================
numeric_df = df.select_dtypes(include=np.number)

# Full correlation matrix
plt.figure(figsize=(10,8))
sns.heatmap(
    numeric_df.corr(),
    annot=True,
    fmt=".2f",
    cmap="coolwarm"
)
plt.title("Correlation Matrix (Numeric Features Only)")
plt.tight_layout()
plt.savefig(plots_dir / "correlation_matrix.png")
plt.close()

# Correlation with target
corr_target = numeric_df.corr()["Outcome"].sort_values(ascending=False)

plt.figure(figsize=(6,4))
sns.barplot(
    x=corr_target.values,
    y=corr_target.index
)
plt.title("Correlation of Features with Outcome")
plt.tight_layout()
plt.savefig(plots_dir / "feature_correlation_target.png")
plt.close()

# ===============================
# 7. Pairplot (Key Features)
# ===============================
pairplot_cols = ["Glucose", "BMI", "Age", "Insulin", "Outcome"]
sns.pairplot(df[pairplot_cols], hue="Outcome", diag_kind="kde")
plt.savefig(plots_dir / "pairplot_features.png")
plt.close()

print(f"✅ All EDA plots saved successfully in: {plots_dir}")


2026-01-31 09:43:26,462 [INFO] Preprocessing: Loading data from ../data/raw/diabetes.csv
2026-01-31 09:43:26,465 [INFO] Preprocessing: Cleaning data: replacing zeros with median and removing duplicates
2026-01-31 09:43:26,471 [INFO] FeatureEngineering: FeatureEngineering: Adding AgeGroup feature
2026-01-31 09:43:26,473 [INFO] FeatureEngineering: FeatureEngineering: Adding BMI features
2026-01-31 09:43:27,214 [INFO] matplotlib.category: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2026-01-31 09:43:27,217 [INFO] matplotlib.category: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2026-01-31 09:43:27,280 [INFO] matplotlib.category: Using categorical units to plot a list of strings that are all parsable as f

✅ All EDA plots saved successfully in: /home/thewoman/Downloads/diabetes-ml-main/plots
