# 01 - EDA for ISIC 2018 Task 3 / HAM10000

This notebook validates data quality and exploratory statistics before modeling.

Checks included:
- class distribution
- age by diagnosis
- sex distribution by diagnosis
- localization heatmap
- images-per-lesion distribution
- sample image grid
- missing-value analysis
- metadata correlation analysis
- image dimension/quality analysis

In [None]:
from __future__ import annotations

from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image

sns.set_theme(style="whitegrid")

# Project root: Colab clone is /content/DermaFusion; local may be repo root or notebooks/
if Path("/content/DermaFusion").exists():
    PROJECT_ROOT = Path("/content/DermaFusion")
else:
    PROJECT_ROOT = Path.cwd().resolve().parents[0] if (Path.cwd() / "src").exists() is False else Path.cwd()

RAW_DIR = PROJECT_ROOT / "data" / "raw"
META_DIR = RAW_DIR / "metadata"
MERGED_CSV = META_DIR / "metadata_merged.csv"
HAM_META_CSV = META_DIR / "HAM10000_metadata.csv"
TRAIN_GT = META_DIR / "ISIC2018_Task3_Training_GroundTruth.csv"
GROUPINGS_CSV = META_DIR / "ISIC2018_Task3_Training_LesionGroupings.csv"

if MERGED_CSV.exists():
    df = pd.read_csv(MERGED_CSV)
    print(f"Loaded metadata: {MERGED_CSV}")
elif HAM_META_CSV.exists():
    df = pd.read_csv(HAM_META_CSV)
    print(f"Loaded metadata: {HAM_META_CSV}")
elif TRAIN_GT.exists() and GROUPINGS_CSV.exists():
    gt = pd.read_csv(TRAIN_GT)
    grp = pd.read_csv(GROUPINGS_CSV)
    df = gt.merge(grp, on="image", how="left").rename(columns={"image": "image_id"})
    print(f"Built metadata from ISIC 2018: {TRAIN_GT.name} + {GROUPINGS_CSV.name}")
else:
    raise FileNotFoundError(
        "No metadata found. Put metadata_merged.csv, HAM10000_metadata.csv, or "
        "ISIC2018_Task3_Training_GroundTruth.csv + ISIC2018_Task3_Training_LesionGroupings.csv in data/raw/metadata/."
    )

# Image dir: raw images first, then preprocessed (for Colab hair-removed layout)
raw_images = RAW_DIR / "images"
preprocessed = PROJECT_ROOT / "data" / "preprocessed_hair_removed" / "images"
if (raw_images / "train").exists():
    IMAGE_DIR = raw_images
elif raw_images.exists():
    IMAGE_DIR = RAW_DIR
elif preprocessed.exists():
    IMAGE_DIR = preprocessed
else:
    IMAGE_DIR = raw_images  # may fail later if no images

print(f"Rows: {len(df):,}")
print(df.head())

In [None]:
# Class distribution
if "dx" not in df.columns and set(["MEL", "NV", "BCC", "AKIEC", "BKL", "DF", "VASC"]).issubset(df.columns):
    class_cols = ["MEL", "NV", "BCC", "AKIEC", "BKL", "DF", "VASC"]
    df["dx"] = df[class_cols].idxmax(axis=1).str.lower()

class_counts = df["dx"].value_counts().sort_values(ascending=False)
class_pct = (class_counts / class_counts.sum()) * 100
summary = pd.DataFrame({"count": class_counts, "percent": class_pct.round(2)})
summary

plt.figure(figsize=(10, 5))
sns.barplot(x=summary.index, y=summary["count"], palette="viridis")
plt.title("Class Distribution")
plt.xlabel("Diagnosis")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
# Age distribution by diagnosis
if "age" in df.columns:
    plt.figure(figsize=(11, 5))
    sns.boxplot(data=df, x="dx", y="age")
    plt.title("Age Distribution by Diagnosis")
    plt.xlabel("Diagnosis")
    plt.ylabel("Age")
    plt.xticks(rotation=20)
    plt.tight_layout()
    plt.show()

# Sex distribution by diagnosis
if "sex" in df.columns:
    sex_dx = pd.crosstab(df["dx"], df["sex"], normalize="index") * 100
    sex_dx.plot(kind="bar", stacked=True, figsize=(11, 5), colormap="Set2")
    plt.title("Sex Distribution by Diagnosis (%)")
    plt.ylabel("Percent")
    plt.tight_layout()
    plt.show()

# Localization heatmap
if "localization" in df.columns:
    loc_dx = pd.crosstab(df["dx"], df["localization"])
    plt.figure(figsize=(14, 6))
    sns.heatmap(loc_dx, cmap="magma")
    plt.title("Diagnosis vs Localization Heatmap")
    plt.tight_layout()
    plt.show()

In [None]:
# Images-per-lesion distribution
if "lesion_id" in df.columns:
    lesion_counts = df["lesion_id"].value_counts()
    plt.figure(figsize=(10, 4))
    sns.histplot(lesion_counts, bins=30, kde=False)
    plt.title("Images Per Lesion Distribution")
    plt.xlabel("Images per lesion")
    plt.ylabel("Number of lesions")
    plt.tight_layout()
    plt.show()

# Missing value analysis
missing_pct = (df.isna().sum() / len(df) * 100).sort_values(ascending=False)
missing_df = pd.DataFrame({"missing_pct": missing_pct})
missing_df[missing_df["missing_pct"] > 0]

plt.figure(figsize=(10, 4))
missing_df["missing_pct"].plot(kind="bar")
plt.title("Missing Value Percentage by Column")
plt.ylabel("% Missing")
plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis (numeric metadata)
numeric_cols = [c for c in ["age", "age_norm", "sex_idx", "localization_idx"] if c in df.columns]
if numeric_cols:
    corr = df[numeric_cols].corr(numeric_only=True)
    plt.figure(figsize=(6, 5))
    sns.heatmap(corr, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
    plt.title("Metadata Correlation Heatmap")
    plt.tight_layout()
    plt.show()

In [None]:
# Image dimension / quality analysis and sample image grid
train_image_dir = IMAGE_DIR / "train"
if not train_image_dir.exists():
    train_image_dir = IMAGE_DIR

image_files = sorted(train_image_dir.glob("*.jpg"))
if not image_files:
    raise FileNotFoundError(f"No images found in {train_image_dir}")

shapes = []
for path in image_files[:1000]:
    with Image.open(path) as img:
        w, h = img.size
        arr = np.asarray(img.convert("RGB"))
        variance = float(np.var(arr))
    shapes.append({"file": path.name, "width": w, "height": h, "variance": variance})

shape_df = pd.DataFrame(shapes)
shape_df[["width", "height", "variance"]].describe()

plt.figure(figsize=(10, 4))
sns.histplot(shape_df["width"], color="steelblue", label="width", bins=30, alpha=0.7)
sns.histplot(shape_df["height"], color="orange", label="height", bins=30, alpha=0.5)
plt.legend()
plt.title("Image Width/Height Distribution (sample)")
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 4))
sns.histplot(shape_df["variance"], bins=40)
plt.title("Image Pixel Variance Distribution (sample)")
plt.tight_layout()
plt.show()

# 3x7 sample images by class, if possible
classes = ["mel", "nv", "bcc", "akiec", "bkl", "df", "vasc"]
if "dx" in df.columns:
    fig, axes = plt.subplots(3, 7, figsize=(18, 8))
    for col_idx, cls in enumerate(classes):
        class_rows = df[df["dx"].astype(str).str.lower() == cls].head(3)
        for row_idx in range(3):
            ax = axes[row_idx, col_idx]
            if row_idx < len(class_rows):
                image_id = str(class_rows.iloc[row_idx]["image_id"])
                img_path = train_image_dir / f"{image_id}.jpg"
                if not img_path.exists():
                    img_path = train_image_dir / image_id
                if img_path.exists():
                    ax.imshow(Image.open(img_path).convert("RGB"))
                    ax.set_title(cls if row_idx == 0 else "")
                else:
                    ax.text(0.5, 0.5, "Missing", ha="center", va="center")
            ax.axis("off")
    plt.tight_layout()
    plt.show()

## Verification Checklist

After running all cells, confirm:

- class distribution is consistent with HAM10000 profile
- age missing values are visible and quantified
- images-per-lesion distribution plotted
- sample images render for all 7 classes where available
- no runtime errors