In [None]:
# ================================================
# Notebook: QC Check - XAI-CT Project
# ================================================

import json
from pathlib import Path
import yaml
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np

# -----------------------------
# Load Config
# -----------------------------
CONFIG_PATH = Path("config/preprocess.yaml")
with open(CONFIG_PATH) as f:
    cfg = yaml.safe_load(f)

DATASET_NAME = cfg["datasets"]["active"]
CURATED_DIR = Path(cfg["paths"]["curated_data"]) / DATASET_NAME
META_FILE = CURATED_DIR / cfg["metadata"]["output_file"]
FIG_DIR = Path(cfg["paths"]["figures"])
FIG_DIR.mkdir(parents=True, exist_ok=True)

# -----------------------------
# Load Metadata
# -----------------------------
with open(META_FILE) as f:
    meta = json.load(f)

labels = [m["label"] for m in meta]
total_images = len(meta)
print(f"Total images: {total_images}")
print("Label counts:", Counter(labels))

# -----------------------------
# Plot Class Distribution
# -----------------------------
plt.figure(figsize=(6,4))
sns.countplot(x=labels, order=sorted(list(set(labels))))
plt.title(f"{DATASET_NAME} - Class Distribution")
plt.xlabel("Labels")
plt.ylabel("Count")
plt.savefig(FIG_DIR / "class_distribution.png", dpi=200)
plt.show()

# -----------------------------
# Basic Image Shape / Intensity QC
# -----------------------------
shapes = [tuple(m["dimensions"]) for m in meta if "dimensions" in m]
if shapes:
    shape_counts = Counter(shapes)
    print("Most common image shapes:", shape_counts.most_common(5))

intensity_ranges = [m["intensity_range"] for m in meta if "intensity_range" in m]
if intensity_ranges:
    mins = [r[0] for r in intensity_ranges]
    maxs = [r[1] for r in intensity_ranges]
    print(f"Pixel intensity ranges across dataset: min={min(mins)}, max={max(maxs)}")
    plt.figure(figsize=(6,4))
    sns.histplot(maxs, color='r', label='max', kde=True)
    sns.histplot(mins, color='b', label='min', kde=True)
    plt.title("Pixel Intensity Distribution")
    plt.xlabel("Pixel value")
    plt.ylabel("Frequency")
    plt.legend()
    plt.savefig(FIG_DIR / "pixel_intensity.png", dpi=200)
    plt.show()

# -----------------------------
# Check for missing / corrupted files
# -----------------------------
from pathlib import Path

missing_files = [m["file"] for m in meta if not Path(m["file"]).exists()]
if missing_files:
    print(f"⚠️ Missing / corrupted files: {len(missing_files)}")
else:
    print("✅ All files exist.")

# Save a simple QC report
qc_report = {
    "total_images": total_images,
    "labels_count": dict(Counter(labels)),
    "most_common_shapes": shape_counts.most_common(5) if shapes else None,
    "pixel_intensity_min_max": (min(mins), max(maxs)) if intensity_ranges else None,
    "missing_files": missing_files
}

import json
QC_REPORT_FILE = Path(cfg["paths"]["qc_reports"]) / f"{DATASET_NAME}_qc.json"
QC_REPORT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(QC_REPORT_FILE, "w") as f:
    json.dump(qc_report, f, indent=2)

print(f"✅ QC report saved at: {QC_REPORT_FILE}")
