In [1]:
# ==========================================
# Cell 1) Detection datasets discovery & inspection (FINAL+++)
#   - How many datasets are found
#   - train/val image count per dataset
#   - Whether label cases (original/scale/side) exist
#   - Output estimated class count/names (multiclass-based)
#
# [UPDATED+++]
#   ✅ Reflect split structure rules per dataset name
#   ✅ For Cell 2 acceleration
#       - Store confirmed train/val labels dir
#       - Calculate train/val label file count, box (line) count
#       - Store n_train_groups_est (=box count)
#   ✅ NEW: Also load noise label cases under refine(10) / refine(sam) / refine(*)
#       - refine(*)/labels_uniform_scaling_{S}, refine(*)/labels_boundary_jitter_{K} (+ optional refine(*)/labels)
#       - Confirm train/val labels dir per case + (optional) file/box statistics
# ==========================================

from __future__ import annotations

import os, sys, random, re
from pathlib import Path
from typing import List, Tuple, Optional, Dict, Any

# -------------------------------------------------------------------------
# 0) Register PROJECT_MODULE_DIR
# -------------------------------------------------------------------------
PROJECT_MODULE_DIR = Path("/home/ISW/project/Project_Module")
if str(PROJECT_MODULE_DIR) not in sys.path:
    sys.path.insert(0, str(PROJECT_MODULE_DIR))

# -------------------------------------------------------------------------
# 1) ultra_det_loader
# -------------------------------------------------------------------------
from ultra_det_loader import discover_det_datasets

# -------------------------------------------------------------------------
# 2) noisy_insection (use only scale/boundary jitter case list)
# -------------------------------------------------------------------------
try:
    from noisy_insection import UNIFORM_SCALING_FACTORS, JITTER_PATTERNS
except Exception:
    UNIFORM_SCALING_FACTORS = [0.6, 0.7, 0.8, 0.9, 1.1, 1.2, 1.3, 1.4]
    JITTER_PATTERNS = [1, 3, 5, 7, 9]

# -------------------------------------------------------------------------
# User config
# -------------------------------------------------------------------------
LOAD_DIR = "/home/ISW/project/datasets"
SEED = 42

# ✅ Refine variant folder candidates (additionally auto-detect refine(*))
REFINE_VARIANT_NAMES = ["refine(10)", "refine(sam)"]

# ✅ Per-case box (line) count is expensive -> set True if needed
COUNT_BOX_LINES_FOR_ALL_CASES = False   # Default: box count only for base(original), only file count for rest
COUNT_BOX_LINES_FOR_BASE_ORIGINAL = True

# Image extensions
_IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"}

def set_seed(seed: int = 42):
    random.seed(seed)

def list_images(dir_path: Optional[Path]) -> List[Path]:
    if dir_path is None or not Path(dir_path).exists():
        return []
    dir_path = Path(dir_path)
    imgs = []
    for p in dir_path.rglob("*"):
        if p.is_file() and p.suffix.lower() in _IMG_EXTS:
            imgs.append(p)
    return sorted(imgs)

def normalize_name(name: str) -> str:
    n = name.strip().lower()
    n = n.replace("_", "-")
    n = n.replace(" ", "-")
    return n

# -------------------------------------------------------------------------
# Legacy heuristic (fallback)
# -------------------------------------------------------------------------
def _fallback_train_dir(images_root: Path) -> Path:
    if (images_root / "train").is_dir():
        return images_root / "train"
    return images_root

def _fallback_val_dir(images_root: Path) -> Optional[Path]:
    if (images_root / "val").is_dir():
        return images_root / "val"
    if (images_root / "valid").is_dir():
        return images_root / "valid"
    return None

# -------------------------------------------------------------------------
# ✅ Dataset-specific split rules
# -------------------------------------------------------------------------
_SIMPLE_TRAIN_VAL = {
    "bccd",
    "brain-tumor",
    "custom-blood",
    "homeobjects-3k",
    "kitti",
    "medical-pills",
    "signature",
}

_TRAIN_TEST_VAL = {
    "construction-ppe",
    "african-wildlife",
}

def detect_split_dirs(ds_root: Path) -> Dict[str, Optional[Path]]:
    """
    Interpret images/labels split structure based on ds_root.
    Returns:
        {
          "train_img_dir": Path|None,
          "val_img_dir": Path|None,
          "test_img_dir": Path|None,
          "split_mode": str,  # "explicit" | "sku_virtual_8_2" | "fallback"
          "train_tag": str,
          "val_tag": str,
        }
    """
    ds_name = normalize_name(ds_root.name)
    images_root = ds_root / "images"

    # 1) VOC rule: use train2012/val2012 only
    if ds_name == "voc":
        return dict(
            train_img_dir=images_root / "train2012",
            val_img_dir=images_root / "val2012",
            test_img_dir=None,
            split_mode="explicit",
            train_tag="train2012",
            val_tag="val2012",
        )

    # 2) COCO/LVIS rule
    if ds_name == "coco" or "coco" in ds_name:
        return dict(
            train_img_dir=images_root / "train2017",
            val_img_dir=images_root / "val2017",
            test_img_dir=images_root / "test2017",
            split_mode="explicit",
            train_tag="train2017",
            val_tag="val2017",
        )

    if ds_name == "lvis" or "lvis" in ds_name:
        return dict(
            train_img_dir=images_root / "train2017",
            val_img_dir=images_root / "val2017",
            test_img_dir=images_root / "test2017",
            split_mode="explicit",
            train_tag="train2017",
            val_tag="val2017",
        )

    # 3) Explicit train/val structure
    if ds_name in _SIMPLE_TRAIN_VAL:
        return dict(
            train_img_dir=images_root / "train",
            val_img_dir=images_root / "val",
            test_img_dir=None,
            split_mode="explicit",
            train_tag="train",
            val_tag="val",
        )

    # 4) train/test/val structure
    if ds_name in _TRAIN_TEST_VAL:
        return dict(
            train_img_dir=images_root / "train",
            val_img_dir=images_root / "val",
            test_img_dir=images_root / "test",
            split_mode="explicit",
            train_tag="train",
            val_tag="val",
        )

    # 5) SKU-110K: no subfolders -> virtual split
    if ds_name in {"sku-110k", "sku110k", "sku_110k"} or ("sku" in ds_name and "110k" in ds_name):
        return dict(
            train_img_dir=images_root,
            val_img_dir=images_root,
            test_img_dir=None,
            split_mode="sku_virtual_8_2",
            train_tag="virtual_8_2",
            val_tag="virtual_8_2",
        )

    # 6) fallback
    tr = _fallback_train_dir(images_root)
    va = _fallback_val_dir(images_root)
    return dict(
        train_img_dir=tr,
        val_img_dir=va,
        test_img_dir=None,
        split_mode="fallback",
        train_tag=tr.name if tr else "unknown",
        val_tag=va.name if va else "missing",
    )

# -------------------------------------------------------------------------
# Class name estimation
# -------------------------------------------------------------------------
def infer_class_names_from_labels(label_root: Path, max_files: int = 2000) -> List[str]:
    if label_root is None or not label_root.exists():
        return ["class_0"]

    txts = list(label_root.rglob("*.txt"))
    if not txts:
        return ["class_0"]

    txts = txts[:max_files]
    cls_ids = set()

    for t in txts:
        try:
            with open(t, "r", encoding="utf-8") as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) < 1:
                        continue
                    cid = int(float(parts[0]))
                    cls_ids.add(cid)
        except Exception:
            continue

    if not cls_ids:
        return ["class_0"]

    max_id = max(cls_ids)
    return [f"class_{i}" for i in range(max_id + 1)]

# -------------------------------------------------------------------------
# ✅ Determine label split dir (applicable to any labels_root)
# -------------------------------------------------------------------------
def resolve_split_label_dirs_from_root(labels_root: Path, train_tag: str, val_tag: str) -> Tuple[Path, Path]:
    """
    If split subfolders exist under labels_root use them, otherwise use labels_root itself.
    e.g.) labels_root/train, labels_root/val
        labels_root/train2017, labels_root/val2017
    """
    if labels_root is None:
        return None, None
    labels_root = Path(labels_root)

    cand_train = labels_root / train_tag
    cand_val   = labels_root / val_tag

    train_labels_dir = cand_train if cand_train.is_dir() else labels_root
    val_labels_dir   = cand_val   if cand_val.is_dir()   else labels_root
    return train_labels_dir, val_labels_dir

# -------------------------------------------------------------------------
# ✅ Label statistics (quick group count estimation)
# -------------------------------------------------------------------------
def count_label_files_and_boxes(label_dir: Optional[Path], count_boxes: bool = True) -> Tuple[int, int]:
    if label_dir is None or not Path(label_dir).exists():
        return 0, 0
    label_dir = Path(label_dir)
    txts = sorted(label_dir.rglob("*.txt"))
    n_files = len(txts)

    if not count_boxes:
        return n_files, 0

    n_boxes = 0
    for t in txts:
        try:
            with open(t, "r", encoding="utf-8") as f:
                for line in f:
                    if line.strip():
                        n_boxes += 1
        except Exception:
            continue
    return n_files, n_boxes

# -------------------------------------------------------------------------
# SKU-110K virtual split count
# -------------------------------------------------------------------------
def compute_sku_virtual_counts(images_root: Path, seed: int = 42, ratio: float = 0.8) -> Tuple[int, int]:
    imgs = list_images(images_root)
    n = len(imgs)
    if n == 0:
        return 0, 0
    rnd = random.Random(seed)
    idxs = list(range(n))
    rnd.shuffle(idxs)
    cut = int(n * ratio)
    n_train = cut
    n_val = n - cut
    return n_train, n_val

# -------------------------------------------------------------------------
# ✅ Detect label cases under base ds_root or refine_variant_root
#   - Assumes structure with labels / labels_uniform_scaling_* / labels_boundary_jitter_* directly under root_dir
# -------------------------------------------------------------------------
def list_label_cases_under_root(root_dir: Path) -> List[Tuple[str, Path]]:
    """
    Returns: [(case_name, labels_root_path), ...]
      case_name e.g.: "original", "scale_0.6", "side_7"
    """
    cases: List[Tuple[str, Path]] = []
    root_dir = Path(root_dir)

    # original
    if (root_dir / "labels").is_dir():
        cases.append(("original", root_dir / "labels"))

    # scale
    for s in UNIFORM_SCALING_FACTORS:
        d = root_dir / f"labels_uniform_scaling_{s}"
        if d.is_dir():
            cases.append((f"scale_{s}", d))

    # side
    for k in JITTER_PATTERNS:
        d = root_dir / f"labels_boundary_jitter_{k}"
        if d.is_dir():
            cases.append((f"side_{k}", d))

    return cases

# -------------------------------------------------------------------------
# ✅ Detect refine variant dirs: refine(10), refine(sam), + auto-include existing refine(*)
# -------------------------------------------------------------------------
def discover_refine_variant_dirs(ds_root: Path) -> List[Path]:
    ds_root = Path(ds_root)
    found: List[Path] = []

    # 1) Specified names first
    for nm in REFINE_VARIANT_NAMES:
        p = ds_root / nm
        if p.is_dir():
            found.append(p)

    # 2) Auto-detect refine(*) (remove duplicates)
    for p in ds_root.iterdir():
        if p.is_dir() and p.name.startswith("refine(") and p.name.endswith(")"):
            if p not in found:
                found.append(p)

    # Sort by name
    found = sorted(found, key=lambda x: x.name)
    return found

# -------------------------------------------------------------------------
# Discover dataset roots
# -------------------------------------------------------------------------
set_seed(SEED)

specs = discover_det_datasets(LOAD_DIR)
roots: List[Path] = []
for s in specs:
    r = Path(s.root)
    if r not in roots:
        roots.append(r)

print("=" * 80)
print(f"[DISCOVERY] Found {len(roots)} unique dataset roots under: {Path(LOAD_DIR).resolve()}")
print("=" * 80)

# -------------------------------------------------------------------------
# Per-dataset summary
# -------------------------------------------------------------------------
dataset_summaries: List[Dict[str, Any]] = []

for ds_root in roots:
    ds_root = Path(ds_root)
    images_root = ds_root / "images"
    labels_root = ds_root / "labels"

    if not images_root.is_dir() or not labels_root.is_dir():
        print(f"⏭️  Skip (missing images/labels): {ds_root}")
        continue

    split_info = detect_split_dirs(ds_root)
    train_dir = split_info["train_img_dir"]
    val_dir   = split_info["val_img_dir"]
    split_mode = split_info["split_mode"]
    train_tag  = split_info.get("train_tag", "train")
    val_tag    = split_info.get("val_tag", "val")

    # --- Calculate image count ---
    if split_mode == "sku_virtual_8_2":
        n_train, n_val = compute_sku_virtual_counts(images_root, seed=SEED, ratio=0.8)
    else:
        n_train = len(list_images(train_dir))
        n_val   = len(list_images(val_dir)) if val_dir else 0

    # ---------------------------------------------------------------------
    # ✅ (A) base label cases: labels / labels_uniform_scaling_* / labels_boundary_jitter_* directly under ds_root
    # ---------------------------------------------------------------------
    base_cases = list_label_cases_under_root(ds_root)

    # base originalfor "Cell2 default" path/stats as before top-level also keep (compatibility)
    train_labels_dir_base, val_labels_dir_base = resolve_split_label_dirs_from_root(ds_root / "labels", train_tag, val_tag)

    # base original statistics
    n_train_label_files_base, n_train_boxes_base = count_label_files_and_boxes(
        train_labels_dir_base,
        count_boxes=COUNT_BOX_LINES_FOR_BASE_ORIGINAL
    )
    n_val_label_files_base, n_val_boxes_base = count_label_files_and_boxes(
        val_labels_dir_base,
        count_boxes=COUNT_BOX_LINES_FOR_BASE_ORIGINAL
    )
    n_train_groups_est_base = n_train_boxes_base  # group count estimate (for practical absn conversion)

    # ---------------------------------------------------------------------
    # ✅ (B) refine variant label cases: labels_uniform_scaling_* / labels_boundary_jitter_* under ds_root/refine(*)
    # ---------------------------------------------------------------------
    refine_dirs = discover_refine_variant_dirs(ds_root)

    # case_details: labels_root + split dirs + stats for all cases (base + refine variants)
    case_details: Dict[str, Dict[str, Any]] = {}

    def _register_case(case_id: str, labels_root_case: Path, count_boxes: bool):
        tr_ld, va_ld = resolve_split_label_dirs_from_root(labels_root_case, train_tag, val_tag)
        n_tr_files, n_tr_boxes = count_label_files_and_boxes(tr_ld, count_boxes=count_boxes)
        n_va_files, n_va_boxes = count_label_files_and_boxes(va_ld, count_boxes=count_boxes)
        case_details[case_id] = dict(
            case_id=case_id,
            labels_root=str(labels_root_case),
            train_labels_dir=str(tr_ld) if tr_ld else None,
            val_labels_dir=str(va_ld) if va_ld else None,
            n_train_label_files=n_tr_files,
            n_val_label_files=n_va_files,
            n_train_boxes=n_tr_boxes,
            n_val_boxes=n_va_boxes,
        )

    # 1) Register base cases
    for case_name, case_labels_root in base_cases:
        # Box count only for base original (default), rest is optional
        if case_name == "original":
            _register_case(case_name, case_labels_root, count_boxes=COUNT_BOX_LINES_FOR_BASE_ORIGINAL)
        else:
            _register_case(case_name, case_labels_root, count_boxes=COUNT_BOX_LINES_FOR_ALL_CASES)

    # 2) Register refine variants
    refine_case_index: Dict[str, List[str]] = {}  # variant_name -> [case_ids...]
    for rdir in refine_dirs:
        variant = rdir.name  # e.g., refine(sam), refine(10)
        r_cases = list_label_cases_under_root(rdir)
        ids: List[str] = []
        for case_name, case_labels_root in r_cases:
            case_id = f"{variant}::{case_name}"
            ids.append(case_id)
            _register_case(case_id, case_labels_root, count_boxes=COUNT_BOX_LINES_FOR_ALL_CASES)
        if ids:
            refine_case_index[variant] = ids

    # Multiclass estimation based on base labels_root (keep existing)
    class_names = infer_class_names_from_labels(labels_root)
    nc = len(class_names)

    info: Dict[str, Any] = {
        "dataset": ds_root.name,
        "root": str(ds_root),
        "images_root": str(images_root),
        "labels_root": str(labels_root),

        "train_dir": str(train_dir) if train_dir else None,
        "val_dir": str(val_dir) if val_dir else None,

        # ✅ Used directly by Cell 2 (based on base original labels, keep existing keys)
        "train_labels_dir": str(train_labels_dir_base) if train_labels_dir_base else None,
        "val_labels_dir": str(val_labels_dir_base) if val_labels_dir_base else None,

        "n_train": n_train,
        "n_val": n_val,

        # ✅ base original label stats (keep existing keys)
        "n_train_label_files": n_train_label_files_base,
        "n_val_label_files": n_val_label_files_base,
        "n_train_boxes": n_train_boxes_base,
        "n_val_boxes": n_val_boxes_base,
        "n_train_groups_est": n_train_groups_est_base,

        "split_mode": split_mode,
        "train_tag": train_tag,
        "val_tag": val_tag,

        # base case names
        "label_cases": [c[0] for c in base_cases],

        # ✅ NEW: case index per refine(*) variant
        "refine_variants": [p.name for p in refine_dirs],
        "refine_case_index": refine_case_index,   # variant -> ["refine(sam)::scale_0.6", ...]

        # ✅ NEW: all case details (base + refine)
        "label_case_details": case_details,       # case_id -> dirs/stats

        "nc_inferred": nc,
        "class_names_inferred": class_names,
    }
    dataset_summaries.append(info)

    # ---------------------------------------------------------------------
    # Print summary
    # ---------------------------------------------------------------------
    print("\n" + "-" * 80)
    print(f"[Dataset] {ds_root.name}")
    print(f" - root        : {ds_root}")
    print(f" - split_mode  : {split_mode}")
    print(f" - train_dir   : {train_dir if train_dir else '(missing)'} | tag={train_tag} | n_train={n_train}")
    print(f" - val_dir     : {val_dir if val_dir else '(missing)'} | tag={val_tag} | n_val={n_val}")

    test_dir = split_info.get("test_img_dir", None)
    if test_dir and test_dir.is_dir():
        n_test = len(list_images(test_dir))
        print(f" - test_dir    : {test_dir} | n_test={n_test}")

    print(f" - [BASE original] train_labels_dir : {train_labels_dir_base}")
    print(f" - [BASE original] val_labels_dir   : {val_labels_dir_base}")
    print(f" - [BASE original] train label files/boxes/groups_est : {n_train_label_files_base} / {n_train_boxes_base} / {n_train_groups_est_base}")
    print(f" - [BASE original] val   label files/boxes           : {n_val_label_files_base} / {n_val_boxes_base}")

    print(f" - base label_cases : {[c[0] for c in base_cases] if base_cases else '(none)'}")

    # refine variants summary
    if refine_dirs:
        print(f" - refine variants found: {[p.name for p in refine_dirs]}")
        for v in [p.name for p in refine_dirs]:
            ids = refine_case_index.get(v, [])
            if not ids:
                continue
            # May get too long, show only case names briefly
            short_names = [cid.split("::", 1)[-1] for cid in ids]
            print(f"   * {v} cases: {short_names}")
    else:
        print(" - refine variants found: (none)")

    # Simple sanity: train/val label file count per case_id
    # (box count is optional)
    print(" - case sanity (label files):")
    for cid, det in case_details.items():
        print(f"   * {cid:>20s} | train_files={det['n_train_label_files']}, val_files={det['n_val_label_files']}")

    print(f" - inferred classes (multiclass-based): nc={nc}, names={class_names}")
    print("-" * 80)

print("\n✅ Cell 1 done.")
print(f"   -> dataset_summaries length = {len(dataset_summaries)}")
print("   -> roots variable is ready for Cell 2.")


[DISCOVERY] Found 13 unique dataset roots under: /home/ISW/project/datasets

--------------------------------------------------------------------------------
[Dataset] SKU-110K
 - root        : /home/ISW/project/datasets/SKU-110K
 - split_mode  : sku_virtual_8_2
 - train_dir   : /home/ISW/project/datasets/SKU-110K/images | tag=virtual_8_2 | n_train=9394
 - val_dir     : /home/ISW/project/datasets/SKU-110K/images | tag=virtual_8_2 | n_val=2349
 - [BASE original] train_labels_dir : /home/ISW/project/datasets/SKU-110K/labels
 - [BASE original] val_labels_dir   : /home/ISW/project/datasets/SKU-110K/labels
 - [BASE original] train label files/boxes/groups_est : 11743 / 1730996 / 1730996
 - [BASE original] val   label files/boxes           : 11743 / 1730996
 - base label_cases : ['original', 'scale_0.6', 'scale_0.7', 'scale_0.8', 'scale_0.9', 'scale_1.1', 'scale_1.2', 'scale_1.3', 'scale_1.4', 'side_1', 'side_3', 'side_5', 'side_7', 'side_9']
 - refine variants found: (none)
 - case sanity (

In [3]:
# ==========================================
# Visualization Runner (FINAL)
#  - Save bbox overlay visualizations for:
#      (1) original
#      (2) each noise case
#      (3) refine(50) all cases
#      (4) refine(sam) all cases
#  - For each dataset: sample_number=100 images ONCE (deterministic)
#    then reuse SAME images for all cases
#  - Output root: ./(visualize)_experiment_results
# ==========================================

from __future__ import annotations

import os, sys, random, hashlib, heapq
from pathlib import Path
from typing import List, Tuple, Optional, Dict, Iterable

from PIL import Image, ImageDraw, ImageFont

# -----------------------------
# USER CONFIG
# -----------------------------
LOAD_DIR = Path("/home/ISW/project/datasets")
SEED = 42
SAMPLE_NUMBER = 100

TARGET_SPLITS = ["val"]  # ["train","val"] possible
TARGET_REFINE_VARIANTS = ["refine(50)", "refine(sam)"]  # None means all

# ✅ Control datasets to visualize here (None for all discovered)
TARGET_DATASETS: Optional[List[str]] = [
    "kitti",
    # "homeobjects-3K",
    # "african-wildlife",
    # "construction-ppe",
    # "Custom_Blood",
    # "brain-tumor",
    # "BCCD",
    # "signature",
    # "medical-pills",
    # "VOC",
]

OUT_ROOT = Path("./(visualize)_kitti").resolve()
OUT_ROOT.mkdir(parents=True, exist_ok=True)

# UI/Style
BBOX_THICKNESS = 4
TITLE_BAR_H = 18

# Image extensions
_IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"}

# Color palette (per class)
_PALETTE = [
    (255, 0, 0), (0, 255, 0), (0, 128, 255), (255, 128, 0), (255, 0, 255),
    (0, 255, 255), (255, 255, 0), (128, 0, 255), (0, 0, 255), (0, 0, 0),
]

# -------------------------------------------------------------------------
# 0) Register PROJECT_MODULE_DIR + ultra_det_loader
# -------------------------------------------------------------------------
PROJECT_MODULE_DIR = Path("/home/ISW/project/Project_Module")
if str(PROJECT_MODULE_DIR) not in sys.path:
    sys.path.insert(0, str(PROJECT_MODULE_DIR))

from ultra_det_loader import discover_det_datasets

# -------------------------------------------------------------------------
# 1) noisy_insection (use only scale/boundary jitter case list)
# -------------------------------------------------------------------------
try:
    from noisy_insection import UNIFORM_SCALING_FACTORS, JITTER_PATTERNS
except Exception:
    UNIFORM_SCALING_FACTORS = [0.6, 0.7, 0.8, 0.9, 1.1, 1.2, 1.3, 1.4]
    JITTER_PATTERNS = [1, 3, 5, 7, 9]


# -------------------------------------------------------------------------
# Helpers
# -------------------------------------------------------------------------
def normalize_name(name: str) -> str:
    n = name.strip().lower()
    n = n.replace("_", "-").replace(" ", "-")
    return n

def _fallback_train_dir(images_root: Path) -> Path:
    return (images_root / "train") if (images_root / "train").is_dir() else images_root

def _fallback_val_dir(images_root: Path) -> Optional[Path]:
    if (images_root / "val").is_dir():
        return images_root / "val"
    if (images_root / "valid").is_dir():
        return images_root / "valid"
    return None

_SIMPLE_TRAIN_VAL = {
    "bccd","brain-tumor","custom-blood","homeobjects-3k","medical-pills","signature",
    "kitti",
}
_TRAIN_TEST_VAL = {
    "construction-ppe","african-wildlife"
}

def detect_split_dirs(ds_root: Path) -> Dict[str, Optional[Path]]:
    ds_name = normalize_name(ds_root.name)
    images_root = ds_root / "images"

    if ds_name == "voc":
        return dict(train_img_dir=images_root / "train2012", val_img_dir=images_root / "val2012",
                    test_img_dir=None, split_mode="explicit", train_tag="train2012", val_tag="val2012")

    if ds_name == "coco" or "coco" in ds_name:
        return dict(train_img_dir=images_root / "train2017", val_img_dir=images_root / "val2017",
                    test_img_dir=images_root / "test2017", split_mode="explicit", train_tag="train2017", val_tag="val2017")

    if ds_name == "lvis" or "lvis" in ds_name:
        return dict(train_img_dir=images_root / "train2017", val_img_dir=images_root / "val2017",
                    test_img_dir=images_root / "test2017", split_mode="explicit", train_tag="train2017", val_tag="val2017")

    if ds_name in _SIMPLE_TRAIN_VAL:
        return dict(train_img_dir=images_root / "train", val_img_dir=images_root / "val",
                    test_img_dir=None, split_mode="explicit", train_tag="train", val_tag="val")

    if ds_name in _TRAIN_TEST_VAL:
        return dict(train_img_dir=images_root / "train", val_img_dir=images_root / "val",
                    test_img_dir=images_root / "test", split_mode="explicit", train_tag="train", val_tag="val")

    if ds_name in {"sku-110k", "sku110k", "sku_110k"} or ("sku" in ds_name and "110k" in ds_name):
        # Virtual split but actual folder is flat
        return dict(train_img_dir=images_root, val_img_dir=images_root, test_img_dir=None,
                    split_mode="sku_virtual_8_2", train_tag="virtual_8_2", val_tag="virtual_8_2")

    tr = _fallback_train_dir(images_root)
    va = _fallback_val_dir(images_root)
    return dict(train_img_dir=tr, val_img_dir=va, test_img_dir=None, split_mode="fallback",
                train_tag=tr.name if tr else "unknown", val_tag=va.name if va else "missing")

def resolve_split_label_dir(label_root: Path, split_tag: str) -> Path:
    """
    If split folders exist under label_root use them, otherwise label_root (flat)
    """
    cand = label_root / split_tag
    return cand if cand.is_dir() else label_root

def _iter_images(img_dir: Path) -> Iterable[Path]:
    """
    streaming scan (prevent memory explosion)
    """
    img_dir = Path(img_dir)
    if not img_dir.exists():
        return
    for root, _, files in os.walk(img_dir):
        for fn in files:
            p = Path(root) / fn
            if p.suffix.lower() in _IMG_EXTS:
                yield p

def _stable_int_hash(s: str) -> int:
    h = hashlib.md5(s.encode("utf-8")).hexdigest()
    return int(h, 16)

def deterministic_sample_images(
    img_dir: Path,
    k: int,
    seed: int,
    require_label_dir: Optional[Path] = None,
) -> List[Path]:
    """
    Scan entire img_dir but deterministically select k items based on relpath hash.
    (If require_label_dir is given, only images with existing labels are candidates)
    """
    img_dir = Path(img_dir)
    require_label_dir = Path(require_label_dir) if require_label_dir else None

    heap: List[Tuple[int, Path]] = []  # (-score, path)  => max-heap by score
    for p in _iter_images(img_dir):
        rel = p.relative_to(img_dir).as_posix()

        if require_label_dir is not None:
            lab = (require_label_dir / Path(rel)).with_suffix(".txt")
            if not lab.exists():
                continue

        score = _stable_int_hash(f"{seed}|{rel}")
        item = (-score, p)  # max-heap emulation (largest score => most negative -score)

        if len(heap) < k:
            heapq.heappush(heap, item)
        else:
            # heap[0] is current "worst" (largest score) because it has most negative -score
            worst_score = -heap[0][0]
            if score < worst_score:
                heapq.heapreplace(heap, (-score, p))

    picked = sorted(heap, key=lambda x: -x[0])  # sort by score ascending
    return [p for _, p in picked]

def parse_yolo_labels(label_path: Path, img_w: int, img_h: int) -> List[Tuple[int, Tuple[int,int,int,int]]]:
    """
    Returns: [(cls_id, (x1,y1,x2,y2)), ...]
    """
    out = []
    if label_path is None or (not label_path.exists()):
        return out

    try:
        txt = label_path.read_text(encoding="utf-8").strip().splitlines()
    except Exception:
        return out

    for line in txt:
        parts = line.strip().split()
        if len(parts) < 5:
            continue
        try:
            cid = int(float(parts[0]))
            cx, cy, w, h = map(float, parts[1:5])
            x1 = (cx - w/2.0) * img_w
            y1 = (cy - h/2.0) * img_h
            x2 = (cx + w/2.0) * img_w
            y2 = (cy + h/2.0) * img_h
            x1 = max(0, min(img_w-1, int(round(x1))))
            y1 = max(0, min(img_h-1, int(round(y1))))
            x2 = max(0, min(img_w-1, int(round(x2))))
            y2 = max(0, min(img_h-1, int(round(y2))))
            if x2 > x1 and y2 > y1:
                out.append((cid, (x1,y1,x2,y2)))
        except Exception:
            continue
    return out

def draw_overlay(img_path: Path, label_path: Optional[Path], title: str = "") -> Image.Image:
    img = Image.open(img_path).convert("RGB")
    W, H = img.size
    draw = ImageDraw.Draw(img)
    font = ImageFont.load_default()

    boxes = parse_yolo_labels(label_path, W, H)
    for cid, (x1, y1, x2, y2) in boxes:
        color = _PALETTE[cid % len(_PALETTE)]
        # ✅ bbox only (no class id/name text output)
        draw.rectangle([x1, y1, x2, y2], outline=color, width=BBOX_THICKNESS)

    if title:
        draw.rectangle([0, 0, W, TITLE_BAR_H], fill=(0, 0, 0))
        draw.text((3, 2), title[:120], fill=(255, 255, 255), font=font)

    if label_path is None or not label_path.exists():
        draw.text((3, TITLE_BAR_H + 2), "LABEL MISSING", fill=(255, 0, 0), font=font)

    return img

def case_to_dir_candidates(case_name: str) -> List[str]:
    if case_name.startswith("scale_"):
        s = case_name.split("_", 1)[1]
        return [f"labels_uniform_scaling_{s}", case_name]
    if case_name.startswith("side_"):
        k = case_name.split("_", 1)[1]
        return [f"labels_boundary_jitter_{k}", case_name]
    if case_name == "original":
        return ["labels", "original"]
    return [case_name]

def find_case_root(base_root: Path, case_name: str) -> Optional[Path]:
    for dn in case_to_dir_candidates(case_name):
        cand = base_root / dn
        if cand.is_dir():
            return cand
    return None

def list_noise_cases(ds_root: Path) -> List[str]:
    noise = [f"scale_{s}" for s in UNIFORM_SCALING_FACTORS] + [f"side_{k}" for k in JITTER_PATTERNS]
    out = []
    for c in noise:
        if find_case_root(ds_root, c) is not None:
            out.append(c)
    return out

def list_refine_cases(ds_root: Path, refine_variant: str, noise_cases: List[str]) -> List[str]:
    rv = ds_root / refine_variant
    if not rv.is_dir():
        return []
    out = []
    for c in noise_cases:
        if find_case_root(rv, c) is not None:
            out.append(c)
    return out

def label_path_for_image(img_path: Path, img_split_dir: Path, label_split_dir: Path) -> Path:
    rel = img_path.relative_to(img_split_dir)
    return (label_split_dir / rel).with_suffix(".txt")

def _dataset_selected(ds_root: Path, target: Optional[List[str]]) -> bool:
    if not target:
        return True
    ds_norm = normalize_name(ds_root.name)
    target_norm = {normalize_name(x) for x in target}
    return ds_norm in target_norm


# -------------------------------------------------------------------------
# MAIN
# -------------------------------------------------------------------------
random.seed(SEED)

specs = discover_det_datasets(str(LOAD_DIR))
roots: List[Path] = []
for s in specs:
    r = Path(s.root)
    if r not in roots:
        roots.append(r)

# ✅ Filter by TARGET_DATASETS
roots = [r for r in roots if _dataset_selected(r, TARGET_DATASETS)]

print("=" * 80)
print(f"[DISCOVERY] Found {len(roots)} selected dataset roots under: {LOAD_DIR}")
if TARGET_DATASETS:
    print(f"[FILTER] TARGET_DATASETS = {TARGET_DATASETS}")
print("=" * 80)

# output category roots
OUT_ORIGINAL = OUT_ROOT / "original"
OUT_NOISE    = OUT_ROOT / "noise"
OUT_REF50    = OUT_ROOT / "refine(50)"
OUT_REFSAM   = OUT_ROOT / "refine(sam)"
for p in [OUT_ORIGINAL, OUT_NOISE, OUT_REF50, OUT_REFSAM]:
    p.mkdir(parents=True, exist_ok=True)

for ds_root in roots:
    ds_root = Path(ds_root)
    images_root = ds_root / "images"
    labels_root = ds_root / "labels"
    if not images_root.is_dir() or not labels_root.is_dir():
        continue

    split_info = detect_split_dirs(ds_root)
    train_img_dir = split_info["train_img_dir"]
    val_img_dir   = split_info["val_img_dir"]
    train_tag = split_info.get("train_tag", "train")
    val_tag   = split_info.get("val_tag", "val")

    # noise/refine case list
    noise_cases = list_noise_cases(ds_root)

    # refine cases (only two requested variants)
    refine_cases_map: Dict[str, List[str]] = {}
    for rv in TARGET_REFINE_VARIANTS:
        refine_cases_map[rv] = list_refine_cases(ds_root, rv, noise_cases)

    for split in TARGET_SPLITS:
        if split == "val":
            img_split_dir = val_img_dir if (val_img_dir and Path(val_img_dir).is_dir()) else train_img_dir
            split_tag = val_tag if (val_img_dir and Path(val_img_dir).is_dir()) else train_tag
        else:
            img_split_dir = train_img_dir
            split_tag = train_tag

        if img_split_dir is None or (not Path(img_split_dir).is_dir()):
            continue
        img_split_dir = Path(img_split_dir)

        base_label_split_dir = resolve_split_label_dir(labels_root, split_tag)

        # sample images ONCE (deterministic), prefer images that have original label file
        sample_seed = (SEED + _stable_int_hash(ds_root.name) % 100000)
        sample_imgs = deterministic_sample_images(
            img_split_dir, k=SAMPLE_NUMBER, seed=sample_seed,
            require_label_dir=base_label_split_dir
        )
        if len(sample_imgs) < SAMPLE_NUMBER:
            sample_imgs = deterministic_sample_images(
                img_split_dir, k=SAMPLE_NUMBER, seed=sample_seed,
                require_label_dir=None
            )

        if not sample_imgs:
            continue

        print(f"\n[VIS] dataset={ds_root.name} split={split}(tag={split_tag}) n_sample={len(sample_imgs)}")

        # -------------------------------------------------------------
        # (A) ORIGINAL
        # -------------------------------------------------------------
        out_base = OUT_ORIGINAL / ds_root.name / split_tag
        out_base.mkdir(parents=True, exist_ok=True)

        for img_path in sample_imgs:
            lab_path = label_path_for_image(img_path, img_split_dir, base_label_split_dir)
            title = f"{ds_root.name} | original | {split_tag}"
            vis = draw_overlay(img_path, lab_path, title=title)

            rel = img_path.relative_to(img_split_dir)
            out_path = (out_base / rel).with_suffix(".jpg")
            out_path.parent.mkdir(parents=True, exist_ok=True)
            vis.save(out_path, quality=95)

        # -------------------------------------------------------------
        # (B) NOISE cases
        # -------------------------------------------------------------
        for case in noise_cases:
            case_root = find_case_root(ds_root, case)
            if case_root is None:
                continue
            case_label_split_dir = resolve_split_label_dir(case_root, split_tag)

            out_case = OUT_NOISE / ds_root.name / case / split_tag
            out_case.mkdir(parents=True, exist_ok=True)

            for img_path in sample_imgs:
                lab_path = label_path_for_image(img_path, img_split_dir, case_label_split_dir)
                title = f"{ds_root.name} | noise:{case} | {split_tag}"
                vis = draw_overlay(img_path, lab_path, title=title)

                rel = img_path.relative_to(img_split_dir)
                out_path = (out_case / rel).with_suffix(".jpg")
                out_path.parent.mkdir(parents=True, exist_ok=True)
                vis.save(out_path, quality=95)

        # -------------------------------------------------------------
        # (C) REFINE variants: refine(50), refine(sam)
        # -------------------------------------------------------------
        for rv in TARGET_REFINE_VARIANTS:
            rv_root = ds_root / rv
            if not rv_root.is_dir():
                continue

            rv_cases = refine_cases_map.get(rv, [])
            if not rv_cases:
                continue

            out_rv_root = (OUT_REF50 if rv == "refine(50)" else OUT_REFSAM) / ds_root.name

            for case in rv_cases:
                case_root = find_case_root(rv_root, case)
                if case_root is None:
                    continue
                case_label_split_dir = resolve_split_label_dir(case_root, split_tag)

                out_case = out_rv_root / case / split_tag
                out_case.mkdir(parents=True, exist_ok=True)

                for img_path in sample_imgs:
                    lab_path = label_path_for_image(img_path, img_split_dir, case_label_split_dir)
                    title = f"{ds_root.name} | {rv}:{case} | {split_tag}"
                    vis = draw_overlay(img_path, lab_path, title=title)

                    rel = img_path.relative_to(img_split_dir)
                    out_path = (out_case / rel).with_suffix(".jpg")
                    out_path.parent.mkdir(parents=True, exist_ok=True)
                    vis.save(out_path, quality=95)

print("\n✅ Visualization done.")
print(f"   -> saved to: {OUT_ROOT}")


[DISCOVERY] Found 1 selected dataset roots under: /home/ISW/project/datasets
[FILTER] TARGET_DATASETS = ['kitti']

[VIS] dataset=kitti split=val(tag=val) n_sample=100

✅ Visualization done.
   -> saved to: /home/ISW/project/(visualize)_kitti
