1. Import Library

In [21]:
import os
import re
import json
import math
import random
import shutil
import hashlib

from pathlib import Path
from collections import defaultdict, Counter

import numpy as np
import pandas as pd

import cv2
from PIL import Image

import matplotlib.pyplot as plt

from tqdm import tqdm

2. Dataset Freeze

All preprocessing will be performed on dataset_working, while dataset_raw remained unchanged as a frozen reference.

In [22]:
SRC_DATASET = Path("dataset")
RAW_DATASET = Path("dataset_raw")
WORK_DATASET = Path("dataset_working")


if RAW_DATASET.exists() or WORK_DATASET.exists():
    raise RuntimeError(
        "dataset_raw or dataset_working already exists.\n"
        "Delete them manually if you want to re-freeze."
    )


print("üì¶ Creating dataset_raw (frozen copy)...")
shutil.copytree(SRC_DATASET, RAW_DATASET)

print("üì¶ Creating dataset_working (preprocessing copy)...")
shutil.copytree(SRC_DATASET, WORK_DATASET)

print("‚Ä¢ dataset_raw      ‚Üí DO NOT TOUCH")
print("‚Ä¢ dataset_working  ‚Üí use for preprocessing")

üì¶ Creating dataset_raw (frozen copy)...
üì¶ Creating dataset_working (preprocessing copy)...
‚Ä¢ dataset_raw      ‚Üí DO NOT TOUCH
‚Ä¢ dataset_working  ‚Üí use for preprocessing


3. Structural Consistency Fix

A directory normalization step was applied to enforce a consistent YOLO-compatible structure, ensuring all images and annotation files were organized into standardized images/ and labels/ subdirectories for each class.

In [23]:
DATASET_ROOT = Path("dataset_working")
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}


for cls in sorted(DATASET_ROOT.iterdir()):
    if not cls.is_dir():
        continue

    print(f"‚ñ∂ Checking: {cls.name}")

    img_dir = cls / "images"
    lbl_dir = cls / "labels"

    img_dir.mkdir(exist_ok=True)
    lbl_dir.mkdir(exist_ok=True)

    for p in cls.iterdir():
        if p.is_file() and p.suffix.lower() in IMG_EXTS:
            dest = img_dir / p.name
            if not dest.exists():
                shutil.move(str(p), str(dest))
                print(f"  ‚úî Moved image ‚Üí images/{p.name}")

    for p in cls.iterdir():
        if p.is_file() and p.suffix.lower() == ".txt":
            dest = lbl_dir / p.name
            if not dest.exists():
                shutil.move(str(p), str(dest))
                print(f"  ‚úî Moved label ‚Üí labels/{p.name}")

print("\n‚úÖ Dataset structure normalization completed successfully.")
print("All image and label files are now organized into 'images/' and 'labels/' folders.")

‚ñ∂ Checking: 0 (tempe goreng)
‚ñ∂ Checking: 1 (tahu goreng)
‚ñ∂ Checking: 10 (nasi goreng)
‚ñ∂ Checking: 11 (bubur ayam)
‚ñ∂ Checking: 12 (cakwe)
‚ñ∂ Checking: 13 (mie ayam)
‚ñ∂ Checking: 14 (nasi padang)
‚ñ∂ Checking: 15 (babi guling)
‚ñ∂ Checking: 16 (nasi uduk)
‚ñ∂ Checking: 17 (nasi babi campur)
‚ñ∂ Checking: 18 (ayam pop)
‚ñ∂ Checking: 19 (telur balado)
‚ñ∂ Checking: 2 (rendang)
‚ñ∂ Checking: 20 (telur dadar)
‚ñ∂ Checking: 21 (telur ceplok)
‚ñ∂ Checking: 22 (nasi putih)
‚ñ∂ Checking: 23 (dadar gulung)
‚ñ∂ Checking: 24 (putu ayu)
‚ñ∂ Checking: 25 (kue cubit)
‚ñ∂ Checking: 26 (pepes ikan)
‚ñ∂ Checking: 27 (bika ambon)
‚ñ∂ Checking: 28 (soto)
‚ñ∂ Checking: 29 (lumpia)
‚ñ∂ Checking: 3 (kangkung)
‚ñ∂ Checking: 30 (bihun goreng)
‚ñ∂ Checking: 31 (pempek)
‚ñ∂ Checking: 32 (batagor)
‚ñ∂ Checking: 33 (ikan goreng)
‚ñ∂ Checking: 34 (telur rebus)
‚ñ∂ Checking: 35 (martabak manis)
‚ñ∂ Checking: 36 (gulai ikan)
‚ñ∂ Checking: 37 (tempe bacem)
‚ñ∂ Checking: 38 (terong balado)
‚ñ∂ Checking: 39 (

4. Label Cleaning & Validation - Remove Empty Label Files (and Corresponding Images)

Label files containing no valid YOLO bounding box annotations were identified and removed. Corresponding images were also deleted to maintain one-to-one consistency between images and labels and to ensure dataset integrity.


In [24]:
from pathlib import Path

DATASET_ROOT = Path("dataset_working")
LABEL_DIR_NAME = "labels"
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}

deleted_images = 0
deleted_labels = 0


for cls in sorted(DATASET_ROOT.iterdir()):
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    lbl_dir = cls / LABEL_DIR_NAME
    if not img_dir.exists() or not lbl_dir.exists():
        continue

    images = {}
    for p in img_dir.iterdir():
        if p.suffix.lower() in IMG_EXTS:
            images[p.stem] = p

    labels = {p.stem: p for p in lbl_dir.glob("*.txt")}


    # Handle empty labels FIRST
    # -------------------------
    for stem, lbl_path in list(labels.items()):
        content = lbl_path.read_text().strip()

        if content == "":
            print(f"‚ùå Empty label: {lbl_path}")
            lbl_path.unlink()
            deleted_labels += 1

            img_path = images.get(stem)
            if img_path and img_path.exists():
                print(f"   ‚îî‚îÄ deleting image: {img_path}")
                img_path.unlink()
                deleted_images += 1

            labels.pop(stem, None)
            images.pop(stem, None)


    # Images without labels
    # -------------------------
    for stem, img_path in list(images.items()):
        if stem not in labels:
            print(f"‚ùå Image without label: {img_path}")
            img_path.unlink()
            deleted_images += 1


    # Labels without images
    # -------------------------
    for stem, lbl_path in list(labels.items()):
        if stem not in images:
            print(f"‚ùå Label without image: {lbl_path}")
            lbl_path.unlink()
            deleted_labels += 1


print(f"Images deleted : {deleted_images}")
print(f"Labels deleted : {deleted_labels}")


‚ùå Image without label: dataset_working\0 (tempe goreng)\images\tempe-goreng-krispy-foto-resep-utama_jpg.rf.ceb0b9fa7925f4c7b2d25f19f0b8a71f.jpg
‚ùå Empty label: dataset_working\35 (martabak manis)\labels\apam_balik_173_jpg.rf.6b3e7991e29e6ab05f5e85dcc54f7ace.txt
   ‚îî‚îÄ deleting image: dataset_working\35 (martabak manis)\images\apam_balik_173_jpg.rf.6b3e7991e29e6ab05f5e85dcc54f7ace.jpg
Images deleted : 2
Labels deleted : 1


In [37]:
from pathlib import Path

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}

for cls in Path("dataset_working").iterdir():
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    lbl_dir = cls / "labels"
    if not img_dir.exists() or not lbl_dir.exists():
        continue

    img_stems = {
        p.stem for p in img_dir.iterdir()
        if p.is_file() and p.suffix.lower() in IMG_EXTS
    }
    lbl_stems = {p.stem for p in lbl_dir.glob("*.txt") if p.is_file()}

    missing_lbl = sorted(img_stems - lbl_stems)
    missing_img = sorted(lbl_stems - img_stems)

    if missing_lbl or missing_img:
        print(f"\n‚ùå {cls.name}")
        if missing_lbl:
            print(f"  Images without labels: {len(missing_lbl)}")
            print(f"  e.g. {missing_lbl[:5]}")
        if missing_img:
            print(f"  Labels without images: {len(missing_img)}")
            print(f"  e.g. {missing_img[:5]}")

5. Remove Duplicate Images

Duplicate images were identified using content-based hashing to detect exact binary duplicates within each class. An initial inspection phase was performed to verify duplicate groups without modifying the dataset, followed by a controlled removal step to eliminate redundant images.


In [39]:
DATASET_ROOT = Path("dataset_working")
LABEL_DIR_NAME = "labels"
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}


def file_hash(path: Path, chunk_size=8192):
    h = hashlib.md5()
    with open(path, "rb") as f:
        while chunk := f.read(chunk_size):
            h.update(chunk)
    return h.hexdigest()


total_dup_groups = 0
total_dup_images = 0

for cls in sorted(DATASET_ROOT.iterdir()):
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    lbl_dir = cls / LABEL_DIR_NAME
    if not img_dir.exists() or not lbl_dir.exists():
        continue

    hash_map = defaultdict(list)

    for img in img_dir.iterdir():
        if img.suffix.lower() in IMG_EXTS:
            h = file_hash(img)
            hash_map[h].append(img)

    dup_groups = [imgs for imgs in hash_map.values() if len(imgs) > 1]

    if dup_groups:
        print(f"\nClass: {cls.name}")
        for group in dup_groups:
            total_dup_groups += 1
            print("  Duplicate group:")
            for img in group:
                print("   ", img.name)
            total_dup_images += len(group) - 1

print("\nüîé CHECK SUMMARY - NO DELETION")
print(f"Duplicate groups found : {total_dup_groups}")
print(f"Images to be removed   : {total_dup_images}")
print("\n‚ö†Ô∏è No files were deleted.")



Class: 10 (nasi goreng)
  Duplicate group:
    292731.jpg
    292769.jpg

Class: 11 (bubur ayam)
  Duplicate group:
    249097.jpg
    249119.jpg

Class: 13 (mie ayam)
  Duplicate group:
    319405.jpg
    319409.jpg
    319412.jpg
    319639.jpg

Class: 14 (nasi padang)
  Duplicate group:
    247871.jpg
    269533.jpg

Class: 22 (nasi putih)
  Duplicate group:
    10645.jpg
    9117.jpg
  Duplicate group:
    11156.jpg
    11161.jpg

Class: 33 (ikan goreng)
  Duplicate group:
    Ikan-Goreng_419_jpg.rf.322abb1764ba684524eb45af5de518bb.jpg
    ikan_train-10-_jpg.rf.8b247a7eae1531418b6d430ca8c72057.jpg

Class: 9 (mie goreng)
  Duplicate group:
    143_233273.jpg
    143_271148.jpg

üîé CHECK SUMMARY - NO DELETION
Duplicate groups found : 8
Images to be removed   : 10

‚ö†Ô∏è No files were deleted.


After the duplicate inspection step, all exact duplicate image groups were reported without removing any files. This allowed manual verification of detected duplicates and ensured that only true redundancies were targeted in the subsequent deletion stage.


In [40]:
DATASET_ROOT = Path("dataset_working")
LABEL_DIR_NAME = "labels"
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}


def file_hash(path: Path, chunk_size=8192):
    h = hashlib.md5()
    with open(path, "rb") as f:
        while chunk := f.read(chunk_size):
            h.update(chunk)
    return h.hexdigest()


deleted_images = 0
deleted_labels = 0

for cls in sorted(DATASET_ROOT.iterdir()):
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    lbl_dir = cls / LABEL_DIR_NAME
    if not img_dir.exists() or not lbl_dir.exists():
        continue

    hash_map = defaultdict(list)

    for img in img_dir.iterdir():
        if img.suffix.lower() in IMG_EXTS:
            h = file_hash(img)
            hash_map[h].append(img)

    for imgs in hash_map.values():
        if len(imgs) > 1:
            keep = imgs[0]

            for dup_img in imgs[1:]:
                lbl_path = lbl_dir / f"{dup_img.stem}.txt"

                print(f"‚ùå Deleting duplicate image: {dup_img}")
                dup_img.unlink()
                deleted_images += 1

                if lbl_path.exists():
                    print(f"   ‚îî‚îÄ deleting label: {lbl_path}")
                    lbl_path.unlink()
                    deleted_labels += 1

print(f"Images deleted : {deleted_images}")
print(f"Labels deleted : {deleted_labels}")

‚ùå Deleting duplicate image: dataset_working\10 (nasi goreng)\images\292769.jpg
   ‚îî‚îÄ deleting label: dataset_working\10 (nasi goreng)\labels\292769.txt
‚ùå Deleting duplicate image: dataset_working\11 (bubur ayam)\images\249119.jpg
   ‚îî‚îÄ deleting label: dataset_working\11 (bubur ayam)\labels\249119.txt
‚ùå Deleting duplicate image: dataset_working\13 (mie ayam)\images\319409.jpg
   ‚îî‚îÄ deleting label: dataset_working\13 (mie ayam)\labels\319409.txt
‚ùå Deleting duplicate image: dataset_working\13 (mie ayam)\images\319412.jpg
   ‚îî‚îÄ deleting label: dataset_working\13 (mie ayam)\labels\319412.txt
‚ùå Deleting duplicate image: dataset_working\13 (mie ayam)\images\319639.jpg
   ‚îî‚îÄ deleting label: dataset_working\13 (mie ayam)\labels\319639.txt
‚ùå Deleting duplicate image: dataset_working\14 (nasi padang)\images\269533.jpg
   ‚îî‚îÄ deleting label: dataset_working\14 (nasi padang)\labels\269533.txt
‚ùå Deleting duplicate image: dataset_working\22 (nasi putih)\images\911

Following verification, confirmed duplicate images were removed while retaining a single representative copy per duplicate group. This reduced dataset redundancy without altering class semantics or introducing annotation inconsistencies.


In [50]:
DATASET_ROOT = Path(r"dataset_working")
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}


# MD5 hash ‚Üí detects exact binary duplicates (content-based)
# ------------------------------
def file_hash(path: Path, chunk_size=8192):
    h = hashlib.md5()
    with open(path, "rb") as f:
        while chunk := f.read(chunk_size):
            h.update(chunk)
    return h.hexdigest()

def read_yolo_lines(label_path: Path):
    with open(label_path, "r", encoding="utf-8") as f:
        return [ln.strip() for ln in f if ln.strip()]


missing_label = []
missing_image = []
empty_labels = []
invalid_yolo = []
corrupted_images = []

# per-class image hashes
image_hashes = defaultdict(lambda: defaultdict(list))

total_images = 0
total_labels = 0


for cls in sorted(
    [p for p in DATASET_ROOT.iterdir() if p.is_dir() and not p.name.startswith("_")],
    key=lambda p: p.name.lower()
):
    class_name = cls.name

    img_dir = cls / "images"
    lbl_dir = cls / "labels"
    if not img_dir.exists() or not lbl_dir.exists():
        continue

    images = [p for p in img_dir.iterdir() if p.suffix.lower() in IMG_EXTS]
    labels = list(lbl_dir.glob("*.txt"))

    total_images += len(images)
    total_labels += len(labels)

    img_map = {p.stem: p for p in images}
    lbl_map = {p.stem: p for p in labels}


    # Missing pairs
    # -------------------------
    for stem, img_path in img_map.items():
        if stem not in lbl_map:
            missing_label.append(str(img_path.resolve()))

    for stem, lbl_path in lbl_map.items():
        if stem not in img_map:
            missing_image.append(str(lbl_path.resolve()))


    # Image checks + per-class duplicates
    # -------------------------
    for img_path in images:
        try:
            img = cv2.imread(str(img_path))
            if img is None or img.size == 0:
                corrupted_images.append(str(img_path.resolve()))
                continue
        except Exception:
            corrupted_images.append(str(img_path.resolve()))
            continue

        h = file_hash(img_path)
        image_hashes[class_name][h].append(str(img_path.resolve()))


    # Label checks (NO duplicate detection)
    # -------------------------
    for lbl_path in labels:
        lines = read_yolo_lines(lbl_path)

        if len(lines) == 0:
            empty_labels.append(str(lbl_path.resolve()))
            continue

        for ln in lines:
            parts = ln.split()
            if len(parts) != 5:
                invalid_yolo.append((str(lbl_path.resolve()), ln))
                continue

            try:
                _, x, y, w, h = parts
                x, y, w, h = map(float, (x, y, w, h))
            except Exception:
                invalid_yolo.append((str(lbl_path.resolve()), ln))
                continue

            if not (0 < x <= 1 and 0 < y <= 1 and 0 < w <= 1 and 0 < h <= 1):
                invalid_yolo.append((str(lbl_path.resolve()), ln))


# PER-CLASS DUPLICATES
# =========================
dup_images = {}

for cls_name, hashes in image_hashes.items():
    for h, files in hashes.items():
        if len(files) > 1:
            dup_images.setdefault(cls_name, []).append(files)


# FULL REPORT
# =========================

print(f"Total images scanned : {total_images}")
print(f"Total labels scanned : {total_labels}")
print("-" * 100)

print(f"‚ùå Missing labels ({len(missing_label)}):")
for p in missing_label:
    print(p)
print("-" * 100)

print(f"‚ùå Missing images ({len(missing_image)}):")
for p in missing_image:
    print(p)
print("-" * 100)

print(f"‚ö† Empty label files ({len(empty_labels)}):")
for p in empty_labels:
    print(p)
print("-" * 100)

print(f"‚ùå Invalid YOLO entries ({len(invalid_yolo)}):")
for p, ln in invalid_yolo:
    print(f"{p}  |  {ln}")
print("-" * 100)

print(f"‚ùå Corrupted images ({len(corrupted_images)}):")
for p in corrupted_images:
    print(p)
print("-" * 100)

print("‚ö† Duplicate image groups (PER CLASS):")
for cls_name, groups in dup_images.items():
    print(f"\nClass: {cls_name}")
    for group in groups:
        for f in group:
            print(" ", f)
        print("-" * 50)

print("=" * 100)

Total images scanned : 6578
Total labels scanned : 6578
----------------------------------------------------------------------------------------------------
‚ùå Missing labels (0):
----------------------------------------------------------------------------------------------------
‚ùå Missing images (0):
----------------------------------------------------------------------------------------------------
‚ö† Empty label files (0):
----------------------------------------------------------------------------------------------------
‚ùå Invalid YOLO entries (0):
----------------------------------------------------------------------------------------------------
‚ùå Corrupted images (0):
----------------------------------------------------------------------------------------------------
‚ö† Duplicate image groups (PER CLASS):


6. Dataset Integrity Check (Re-EDA)

A second exploratory analysis was conducted after preprocessing to assess the effective dataset distribution. No additional dataset balancing or filtering was applied, as class frequencies, image resolutions, and bounding box scales were within acceptable ranges. Images were retained at their original resolutions to preserve visual detail.

In [44]:
total_images = 0
total_boxes = 0
class_names = []

for cls in DATASET_ROOT.iterdir():
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    lbl_dir = cls / "labels"
    if not img_dir.exists() or not lbl_dir.exists():
        continue

    images = [p for p in img_dir.iterdir() if p.suffix.lower() in IMG_EXTS]
    labels = list(lbl_dir.glob("*.txt"))

    total_images += len(images)
    class_names.append(cls.name)

    for lbl in labels:
        total_boxes += len(lbl.read_text().strip().splitlines())

print("=== DATASET SUMMARY ===")
print("Classes       :", len(class_names))
print("Images        :", total_images)
print("Bounding boxes:", total_boxes)

=== DATASET SUMMARY ===
Classes       : 40
Images        : 6578
Bounding boxes: 11100


6.1 Image per Class

In [45]:
class_image_counts = []

for cls in DATASET_ROOT.iterdir():
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    if not img_dir.exists():
        continue

    count = sum(1 for p in img_dir.iterdir() if p.suffix.lower() in IMG_EXTS)
    class_image_counts.append({
        "class": cls.name,
        "images": count
    })

df_images_per_class = (
    pd.DataFrame(class_image_counts)
    .sort_values("images", ascending=False)
)

display(df_images_per_class.style.hide(axis="index"))


class,images
10 (nasi goreng),326
4 (sate),321
2 (rendang),299
31 (pempek),298
5 (bakso),293
9 (mie goreng),240
22 (nasi putih),230
34 (telur rebus),200
21 (telur ceplok),199
24 (putu ayu),190


6.2 Bounding Box per Class

In [46]:
class_box_counts = []

for cls in DATASET_ROOT.iterdir():
    if not cls.is_dir():
        continue

    lbl_dir = cls / "labels"
    if not lbl_dir.exists():
        continue

    box_count = 0
    for lbl in lbl_dir.glob("*.txt"):
        box_count += len(lbl.read_text().strip().splitlines())

    class_box_counts.append({
        "class": cls.name,
        "boxes": box_count
    })

df_boxes_per_class = (
    pd.DataFrame(class_box_counts)
    .sort_values("boxes", ascending=False)
)

display(df_boxes_per_class.style.hide(axis="index"))


class,boxes
25 (kue cubit),1193
24 (putu ayu),1169
31 (pempek),641
37 (tempe bacem),568
1 (tahu goreng),426
0 (tempe goreng),422
34 (telur rebus),422
5 (bakso),363
4 (sate),344
23 (dadar gulung),341


6.3 Image Resolution Distribution

In [47]:
resolution_counter = Counter()

for cls in DATASET_ROOT.iterdir():
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    if not img_dir.exists():
        continue

    for img_path in img_dir.iterdir():
        if img_path.suffix.lower() not in IMG_EXTS:
            continue

        img = cv2.imread(str(img_path))
        if img is None:
            continue

        h, w = img.shape[:2]
        resolution_counter[(w, h)] += 1

df_resolutions = (
    pd.DataFrame(
        [{"width": w, "height": h, "count": c}
         for (w, h), c in resolution_counter.items()]
    )
    .sort_values("count", ascending=False)
)

display(df_resolutions.head(20).style.hide(axis="index"))


width,height,count
500,500,485
640,640,424
577,433,301
751,532,253
800,600,198
512,512,159
500,375,156
680,482,114
1280,720,108
1200,630,99


6.4 Box Size Distribution (relative area)

In [48]:
box_areas = []

for cls in DATASET_ROOT.iterdir():
    if not cls.is_dir():
        continue

    lbl_dir = cls / "labels"
    if not lbl_dir.exists():
        continue

    for lbl in lbl_dir.glob("*.txt"):
        for ln in lbl.read_text().splitlines():
            parts = ln.split()
            if len(parts) != 5:
                continue
            _, _, _, w, h = map(float, parts)
            box_areas.append(w * h)

df_box_area = pd.DataFrame({"relative_area": box_areas})

print(df_box_area.describe())

       relative_area
count   11100.000000
mean        0.356062
std         0.304503
min         0.000006
25%         0.089999
50%         0.228731
75%         0.636353
max         1.000000


Following preprocessing and label normalization, exploratory analysis was repeated on the cleaned dataset to verify class distributions, annotation consistency, and overall data integrity. The re-analysis confirmed that all labels conform to the YOLO format with zero-based class indexing and that no structural or annotation anomalies remain.

7. Splitting Train/Val/Test

The dataset was split into training, validation, and test sets using a 70/15/15 ratio. Stratified sampling was applied at the class level to ensure that each food category was represented across all subsets. A fixed random seed (42) was used to ensure reproducibility.

In [49]:
SRC_ROOT = Path("dataset_working")
DST_ROOT = Path("dataset_final")

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}
SEED = 42

TRAIN_RATIO = 0.70
VAL_RATIO   = 0.15
TEST_RATIO  = 0.15

random.seed(SEED)

# CREATE YOLO DIR STRUCTURE
# =========================
for split in ["train", "val", "test"]:
    (DST_ROOT / split / "images").mkdir(parents=True, exist_ok=True)
    (DST_ROOT / split / "labels").mkdir(parents=True, exist_ok=True)

print("‚úÇÔ∏è DATASET SPLIT ‚Äî YOLO FORMAT (70/15/15, Stratified per class)\n")

total_counts = {"train": 0, "val": 0, "test": 0}

# SPLIT PER CLASS (STRATIFIED)
# =========================
for cls in sorted(SRC_ROOT.iterdir()):
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    lbl_dir = cls / "labels"

    if not img_dir.exists() or not lbl_dir.exists():
        continue

    # Collect valid image‚Äìlabel pairs
    pairs = []
    for img_path in img_dir.iterdir():
        if img_path.suffix.lower() not in IMG_EXTS:
            continue

        lbl_path = lbl_dir / f"{img_path.stem}.txt"
        if lbl_path.exists():
            pairs.append((img_path, lbl_path))

    if not pairs:
        continue

    random.shuffle(pairs)

    n = len(pairs)
    n_train = int(n * TRAIN_RATIO)
    n_val   = int(n * VAL_RATIO)

    splits = {
        "train": pairs[:n_train],
        "val":   pairs[n_train:n_train + n_val],
        "test":  pairs[n_train + n_val:]
    }

    print(
        f"Class {cls.name}: {n} ‚Üí "
        f"train={len(splits['train'])}, "
        f"val={len(splits['val'])}, "
        f"test={len(splits['test'])}"
    )

    for split_name, items in splits.items():
        for img_path, lbl_path in items:
            shutil.copy2(
                img_path,
                DST_ROOT / split_name / "images" / img_path.name
            )
            shutil.copy2(
                lbl_path,
                DST_ROOT / split_name / "labels" / lbl_path.name
            )
            total_counts[split_name] += 1


print("\n‚úÖ SPLIT COMPLETE (YOLO READY)")
print(f"Train images : {total_counts['train']}")
print(f"Val images   : {total_counts['val']}")
print(f"Test images  : {total_counts['test']}")
print(f"Random seed  : {SEED}")


‚úÇÔ∏è DATASET SPLIT ‚Äî YOLO FORMAT (70/15/15, Stratified per class)

Class 0 (tempe goreng): 90 ‚Üí train=62, val=13, test=15
Class 1 (tahu goreng): 106 ‚Üí train=74, val=15, test=17
Class 10 (nasi goreng): 326 ‚Üí train=228, val=48, test=50
Class 11 (bubur ayam): 107 ‚Üí train=74, val=16, test=17
Class 12 (cakwe): 135 ‚Üí train=94, val=20, test=21
Class 13 (mie ayam): 104 ‚Üí train=72, val=15, test=17
Class 14 (nasi padang): 147 ‚Üí train=102, val=22, test=23
Class 15 (babi guling): 108 ‚Üí train=75, val=16, test=17
Class 16 (nasi uduk): 106 ‚Üí train=74, val=15, test=17
Class 17 (nasi babi campur): 117 ‚Üí train=81, val=17, test=19
Class 18 (ayam pop): 103 ‚Üí train=72, val=15, test=16
Class 19 (telur balado): 186 ‚Üí train=130, val=27, test=29
Class 2 (rendang): 299 ‚Üí train=209, val=44, test=46
Class 20 (telur dadar): 114 ‚Üí train=79, val=17, test=18
Class 21 (telur ceplok): 199 ‚Üí train=139, val=29, test=31
Class 22 (nasi putih): 230 ‚Üí train=161, val=34, test=35
Class 23 (d