1. Import Library

In [1]:
import os
import re
import json
import math
import random
import shutil
import hashlib

from pathlib import Path
from collections import defaultdict, Counter

import numpy as np
import pandas as pd

import cv2
from PIL import Image

import matplotlib.pyplot as plt

from tqdm import tqdm

2. Dataset Freeze

All preprocessing will be performed on dataset_working, while dataset_raw remained unchanged as a frozen reference.

In [2]:
SRC_DATASET = Path("dataset")
RAW_DATASET = Path("dataset_raw")
WORK_DATASET = Path("dataset_working")


if RAW_DATASET.exists() or WORK_DATASET.exists():
    raise RuntimeError(
        "dataset_raw or dataset_working already exists.\n"
        "Delete them manually if you want to re-freeze."
    )


print("üì¶ Creating dataset_raw (frozen copy)...")
shutil.copytree(SRC_DATASET, RAW_DATASET)

print("üì¶ Creating dataset_working (preprocessing copy)...")
shutil.copytree(SRC_DATASET, WORK_DATASET)

print("‚Ä¢ dataset_raw      ‚Üí DO NOT TOUCH")
print("‚Ä¢ dataset_working  ‚Üí use for preprocessing")

üì¶ Creating dataset_raw (frozen copy)...
üì¶ Creating dataset_working (preprocessing copy)...
‚Ä¢ dataset_raw      ‚Üí DO NOT TOUCH
‚Ä¢ dataset_working  ‚Üí use for preprocessing


3. Structural Consistency Fix

A directory normalization step was applied to enforce a consistent YOLO-compatible structure, ensuring all images and annotation files were organized into standardized images/ and labels/ subdirectories for each class.

In [3]:
DATASET_ROOT = Path("dataset_working")
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}


for cls in sorted(DATASET_ROOT.iterdir()):
    if not cls.is_dir():
        continue

    print(f"‚ñ∂ Checking: {cls.name}")

    img_dir = cls / "images"
    lbl_dir = cls / "labels"

    img_dir.mkdir(exist_ok=True)
    lbl_dir.mkdir(exist_ok=True)

    for p in cls.iterdir():
        if p.is_file() and p.suffix.lower() in IMG_EXTS:
            dest = img_dir / p.name
            if not dest.exists():
                shutil.move(str(p), str(dest))
                print(f"  ‚úî Moved image ‚Üí images/{p.name}")

    for p in cls.iterdir():
        if p.is_file() and p.suffix.lower() == ".txt":
            dest = lbl_dir / p.name
            if not dest.exists():
                shutil.move(str(p), str(dest))
                print(f"  ‚úî Moved label ‚Üí labels/{p.name}")

print("\n‚úÖ Dataset structure normalization completed successfully.")
print("All image and label files are now organized into 'images/' and 'labels/' folders.")

‚ñ∂ Checking: 1 (tempe goreng)
‚ñ∂ Checking: 10 (gado gado)
‚ñ∂ Checking: 11 (mie goreng)
‚ñ∂ Checking: 12 (opor ayam)
‚ñ∂ Checking: 13 (nasi goreng)
‚ñ∂ Checking: 14 (bubur ayam)
‚ñ∂ Checking: 15 (cakwe)
‚ñ∂ Checking: 16 (mie ayam)
‚ñ∂ Checking: 17 (nasi padang)
‚ñ∂ Checking: 18 (babi guling)
‚ñ∂ Checking: 19 (nasi uduk)
‚ñ∂ Checking: 2 (tahu goreng)
‚ñ∂ Checking: 20 (nasi babi campur)
‚ñ∂ Checking: 21 (ayam pop)
‚ñ∂ Checking: 22 (telur balado)
‚ñ∂ Checking: 23 (telur dadar)
‚ñ∂ Checking: 24 (telur ceplok)
‚ñ∂ Checking: 25 (nasi putih)
‚ñ∂ Checking: 26 (dadar gulung)
‚ñ∂ Checking: 27 (putu ayu)
‚ñ∂ Checking: 28 (kue cubit)
‚ñ∂ Checking: 29 (pepes ikan)
‚ñ∂ Checking: 3 (rendang)
‚ñ∂ Checking: 30 (bika ambon)
‚ñ∂ Checking: 31 (soto)
‚ñ∂ Checking: 32 (lumpia)
‚ñ∂ Checking: 33 (bihun goreng)
‚ñ∂ Checking: 34 (pempek)
‚ñ∂ Checking: 35 (batagor)
‚ñ∂ Checking: 36 (ikan goreng)
‚ñ∂ Checking: 37 (telur rebus)
‚ñ∂ Checking: 38 (martabak manis)
‚ñ∂ Checking: 39 (gulai ikan)
‚ñ∂ Checking: 4 (kang

4. Label Cleaning & Validation - Remove Empty Label Files (and Corresponding Images)

Label files containing no valid YOLO bounding box annotations were identified and removed. Corresponding images were also deleted to maintain one-to-one consistency between images and labels and to ensure dataset integrity.


In [4]:
from pathlib import Path

DATASET_ROOT = Path("dataset_working")
LABEL_DIR_NAME = "labels"
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}

deleted_images = 0
deleted_labels = 0


for cls in sorted(DATASET_ROOT.iterdir()):
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    lbl_dir = cls / LABEL_DIR_NAME
    if not img_dir.exists() or not lbl_dir.exists():
        continue

    images = {}
    for p in img_dir.iterdir():
        if p.suffix.lower() in IMG_EXTS:
            images[p.stem] = p

    labels = {p.stem: p for p in lbl_dir.glob("*.txt")}


    # Handle empty labels FIRST
    # -------------------------
    for stem, lbl_path in list(labels.items()):
        content = lbl_path.read_text().strip()

        if content == "":
            print(f"‚ùå Empty label: {lbl_path}")
            lbl_path.unlink()
            deleted_labels += 1

            img_path = images.get(stem)
            if img_path and img_path.exists():
                print(f"   ‚îî‚îÄ deleting image: {img_path}")
                img_path.unlink()
                deleted_images += 1

            labels.pop(stem, None)
            images.pop(stem, None)


    # Images without labels
    # -------------------------
    for stem, img_path in list(images.items()):
        if stem not in labels:
            print(f"‚ùå Image without label: {img_path}")
            img_path.unlink()
            deleted_images += 1


    # Labels without images
    # -------------------------
    for stem, lbl_path in list(labels.items()):
        if stem not in images:
            print(f"‚ùå Label without image: {lbl_path}")
            lbl_path.unlink()
            deleted_labels += 1


print(f"Images deleted : {deleted_images}")
print(f"Labels deleted : {deleted_labels}")


‚ùå Image without label: dataset_working\1 (tempe goreng)\images\tempe-goreng-krispy-foto-resep-utama_jpg.rf.ceb0b9fa7925f4c7b2d25f19f0b8a71f.jpg
‚ùå Empty label: dataset_working\38 (martabak manis)\labels\apam_balik_173_jpg.rf.6b3e7991e29e6ab05f5e85dcc54f7ace.txt
   ‚îî‚îÄ deleting image: dataset_working\38 (martabak manis)\images\apam_balik_173_jpg.rf.6b3e7991e29e6ab05f5e85dcc54f7ace.jpg
Images deleted : 2
Labels deleted : 1


After execution, all empty or invalid label files were removed along with their corresponding images. This step ensured that every remaining image in the dataset is associated with at least one valid YOLO bounding box annotation, resulting in a consistent and fully usable dataset for training and evaluation.


5. Convert Segmentation -> Bounding Box (YOLO FORMAT)

Segmentation-based annotations were converted into YOLO bounding box format to ensure compatibility with object detection models. Polygon annotations were transformed into enclosing bounding boxes while preserving class labels and normalized coordinate conventions.

In [5]:
DATASET_ROOT = Path("dataset_working")
SRC_LABEL_DIR = "labels"
DST_LABEL_DIR = "labels_bbox"

def polygon_to_bbox(coords):
    xs = coords[0::2]
    ys = coords[1::2]

    xmin, xmax = min(xs), max(xs)
    ymin, ymax = min(ys), max(ys)

    xc = (xmin + xmax) / 2
    yc = (ymin + ymax) / 2
    w = xmax - xmin
    h = ymax - ymin

    return xc, yc, w, h


converted_files = 0
converted_boxes = 0
copied_bbox_files = 0

for cls in sorted(DATASET_ROOT.iterdir()):
    if not cls.is_dir():
        continue

    src_lbl_dir = cls / SRC_LABEL_DIR
    if not src_lbl_dir.exists():
        continue

    dst_lbl_dir = cls / DST_LABEL_DIR
    dst_lbl_dir.mkdir(exist_ok=True)

    for lbl_path in src_lbl_dir.glob("*.txt"):
        with open(lbl_path, "r", encoding="utf-8") as f:
            lines = [ln.strip() for ln in f if ln.strip()]

        new_lines = []
        did_convert = False

        for ln in lines:
            parts = ln.split()
            class_id = parts[0]
            values = list(map(float, parts[1:]))

            # Case 1: already bbox format
            if len(values) == 4:
                new_lines.append(ln)
                continue

            # Case 2: segmentation polygon
            if len(values) >= 6 and len(values) % 2 == 0:
                xc, yc, w, h = polygon_to_bbox(values)
                new_lines.append(
                    f"{class_id} {xc:.6f} {yc:.6f} {w:.6f} {h:.6f}"
                )
                converted_boxes += 1
                did_convert = True
            else:
                did_convert = True

        if new_lines:
            out_path = dst_lbl_dir / lbl_path.name
            with open(out_path, "w", encoding="utf-8") as f:
                f.write("\n".join(new_lines) + "\n")

            if did_convert:
                converted_files += 1
            else:
                copied_bbox_files += 1


print(f"Files converted (seg‚Üíbbox): {converted_files}")
print(f"Bounding boxes created     : {converted_boxes}")
print(f"Files copied (already bbox): {copied_bbox_files}")
print(f"Output folder              : {DST_LABEL_DIR}")


Files converted (seg‚Üíbbox): 210
Bounding boxes created     : 370
Files copied (already bbox): 6952
Output folder              : labels_bbox


After execution, all annotation files are written to a new `labels_bbox/` directory for each class. Annotation files that were already in YOLO bounding box format are copied without modification, while segmentation-based annotations are converted into bounding boxes by computing the minimum enclosing rectangle of each polygon. The original annotation files in `labels/` remain untouched, ensuring a non-destructive preprocessing workflow and allowing safe rollback if needed.

In [6]:
for p in Path("dataset_working").rglob("labels_bbox/*.txt"):
    with open(p) as f:
        for ln in f:
            assert len(ln.split()) == 5
print("\n‚úÖ Sanity check passed: All label files in 'labels_bbox' are in bounding box format.")


‚úÖ Sanity check passed: All label files in 'labels_bbox' are in bounding box format.


In [7]:
sum(
    1
    for p in Path("dataset_working").rglob("labels_bbox/*.txt")
    if p.read_text().strip() == ""
)
print("\n‚úÖ Sanity check passed: No empty label files remain in 'labels_bbox'.")


‚úÖ Sanity check passed: No empty label files remain in 'labels_bbox'.


In [8]:
errors = 0

for cls in Path("dataset_working").iterdir():
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    lbl_dir = cls / "labels_bbox"
    if not img_dir.exists() or not lbl_dir.exists():
        continue

    img_stems = {p.stem for p in img_dir.iterdir()}
    lbl_stems = {p.stem for p in lbl_dir.glob("*.txt")}

    if img_stems != lbl_stems:
        print("‚ùå Mismatch in:", cls.name)
        errors += 1

print("Errors found:", errors)
if errors == 0:
    print("\n‚úÖ Final sanity check passed: Images and labels are perfectly paired.")

‚ùå Mismatch in: 1 (tempe goreng)
Errors found: 1


7. Remove Duplicate Images

Duplicate images were identified using content-based hashing to detect exact binary duplicates within each class. An initial inspection phase was performed to verify duplicate groups without modifying the dataset, followed by a controlled removal step to eliminate redundant images.


In [9]:
DATASET_ROOT = Path("dataset_working")
LABEL_DIR_NAME = "labels_bbox"
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}


def file_hash(path: Path, chunk_size=8192):
    h = hashlib.md5()
    with open(path, "rb") as f:
        while chunk := f.read(chunk_size):
            h.update(chunk)
    return h.hexdigest()


total_dup_groups = 0
total_dup_images = 0

for cls in sorted(DATASET_ROOT.iterdir()):
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    lbl_dir = cls / LABEL_DIR_NAME
    if not img_dir.exists() or not lbl_dir.exists():
        continue

    hash_map = defaultdict(list)

    for img in img_dir.iterdir():
        if img.suffix.lower() in IMG_EXTS:
            h = file_hash(img)
            hash_map[h].append(img)

    dup_groups = [imgs for imgs in hash_map.values() if len(imgs) > 1]

    if dup_groups:
        print(f"\nClass: {cls.name}")
        for group in dup_groups:
            total_dup_groups += 1
            print("  Duplicate group:")
            for img in group:
                print("   ", img.name)
            total_dup_images += len(group) - 1

print("\nüîé CHECK SUMMARY - NO DELETION")
print(f"Duplicate groups found : {total_dup_groups}")
print(f"Images to be removed   : {total_dup_images}")
print("\n‚ö†Ô∏è No files were deleted.")



Class: 11 (mie goreng)
  Duplicate group:
    143_233273.jpg
    143_271148.jpg

Class: 12 (opor ayam)
  Duplicate group:
    Oporayamdantelur193_jpeg.rf.b83d1981aae7af70b425434c091a7b0b.jpg
    Oporayamdantelur193_jpeg_jpg.rf.ff79fd1613dbaa8df4c6ae0bf16c8919.jpg

Class: 13 (nasi goreng)
  Duplicate group:
    292731.jpg
    292769.jpg

Class: 14 (bubur ayam)
  Duplicate group:
    249097.jpg
    249119.jpg

Class: 16 (mie ayam)
  Duplicate group:
    319405.jpg
    319409.jpg
    319412.jpg
    319639.jpg

Class: 17 (nasi padang)
  Duplicate group:
    247871.jpg
    269533.jpg

Class: 25 (nasi putih)
  Duplicate group:
    10645.jpg
    9117.jpg
  Duplicate group:
    11156.jpg
    11161.jpg

Class: 36 (ikan goreng)
  Duplicate group:
    Ikan-Goreng_419_jpg.rf.322abb1764ba684524eb45af5de518bb.jpg
    ikan_train-10-_jpg.rf.8b247a7eae1531418b6d430ca8c72057.jpg

üîé CHECK SUMMARY - NO DELETION
Duplicate groups found : 9
Images to be removed   : 11

‚ö†Ô∏è No files were deleted.


After the duplicate inspection step, all exact duplicate image groups were reported without removing any files. This allowed manual verification of detected duplicates and ensured that only true redundancies were targeted in the subsequent deletion stage.


In [10]:
DATASET_ROOT = Path("dataset_working")
LABEL_DIR_NAME = "labels_bbox"
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}


def file_hash(path: Path, chunk_size=8192):
    h = hashlib.md5()
    with open(path, "rb") as f:
        while chunk := f.read(chunk_size):
            h.update(chunk)
    return h.hexdigest()


deleted_images = 0
deleted_labels = 0

for cls in sorted(DATASET_ROOT.iterdir()):
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    lbl_dir = cls / LABEL_DIR_NAME
    if not img_dir.exists() or not lbl_dir.exists():
        continue

    hash_map = defaultdict(list)

    for img in img_dir.iterdir():
        if img.suffix.lower() in IMG_EXTS:
            h = file_hash(img)
            hash_map[h].append(img)

    for imgs in hash_map.values():
        if len(imgs) > 1:
            keep = imgs[0]

            for dup_img in imgs[1:]:
                lbl_path = lbl_dir / f"{dup_img.stem}.txt"

                print(f"‚ùå Deleting duplicate image: {dup_img}")
                dup_img.unlink()
                deleted_images += 1

                if lbl_path.exists():
                    print(f"   ‚îî‚îÄ deleting label: {lbl_path}")
                    lbl_path.unlink()
                    deleted_labels += 1

print(f"Images deleted : {deleted_images}")
print(f"Labels deleted : {deleted_labels}")

‚ùå Deleting duplicate image: dataset_working\11 (mie goreng)\images\143_271148.jpg
   ‚îî‚îÄ deleting label: dataset_working\11 (mie goreng)\labels_bbox\143_271148.txt
‚ùå Deleting duplicate image: dataset_working\12 (opor ayam)\images\Oporayamdantelur193_jpeg_jpg.rf.ff79fd1613dbaa8df4c6ae0bf16c8919.jpg
   ‚îî‚îÄ deleting label: dataset_working\12 (opor ayam)\labels_bbox\Oporayamdantelur193_jpeg_jpg.rf.ff79fd1613dbaa8df4c6ae0bf16c8919.txt
‚ùå Deleting duplicate image: dataset_working\13 (nasi goreng)\images\292769.jpg
   ‚îî‚îÄ deleting label: dataset_working\13 (nasi goreng)\labels_bbox\292769.txt
‚ùå Deleting duplicate image: dataset_working\14 (bubur ayam)\images\249119.jpg
   ‚îî‚îÄ deleting label: dataset_working\14 (bubur ayam)\labels_bbox\249119.txt
‚ùå Deleting duplicate image: dataset_working\16 (mie ayam)\images\319409.jpg
   ‚îî‚îÄ deleting label: dataset_working\16 (mie ayam)\labels_bbox\319409.txt
‚ùå Deleting duplicate image: dataset_working\16 (mie ayam)\images\319412.j

Following verification, confirmed duplicate images were removed while retaining a single representative copy per duplicate group. This reduced dataset redundancy without altering class semantics or introducing annotation inconsistencies.


In [11]:
for cls in Path("dataset_working").iterdir():
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    hashes = defaultdict(list)

    for img in img_dir.iterdir():
        h = hashlib.md5(img.read_bytes()).hexdigest()
        hashes[h].append(img)

    for h, imgs in hashes.items():
        if len(imgs) > 1:
            print("‚ùå Duplicate remains in:", cls.name)

print("\n‚úÖ Final sanity check passed: No duplicate images remain.")


‚úÖ Final sanity check passed: No duplicate images remain.


8.  Multi-Class Filtering

Some images contained annotations from multiple food categories. To maintain a consistent single-class detection setting aligned with the dataset organization, a filtering strategy was applied whereby only the annotation corresponding to the folder‚Äôs class label was retained, while other annotations were removed. This approach simplifies the detection task and ensures cleaner evaluation, while preserving image diversity. Potential bias introduced by removing contextual co-occurrence information is acknowledged.

In [12]:
DATASET_ROOT = Path("dataset_working")
LABEL_DIR_NAME = "labels_bbox"
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}


def get_folder_class_id(folder_name: str) -> int:
    match = re.match(r"\s*(\d+)", folder_name)
    if not match:
        raise ValueError(f"Cannot extract class ID from folder name: {folder_name}")
    return int(match.group(1))

def find_image(img_dir: Path, stem: str):
    for ext in IMG_EXTS:
        p = img_dir / f"{stem}{ext}"
        if p.exists():
            return p
    return None


print("üîç MULTI-CLASS LABEL AUDIT (READ-ONLY)\n")

count = 0

for cls in sorted(DATASET_ROOT.iterdir()):
    if not cls.is_dir():
        continue

    try:
        folder_class_id = get_folder_class_id(cls.name)
    except ValueError as e:
        print(f"‚ö† Skipping folder (cannot parse class ID): {cls.name}")
        continue

    lbl_dir = cls / LABEL_DIR_NAME
    img_dir = cls / "images"
    if not lbl_dir.exists() or not img_dir.exists():
        continue

    for lbl_path in lbl_dir.glob("*.txt"):
        with open(lbl_path, "r", encoding="utf-8") as f:
            lines = [ln.strip() for ln in f if ln.strip()]

        class_ids = set()
        for ln in lines:
            parts = ln.split()
            if len(parts) == 5:
                class_ids.add(int(parts[0]))

        if len(class_ids) > 1:
            img_path = find_image(img_dir, lbl_path.stem)

            print("=" * 80)
            print(f"Label file      : {lbl_path}")
            print(f"Image file      : {img_path}")
            print(f"Folder class ID : {folder_class_id}")
            print(f"Label class IDs : {sorted(class_ids)}")

            count += 1

print("\n==============================================")
print(f"Total multi-class samples found: {count}")
print("==============================================")


üîç MULTI-CLASS LABEL AUDIT (READ-ONLY)

Label file      : dataset_working\1 (tempe goreng)\labels_bbox\bf895418c287c87b3612fa457cece82e-tempe-menu_jpg.rf.da443c66058e2e911d8ffb4f5dcd8a88.txt
Image file      : dataset_working\1 (tempe goreng)\images\bf895418c287c87b3612fa457cece82e-tempe-menu_jpg.rf.da443c66058e2e911d8ffb4f5dcd8a88.jpg
Folder class ID : 1
Label class IDs : [7, 9]
Label file      : dataset_working\1 (tempe goreng)\labels_bbox\jadi_jpg.rf.a8bc486d4eec32fb5332a85bd7c69043.txt
Image file      : dataset_working\1 (tempe goreng)\images\jadi_jpg.rf.a8bc486d4eec32fb5332a85bd7c69043.jpg
Folder class ID : 1
Label class IDs : [7, 9]
Label file      : dataset_working\1 (tempe goreng)\labels_bbox\maxresdefault-3-_jpg.rf.dd168d6ee786ac6b2107cfbe208be17d.txt
Image file      : dataset_working\1 (tempe goreng)\images\maxresdefault-3-_jpg.rf.dd168d6ee786ac6b2107cfbe208be17d.jpg
Folder class ID : 1
Label class IDs : [7, 9]
Label file      : dataset_working\1 (tempe goreng)\labels_bbox\t

In [13]:
DATASET_ROOT = Path("dataset_working")
LABEL_DIR_NAME = "labels_bbox"

FILE_LEVEL_OVERRIDES = {
    "43_jpg.rf.7dac2bcf4e71f0deec92607955321cef.txt": 16,
    "2ea631c5d5fce7360cc6d23d0c031d9e_jpg.rf.6d85978f418c408e8a747c18542195ec.txt": 9,
    "Ikan-Mujair-Goreng_jpeg_jpg.rf.be143d519be02f3f25c1a482cf1f8e3f.txt": 8,
}

FOLDER_CLASS_OVERRIDE = {
    1: 9,
    12: 25,
    17: 7,
    2: 8,
    21: 9,
    23: 8,
    31: 22,
    34: 17,
    36: 4,
    39: 2,
    42: 29,
    5: 10,
}


def get_folder_class_id(folder_name: str) -> int:
    m = re.match(r"\s*(\d+)", folder_name)
    if not m:
        raise ValueError(f"Cannot parse class ID from folder name: {folder_name}")
    return int(m.group(1))


modified_files = 0

for cls in sorted(DATASET_ROOT.iterdir()):
    if not cls.is_dir():
        continue

    try:
        folder_class_id = get_folder_class_id(cls.name)
    except ValueError:
        continue

    lbl_dir = cls / LABEL_DIR_NAME
    if not lbl_dir.exists():
        continue

    for lbl_path in lbl_dir.glob("*.txt"):
        with open(lbl_path, "r", encoding="utf-8") as f:
            lines = [ln.strip() for ln in f if ln.strip()]

        parsed = []
        class_ids = set()

        for ln in lines:
            parts = ln.split()
            if len(parts) != 5:
                continue
            cid = int(parts[0])
            parsed.append((cid, ln))
            class_ids.add(cid)

        if len(class_ids) <= 1:
            continue

        if lbl_path.name in FILE_LEVEL_OVERRIDES:
            keep_id = FILE_LEVEL_OVERRIDES[lbl_path.name]
            reason = "FILE override"
        elif folder_class_id in FOLDER_CLASS_OVERRIDE:
            keep_id = FOLDER_CLASS_OVERRIDE[folder_class_id]
            reason = "FOLDER override"
        else:
            keep_id = folder_class_id
            reason = "FOLDER default"

        kept_lines = [ln for cid, ln in parsed if cid == keep_id]

        if not kept_lines:
            print(f"‚ö† No matching class kept for: {lbl_path}")
            continue

        with open(lbl_path, "w", encoding="utf-8") as f:
            f.write("\n".join(kept_lines) + "\n")

        print(f"‚úî Filtered {lbl_path}")
        print(f"  ‚îî kept class {keep_id} ({reason}), removed {class_ids - {keep_id}}")

        modified_files += 1

print(f"Label files modified: {modified_files}")


‚úî Filtered dataset_working\1 (tempe goreng)\labels_bbox\bf895418c287c87b3612fa457cece82e-tempe-menu_jpg.rf.da443c66058e2e911d8ffb4f5dcd8a88.txt
  ‚îî kept class 9 (FOLDER override), removed {7}
‚úî Filtered dataset_working\1 (tempe goreng)\labels_bbox\jadi_jpg.rf.a8bc486d4eec32fb5332a85bd7c69043.txt
  ‚îî kept class 9 (FOLDER override), removed {7}
‚úî Filtered dataset_working\1 (tempe goreng)\labels_bbox\maxresdefault-3-_jpg.rf.dd168d6ee786ac6b2107cfbe208be17d.txt
  ‚îî kept class 9 (FOLDER override), removed {7}
‚úî Filtered dataset_working\1 (tempe goreng)\labels_bbox\tumblr-inline-nmh2mvoii41qac5yq-1280-09d797eae8926332657eb07fde9a014a_600x400_jpg.rf.9c46fd37aff93609b39d9b77c17dc630.txt
  ‚îî kept class 9 (FOLDER override), removed {7}
‚úî Filtered dataset_working\12 (opor ayam)\labels_bbox\10_jpg.rf.27d932f8b5d7b8f019d154a743890dc1.txt
  ‚îî kept class 25 (FOLDER override), removed {16}
‚úî Filtered dataset_working\12 (opor ayam)\labels_bbox\12_jpg.rf.17d78f930e3df316243c57b9435

9. Remap Label class IDs to Folder ID

Class identifiers in all YOLO label files were normalized to zero-based indexing to comply with the YOLO annotation specification. The class ID for each annotation was derived from the numeric prefix of its corresponding class folder, ensuring consistency between directory structure and label contents. 

During this process, a canonical class mapping file (`classes.txt`) was generated to record the correspondence between YOLO class IDs and their semantic class names. This mapping provides a persistent reference for training configuration, evaluation, and reproducibility.

In [14]:
DATASET_ROOT = Path("dataset_working")
LABEL_DIR_NAME = "labels_bbox"
CLASS_MAP_FILE = Path("classes.txt")


def parse_folder_class(folder_name: str):

    m = re.match(r"\s*(\d+)\s*(?:\((.+)\))?", folder_name)
    if not m:
        raise ValueError(f"Cannot parse class from folder name: {folder_name}")

    folder_id = int(m.group(1))
    class_name = m.group(2) if m.group(2) else f"class_{folder_id}"

    
    yolo_id = folder_id - 1
    if yolo_id < 0:
        raise ValueError(f"Invalid class ID after conversion: {folder_name}")

    return yolo_id, class_name.strip()


print("üîß Remapping labels to 0-based YOLO class IDs\n")

class_id_to_name = {}
changed_files = 0
changed_lines = 0

for cls in sorted(DATASET_ROOT.iterdir()):
    if not cls.is_dir():
        continue

    try:
        yolo_id, class_name = parse_folder_class(cls.name)
    except ValueError as e:
        print(f"‚ö† Skipped: {e}")
        continue

    class_id_to_name[yolo_id] = class_name

    lbl_dir = cls / LABEL_DIR_NAME
    if not lbl_dir.exists():
        continue

    for lbl_path in lbl_dir.glob("*.txt"):
        lines = lbl_path.read_text(encoding="utf-8").splitlines()
        new_lines = []
        modified = False

        for ln in lines:
            parts = ln.split()
            if len(parts) != 5:
                continue

            old_id = int(parts[0])
            if old_id != yolo_id:
                modified = True
                changed_lines += 1

            parts[0] = str(yolo_id)
            new_lines.append(" ".join(parts))

        if modified:
            lbl_path.write_text("\n".join(new_lines) + "\n", encoding="utf-8")
            changed_files += 1
            print(f"‚úî Remapped: {lbl_path}")


# WRITE CLASS MAP
# =========================
print("\nüìù Writing class mapping file:", CLASS_MAP_FILE)

with open(CLASS_MAP_FILE, "w", encoding="utf-8") as f:
    for cid in sorted(class_id_to_name):
        name = class_id_to_name[cid].replace(" ", "_")
        f.write(f"{cid} {name}\n")


# SUMMARY
# =========================
print("\n‚úÖ REMAPPING COMPLETE")
print(f"Files changed : {changed_files}")
print(f"Labels fixed  : {changed_lines}")
print(f"Classes saved : {len(class_id_to_name)}")
print(f"Class map     : {CLASS_MAP_FILE}")

üîß Remapping labels to 0-based YOLO class IDs

‚úî Remapped: dataset_working\1 (tempe goreng)\labels_bbox\044589900_1546677752-resep-tempe-goreng-sederhana-tanpa-tepung-gurih-nan-lezat_jpg.rf.d7c7a4527be4d6fe774a6230210fbce0.txt
‚úî Remapped: dataset_working\1 (tempe goreng)\labels_bbox\12_jpg.rf.7cc8a6c05fc2ebbb5d16f35f77d14dc6.txt
‚úî Remapped: dataset_working\1 (tempe goreng)\labels_bbox\192-tempe-goreng-selimut-tempe-goreng-tepung-gorengan-tempe-foto-resep-utama_jpg.rf.2e892737748b0c1e782ff9e0970cc1b9.txt
‚úî Remapped: dataset_working\1 (tempe goreng)\labels_bbox\20200716155723-4227-resep-tempe-goreng-gurihhh_jpg.rf.a53d0e1f7fc2ea35760ed88d1705c8aa.txt
‚úî Remapped: dataset_working\1 (tempe goreng)\labels_bbox\3d79eb1d39942565c0960cc68c13d80a_jpg.rf.2f57d9922df418dc8220a4715ebcb430.txt
‚úî Remapped: dataset_working\1 (tempe goreng)\labels_bbox\42104_tempe-goreng-bumbu-kunyit_jpg.rf.82b92d4b47b36f99b5003bd545a55356.txt
‚úî Remapped: dataset_working\1 (tempe goreng)\labels_bbox\724

10. Dataset Integrity Check (Re-EDA)

A second exploratory analysis was conducted after preprocessing to assess the effective dataset distribution. No additional dataset balancing or filtering was applied, as class frequencies, image resolutions, and bounding box scales were within acceptable ranges. Images were retained at their original resolutions to preserve visual detail.

In [15]:
total_images = 0
total_boxes = 0
class_names = []

for cls in DATASET_ROOT.iterdir():
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    lbl_dir = cls / "labels_bbox"
    if not img_dir.exists() or not lbl_dir.exists():
        continue

    images = [p for p in img_dir.iterdir() if p.suffix.lower() in IMG_EXTS]
    labels = list(lbl_dir.glob("*.txt"))

    total_images += len(images)
    class_names.append(cls.name)

    for lbl in labels:
        total_boxes += len(lbl.read_text().strip().splitlines())

print("=== DATASET SUMMARY ===")
print("Classes       :", len(class_names))
print("Images        :", total_images)
print("Bounding boxes:", total_boxes)

=== DATASET SUMMARY ===
Classes       : 42
Images        : 7151
Bounding boxes: 11672


10.1 Image per Class

In [16]:
class_image_counts = []

for cls in DATASET_ROOT.iterdir():
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    if not img_dir.exists():
        continue

    count = sum(1 for p in img_dir.iterdir() if p.suffix.lower() in IMG_EXTS)
    class_image_counts.append({
        "class": cls.name,
        "images": count
    })

df_images_per_class = (
    pd.DataFrame(class_image_counts)
    .sort_values("images", ascending=False)
)

display(df_images_per_class.style.hide(axis="index"))


class,images
6 (bakso),409
5 (sate),361
38 (martabak manis),343
13 (nasi goreng),326
3 (rendang),299
11 (mie goreng),240
12 (opor ayam),233
25 (nasi putih),230
37 (telur rebus),200
24 (telur ceplok),199


10.2 Bounding Box per Class

In [17]:
class_box_counts = []

for cls in DATASET_ROOT.iterdir():
    if not cls.is_dir():
        continue

    lbl_dir = cls / "labels_bbox"
    if not lbl_dir.exists():
        continue

    box_count = 0
    for lbl in lbl_dir.glob("*.txt"):
        box_count += len(lbl.read_text().strip().splitlines())

    class_box_counts.append({
        "class": cls.name,
        "boxes": box_count
    })

df_boxes_per_class = (
    pd.DataFrame(class_box_counts)
    .sort_values("boxes", ascending=False)
)

display(df_boxes_per_class.style.hide(axis="index"))


class,boxes
28 (kue cubit),1193
27 (putu ayu),1169
6 (bakso),614
40 (tempe bacem),568
38 (martabak manis),526
2 (tahu goreng),423
37 (telur rebus),422
1 (tempe goreng),418
5 (sate),387
26 (dadar gulung),341


10.3 Image Resolution Distribution

In [18]:
resolution_counter = Counter()

for cls in DATASET_ROOT.iterdir():
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    if not img_dir.exists():
        continue

    for img_path in img_dir.iterdir():
        if img_path.suffix.lower() not in IMG_EXTS:
            continue

        img = cv2.imread(str(img_path))
        if img is None:
            continue

        h, w = img.shape[:2]
        resolution_counter[(w, h)] += 1

df_resolutions = (
    pd.DataFrame(
        [{"width": w, "height": h, "count": c}
         for (w, h), c in resolution_counter.items()]
    )
    .sort_values("count", ascending=False)
)

display(df_resolutions.head(20).style.hide(axis="index"))


width,height,count
640,640,480
500,500,480
800,600,345
751,532,322
577,433,301
512,512,161
500,375,155
680,482,148
1280,720,106
1200,630,93


10.4 Box Size Distribution (relative area)

In [19]:
box_areas = []

for cls in DATASET_ROOT.iterdir():
    if not cls.is_dir():
        continue

    lbl_dir = cls / "labels_bbox"
    if not lbl_dir.exists():
        continue

    for lbl in lbl_dir.glob("*.txt"):
        for ln in lbl.read_text().splitlines():
            parts = ln.split()
            if len(parts) != 5:
                continue
            _, _, _, w, h = map(float, parts)
            box_areas.append(w * h)

df_box_area = pd.DataFrame({"relative_area": box_areas})

print(df_box_area.describe())

       relative_area
count   11672.000000
mean        0.365508
std         0.306115
min         0.000192
25%         0.092736
50%         0.249123
75%         0.647175
max         1.000000


Following preprocessing and label normalization, exploratory analysis was repeated on the cleaned dataset to verify class distributions, annotation consistency, and overall data integrity. The re-analysis confirmed that all labels conform to the YOLO format with zero-based class indexing and that no structural or annotation anomalies remain.

11. Splitting Train/Val/Test

The dataset was split into training, validation, and test sets using a 70/15/15 ratio. Stratified sampling was applied at the class level to ensure that each food category was represented across all subsets. A fixed random seed (42) was used to ensure reproducibility.

In [20]:
SRC_ROOT = Path("dataset_working")
DST_ROOT = Path("dataset_final")

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}
SEED = 42

TRAIN_RATIO = 0.70
VAL_RATIO   = 0.15
TEST_RATIO  = 0.15

random.seed(SEED)

# CREATE YOLO DIR STRUCTURE
# =========================
for split in ["train", "val", "test"]:
    (DST_ROOT / split / "images").mkdir(parents=True, exist_ok=True)
    (DST_ROOT / split / "labels").mkdir(parents=True, exist_ok=True)

print("‚úÇÔ∏è DATASET SPLIT ‚Äî YOLO FORMAT (70/15/15, Stratified per class)\n")

total_counts = {"train": 0, "val": 0, "test": 0}

# SPLIT PER CLASS (STRATIFIED)
# =========================
for cls in sorted(SRC_ROOT.iterdir()):
    if not cls.is_dir():
        continue

    img_dir = cls / "images"
    lbl_dir = cls / "labels_bbox"

    if not img_dir.exists() or not lbl_dir.exists():
        continue

    # Collect valid image‚Äìlabel pairs
    pairs = []
    for img_path in img_dir.iterdir():
        if img_path.suffix.lower() not in IMG_EXTS:
            continue

        lbl_path = lbl_dir / f"{img_path.stem}.txt"
        if lbl_path.exists():
            pairs.append((img_path, lbl_path))

    if not pairs:
        continue

    random.shuffle(pairs)

    n = len(pairs)
    n_train = int(n * TRAIN_RATIO)
    n_val   = int(n * VAL_RATIO)

    splits = {
        "train": pairs[:n_train],
        "val":   pairs[n_train:n_train + n_val],
        "test":  pairs[n_train + n_val:]
    }

    print(
        f"Class {cls.name}: {n} ‚Üí "
        f"train={len(splits['train'])}, "
        f"val={len(splits['val'])}, "
        f"test={len(splits['test'])}"
    )

    for split_name, items in splits.items():
        for img_path, lbl_path in items:
            shutil.copy2(
                img_path,
                DST_ROOT / split_name / "images" / img_path.name
            )
            shutil.copy2(
                lbl_path,
                DST_ROOT / split_name / "labels" / lbl_path.name
            )
            total_counts[split_name] += 1


print("\n‚úÖ SPLIT COMPLETE (YOLO READY)")
print(f"Train images : {total_counts['train']}")
print(f"Val images   : {total_counts['val']}")
print(f"Test images  : {total_counts['test']}")
print(f"Random seed  : {SEED}")


‚úÇÔ∏è DATASET SPLIT ‚Äî YOLO FORMAT (70/15/15, Stratified per class)

Class 1 (tempe goreng): 90 ‚Üí train=62, val=13, test=15
Class 10 (gado gado): 105 ‚Üí train=73, val=15, test=17
Class 11 (mie goreng): 240 ‚Üí train=168, val=36, test=36
Class 12 (opor ayam): 233 ‚Üí train=163, val=34, test=36
Class 13 (nasi goreng): 326 ‚Üí train=228, val=48, test=50
Class 14 (bubur ayam): 107 ‚Üí train=74, val=16, test=17
Class 15 (cakwe): 135 ‚Üí train=94, val=20, test=21
Class 16 (mie ayam): 104 ‚Üí train=72, val=15, test=17
Class 17 (nasi padang): 147 ‚Üí train=102, val=22, test=23
Class 18 (babi guling): 108 ‚Üí train=75, val=16, test=17
Class 19 (nasi uduk): 106 ‚Üí train=74, val=15, test=17
Class 2 (tahu goreng): 106 ‚Üí train=74, val=15, test=17
Class 20 (nasi babi campur): 117 ‚Üí train=81, val=17, test=19
Class 21 (ayam pop): 111 ‚Üí train=77, val=16, test=18
Class 22 (telur balado): 186 ‚Üí train=130, val=27, test=29
Class 23 (telur dadar): 114 ‚Üí train=79, val=17, test=18
Class 24 (te