# Split DataSet Food-101 into Train and Test Dataset Based on README Instruction

In [1]:
import os
import shutil

# === Path settings ===
root = r"C:/Users/User/PycharmProjects/PythonProject/FYP/FoodWasteEstimator/food-101"   # change this if needed
images_dir = os.path.join(root, "images")
meta_dir = os.path.join(root, "meta")

output_base = os.path.join(root, "food101_yolo")
train_dir = os.path.join(output_base, "train")
val_dir   = os.path.join(output_base, "val")

# create output folders
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

# helper to read split list
def read_split(file_path):
    with open(file_path, "r") as f:
        return [line.strip() for line in f.readlines()]

# load official splits from meta folder
train_list = read_split(os.path.join(meta_dir, "train.txt"))
test_list  = read_split(os.path.join(meta_dir, "test.txt"))  # treat as val

def copy_images(split_list, split_name, out_dir):
    for i, item in enumerate(split_list):
        cls, img_id = item.split('/')
        src = os.path.join(images_dir, cls, f"{img_id}.jpg")
        dst_dir = os.path.join(out_dir, cls)
        os.makedirs(dst_dir, exist_ok=True)
        if os.path.exists(src):
            shutil.copy(src, dst_dir)
        if (i + 1) % 500 == 0:
            print(f"[{split_name}] Copied {i+1}/{len(split_list)} images...")

print("Copying training images...")
copy_images(train_list, "train", train_dir)

print("Copying validation images (from official test.txt)...")
copy_images(test_list, "val", val_dir)

print("‚úÖ Done! YOLO dataset created at:", output_base)


üì¶ Copying training images...
[train] Copied 500/75750 images...
[train] Copied 1000/75750 images...
[train] Copied 1500/75750 images...
[train] Copied 2000/75750 images...
[train] Copied 2500/75750 images...
[train] Copied 3000/75750 images...
[train] Copied 3500/75750 images...
[train] Copied 4000/75750 images...
[train] Copied 4500/75750 images...
[train] Copied 5000/75750 images...
[train] Copied 5500/75750 images...
[train] Copied 6000/75750 images...
[train] Copied 6500/75750 images...
[train] Copied 7000/75750 images...
[train] Copied 7500/75750 images...
[train] Copied 8000/75750 images...
[train] Copied 8500/75750 images...
[train] Copied 9000/75750 images...
[train] Copied 9500/75750 images...
[train] Copied 10000/75750 images...
[train] Copied 10500/75750 images...
[train] Copied 11000/75750 images...
[train] Copied 11500/75750 images...
[train] Copied 12000/75750 images...
[train] Copied 12500/75750 images...
[train] Copied 13000/75750 images...
[train] Copied 13500/75750

# CONVERT UEC-FOOD-100 INTO YOLO FORMAT & Split Dataset

In [None]:
from pathlib import Path
import os, cv2, random, shutil
from collections import defaultdict

# === EDIT THESE ===
RAW_ROOT  = Path(r"C:/Users/User/PycharmProjects/PythonProject/FYP/FoodWasteEstimator/UECFOOD100")
OUT_ROOT  = Path(r"C:/Users/User/PycharmProjects/PythonProject/FYP/FoodWasteEstimator/UECFOOD100_yolo")
ONE_CLASS = True     # True => single class 'food'; False => keep 100 classes (0..99)
SPLIT     = 0.70
random.seed(42)

for p in [OUT_ROOT/"images/train", OUT_ROOT/"images/val", OUT_ROOT/"labels/train", OUT_ROOT/"labels/val"]:
    p.mkdir(parents=True, exist_ok=True)

def yolo_line(x1,y1,x2,y2,w,h,cls):
    xc=((x1+x2)/2)/w; yc=((y1+y2)/2)/h; ww=(x2-x1)/w; hh=(y2-y1)/h
    return f"{cls} {xc:.6f} {yc:.6f} {ww:.6f} {hh:.6f}\n"

# gather boxes per absolute image path
records = defaultdict(list)  # abs_img_path -> list[(cls_id,(x1,y1,x2,y2))]

class_dirs = sorted([d for d in RAW_ROOT.iterdir() if d.is_dir() and d.name.isdigit()],
                    key=lambda p:int(p.name))

for idx, cdir in enumerate(class_dirs, start=1):
    cls_id = 0 if ONE_CLASS else (idx-1)
    bb = cdir / "bb_info.txt"
    if not bb.exists():
        print(f"‚ö†Ô∏è  missing {bb}")
        continue

    with bb.open("r", encoding="utf-8", errors="ignore") as f:
        header = f.readline()  # skip "img x1 y1 x2 y2"
        for line in f:
            line=line.strip()
            if not line:
                continue
            toks = line.split()
            if len(toks) < 5:
                continue
            # parse: img_id x1 y1 x2 y2
            img_id = toks[0]
            try:
                x1, y1, x2, y2 = map(float, toks[1:5])
            except:
                continue
            # map image id -> filename (try several extensions)
            candidates = [cdir / f"{img_id}.jpg",
                          cdir / f"{img_id}.JPG",
                          cdir / f"{img_id}.png",
                          cdir / f"{img_id}.jpeg"]
            img_path = next((p for p in candidates if p.exists()), None)
            if img_path is None:
                continue

            img = cv2.imread(str(img_path))
            if img is None:
                continue
            h, w = img.shape[:2]
            # clip
            x1 = max(0, min(x1, w-1)); y1 = max(0, min(y1, h-1))
            x2 = max(0, min(x2, w-1)); y2 = max(0, min(y2, h-1))
            if x2 <= x1 or y2 <= y1:
                continue

            records[str(img_path.resolve())].append((cls_id, (x1,y1,x2,y2)))

print("Images with boxes:", len(records))

# split by image
keys = list(records.keys())
random.shuffle(keys)
cut = int(SPLIT * len(keys))
splits = {"train": keys[:cut], "val": keys[cut:]}

def save_example(abs_path_str, split):
    path = Path(abs_path_str)
    img = cv2.imread(str(path))
    if img is None:
        return
    h, w = img.shape[:2]

    uniq = f"{path.stem}_{hash(abs_path_str) & 0xffff:04x}"
    dst_img = OUT_ROOT / f"images/{split}/{uniq}.jpg"
    dst_lbl = OUT_ROOT / f"labels/{split}/{uniq}.txt"

    shutil.copyfile(path, dst_img)

    lines=[]
    for cls_id, (x1,y1,x2,y2) in records[abs_path_str]:
        cls = 0 if ONE_CLASS else cls_id
        lines.append(yolo_line(x1,y1,x2,y2,w,h,cls))
    with dst_lbl.open("w", encoding="utf-8") as f:
        f.writelines(lines)

for split, klist in splits.items():
    for k in klist:
        save_example(k, split)

print("Done.")
print(" train imgs:", len(os.listdir(OUT_ROOT/'images/train')))
print(" val imgs  :", len(os.listdir(OUT_ROOT/'images/val')))


# Quick diagnosisÔºö UEC-FOOD-100 has bounding box?

In [2]:
from pathlib import Path

RAW_ROOT = Path(r"C:/Users/User/PycharmProjects/PythonProject/FYP/FoodWasteEstimator/UECFOOD100")

# 1) Do we see class folders and bb_info.txt files?
cls_dirs = [d for d in RAW_ROOT.iterdir() if d.is_dir() and d.name.isdigit()]
print("Class dirs found:", len(cls_dirs))
missing = []
present = []
for d in sorted(cls_dirs, key=lambda p: int(p.name)):
    f = d / "bb_info.txt"
    if f.exists(): present.append(f)
    else: missing.append(d)

print("bb_info.txt present:", len(present))
print("bb_info.txt missing:", len(missing))
if present:
    print("\nSample of first bb_info.txt:", present[0])
    print("\n--- first 10 lines ---")
    try:
        for i, line in enumerate(open(present[0], "r", encoding="utf-8", errors="ignore")):
            print(line.rstrip())
            if i>=9: break
    except Exception as e:
        print("Read error:", e)


Class dirs found: 100
bb_info.txt present: 100
bb_info.txt missing: 0

Sample of first bb_info.txt: C:\Users\User\PycharmProjects\PythonProject\FYP\FoodWasteEstimator\UECFOOD100\1\bb_info.txt

--- first 10 lines ---
img x1 y1 x2 y2
1 0 143 370 486
2 20 208 582 559
3 2 110 243 410
4 0 237 286 536
5 8 28 761 585
6 0 38 369 310
7 0 162 383 450
8 80 31 776 454
9 2 226 270 470
