In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
import random
import shutil
from tqdm import tqdm

In [None]:
SOURCE_DIR = r"food-101\images"
TARGET_DIR = r"food-101-splitpilihan"
SELECTED_CLASSES = ["cup_cakes","donuts","french_fries","fried_rice","hamburger","omelette","pizza","steak","sushi","takoyaki"]

SAMPLES_PER_CLASS = 1000
TRAIN_RATIO = 0.7
VAL_RATIO = 0.15
TEST_RATIO = 0.15
RANDOM_SEED = 42

random.seed(RANDOM_SEED)

train_count = int(SAMPLES_PER_CLASS * TRAIN_RATIO)
val_count   = int(SAMPLES_PER_CLASS * VAL_RATIO)
test_count  = SAMPLES_PER_CLASS - train_count - val_count

print("=" * 70)
print("DATASET SAMPLING & SPLITTING (10 KELAS)")
print("=" * 70)
print(f"Train: {train_count} | Val: {val_count} | Test: {test_count}")
print("=" * 70)

all_classes = sorted([
    d for d in os.listdir(SOURCE_DIR)
    if os.path.isdir(os.path.join(SOURCE_DIR, d))])

print(f"Jumlah kelas diproses: {len(SELECTED_CLASSES)}")
print("Kelas terpilih:")
for cls in SELECTED_CLASSES:
    print(f"- {cls}")

for cls in tqdm(SELECTED_CLASSES, desc="Processing classes"):
    src_class_path = os.path.join(SOURCE_DIR, cls)

    images = glob.glob(os.path.join(src_class_path, "*.jpg"))

    if len(images) < SAMPLES_PER_CLASS:
        raise ValueError(f"Kelas {cls} hanya punya {len(images)} gambar")

    random.shuffle(images)
    selected = images[:SAMPLES_PER_CLASS]

    splits = {
        "train": selected[:train_count],
        "val":   selected[train_count:train_count+val_count],
        "test":  selected[train_count+val_count:]}

    for split, files in splits.items():
        dst = os.path.join(TARGET_DIR, split, cls)
        os.makedirs(dst, exist_ok=True)
        for f in files:
            shutil.copy(f, dst)
print("\nDataset sampling & splitting selesai (10 kelas)")

In [None]:
def count_images(base_dir):
    total = 0
    for cls in os.listdir(base_dir):
        cls_path = os.path.join(base_dir, cls)
        if os.path.isdir(cls_path):
            total += len(os.listdir(cls_path))
    return total

print("Train total :", count_images(r"food-101-split\train"))
print("Val total   :", count_images(r"food-101-split\val"))
print("Test total  :", count_images(r"food-101-split\test"))


In [None]:
BASE_DIR = r"food-101-splitpilihan"
splits = ["train", "val", "test"]
data = []

for split in splits:
    split_dir = os.path.join(BASE_DIR, split)
    for cls in sorted(os.listdir(split_dir)):
        cls_dir = os.path.join(split_dir, cls)
        if os.path.isdir(cls_dir):
            count = len([
                f for f in os.listdir(cls_dir)
                if f.lower().endswith(".jpg")
            ])
            data.append([cls, split, count])
df = pd.DataFrame(data, columns=["Kelas", "Split", "Jumlah"])

In [None]:
pivot_df = df.pivot(index="Kelas", columns="Split", values="Jumlah")
plt.figure(figsize=(12, 6))
pivot_df.plot(kind="bar", width=0.8)

plt.title("Distribusi Jumlah Gambar per Kelas")
plt.xlabel("Kelas")
plt.ylabel("Jumlah Gambar")
plt.xticks(rotation=45, ha="right")
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.legend(title="Dataset Split")
plt.tight_layout()
plt.show()