In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split

In [2]:
INPUT_DATASET = "../dataset"
OUTPUT_BASE = "../data_split"
TRAIN_PATH = os.path.join(OUTPUT_BASE, "train")
VAL_PATH = os.path.join(OUTPUT_BASE, "val")
TEST_PATH = os.path.join(OUTPUT_BASE, "test")

In [3]:
TEST_SIZE = 0.15
VAL_SIZE = 0.15
RANDOM_STATE = 42

In [4]:
os.makedirs(TRAIN_PATH, exist_ok=True)
os.makedirs(TEST_PATH, exist_ok=True)
os.makedirs(VAL_PATH, exist_ok=True)

In [5]:
for cls in os.listdir(INPUT_DATASET):
    cls_path = os.path.join(INPUT_DATASET, cls)
    if not os.path.isdir(cls_path):
        continue

    images = [
        img for img in os.listdir(cls_path)
        if img.lower().endswith((".jpg", ".png", ".jpeg"))
    ]
    # 70 - 30
    train_imgs, temp_imgs = train_test_split(
        images,
        test_size=(VAL_SIZE + TEST_SIZE),
        random_state=RANDOM_STATE,
    )
    # 30 --> 15-15
    # Second split val vs. test
    val_imgs, test_imgs = train_test_split(
        temp_imgs,
        test_size=TEST_SIZE/(VAL_SIZE + TEST_SIZE),
        random_state=RANDOM_STATE,
    )

    for split, imgs in zip([TRAIN_PATH, VAL_PATH, TEST_PATH]
            , [train_imgs, val_imgs, test_imgs]):
        cls_dir = os.path.join(split, cls)
        os.makedirs(cls_dir, exist_ok=True)
        for img in imgs:
            shutil.copy(
                os.path.join(cls_path, img),
                os.path.join(cls_dir, img)
            )

print(f"   Train: {len(train_imgs)} images per class")
print(f"   Val:   {len(val_imgs)} images per class")
print(f"   Test:  {len(test_imgs)} images per class")

   Train: 77 images per class
   Val:   16 images per class
   Test:  17 images per class
