In [1]:
import os
import random
import shutil

In [2]:
# --- Configuration ---
SOURCE_DIR = "../data/classifier_data/"
VAL_SPLIT_RATIO = 0.2

def split_dataset(source_dir, val_ratio):
    """
    Splits a dataset of class folders into train and val sets.
    """
    print(f"Splitting dataset in '{source_dir}' with a {val_ratio*100}% validation set.")

    # --- FIX: Get the list of class names FIRST ---
    # This prevents the script from trying to process 'train' and 'val' as classes.
    try:
        class_names = [d for d in os.listdir(source_dir) if os.path.isdir(os.path.join(source_dir, d))]
    except FileNotFoundError:
        print(f"Error: The source directory '{source_dir}' was not found.")
        return

    # Filter out 'train' or 'val' folders if they already exist from a partial run
    class_names = [name for name in class_names if name not in ['train', 'val']]
    
    if not class_names:
        print("No class folders (like 'printed', 'handwritten') found in the source directory.")
        return

    print(f"Found classes to process: {class_names}")

    # Create the new directory structure
    train_dir = os.path.join(source_dir, 'train')
    val_dir = os.path.join(source_dir, 'val')

    if os.path.exists(train_dir) or os.path.exists(val_dir):
        print("Train/Val folders already exist. Aborting to prevent data duplication.")
        print("Please remove the existing 'train' and 'val' folders and re-run.")
        return

    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)

    # Process each class
    for class_name in class_names:
        print(f"\nProcessing class: {class_name}")
        
        # Create corresponding subdirectories in train and val
        os.makedirs(os.path.join(train_dir, class_name), exist_ok=True)
        os.makedirs(os.path.join(val_dir, class_name), exist_ok=True)

        # Get a list of all images for the current class
        class_source_dir = os.path.join(source_dir, class_name)
        all_images = [f for f in os.listdir(class_source_dir) if os.path.isfile(os.path.join(class_source_dir, f))]
        
        random.shuffle(all_images)
        split_point = int(len(all_images) * val_ratio)
        val_images = all_images[:split_point]
        train_images = all_images[split_point:]

        print(f"  Total images: {len(all_images)}")
        print(f"  Moving {len(val_images)} to validation set.")
        print(f"  Moving {len(train_images)} to training set.")

        # Move the files
        for img in val_images:
            shutil.move(os.path.join(class_source_dir, img), os.path.join(val_dir, class_name, img))
        
        for img in train_images:
            shutil.move(os.path.join(class_source_dir, img), os.path.join(train_dir, class_name, img))

        # Remove the now-empty original class folder
        os.rmdir(class_source_dir)

    print("\nDataset split successfully!")

if __name__ == "__main__":
    split_dataset(SOURCE_DIR, VAL_SPLIT_RATIO)

Splitting dataset in '../data/classifier_data/' with a 20.0% validation set.
Found classes to process: ['printed', 'handwritten']

Processing class: printed
  Total images: 956
  Moving 191 to validation set.
  Moving 765 to training set.

Processing class: handwritten
  Total images: 721
  Moving 144 to validation set.
  Moving 577 to training set.

Dataset split successfully!
