This script splits the original dataset (which I used to store in `../data`) into three parts: 
training, validation, and test sets, following a 70-20-10 distribution ratio. 
Each class in the dataset has its own folder within `../data`, and the script 
creates new directories (`train`, `val`, and `test`). The file was later uploaded to google drive due to it being large.

For each class:
    - 70% of the images are moved to the training set (`train` directory)
    - 20% of the images are moved to the validation set (`val` directory)
    - 10% of the images are moved to the test set (`test` directory)

The script shuffles the data for each class before splitting to ensure randomness and copies the files into the respective directories.

In [None]:
import os
import shutil
import random

# Paths
data_dir = '../data'
output_dir = '../split_data'
train_dir = os.path.join(output_dir, 'train')
val_dir = os.path.join(output_dir, 'val')
test_dir = os.path.join(output_dir, 'test')



In [None]:

# Ratios for split
train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1

# Ensure output directories exist
for directory in [train_dir, val_dir, test_dir]:
    os.makedirs(directory, exist_ok=True)

# Split dataset
for class_name in os.listdir(data_dir):
    class_path = os.path.join(data_dir, class_name)
    if not os.path.isdir(class_path):
        continue

    # Create class folders in train, val, and test directories
    os.makedirs(os.path.join(train_dir, class_name), exist_ok=True)
    os.makedirs(os.path.join(val_dir, class_name), exist_ok=True)
    os.makedirs(os.path.join(test_dir, class_name), exist_ok=True)

    # Get all files in the class folder and shuffle them
    files = os.listdir(class_path)
    random.shuffle(files)

    # Calculate split indices
    total_files = len(files)
    train_end = int(total_files * train_ratio)
    val_end = train_end + int(total_files * val_ratio)
    # Split files into train, val, and test
    train_files = files[:train_end]
    val_files = files[train_end:val_end]
    test_files = files[val_end:]
    print(train_files)
    # Move files to the corresponding directories
    for file in train_files:
        shutil.copy(os.path.join(class_path, file), os.path.join(train_dir, class_name, file))
    for file in val_files:
        shutil.copy(os.path.join(class_path, file), os.path.join(val_dir, class_name, file))
    for file in test_files:
        shutil.copy(os.path.join(class_path, file), os.path.join(test_dir, class_name, file))

print("Dataset split completed.")