In [2]:
import os
import json
import random
import shutil
from tqdm import tqdm

# Set a random seed for reproducibility
random.seed(42)

# Specify the paths to the source and destination directories
src_data_dir = r'D:/Repositories/Vision-Dataset/VISION/dataset'
dest_data_dir = r'../datasets/Vision_data'

# Create a 'split' directory within the destination directory to store the JSON files
split_dir = os.path.join(dest_data_dir, 'split')
os.makedirs(split_dir, exist_ok=True)

# Get a list of class names (class folders)
class_names = os.listdir(src_data_dir)

# Define the split ratios (train, test, validation)
train_split_ratio = 0.7
test_split_ratio = 0.2
validation_split_ratio = 0.1

# Define the minimum number of samples per class in each split
min_samples_per_class = 10

# Create dictionaries to store the image lists for each split
image_splits = {
    'train': [],
    'test': [],
    'validation': []
}

# Process each class
for class_name in tqdm(class_names, desc="Processing Classes"):
    class_dir = os.path.join(src_data_dir, class_name, "images")
    if os.path.exists(class_dir):
        # Get a list of images in the class folder with ".jpg" extension
        images = [image for image in os.listdir(class_dir) if image.lower().endswith(".jpg")]
        num_images = len(images)

        if num_images >= min_samples_per_class:
            # Calculate the number of samples for each split
            num_train = max(min_samples_per_class, int(train_split_ratio * num_images))
            num_test = max(min_samples_per_class, int(test_split_ratio * num_images))
            num_validation = max(min_samples_per_class, int(validation_split_ratio * num_images))

            # Randomly shuffle the list of images
            random.shuffle(images)

            # Assign images to the splits, ensuring that each split has at least 10 samples from each class
            image_splits['train'].extend(images[:num_train])
            image_splits['test'].extend(images[num_train:num_train + num_test])
            image_splits['validation'].extend(images[num_train + num_test:num_train + num_test + num_validation])

# Create train, test, and validation folders within the destination directory
for split_name in ['train', 'test', 'validation']:
    split_dest_dir = os.path.join(dest_data_dir, split_name)
    os.makedirs(split_dest_dir, exist_ok=True)

    # Copy the data to the corresponding split folders
    for class_name in class_names:
        class_src_dir = os.path.join(src_data_dir, class_name, "images")
        class_dest_dir = os.path.join(split_dest_dir, class_name)

        # Use shutil.copytree to copy the entire directory structure recursively
        if os.path.exists(class_src_dir) and not os.path.exists(class_dest_dir):
            shutil.copytree(class_src_dir, class_dest_dir)

# Save the image split lists as JSON files
for split_name, image_list in image_splits.items():
    split_json_path = os.path.join(split_dir, f'vision_{split_name}.json')
    with open(split_json_path, 'w') as json_file:
        json.dump(image_list, json_file)

Processing Classes: 100%|██████████| 44/44 [00:04<00:00, 10.80it/s]
