The purpose of this script is to split the images into 3 sets of data: Train, Test and Validation. Each set will contain random pictures from each category (keeping the same folder structure). Based on the size (7,838 images) of our dataset we decided on using the following ratios: 70%Train, 15%Test, 15%Validation.

70% Training: This is the portion of the dataset that our model will learn from. It's important to have a sufficiently large training set to allow the model to generalize well and learn the underlying patterns in the data.

15% Testing: The testing set is used to evaluate the model's performance on data it hasn't seen during training. This helps us understand how well our model is likely to perform on new, unseen data.

15% Validation: The validation set is used to fine-tune hyperparameters and monitor the model's performance during training. It helps prevent overfitting by allowing us to make adjustments based on validation performance.



In [None]:
#Connect to GoogleDrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Install necessary libraries
import os
import cv2
import shutil
import random
import numpy as np

In [None]:
#Set-up directory
os.chdir('/content/drive/My Drive/Colab Notebooks')

In [None]:
#Function for data split
def split_data_into_train_val_test(image_folder, train_ratio=0.7, val_ratio=0.15):
    # Get a list of all subfolders (classes)
    class_folders = os.listdir(image_folder)

    #Create train, validation and test directories ,inside the input directory, if they don't exist
    train_folder = os.path.join(image_folder, 'train')
    val_folder = os.path.join(image_folder, 'val')
    test_folder = os.path.join(image_folder, 'test')
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(val_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    #Split the data and copy images to the corresponding folders (not just moving the images)
    #The script splits each class folder into training, validation, and testing subsets while ensuring stratified sampling. This prevents one class from dominating one split.
    #It properly splits the images within each class according to the specified ratios (70%, 15%, 15%) and ensures that the distribution of each class is preserved in each subset.
    #Shuffling is also used to introduce randomness into the order of the samples within a class before splitting. This helps prevent any potential biases that might arise due to the order of images in the original dataset.
    for class_folder in class_folders:
        class_path = os.path.join(image_folder, class_folder)
        class_images = os.listdir(class_path)
        random.shuffle(class_images)

        train_size = int(train_ratio * len(class_images))
        val_size = int(val_ratio * len(class_images))
        train_images = class_images[:train_size]
        val_images = class_images[train_size:train_size + val_size]
        test_images = class_images[train_size + val_size:]

        #Create class subdirectories in train, validation and test folders
        train_class_folder = os.path.join(train_folder, class_folder)
        val_class_folder = os.path.join(val_folder, class_folder)
        test_class_folder = os.path.join(test_folder, class_folder)
        os.makedirs(train_class_folder, exist_ok=True)
        os.makedirs(val_class_folder, exist_ok=True)
        os.makedirs(test_class_folder, exist_ok=True)

        #Copy images to the corresponding train, validation, and test subdirectories
        for train_image in train_images:
            src_path = os.path.join(class_path, train_image)
            dst_path = os.path.join(train_class_folder, train_image)
            shutil.copy(src_path, dst_path)

        for val_image in val_images:
            src_path = os.path.join(class_path, val_image)
            dst_path = os.path.join(val_class_folder, val_image)
            shutil.copy(src_path, dst_path)

        for test_image in test_images:
            src_path = os.path.join(class_path, test_image)
            dst_path = os.path.join(test_class_folder, test_image)
            shutil.copy(src_path, dst_path)


In [None]:
preprocessed_folder_path = '/content/drive/My Drive/Colab Notebooks/Preprocessed_Images'

In [None]:
#Split data into train and test and validation sets
split_data_into_train_val_test(preprocessed_folder_path)