In [1]:
import os
import shutil
import random

import math

In [2]:
# Define the base data directory (this is the parent of 'train', and where 'validation' and 'test' will be created)
base_data_dir = "/media/tairo/Storages/AIProject/AboutModel/TraAI/Data"

# Define the source directory for images (your current "train" directory)
source_train_dir = os.path.join(base_data_dir, "train")

# Define split ratios
# train_ratio will be implicit (what's left after val and test are moved)
validation_ratio = 0.20
test_ratio = 0.10
# train_ratio will be 1.0 - validation_ratio - test_ratio = 0.70

# Define common image extensions to process
IMAGE_EXTENSIONS = ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff')

In [None]:
def split_data_in_place(current_train_dir, parent_data_dir, val_r, test_r):
    print(f"Source 'train' direc
# Define the source directory for images (your current "train" directory)tory (will be modified): {current_train_dir}")
    print(f"Parent directory for creating 'validation' and 'test': {parent_data_dir}")
    print(f"Ratios: Validation={val_r*100}%, Test={test_r*100}%, Train (remaining)={(1-val_r-test_r)*100}%")
    print("WARNING: This script will MOVE files from your existing 'train' directory.")
    print("-" * 50)

    if not os.path.exists(current_train_dir):
        print(f"ERROR: Source 'train' directory '{current_train_dir}' does not exist.")
        return

    # Define paths for new validation and test directories
    validation_path = os.path.join(parent_data_dir, "validation")
    test_path = os.path.join(parent_data_dir, "test")

    os.makedirs(validation_path, exist_ok=True)
    os.makedirs(test_path, exist_ok=True)
    print(f"Ensured/Created directory: {validation_path}")
    print(f"Ensured/Created directory: {test_path}")
    print("-" * 50)

    try:
        class_names = sorted([d for d in os.listdir(current_train_dir) if os.path.isdir(os.path.join(current_train_dir, d))])
    except FileNotFoundError:
        print(f"ERROR: Could not list directories in '{current_train_dir}'. Check path and permissions.")
        return
    
    if not class_names:
        print(f"ERROR: No class subdirectories found in '{current_train_dir}'.")
        return

    print(f"Found class folders in '{current_train_dir}': {', '.join(class_names)}")
    print("-" * 50)

    total_files_in_original_train = 0
    total_files_moved_to_val = 0
    total_files_moved_to_test = 0
    total_files_remaining_in_train = 0

    for class_name in class_names:
        print(f"Processing class: {class_name}")
        source_class_path = os.path.join(current_train_dir, class_name) # This is where files will be moved FROM

        # Create corresponding class subdirectories in validation and test
        dest_val_class_path = os.path.join(validation_path, class_name)
        dest_test_class_path = os.path.join(test_path, class_name)

        os.makedirs(dest_val_class_path, exist_ok=True)
        os.makedirs(dest_test_class_path, exist_ok=True)

        try:
            all_files = [
                f for f in os.listdir(source_class_path)
                if os.path.isfile(os.path.join(source_class_path, f)) and f.lower().endswith(IMAGE_EXTENSIONS)
            ]
        except FileNotFoundError:
            print(f"  ERROR: Class directory '{source_class_path}' not found during file listing.")
            continue

        if not all_files:
            print(f"  No image files found in {source_class_path}. Skipping this class for splitting.")
            continue

        random.shuffle(all_files)
        num_files_class_original = len(all_files)
        total_files_in_original_train += num_files_class_original

        # Calculate number of files for validation and test
        num_val_files = math.ceil(num_files_class_original * val_r)
        num_test_files = math.ceil(num_files_class_original * test_r)

        # Ensure we don't try to move more files than exist, especially if sum of val_r and test_r is high
        if num_val_files + num_test_files > num_files_class_original:
            # Prioritize validation, then test, if proportions are too large for the available files
            num_val_files = min(num_val_files, num_files_class_original)
            num_test_files = min(num_test_files, num_files_class_original - num_val_files)


        val_files = all_files[:num_val_files]
        test_files = all_files[num_val_files : num_val_files + num_test_files]
        # Remaining files will stay in the original train folder for this class

        num_train_files_remaining = num_files_class_original - len(val_files) - len(test_files)

        print(f"  Original files in {class_name}: {num_files_class_original}")
        print(f"  Moving to validation: {len(val_files)}")
        print(f"  Moving to test: {len(test_files)}")
        print(f"  Remaining in train: {num_train_files_remaining}")


        # Function to MOVE files
        def move_files(files_to_move, source_folder, dest_folder):
            moved_count = 0
            for f_name in files_to_move:
                src_file_path = os.path.join(source_folder, f_name) # Original location in current_train_dir/class_name
                dest_file_path = os.path.join(dest_folder, f_name) # New location in validation/class_name or test/class_name
                try:
                    shutil.move(src_file_path, dest_file_path)
                    moved_count += 1
                except Exception as e:
                    print(f"    Error moving {src_file_path} to {dest_file_path}: {e}")
            return moved_count

        # Move files to their new validation and test directories
        print(f"  Moving validation files for {class_name}...")
        count_v = move_files(val_files, source_class_path, dest_val_class_path)
        total_files_moved_to_val += count_v

        print(f"  Moving test files for {class_name}...")
        count_te = move_files(test_files, source_class_path, dest_test_class_path)
        total_files_moved_to_test += count_te

        total_files_remaining_in_train += num_train_files_remaining # Update based on calculation
        print("-" * 30)

    print("=" * 50)
    print("Data splitting (in-place modification of 'train' and creation of 'validation'/'test') complete.")
    print(f"Total files originally in 'train' directory: {total_files_in_original_train}")
    print(f"Total files MOVED to new 'validation' set: {total_files_moved_to_val}")
    print(f"Total files MOVED to new 'test' set: {total_files_moved_to_test}")
    print(f"Total files REMAINING in 'train' set: {total_files_remaining_in_train}") # Recalculate for sanity
    print(f"Final counts: Train={total_files_in_original_train - total_files_moved_to_val - total_files_moved_to_test}, Validation={total_files_moved_to_val}, Test={total_files_moved_to_test}")
    print(f"Validation data is in: {validation_path}")
    print(f"Test data is in: {test_path}")
    print(f"The 'train' directory at {source_train_dir} has been modified (reduced).")
    print("=" * 50)

# Run the splitting function
if __name__ == "__main__":
    # This is the PARENT directory of 'train', 'validation', 'test'
    base_data_dir = "/media/tairo/Storages/AIProject/AboutModel/TraAI/Data"
    
    # This is your existing 'train' directory that will be modified
    source_train_dir = os.path.join(base_data_dir, "train")

    # Ratios for new folders (train will be the remainder)
    val_ratio = 0.20
    test_ratio = 0.10

    print("Starting data splitting process (IN-PLACE modification)...")
    print("*"*10 + " WARNING: THIS WILL MODIFY YOUR EXISTING 'train' FOLDER. " + "*"*10)
    confirm = input(f"Files will be MOVED from '{source_train_dir}' to new 'validation' and 'test' folders within '{base_data_dir}'. Are you absolutely sure you want to proceed? (yes/no): ")
    if confirm.lower() != 'yes':
        print("Operation cancelled by user.")
        exit()

    split_data_in_place(source_train_dir, base_data_dir, val_ratio, test_ratio)

Starting data splitting process (IN-PLACE modification)...


Operation cancelled by user.
Source 'train' directory (will be modified): /media/tairo/Storages/AIProject/AboutModel/TraAI/Data/train
Parent directory for creating 'validation' and 'test': /media/tairo/Storages/AIProject/AboutModel/TraAI/Data
Ratios: Validation=20.0%, Test=10.0%, Train (remaining)=70.0%
--------------------------------------------------
Ensured/Created directory: /media/tairo/Storages/AIProject/AboutModel/TraAI/Data/validation
Ensured/Created directory: /media/tairo/Storages/AIProject/AboutModel/TraAI/Data/test
--------------------------------------------------
Found class folders in '/media/tairo/Storages/AIProject/AboutModel/TraAI/Data/train': Bacterial Spot, Early Blight, Yellow Leaf Curl
--------------------------------------------------
Processing class: Bacterial Spot
  Original files in Bacterial Spot: 3558
  Moving to validation: 712
  Moving to test: 356
  Remaining in train: 2490
  Moving validation files for Bacterial Spot...
  Moving test files for Bacteria

: 