Split the Dataset into 70% Training, 20% Vaidation and 10% Testing

In [2]:
import splitfolders  # install via: pip install split-folders

input_folder = "../dataset"   # your dataset with disease folders
output_folder = "../data"     # new folder for split dataset

# Split into 70% train, 20% val, 10% test
splitfolders.ratio(input_folder, output=output_folder, seed=42, ratio=(0.7, 0.2, 0.1))

In [19]:
import os
import shutil
import random

# Source dataset (your current folder with Anthracnose, Healthy, OtherDisease)
source_dir = "../../Database/Mango Dataset A Comprehensive Resource for Agricultural Research and Disease Detection/Augmented Mango dataset"

# Destination dataset (new balanced split dataset)
dest_dir = "../data"

# Splits (70/20/10)
splits = {"train": 0.7, "valid": 0.2, "test": 0.1}

# Final number of images per class
N_PER_CLASS = 1000

# 🔥 Clean old dataset if it exists
if os.path.exists(dest_dir):
    shutil.rmtree(dest_dir)

# Create destination folders again
for split in splits.keys():
    for cls in ["Anthracnose", "Healthy", "OtherDisease"]:
        os.makedirs(os.path.join(dest_dir, split, cls), exist_ok=True)

# Helper to copy images
def copy_images(images, cls, split):
    split_dir = os.path.join(dest_dir, split, cls)
    for img in images:
        shutil.copy(img, split_dir)

# Process Anthracnose and Healthy
for cls in ["Anthracnose", "Healthy"]:
    cls_path = os.path.join(source_dir, cls)
    images = [os.path.join(cls_path, img) for img in os.listdir(cls_path)]
    random.shuffle(images)
    images = images[:N_PER_CLASS]  # pick exactly 500
    
    # Split
    n_train = int(N_PER_CLASS * splits["train"])
    n_valid = int(N_PER_CLASS * splits["valid"])
    
    copy_images(images[:n_train], cls, "train")
    copy_images(images[n_train:n_train+n_valid], cls, "valid")
    copy_images(images[n_train+n_valid:], cls, "test")
    
    print(f"{cls}: {len(images)} -> {n_train} train, {n_valid} valid, {N_PER_CLASS - n_train - n_valid} test")

# Process OtherDisease (merge equally from subfolders)
other_path = os.path.join(source_dir, "OtherDisease")
subfolders = [os.path.join(other_path, d) for d in os.listdir(other_path) if os.path.isdir(os.path.join(other_path, d))]

per_disease = N_PER_CLASS // len(subfolders)  # equal number per sub-disease
other_images = []

for sub in subfolders:
    imgs = os.listdir(sub)
    random.shuffle(imgs)
    imgs = [os.path.join(sub, img) for img in imgs[:per_disease]]
    other_images.extend(imgs)

random.shuffle(other_images)

# Split OtherDisease
n_train = int(N_PER_CLASS * splits["train"])
n_valid = int(N_PER_CLASS * splits["valid"])

copy_images(other_images[:n_train], "OtherDisease", "train")
copy_images(other_images[n_train:n_train+n_valid], "OtherDisease", "valid")
copy_images(other_images[n_train+n_valid:], "OtherDisease", "test")

print(f"OtherDisease: {len(other_images)} -> {n_train} train, {n_valid} valid, {N_PER_CLASS - n_train - n_valid} test")

print("✅ Finished creating fresh balanced dataset with 3 classes (500 each)!")


Anthracnose: 1000 -> 700 train, 200 valid, 100 test
Healthy: 1000 -> 700 train, 200 valid, 100 test
OtherDisease: 999 -> 700 train, 200 valid, 100 test
✅ Finished creating fresh balanced dataset with 3 classes (500 each)!


In [15]:
for cls in ["Anthracnose", "Healthy", "OtherDisease"]:
    files = os.listdir(f"../../Database/dataset/{cls}")
    print(cls, "->", len([f for f in files if f.lower().endswith(('.png','.jpg','.jpeg'))]))

Anthracnose -> 500
Healthy -> 500
OtherDisease -> 0
