In [1]:
import os, shutil
import random

In [2]:
original_dataset_dir_healthy = '/Users/marsc/ASUS/Desktop/FYP/Datasets/cropped/no'
original_dataset_dir_tumor = '/Users/marsc/ASUS/Desktop/FYP/Datasets/cropped/yes'

In [3]:
base_dir = '/Users/marsc/ASUS/Desktop/FYP/Datasets/healthy_and_tumor'
os.mkdir(base_dir)

In [4]:
train_dir = os.path.join(base_dir, 'train')
os.mkdir(train_dir)

validation_dir = os.path.join(base_dir, 'validation')
os.mkdir(validation_dir)

test_dir = os.path.join(base_dir, 'test')
os.mkdir(test_dir)

In [5]:
train_healthy_dir = os.path.join(train_dir, 'healthy')
os.mkdir(train_healthy_dir)
train_tumor_dir = os.path.join(train_dir, 'tumor')
os.mkdir(train_tumor_dir)

validation_healthy_dir = os.path.join(validation_dir, 'healthy')
os.mkdir(validation_healthy_dir)
validation_tumor_dir = os.path.join(validation_dir, 'tumor')
os.mkdir(validation_tumor_dir)

test_healthy_dir = os.path.join(test_dir, 'healthy')
os.mkdir(test_healthy_dir)
test_tumor_dir = os.path.join(test_dir, 'tumor')
os.mkdir(test_tumor_dir)

In [6]:
# Function to copy images
def copy_images(src_dir, dst_dir, image_list):
    for fname in image_list:
        src = os.path.join(src_dir, fname)
        dst = os.path.join(dst_dir, fname)
        shutil.copyfile(src, dst)

In [7]:
# Get list of all images
healthy_images = os.listdir(original_dataset_dir_healthy)
tumor_images = os.listdir(original_dataset_dir_tumor)

# Ensure there are enough images
assert len(healthy_images) == 1500, "There should be exactly 1500 healthy images."
assert len(tumor_images) == 1500, "There should be exactly 1500 tumor images."

# Shuffle the images to ensure randomness
random.shuffle(healthy_images)
random.shuffle(tumor_images)

# Define the split sizes
train_size = 1050  # 70% of 1500
validation_size = 225  # 15% of 1500
test_size = 225  # 15% of 1500

# Split the images
train_healthy_images = healthy_images[:train_size]
validation_healthy_images = healthy_images[train_size:train_size + validation_size]
test_healthy_images = healthy_images[train_size + validation_size:train_size + validation_size + test_size]

train_tumor_images = tumor_images[:train_size]
validation_tumor_images = tumor_images[train_size:train_size + validation_size]
test_tumor_images = tumor_images[train_size + validation_size:train_size + validation_size + test_size]

# Copy the images to their respective directories
copy_images(original_dataset_dir_healthy, train_healthy_dir, train_healthy_images)
copy_images(original_dataset_dir_healthy, validation_healthy_dir, validation_healthy_images)
copy_images(original_dataset_dir_healthy, test_healthy_dir, test_healthy_images)

copy_images(original_dataset_dir_tumor, train_tumor_dir, train_tumor_images)
copy_images(original_dataset_dir_tumor, validation_tumor_dir, validation_tumor_images)
copy_images(original_dataset_dir_tumor, test_tumor_dir, test_tumor_images)

In [8]:
# Function to verify the number of images in each directory
def verify_image_counts():
    dirs = [
        (train_healthy_dir, 1050),
        (validation_healthy_dir, 225),
        (test_healthy_dir, 225),
        (train_tumor_dir, 1050),
        (validation_tumor_dir, 225),
        (test_tumor_dir, 225)
    ]
    
    for dir_path, expected_count in dirs:
        actual_count = len(os.listdir(dir_path))
        assert actual_count == expected_count, f"Directory {dir_path} has {actual_count} images; expected {expected_count}."
    
    print("All directories have the correct number of images.")

# Verify the image counts
verify_image_counts()
print("Images copied successfully and verified!")

All directories have the correct number of images.
Images copied successfully and verified!


In [9]:
# Get the list of all files in the folder
base_dir = '/Users/marsc/ASUS/Desktop/FYP/Datasets/healthy_and_tumor'
folder_list = os.listdir(base_dir)

# Print the list of folders and the number of images in each subfolder
for folder in folder_list:
    print(folder)
    subfolder_path = os.path.join(base_dir, folder)
    if os.path.isdir(subfolder_path):
        subfolder_list = os.listdir(subfolder_path)
        for subfolder in subfolder_list:
            subfolder_full_path = os.path.join(subfolder_path, subfolder)
            if os.path.isdir(subfolder_full_path):
                image_list = os.listdir(subfolder_full_path)
                print(f'{subfolder}: {len(image_list)} images')

test
healthy: 225 images
tumor: 225 images
train
healthy: 1050 images
tumor: 1050 images
validation
healthy: 225 images
tumor: 225 images
