In [1]:
import os
import shutil
import random
from PIL import Image
from pathlib import Path

# Paths
benign_dir = '/Users/gbm746/Documents/CNN_Images/stratified_benign'
malignant_dir = '/Users/gbm746/Documents/CNN_Images/Malignant'
output_root = '/Users/gbm746/Documents/processed_dataset'
image_size = (512, 512)

# Make sure output directory is clean
if os.path.exists(output_root):
    shutil.rmtree(output_root)

# Define class folders and categories
categories = {
    'benign': benign_dir,
    'malignant': malignant_dir
}

# Seed for reproducibility
random.seed(42)

# Create output directories
splits = ['train', 'val', 'test']
for split in splits:
    for category in categories.keys():
        os.makedirs(os.path.join(output_root, split, category), exist_ok=True)

def process_and_split_images(category, path):
    all_images = [
        img for img in os.listdir(path)
        if img.lower().endswith(('.jpg', '.jpeg', '.png'))  # Exclude .tif
    ]

    random.shuffle(all_images)

    n = len(all_images)
    train_cutoff = int(0.8 * n)
    val_cutoff = int(0.9 * n)

    splits_indices = {
        'train': all_images[:train_cutoff],
        'val': all_images[train_cutoff:val_cutoff],
        'test': all_images[val_cutoff:]
    }

    for split, images in splits_indices.items():
        for image_name in images:
            input_path = os.path.join(path, image_name)
            output_path = os.path.join(output_root, split, category, image_name)

            try:
                with Image.open(input_path) as img:
                    img = img.convert('RGB')
                    img_resized = img.resize(image_size, resample=Image.LANCZOS)
                    img_resized.save(output_path)
            except Exception as e:
                print(f"Failed to process {input_path}: {e}")

# Run for each category
for label, folder_path in categories.items():
    process_and_split_images(label, folder_path)

print("Processing complete.")


Processing complete.
