# CNN to detect skin cancer

## Preprocessing to trim white padding and resize

In [1]:
from PIL import Image
import os
import random
import shutil

# Set random seed for reproducibility
random.seed(42)

# Define categories and source base directory
categories = ["benign", "malignant"]
src_base_dir = "/Volumes/SanDisk-64G"
output_base_dir = os.path.join(src_base_dir, "processed_dataset_6000")

# Desired image size
target_size = (512, 512)

# Split ratios
split_ratios = {
    "train": 0.8,
    "val": 0.1,
    "test": 0.1
}

# Clear and recreate output folders
for split in split_ratios:
    for category in categories:
        split_dir = os.path.join(output_base_dir, split, category)
        os.makedirs(split_dir, exist_ok=True)

# Process each category
for category in categories:
    src_dir = os.path.join(src_base_dir, f"3000_{category}")
    images = [f for f in os.listdir(src_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    random.shuffle(images)

    total = len(images)
    train_end = int(split_ratios["train"] * total)
    val_end = train_end + int(split_ratios["val"] * total)

    splits = {
        "train": images[:train_end],
        "val": images[train_end:val_end],
        "test": images[val_end:]
    }

    for split, filenames in splits.items():
        for filename in filenames:
            src_path = os.path.join(src_dir, filename)
            dst_path = os.path.join(output_base_dir, split, category, filename)
            try:
                with Image.open(src_path) as img:
                    img_resized = img.resize(target_size, Image.LANCZOS)
                    img_resized.save(dst_path)
            except Exception as e:
                pass
