In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# === Paths ===
image_folders = [
    "/content/drive/MyDrive/EECE 490 Project/EyePac/Cleaned_images",
    "/content/drive/MyDrive/EECE 490 Project/DDR dataset/Cleaned_DDR",
    "/content/drive/MyDrive/EECE 490 Project/Aptos/train_images_cleaned",
    "/content/drive/MyDrive/EECE 490 Project/IDRD/Cleaned_IDRiD"
]

combined_image_folder = "/content/drive/MyDrive/EECE 490 Project/Classification_Set/All_Images"
os.makedirs(combined_image_folder, exist_ok=True)

# === Combine Images ===
for folder in image_folders:
    for file in os.listdir(folder):
        src = os.path.join(folder, file)
        dst = os.path.join(combined_image_folder, file)
        if os.path.isfile(src):
            shutil.copy2(src, dst)

print("✅ All images copied to:", combined_image_folder)

In [3]:
import os
import shutil
import pandas as pd


# === Excel Files to Combine ===
excel_paths = [
    "/content/drive/MyDrive/EECE 490 Project/EyePac/trainLabels_filtered.xlsx",
    "/content/drive/MyDrive/EECE 490 Project/DDR dataset/DR_grading_cleaned.xlsx",
    "/content/drive/MyDrive/EECE 490 Project/Aptos/train_cleaned.xlsx",
    "/content/drive/MyDrive/EECE 490 Project/IDRD/cleaned_labels.xlsx",
]

dfs = []

for path in excel_paths:
    df = pd.read_excel(path)
    if "filename" in df.columns and "diagnosis" in df.columns:
        dfs.append(df[["filename", "diagnosis"]])
    else:
        print(f"⚠️ Columns missing in: {path}")

# === Merge and Save ===
combined_df = pd.concat(dfs, ignore_index=True)
combined_excel_path = "/content/drive/MyDrive/EECE 490 Project/Classification_Set/combined_labels.xlsx"
combined_df.to_excel(combined_excel_path, index=False)

print("✅ Combined Excel saved at:", combined_excel_path)


✅ Combined Excel saved at: /content/drive/MyDrive/EECE 490 Project/Classification_Set/combined_labels.xlsx


In [4]:
# === Count number of images in folder ===
num_images = len(os.listdir("/content/drive/MyDrive/EECE 490 Project/Classification_Set/All_Images"))

# === Count number of rows in Excel ===
df = pd.read_excel("/content/drive/MyDrive/EECE 490 Project/Classification_Set/combined_labels.xlsx")
num_labels = len(df)

# === Print results ===
print(f"🖼️ Total images in folder: {num_images}")
print(f"📄 Total labels in Excel:  {num_labels}")


🖼️ Total images in folder: 37480
📄 Total labels in Excel:  37479


In [8]:
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm  # <--- progress bar

# === Paths ===
combined_excel_path = "/content/drive/MyDrive/EECE 490 Project/Classification_Set/combined_labels.xlsx"
image_dir = "/content/drive/MyDrive/EECE 490 Project/Classification_Set/All_Images"
output_dir = "/content/drive/MyDrive/EECE 490 Project/Classification_Set/renamed_images"
os.makedirs(output_dir, exist_ok=True)

# === Load Excel
df = pd.read_excel(combined_excel_path)

# === Prepare new list
new_filenames = []
preserved_diagnoses = []
image_counter = 1
not_found = []

# === Loop through Excel with tqdm
for row in tqdm(df.itertuples(index=False), total=len(df), desc="Processing"):
    original_name = os.path.splitext(str(row.filename))[0]
    label = row.diagnosis
    found = False

    for ext in ['.jpeg', '.jpg', '.JPG', '.JPEG']:
        image_path = os.path.join(image_dir, original_name + ext)
        if os.path.exists(image_path):
            new_filename = f"img_{image_counter:05d}.png"
            img = Image.open(image_path).convert("RGB")
            img.save(os.path.join(output_dir, new_filename))

            new_filenames.append(new_filename)
            preserved_diagnoses.append(label)
            image_counter += 1
            found = True
            break

    if not found:
        not_found.append(original_name)

# === Save updated Excel
renamed_df = pd.DataFrame({
    "filename": new_filenames,
    "diagnosis": preserved_diagnoses
})
renamed_excel_path = "/content/drive/MyDrive/EECE 490 Project/Classification_Set/renamed_labels.xlsx"
renamed_df.to_excel(renamed_excel_path, index=False)

# === Done
print(f"\n✅ Renamed {len(new_filenames)} images to .png and updated Excel.")
print(f"⚠️ {len(not_found)} images not found.")
print(f"📁 Renamed images are in: {output_dir}")
print(f"📝 New Excel is saved at: {renamed_excel_path}")


Processing: 100%|██████████| 37479/37479 [1:31:45<00:00,  6.81it/s]



✅ Renamed 37475 images to .png and updated Excel.
⚠️ 4 images not found.
📁 Renamed images are in: /content/drive/MyDrive/EECE 490 Project/Classification_Set/renamed_images
📝 New Excel is saved at: /content/drive/MyDrive/EECE 490 Project/Classification_Set/renamed_labels.xlsx


In [9]:
import os
import cv2
import numpy as np
from tqdm import tqdm

# === Paths ===
input_dir = "/content/drive/MyDrive/EECE 490 Project/Classification_Set/renamed_images"
output_dir = "/content/drive/MyDrive/EECE 490 Project/Classification_Set/resized_denoised_images"
os.makedirs(output_dir, exist_ok=True)

# === Parameters
target_size = (200, 200)
kernel_size = (3, 3)

# === Process images
image_files = [f for f in os.listdir(input_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
skipped = []

for fname in tqdm(image_files, desc="Processing images"):
    in_path = os.path.join(input_dir, fname)
    out_path = os.path.join(output_dir, fname)

    img = cv2.imread(in_path)
    if img is None or img.shape[2] != 3:
        skipped.append(fname)
        continue

    try:
        # === Apply Gaussian blur
        img = cv2.GaussianBlur(img, kernel_size, 0)

        # === Resize to 200x200
        img = cv2.resize(img, target_size, interpolation=cv2.INTER_AREA)

        # === Normalize to [-1, 1]
        normalized = img.astype(np.float32) / 127.5 - 1.0

        # === Save back as [0, 255] uint8
        out_img = ((normalized + 1.0) * 127.5).astype(np.uint8)
        cv2.imwrite(out_path, out_img)

    except Exception as e:
        print(f"⚠️ Failed to process {fname}: {e}")
        skipped.append(fname)

# === Done
print(f"\n✅ Done! Processed: {len(image_files) - len(skipped)} images")
print(f"⚠️ Skipped: {len(skipped)} images")
print("📁 Output folder:", output_dir)


Processing images: 100%|██████████| 37475/37475 [36:02<00:00, 17.33it/s]


✅ Done! Processed: 37475 images
⚠️ Skipped: 0 images
📁 Output folder: /content/drive/MyDrive/EECE 490 Project/Classification_Set/resized_denoised_images





In [10]:
import os
import cv2

# === Folder containing images ===
folder_path = "/content/drive/MyDrive/EECE 490 Project/Classification_Set/resized_denoised_images"

# === Loop through all files and check dimensions ===
for filename in os.listdir(folder_path):
    if filename.lower().endswith(('.png')):
        img_path = os.path.join(folder_path, filename)
        img = cv2.imread(img_path)

        if img is None:
            print(f"❌ Could not read {filename}")
            continue

        h, w = img.shape[:2]
        if h != 200 or w != 200:
            print(f"⚠️ {filename} has size {w}x{h}")


In [11]:
import os
import pandas as pd
import shutil
from tqdm import tqdm

# === Paths ===
excel_path = "/content/drive/MyDrive/EECE 490 Project/Classification_Set/renamed_labels.xlsx"
image_dir = "/content/drive/MyDrive/EECE 490 Project/Classification_Set/resized_denoised_images"
output_dir = "/content/drive/MyDrive/EECE 490 Project/Classification_Set/resized_denoised_split"
os.makedirs(output_dir, exist_ok=True)

# === Load cleaned Excel
df = pd.read_excel(excel_path)

# === Create subfolders for labels 0 to 4
label_counts = {i: 0 for i in range(5)}
for i in label_counts:
    os.makedirs(os.path.join(output_dir, str(i)), exist_ok=True)

# === Split images by label (keep original filenames)
for row in tqdm(df.itertuples(index=False), desc="Splitting images"):
    fname = row.filename
    label = int(row.diagnosis)
    src = os.path.join(image_dir, fname)
    dst = os.path.join(output_dir, str(label), fname)

    if os.path.exists(src):
        shutil.copy2(src, dst)
        label_counts[label] += 1
    else:
        print(f"⚠️ Image not found: {fname}")

# === Show result
print("\n✅ Split complete!")
for label in sorted(label_counts):
    print(f"📁 Class {label}: {label_counts[label]} images")
print(f"\n🔍 All images saved in: {output_dir}")


Splitting images: 37475it [11:26, 54.57it/s]


✅ Split complete!
📁 Class 0: 25140 images
📁 Class 1: 2453 images
📁 Class 2: 7817 images
📁 Class 3: 913 images
📁 Class 4: 1152 images

🔍 All images saved in: /content/drive/MyDrive/EECE 490 Project/Classification_Set/resized_denoised_split





In [12]:
import os
import shutil
import random
from tqdm import tqdm

# === Paths ===
input_root = "/content/drive/MyDrive/EECE 490 Project/Classification_Set/resized_denoised_split"
output_root = "/content/drive/MyDrive/EECE 490 Project/Classification_Set/final_split"
splits = ["train", "val", "test"]
ratios = {"train": 0.7, "val": 0.1, "test": 0.2}

# === Set up split folders
for split in splits:
    for label in range(5):
        split_path = os.path.join(output_root, split, str(label))
        os.makedirs(split_path, exist_ok=True)

# === Split each class folder
for label in range(5):
    class_folder = os.path.join(input_root, str(label))
    files = sorted(os.listdir(class_folder))
    random.shuffle(files)

    n_total = len(files)
    n_train = int(ratios["train"] * n_total)
    n_val = int(ratios["val"] * n_total)

    split_ranges = {
        "train": files[:n_train],
        "val": files[n_train:n_train + n_val],
        "test": files[n_train + n_val:]
    }

    for split in splits:
        for fname in tqdm(split_ranges[split], desc=f"Copying {split}/{label}"):
            src = os.path.join(class_folder, fname)
            dst = os.path.join(output_root, split, str(label), fname)
            shutil.copy2(src, dst)

# === Done
print("\n✅ Final 70/10/20 split complete!")
for split in splits:
    for label in range(5):
        count = len(os.listdir(os.path.join(output_root, split, str(label))))
        print(f"{split}/{label}: {count} images")


Copying train/0: 100%|██████████| 17598/17598 [13:14<00:00, 22.15it/s]
Copying val/0: 100%|██████████| 2514/2514 [00:47<00:00, 53.34it/s]
Copying test/0: 100%|██████████| 5028/5028 [01:36<00:00, 52.16it/s]
Copying train/1: 100%|██████████| 1717/1717 [01:12<00:00, 23.54it/s]
Copying val/1: 100%|██████████| 245/245 [00:04<00:00, 56.12it/s]
Copying test/1: 100%|██████████| 491/491 [00:08<00:00, 55.58it/s]
Copying train/2: 100%|██████████| 5471/5471 [04:20<00:00, 21.00it/s]
Copying val/2: 100%|██████████| 781/781 [00:14<00:00, 55.65it/s]
Copying test/2: 100%|██████████| 1565/1565 [00:29<00:00, 53.54it/s]
Copying train/3: 100%|██████████| 639/639 [00:17<00:00, 36.29it/s]
Copying val/3: 100%|██████████| 91/91 [00:02<00:00, 40.65it/s]
Copying test/3: 100%|██████████| 183/183 [00:02<00:00, 61.35it/s]
Copying train/4: 100%|██████████| 806/806 [00:24<00:00, 32.98it/s]
Copying val/4: 100%|██████████| 115/115 [00:02<00:00, 51.43it/s]
Copying test/4: 100%|██████████| 231/231 [00:07<00:00, 30.15it/s


✅ Final 70/10/20 split complete!
train/0: 17598 images
train/1: 1717 images
train/2: 5471 images
train/3: 639 images
train/4: 806 images
val/0: 2514 images
val/1: 245 images
val/2: 781 images
val/3: 91 images
val/4: 115 images
test/0: 5028 images
test/1: 491 images
test/2: 1565 images
test/3: 183 images
test/4: 231 images


In [13]:
import os
import shutil
import random
from PIL import Image, ImageEnhance, ImageOps
from tqdm import tqdm

# === ORIGINAL SPLIT ===
original_dir = "/content/drive/MyDrive/EECE 490 Project/Classification_Set/final_split"
target_dir = "/content/drive/MyDrive/EECE 490 Project/Classification_Set/augmented_final_split_fully_balanced"
os.makedirs(target_dir, exist_ok=True)

# === Target per class per split
target_split = {
    "train": 7000,
    "val": 1000,
    "test": 2000
}

# === Augmentation methods
AUGMENTATIONS = {
    'rotate': lambda img: img.rotate(random.uniform(-10, 10)),
    'flip_h': lambda img: ImageOps.mirror(img),
    'flip_v': lambda img: ImageOps.flip(img),
    'zoom': lambda img: img.crop((20, 20, img.width - 20, img.height - 20)).resize((img.width, img.height)),
    'brightness': lambda img: ImageEnhance.Brightness(img).enhance(random.uniform(0.5, 1.5)),
    'color': lambda img: ImageEnhance.Color(img).enhance(random.uniform(0.5, 1.5)),
    'contrast': lambda img: ImageEnhance.Contrast(img).enhance(random.uniform(0.5, 1.5))
}

def augment_image(img):
    aug_img = img.copy()
    ops = random.sample(list(AUGMENTATIONS.values()), k=random.randint(3, 5))
    for op in ops:
        aug_img = op(aug_img)
    return aug_img

# === Main augmentation loop
for split in ["train", "val", "test"]:
    for label in range(5):
        src = os.path.join(original_dir, split, str(label))
        dst = os.path.join(target_dir, split, str(label))
        os.makedirs(dst, exist_ok=True)

        images = sorted(os.listdir(src))
        current_count = len(images)
        target_count = target_split[split]

        # 1. Copy originals
        for fname in images:
            shutil.copy2(os.path.join(src, fname), os.path.join(dst, fname))

        # 2. Augment as needed
        if current_count < target_count:
            for i in tqdm(range(target_count - current_count), desc=f"Augmenting {split}/{label}"):
                base_img = random.choice(images)
                img = Image.open(os.path.join(src, base_img)).convert("RGB")
                aug_img = augment_image(img)
                aug_name = f"aug_{i:05d}_{base_img}"
                aug_img.save(os.path.join(dst, aug_name))

print("✅ YOU'RE DONEEEE. ALL CLASSES NOW 7k/1k/2k — FULLY BALANCED 💪")

# === COUNTING IMAGES IN EACH FOLDER ===
print("\n📊 FINAL IMAGE COUNTS PER FOLDER:\n")
for split in ["train", "val", "test"]:
    for label in range(5):
        folder = os.path.join(target_dir, split, str(label))
        count = len(os.listdir(folder))
        print(f"{split}/{label}: {count} images")


Augmenting train/1: 100%|██████████| 5283/5283 [03:49<00:00, 23.04it/s]
Augmenting train/2: 100%|██████████| 1529/1529 [01:09<00:00, 21.91it/s]
Augmenting train/3: 100%|██████████| 6361/6361 [04:47<00:00, 22.15it/s]
Augmenting train/4: 100%|██████████| 6194/6194 [04:43<00:00, 21.81it/s]
Augmenting val/1: 100%|██████████| 755/755 [00:31<00:00, 23.65it/s]
Augmenting val/2: 100%|██████████| 219/219 [00:09<00:00, 22.09it/s]
Augmenting val/3: 100%|██████████| 909/909 [00:37<00:00, 24.05it/s]
Augmenting val/4: 100%|██████████| 885/885 [00:37<00:00, 23.49it/s]
Augmenting test/1: 100%|██████████| 1509/1509 [01:04<00:00, 23.39it/s]
Augmenting test/2: 100%|██████████| 435/435 [00:19<00:00, 22.55it/s]
Augmenting test/3: 100%|██████████| 1817/1817 [01:18<00:00, 23.29it/s]
Augmenting test/4: 100%|██████████| 1769/1769 [01:19<00:00, 22.38it/s]


✅ YOU'RE DONEEEE. ALL CLASSES NOW 7k/1k/2k — FULLY BALANCED 💪

📊 FINAL IMAGE COUNTS PER FOLDER:

train/0: 17598 images
train/1: 7000 images
train/2: 7000 images
train/3: 7000 images
train/4: 7000 images
val/0: 2514 images
val/1: 1000 images
val/2: 1000 images
val/3: 1000 images
val/4: 1000 images
test/0: 5028 images
test/1: 2000 images
test/2: 2000 images
test/3: 2000 images
test/4: 2000 images


In [14]:
import os
import random
from tqdm import tqdm

# === Folder containing oversized class 0 folders
class0_paths = {
    "train": "/content/drive/MyDrive/EECE 490 Project/Classification_Set/augmented_final_split_fully_balanced/train/0",
    "val":   "/content/drive/MyDrive/EECE 490 Project/Classification_Set/augmented_final_split_fully_balanced/val/0",
    "test":  "/content/drive/MyDrive/EECE 490 Project/Classification_Set/augmented_final_split_fully_balanced/test/0"
}

# === Desired count per split
target_counts = {
    "train": 7000,
    "val": 1000,
    "test": 2000
}

# === Delete excess images
for split, path in class0_paths.items():
    all_files = sorted(os.listdir(path))
    target = target_counts[split]

    if len(all_files) > target:
        to_delete = random.sample(all_files, len(all_files) - target)
        for fname in tqdm(to_delete, desc=f"Deleting from {split}/0"):
            os.remove(os.path.join(path, fname))

print("\n🗑️ DONE! Class 0 is now cleanly trimmed to 7k/1k/2k. BALANCE ACHIEVED ⚖️")


Deleting from train/0: 100%|██████████| 10598/10598 [00:36<00:00, 294.30it/s]
Deleting from val/0: 100%|██████████| 1514/1514 [00:05<00:00, 297.06it/s]
Deleting from test/0: 100%|██████████| 3028/3028 [00:09<00:00, 312.59it/s]


🗑️ DONE! Class 0 is now cleanly trimmed to 7k/1k/2k. BALANCE ACHIEVED ⚖️





In [16]:
# === COUNTING IMAGES IN EACH FOLDER ===
print("\n📊 FINAL IMAGE COUNTS PER FOLDER:\n")
for split in ["train", "val", "test"]:
    for label in range(5):
        folder = os.path.join(target_dir, split, str(label))
        count = len(os.listdir(folder))
        print(f"{split}/{label}: {count} images")



📊 FINAL IMAGE COUNTS PER FOLDER:

train/0: 7000 images
train/1: 7000 images
train/2: 7000 images
train/3: 7000 images
train/4: 7000 images
val/0: 1000 images
val/1: 1000 images
val/2: 1000 images
val/3: 1000 images
val/4: 1000 images
test/0: 2000 images
test/1: 2000 images
test/2: 2000 images
test/3: 2000 images
test/4: 2000 images
