In [1]:
!pip install kagglehub torch torchvision transformers diffusers accelerate datasets xformers pytorch-fid pandas
import os
import gc
import torch
import numpy as np
import random
from PIL import Image
import pandas as pd
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset, TensorDataset
from transformers import ViTModel, CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel, DDPMScheduler, StableDiffusionPipeline
from accelerate import Accelerator
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import torch.nn as nn
import torch.nn.functional as F
import kagglehub
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Set random seed
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

# GPU setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Memory cleanup function
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()



# 0. Prepare training images

In [3]:
def prepare_images():
    # Kaggle dataset download
    path = kagglehub.dataset_download("paultimothymooney/kermany2018")
    print(f"Dataset downloaded to: {path}")

    # Set dataset path and copy trainset CNV images
    in_dir = os.path.join(path, "OCT2017 /train/CNV")
    out_dir = "/content/processed/CNV/"
    os.makedirs(out_dir, exist_ok=True)
    for fn in os.listdir(in_dir):
        img = Image.open(os.path.join(in_dir, fn)).convert("RGB")
        img = img.resize((512, 512), resample=Image.LANCZOS)
        img.save(os.path.join(out_dir, fn))
    print(f"Processed CNV images saved to {out_dir}")

    # set dataset path and copy trainset Normal images
    in_dir = os.path.join(path, "OCT2017 /train/NORMAL")
    out_dir = "/content/processed/Normal/"
    os.makedirs(out_dir, exist_ok=True)
    for fn in os.listdir(in_dir):
        img = Image.open(os.path.join(in_dir, fn)).convert("RGB")
        img = img.resize((512, 512), resample=Image.LANCZOS)
        img.save(os.path.join(out_dir, fn))
    print(f"Processed CNV images saved to {out_dir}")

    # set dataset path and copy testset CNV images
    in_dir = os.path.join(path, "OCT2017 /test/CNV")
    out_dir = "/content/testset/CNV/"
    os.makedirs(out_dir, exist_ok=True)
    for fn in os.listdir(in_dir):
        img = Image.open(os.path.join(in_dir, fn)).convert("RGB")
        img = img.resize((512, 512), resample=Image.LANCZOS)
        img.save(os.path.join(out_dir, fn))
    print(f"Processed CNV test images saved to {out_dir}")

    # set dataset path and copy testset Normal images
    in_dir = os.path.join(path, "OCT2017 /test/NORMAL")
    out_dir = "/content/testset/Normal/"
    os.makedirs(out_dir, exist_ok=True)
    for fn in os.listdir(in_dir):
        img = Image.open(os.path.join(in_dir, fn)).convert("RGB")
        img = img.resize((512, 512), resample=Image.LANCZOS)
        img.save(os.path.join(out_dir, fn))
    print(f"Processed CNV images saved to {out_dir}")


prepare_images()

Dataset downloaded to: /kaggle/input/kermany2018
Processed CNV images saved to /content/processed/CNV/
Processed CNV images saved to /content/processed/Normal/
Processed CNV test images saved to /content/testset/CNV/
Processed CNV images saved to /content/testset/Normal/


# 1. Classifier and Dataset

In [2]:
class ClassifierDataset(Dataset):

    def __init__(self, normal_root_dir:str, cnv_root_dir:str, synthetic_root_dir:str = None, ratio:float = 0.0, resolution=512, random_state = 42) -> None:

        super().__init__()

        self.transform = transforms.Compose([
            transforms.Resize((resolution, resolution), transforms.InterpolationMode.LANCZOS),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5]),
        ])

        normal_imgs = [
            os.path.join(normal_root_dir, f)
            for f in os.listdir(normal_root_dir)
            if f.lower().endswith((".png", ".jpg", ".jpeg"))
        ]

        cnv_imgs = [
            os.path.join(cnv_root_dir, f)
            for f in os.listdir(cnv_root_dir)
            if f.lower().endswith((".png", ".jpg", ".jpeg"))
        ]

        if synthetic_root_dir is not None:

          real_img_size = len(cnv_imgs) + len(normal_imgs)

          synthetic_imgs = [
              os.path.join(synthetic_root_dir, f)
              for f in os.listdir(synthetic_root_dir)
              if f.lower().endswith((".png", ".jpg", ".jpeg"))
          ]

          synthetic_imgs = shuffle(synthetic_imgs, random_state=random_state)[:int(real_img_size * ratio/(1-ratio))]

          self.imgs = normal_imgs + cnv_imgs + synthetic_imgs
          self.labels = [0] * len(normal_imgs) + [1] * len(cnv_imgs) + [1] * len(synthetic_imgs)

        else:

          self.imgs = normal_imgs + cnv_imgs
          self.labels = [0] * len(normal_imgs) + [1] * len(cnv_imgs)

    def __len__(self) -> int:

        return len(self.labels)

    def __getitem__(self, idx:int) -> tuple:

        img = Image.open(self.imgs[idx]).convert("L")
        img = self.transform(img)
        lbl = self.labels[idx]

        return img, lbl

class Classifier(nn.Module):

    def __init__(self) -> None:

        super().__init__()

        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding="same")
        self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding="same")
        self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.fc1 = nn.Linear(64 * 64 * 32, 512)
        self.fc2 = nn.Linear(512, 1)

    def forward(self, x:torch.Tensor) -> torch.Tensor:

        x = F.relu(self.conv1(x))
        x = self.maxpool1(x)

        x = F.relu(self.conv2(x))
        x = self.maxpool2(x)

        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        x = F.sigmoid(x)

        return x.squeeze(1)

# 2. Classifier Training and Validation based on Synthetic Ratio

In [None]:
def train_classifier_with_ratios():

    # Define batch parameters
    batch_size = 128
    normal_root_dir = "/content/processed/Normal/"
    cnv_root_dir = "/content/processed/CNV/"
    synthetic_root_dir = "/content/synthetic_cnv/"
    resolution = 512
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    num_epochs = 100

    ratios = np.arange(0, 1.1, 0.1)
    results = []

    os.makedirs("classifiers", exist_ok=True)

    for ratio in ratios:

        # Train and evaluate model
        model = Classifier()  # Assumes Classifier is defined elsewhere
        dataset = ClassifierDataset(normal_root_dir, cnv_root_dir, synthetic_root_dir, ratio, resolution, random_state=seed)
        train_dataset, val_dataset = torch.utils.data.random_split(dataset, [int(len(dataset) * 0.8), len(dataset) - int(len(dataset) * 0.8)])
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory = True, num_workers=4)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory = True, num_workers=4)

        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
        criterion = nn.BCELoss()

        model.to(device)

        for epoch in range(num_epochs):

            model.train()

            for images, labels in train_loader:

                images = images.to(device)
                labels = labels.float().to(device)

                optimizer.zero_grad()
                outputs = model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

        model.eval()

        y_true = []
        y_pred = []

        for images, labels in val_loader:

            images = images.to(device)
            labels = labels.to(device)

            with torch.no_grad():

                outputs = model(images)
                y_pred.extend(outputs.cpu().numpy())
                y_true.extend(labels.cpu().numpy())

        y_true, y_pred = np.array(y_true), np.array(y_pred)
        y_pred = (y_pred > 0.5).astype(int)

        accuracy = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, average='macro')
        precision = precision_score(y_true, y_pred, average='macro')
        recall = recall_score(y_true, y_pred, average='macro')


        results.append({
            'ratio': ratio,
            'accuracy': accuracy,
            'f1_score': f1,
            'precision': precision,
            'recall': recall
        })
        clear_memory()  # Assumes clear_memory is defined elsewhere

        torch.save(model.state_dict(), f"classifiers/model_ratio_{ratio}.pth")

    # Print results
    print("\nValidation Results Summary:")
    for res in results:
        print(f"Ratio: {res['ratio']*100:.0f}% | Accuracy: {res['accuracy']:.4f} | F1 Score: {res['f1_score']:.4f} | Precision: {res['precision']:.4f} | Recall: {res['recall']:.4f}")

    best_result = max(results, key=lambda x: x['accuracy'])
    print(f"\nBest Ratio: {best_result['ratio']*100:.0f}%")
    print(f"Accuracy: {best_result['accuracy']:.4f}")
    print(f"F1 Score: {best_result['f1_score']:.4f}")
    print(f"Precision: {best_result['precision']:.4f}")
    print(f"Recall: {best_result['recall']:.4f}")

    # Save results to CSV
    pd.DataFrame(results).to_csv('val_classifier_results.csv', index=False)
    print("Results saved to val_classifier_results.csv")

# Run the function
train_classifier_with_ratios()

# 3. Apply Models to Testset

In [None]:
def test_classifier_with_ratios():

    # Define batch parameters
    batch_size = 128
    normal_root_dir = "/content/testset/Normal/"
    cnv_root_dir = "/content/testset/CNV/"

    resolution = 512
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    num_epochs = 100

    ratios = np.arange(0, 1.1, 0.1)
    results = []

    os.makedirs("classifiers", exist_ok=True)

    for ratio in ratios:

        # Load trained model
        model = Classifier()
        model = model.load_state_dict(torch.load(f"classifiers/model_ratio_{ratio}.pth"))

        dataset = ClassifierDataset(normal_root_dir, cnv_root_dir, synthetic_root_dir = None, ratio = 0.0, resolution = resolution, random_state=seed)
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, pin_memory = True, num_workers=4)
        criterion = nn.BCELoss()

        model.to(device)
        model.eval()

        y_true = []
        y_pred = []

        for images, labels in loader:

            images = images.to(device)
            labels = labels.to(device)

            with torch.no_grad():

                outputs = model(images)
                y_pred.extend(outputs.cpu().numpy())
                y_true.extend(labels.cpu().numpy())

        y_true, y_pred = np.array(y_true), np.array(y_pred)
        y_pred = (y_pred > 0.5).astype(int)

        accuracy = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, average='macro')
        precision = precision_score(y_true, y_pred, average='macro')
        recall = recall_score(y_true, y_pred, average='macro')


        results.append({
            'ratio': ratio,
            'accuracy': accuracy,
            'f1_score': f1,
            'precision': precision,
            'recall': recall
        })
        clear_memory()  # Assumes clear_memory is defined elsewhere

    # Print results
    print("\nTest Results Summary:")
    for res in results:
        print(f"Ratio: {res['ratio']*100:.0f}% | Accuracy: {res['accuracy']:.4f} | F1 Score: {res['f1_score']:.4f} | Precision: {res['precision']:.4f} | Recall: {res['recall']:.4f}")

    best_result = max(results, key=lambda x: x['accuracy'])
    print(f"\nBest Ratio: {best_result['ratio']*100:.0f}%")
    print(f"Accuracy: {best_result['accuracy']:.4f}")
    print(f"F1 Score: {best_result['f1_score']:.4f}")
    print(f"Precision: {best_result['precision']:.4f}")
    print(f"Recall: {best_result['recall']:.4f}")

    # Save results to CSV
    pd.DataFrame(results).to_csv('test_classifier_results.csv', index=False)
    print("Results saved to test_classifier_results.csv")

# Run the function
train_classifier_with_ratios()