Start with baseline CNN inspired by LeNet-5. For this initial investigation, I want to visualize the improvements without considering any domain shifts. I will use the Canine Lymphoma, 3D Histech, VMU Vienna dataset (because it is the largest annotation set).

In [1]:
# Code adapted from Machine Learning Engineering (Cornell Tech 2025)
import torch
import numpy as np
import random

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

In [2]:
from google.colab import drive
import sys
import os

# Mount Drive
drive.mount('/content/drive')

# --- Path Definition ---
# This path was verified to contain your loaders.py and your CSV
PROJECT_DIR = '/content/drive/MyDrive/GoogleColab/dataformidogcnn'

# Add the directory to the system path so Python can find 'loaders.py'
if PROJECT_DIR not in sys.path:
    sys.path.append(PROJECT_DIR)
    print(f"✅ Added {PROJECT_DIR} to Python system path.")
else:
    print("Project directory already in system path.")

Mounted at /content/drive
✅ Added /content/drive/MyDrive/GoogleColab/dataformidogcnn to Python system path.


In [3]:
import pandas as pd
from loaders import create_loaders # This should now work

# Define the full path to your CSV file
CSV_PATH = os.path.join(PROJECT_DIR, 'processed_annotations_with_patch_id.csv')

# Load the DataFrame and create the df_master variable
try:
    df_master = pd.read_csv(CSV_PATH)
    print(f"✅ df_master loaded successfully. Total rows: {len(df_master)}")
except FileNotFoundError:
    print(f"❌ CRITICAL ERROR: File not found at {CSV_PATH}. Execution stopping here.")
    df_master = None # Prevent NameError in next cell if failed

✅ df_master loaded successfully. Total rows: 26286


In [4]:
# Assuming your cropped_images folder is in the same directory as loaders.py and your CSV
PROJECT_DIR = '/content/drive/MyDrive/GoogleColab/dataformidogcnn/'


if 'df_master' in locals() and df_master is not None:

    # Define custom_filter if you have it, or set to None
    # Assuming custom_filter is not defined elsewhere, defining it here:
    custom_filter = {
      'Tumor': 'canine lymphoma',
      'Scanner': '3D Histech',
      'Origin': 'VMU Vienna'
    }

    # 3. Create loaders using the master DF
    train_loader, val_loader, test_loader = create_loaders(
        df_master,
        patch_dir=PROJECT_DIR,
        filters=custom_filter
    )

    print("✅ DataLoaders created successfully.")
else:
    print("❌ Cannot create DataLoaders: df_master was not loaded. Rerun previous cells.")


Applying filters: {'Tumor': 'canine lymphoma', 'Scanner': '3D Histech', 'Origin': 'VMU Vienna'}
Original size: 26286. Filtered size: 8216.

--- Data Split Summary ---
Original Total Annotations: 8216
Train Annotations: 5750 (70.0%)
Validation Annotations: 1233 (15.0%)
Test Annotations: 1233 (15.0%)

Train/Validation/Test DataLoaders created successfully.
Train Loader batch size: 32
Train Loader output shape: [Batch_Size, 3, 50, 50] (Verification)

Verification Batch 1:
  Image Batch Shape: torch.Size([32, 3, 60, 60])
  Labels in Batch (first 5): tensor([0, 0, 1, 0, 0])
✅ DataLoaders created successfully.


In [5]:
batch = next(iter(train_loader))
batch['image'].shape

torch.Size([32, 3, 60, 60])

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from typing import Tuple, List

# Helper function to compute accuracy for a single data loader
def _compute_accuracy(model: nn.Module, dataloader: torch.utils.data.DataLoader) -> Tuple[int, int]:
    """Computes total correct predictions and total samples for a given dataloader."""
    total_correct = 0
    total_samples = 0

    for batch in dataloader:
        # Robust Batch Unpacking: Try to handle common formats
        if isinstance(batch, dict):
            # Assuming dictionary with 'image' and 'label' keys
            X_batch = batch.get('image')
            y_batch = batch.get('label')
            if X_batch is None or y_batch is None:
                 raise ValueError("DataLoader yields a dict, but 'image' or 'label' keys are missing.")
        elif isinstance(batch, (list, tuple)):
            # Assuming standard tuple (input, target).
            # We enforce X_batch to be the first element, regardless of its content type for now.
            X_batch, y_batch = batch[0], batch[1]
        else:
             raise TypeError(f"Unexpected batch type: {type(batch)}. Expected tuple, list, or dict.")

        # CRITICAL CHECK: Ensure the input to the model is a Tensor
        if not isinstance(X_batch, torch.Tensor):
             print(f"--- DEBUG: Batch input type is {type(X_batch)}. Expected torch.Tensor. ---")
             # If X_batch is a string (e.g., a path), the model will crash.
             # This confirms the source of the TypeError.
             raise TypeError(f"Input to model is not a Tensor. Type received: {type(X_batch)}.")

        outputs = model(X_batch)
        _, predicted = torch.max(outputs, axis=1)

        total_correct += (predicted == y_batch).sum().item()
        total_samples += y_batch.shape[0]

    return total_correct, total_samples

def evaluate(model: nn.Module, trainloader: torch.utils.data.DataLoader, testloader: torch.utils.data.DataLoader) -> Tuple[float, float]:
    """
    Calculates the final training and test accuracies.
    """
    model.eval()

    with torch.no_grad():
        total_correct_train, total_samples_train = _compute_accuracy(model, trainloader)
        total_correct_test, total_samples_test = _compute_accuracy(model, testloader)

    final_train_accuracy = total_correct_train / total_samples_train if total_samples_train > 0 else 0.0
    final_test_accuracy = total_correct_test / total_samples_test if total_samples_test > 0 else 0.0

    return final_train_accuracy, final_test_accuracy

#

def plot_learning_curves(epoch_points: List[int], train_acc_points: List[float], test_acc_points: List[float]):
    """Helper function to create the accuracy plot."""
    plt.figure(figsize=(10, 6))
    plt.plot(epoch_points, train_acc_points, label='Training Accuracy', marker='o')
    plt.plot(epoch_points, test_acc_points, label='Test Accuracy', marker='x')

    plt.title('Learning Curves: Accuracy Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    plt.show()

def train_CNN(model: nn.Module, trainloader: torch.utils.data.DataLoader, testloader: torch.utils.data.DataLoader, num_epochs: int = 50, plot_interval: int = 10, lr: float = 0.01) -> Tuple[float, float]:
    """
    Trains a CNN model, evaluates accuracy periodically, and plots the learning curves.
    """
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

    epoch_points = []
    train_acc_points = []
    test_acc_points = []

    # ----------------- Initial Evaluation (Epoch 0) -----------------
    model.eval()
    train_acc, test_acc = evaluate(model, trainloader, testloader)

    epoch_points.append(0)
    train_acc_points.append(train_acc)
    test_acc_points.append(test_acc)

    if plot_interval > 0:
        print(f"Epoch 0 (Initial): Train Acc = {train_acc:.4f}, Test Acc = {test_acc:.4f}")

    # ----------------- TRAINING LOOP -----------------
    for epoch in range(1, num_epochs + 1):

        model.train()

        for batch in trainloader:
            # Robust Batch Unpacking
            if isinstance(batch, dict):
                X_batch = batch.get('image')
                y_batch = batch.get('label')
            elif isinstance(batch, (list, tuple)):
                X_batch, y_batch = batch[0], batch[1]
            else:
                 raise TypeError(f"Unexpected batch type yielded by trainloader: {type(batch)}.")

            # CRITICAL CHECK: Input must be a Tensor
            if not isinstance(X_batch, torch.Tensor):
                 raise TypeError(f"Input image data is not a Tensor. Type received: {type(X_batch)}. "
                                 "Check your DataLoader's collate_fn or Dataset's __getitem__ method.")

            # --- Standard Training Steps ---
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # ----------------- EVALUATION & STORAGE -----------------
        if (plot_interval != 0 and epoch % plot_interval == 0) or epoch == num_epochs:
            model.eval()
            train_acc, test_acc = evaluate(model, trainloader, testloader)

            epoch_points.append(epoch)
            train_acc_points.append(train_acc)
            test_acc_points.append(test_acc)

            if plot_interval > 0:
                print(f"Epoch {epoch}: Train Acc = {train_acc:.4f}, Test Acc = {test_acc:.4f}")

    # ----------------- FINAL PLOT -----------------
    if plot_interval > 0:
        plot_learning_curves(epoch_points, train_acc_points, test_acc_points)

    print(f"\nTraining Complete. Final Train Acc: {train_acc:.4f}, Final Test Acc: {test_acc:.4f}")

    return train_acc, test_acc

In [7]:
def init_weights(module):
    """Initialize weights for CNNs."""
    if type(module) == nn.Linear or type(module) == nn.Conv2d:
        nn.init.xavier_uniform_(module.weight)

In [8]:
from torch import nn

class lenet5(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5, padding=2),
            nn.Sigmoid(),
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5),
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.LazyLinear(120),
            nn.Sigmoid(),
            nn.LazyLinear(84),
            nn.Sigmoid(),
            nn.LazyLinear(2),
        )
        self.model.apply(init_weights)

    def forward(self, x):
        return self.model(x)

In [9]:
accuracies = {}

In [None]:
model = lenet5()
trainacc, valacc = train_CNN(model, train_loader, val_loader)
accuracies["lenet5"] = (trainacc, valacc)

In [None]:
class alexnet(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.LazyConv2d(96, kernel_size=11, stride=4, padding=1),
            nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2),
            nn.LazyConv2d(256, kernel_size=5, padding=2), nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.LazyConv2d(384, kernel_size=3, padding=1), nn.ReLU(),
            nn.LazyConv2d(384, kernel_size=3, padding=1), nn.ReLU(),
            nn.LazyConv2d(256, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2), nn.Flatten(),
            nn.LazyLinear(4096), nn.ReLU(), nn.Dropout(p=0.5),
            nn.LazyLinear(4096), nn.ReLU(),nn.Dropout(p=0.5),
            nn.LazyLinear(2)
            )
        self.model.apply(init_weights)

    def forward(self, x):
        return self.model(x)

In [None]:
model = alexnet()
trainacc, valacc = train_CNN(model, train_loader, val_loader)
accuracies["alexnet"] = (trainacc, valacc)

In [None]:
class lenet5_relu_max_dropout(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.LazyLinear(120),
            nn.ReLU(), nn.Dropout(p=0.5),
            nn.LazyLinear(84),
            nn.ReLU(), nn.Dropout(p=0.5),
            nn.LazyLinear(2),
        )
        self.model.apply(init_weights)

    def forward(self, x):
        return self.model(x)

In [None]:
model = lenet5_relu_max_dropout()
trainacc, valacc = train_CNN(model, train_loader, val_loader)
accuracies["lenet5_relu_max_dropout"] = (trainacc, valacc)

In [None]:
class lenet5_relu(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5),
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.LazyLinear(120),
            nn.ReLU(),
            nn.LazyLinear(84),
            nn.ReLU(),
            nn.LazyLinear(2),
        )
        self.model.apply(init_weights)

    def forward(self, x):
        return self.model(x)

In [None]:
model = lenet5_relu()
trainacc, valacc = train_CNN(model, train_loader, val_loader)
accuracies["lenet5_relu"] = (trainacc, valacc)

In [None]:
class lenet5_dropout(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5, padding=2),
            nn.Sigmoid(),
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5),
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.LazyLinear(120),
            nn.Sigmoid(), nn.Dropout(p=0.5),
            nn.LazyLinear(84),
            nn.Sigmoid(), nn.Dropout(p=0.5),
            nn.LazyLinear(2),
        )
        self.model.apply(init_weights)

    def forward(self, x):
        return self.model(x)

In [None]:
model = lenet5_dropout()
trainacc, valacc = train_CNN(model, train_loader, val_loader)
accuracies["lenet5_dropout"] = (trainacc, valacc)

In [None]:
class lenet5_max(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5, padding=2),
            nn.Sigmoid(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.LazyLinear(120),
            nn.Sigmoid(),
            nn.LazyLinear(84),
            nn.Sigmoid(),
            nn.LazyLinear(2),
        )
        self.model.apply(init_weights)

    def forward(self, x):
        return self.model(x)

In [None]:
model = lenet5_max()
trainacc, valacc = train_CNN(model, train_loader, val_loader)
accuracies["lenet5_max"] = (trainacc, valacc)

In [None]:
class lenet5_deep(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.LazyConv2d(96, kernel_size=11, stride=4, padding=1),
            nn.Sigmoid(), nn.AvgPool2d(kernel_size=3, stride=2),
            nn.LazyConv2d(256, kernel_size=5, padding=2), nn.ReLU(),
            nn.AvgPool2d(kernel_size=3, stride=2),
            nn.LazyConv2d(384, kernel_size=3, padding=1), nn.ReLU(),
            nn.LazyConv2d(384, kernel_size=3, padding=1), nn.ReLU(),
            nn.LazyConv2d(256, kernel_size=3, padding=1), nn.ReLU(),
            nn.AvgPool2d(kernel_size=3, stride=2), nn.Flatten(),
            nn.LazyLinear(4096), nn.Sigmoid(),
            nn.LazyLinear(4096), nn.Sigmoid(),
            nn.LazyLinear(2)
            )
        self.model.apply(init_weights)

    def forward(self, x):
        return self.model(x)

In [None]:
model = lenet5_deep()
trainacc, valacc = train_CNN(model, train_loader, val_loader)
accuracies[model] = (trainacc, valacc)