In [None]:
import kagglehub

# Get Dataset
dataset_path = kagglehub.dataset_download("shaz13/real-world-documents-collections")

In [None]:
dataset_path += '\\docs-sm'

In [None]:
import torch

from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split, Subset

# Define transforms (resize, normalize, etc.)
transform = transforms.Compose([
    transforms.Resize((320, 320)),
    transforms.RandomHorizontalFlip(0.25),
    transforms.RandomRotation(5),
    transforms.RandomPerspective(distortion_scale=0.25, p=0.25),
    transforms.ColorJitter(brightness=0.25, contrast=0.25),
    transforms.ToTensor(),

    # Normalization Factor for Resnet50
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Transformations that don't really normalize or alter the data that much.
# They just serve to put the datain to the proper format for the models
raw_transform = transforms.Compose([
    transforms.Resize((320, 320)),  # Ensures all images are same size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Two datasets with same file paths
full_dataset_with_aug = datasets.ImageFolder(root=dataset_path, transform=transform)
full_dataset_raw = datasets.ImageFolder(root=dataset_path, transform=raw_transform)


# Create DataLoader
# Store split sizes
train_len = int(0.9 * len(full_dataset_with_aug))
test_len = len(full_dataset_with_aug) - train_len
seed = 42

torch.manual_seed(seed)
train_set, test_set_with_aug = random_split(full_dataset_with_aug, [train_len, test_len])

test_indices = test_set_with_aug.indices
test_set_raw = Subset(full_dataset_raw, test_indices)

train_dataloader = DataLoader(train_set, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_set_with_aug, batch_size=8, shuffle=True)
test_raw_dataloader = DataLoader(test_set_raw, batch_size=8, shuffle=True)

In [None]:
import matplotlib.pyplot as plt

# Function to unnormalize
def unnormalize(img_tensor):
    mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
    std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    return img_tensor * std + mean

# Get a batch of images
images, labels = next(iter(test_raw_dataloader))

images = unnormalize(images)

num_images = 9  # 3x3 grid

rows = cols = int(num_images ** 0.5)
fig, axes = plt.subplots(rows, cols, figsize=(cols * 4, rows * 4))  # 4x4 inches per image

# Plot each image
for i in range(num_images):
    ax = axes[i // cols, i % cols]
    img = images[i].permute(1, 2, 0).clamp(0, 1).numpy()  # Clamp to [0, 1] for safe display
    ax.imshow(img)
    ax.axis('off')
    ax.set_title(f"Label: {labels[i]}")

plt.tight_layout()
plt.show()

In [None]:
import torch.nn.functional as F
import torch.nn as nn

class CNNClassifier(nn.Module):
    def __init__(self, num_classes=1):
        super().__init__()

        # CNN Stack
        self.convolution_stack = nn.Sequential(

            nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3),   # -> [B, 32, 112, 112]
            nn.BatchNorm2d(32),
            nn.ReLU(),

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),  # -> [B, 64, 112, 112]
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # -> [B, 64, 56, 56]

            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1), # -> [B, 128, 56, 56]
            nn.BatchNorm2d(128),
            nn.ReLU(),

            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=2, dilation=2),  # -> [B, 128, 56, 56]
            nn.BatchNorm2d(128),
            nn.ReLU(),

            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=4, dilation=4),  # -> [B, 256, 56, 56]
            nn.BatchNorm2d(256),
            nn.ReLU(),

            nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=8, dilation=8),  # -> [B, 384, 56, 56]
            nn.BatchNorm2d(384),
            nn.ReLU(),

            nn.AdaptiveAvgPool2d((1,1))  # Final output shape: [B, 384, 1, 1]
        )

        # Classifier Head
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(384, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.convolution_stack(x)
        x = self.fc(x)
        return x

In [None]:
import torch.optim as optim
from datasets import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score

def train(model, save_path):

    # Training setup
    criterion = nn.CrossEntropyLoss() # Cross Entropy Loss for classification :)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)
    model.to("cuda")

    for epoch in range(40):
        train_loss_total = 0
        for inputs, labels in tqdm(train_dataloader):
            inputs = inputs.to("cuda")
            labels = labels.to("cuda")

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss_total += loss.item()

        test_loss_average, correct, total, all_preds, all_labels = eval(model, test_dataloader, criterion)

        precision_macro = precision_score(all_labels, all_preds, average='macro')
        recall_macro = recall_score(all_labels, all_preds, average='macro')
        f1_macro = f1_score(all_labels, all_preds, average='macro')

        train_loss_average = train_loss_total / len(train_dataloader)

        print(f"Epoch {epoch+1}, Train Loss: {train_loss_average:.4f}")
        print(f"Epoch {epoch+1}, Test Loss: {test_loss_average:.4f}")
        print(f"Test Accuracy: {correct / total:.2%}")
        print(f"Precision: {precision_macro:.4f}\t Recall: {recall_macro:.4f}\t F1: {f1_macro:.4f}")

        torch.save(model, f"{save_path}\\Classifier-Epoch-{epoch + 1}.pt")

def eval(model, dataloader, criterion):
    test_loss_total = 0
    correct = 0
    total = 0

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in tqdm(dataloader):
            inputs = inputs.to("cuda")
            labels = labels.to("cuda")

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            test_loss_total += loss.item()

            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return  test_loss_total / len(dataloader), correct, total, all_preds, all_labels

In [None]:
import torch
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

def evaluate_and_plot(model):
    loss, correct, total, all_preds, all_labels = eval(model, test_raw_dataloader, nn.CrossEntropyLoss())

    # Compute confusion matrix
    cm = confusion_matrix(all_labels, all_preds)

    # Get class names (assuming ImageFolder was used)
    class_names = full_dataset_raw.classes  # <-- adjust this to match your dataset

    # Plot
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    fig, ax = plt.subplots(figsize=(10, 10))
    disp.plot(ax=ax, cmap='Blues', xticks_rotation=45)
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

In [1]:
import numpy as np

# We want our models to be roughly the same size, that way when we compare the performance of the architecture
# we are concentrating on the effect that the architecture had, not the model size.

def model_size(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    return sum([np.prod(p.size()) for p in model_parameters])

In [None]:
model = CNNClassifier(num_classes=len(full_dataset_raw.classes))
print(model_size(model))

train(model, "CNNClassifier")
evaluate_and_plot(model)

In [None]:
import torch.nn as nn
from torchvision.models import resnet50

model = resnet50(pretrained=True)

# Replace the last layer as Resnet doesn't inherently have a classifier head
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 16)
print(model_size(model))

train(model, "Resnet50")
evaluate_and_plot(model)


In [None]:
import torch
from vit_pytorch import ViT

vit = ViT(
    image_size = 320,
    patch_size = 8,
    num_classes = len(full_dataset_raw.classes),
    dim = 128,
    depth = 8,
    heads = 4,
    mlp_dim = 128 * 4,
    dropout = 0.1,
    emb_dropout = 0.1
)

print(model_size(model))

train(model, "ViT")
evaluate_and_plot(model)

In [2]:
from vit_pytorch.nest import NesT
import torch.nn as nn

model = NesT(
    image_size = 320,
    patch_size = 8,
    dim = 96,
    heads = 4,
    num_hierarchies = 3,
    block_repeats = (2, 2, 4),  # the number of transformer blocks at each hierarchy, starting from the bottom
    num_classes = len(full_dataset_raw.classes),
    dropout = 0.1,
)

print(model_size(model))

train(model, "Nest")
evaluate_and_plot(model)

NameError: name 'full_dataset_raw' is not defined