# Advanced CV

In [None]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset
from torchvision import models
from transformers import (
    Trainer,
    TrainingArguments,
    ViTForImageClassification,
    ViTImageProcessor,
)

warnings.filterwarnings("ignore")
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

In [None]:
transform_train = transforms.Compose(
    [
        transforms.Resize((64, 64)),  # Smaller resolution for speed
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),  # Simple normalization
    ]
)

transform_test = transforms.Compose(
    [
        transforms.Resize((64, 64)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ]
)

train_dataset = torchvision.datasets.CIFAR10(
    root="./data", train=True, download=True, transform=transform_train
)
test_dataset = torchvision.datasets.CIFAR10(
    root="./data", train=False, download=True, transform=transform_test
)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

classes = [
    "plane",
    "car",
    "bird",
    "cat",
    "deer",
    "dog",
    "frog",
    "horse",
    "ship",
    "truck",
]
print(f"Dataset: {len(train_dataset)} train, {len(test_dataset)} test")
print(f"Image size: 64x64, Classes: {len(classes)}")

In [3]:
def train_model(model, train_loader, epochs=3):
    """Simple training function"""
    model.train()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0

        for data, target in train_loader:
            data, target = data.to(device), target.to(device)

            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

        acc = 100.0 * correct / total
        print(
            f"Epoch {epoch + 1}: Loss={total_loss / len(train_loader):.3f}, Acc={acc:.1f}%"
        )


def test_model(model, test_loader):
    """Simple testing function"""
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

    accuracy = 100.0 * correct / total
    print(f"Test Accuracy: {accuracy:.1f}%")
    return accuracy

## Transfer Learning: Frozen Backbone

In [None]:
class SimpleResNet(nn.Module):
    def __init__(self, num_classes=10, freeze_backbone=True):
        super().__init__()
        # Load pretrained ResNet18
        self.resnet = models.resnet18(pretrained=True)

        if freeze_backbone:
            for param in self.resnet.parameters():
                param.requires_grad = False

        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)
        for param in self.resnet.fc.parameters():
            param.requires_grad = True

    def forward(self, x):
        return self.resnet(x)


model_frozen = SimpleResNet(freeze_backbone=True).to(device)

trainable = sum(p.numel() for p in model_frozen.parameters() if p.requires_grad)
total = sum(p.numel() for p in model_frozen.parameters())
print(
    f"Trainable parameters: {trainable:,} / {total:,} ({100 * trainable / total:.1f}%)"
)

train_model(model_frozen, train_loader, epochs=3)
frozen_acc = test_model(model_frozen, test_loader)

## ViT fine-tuning with huggingface trainer

In [None]:
class ViTDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image, label = self.dataset[idx]

        # Convert to PIL for ViT processor
        if isinstance(image, torch.Tensor):
            # Denormalize from [-1, 1] to [0, 1]
            image = (image + 1) / 2
            image = transforms.ToPILImage()(image)

        # Resize to 224x224 for ViT (requirement)
        image = image.resize((224, 224))

        encoding = self.processor(images=image, return_tensors="pt")
        return {
            "pixel_values": encoding["pixel_values"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}


model_name = "google/vit-base-patch16-224-in21k"
processor = ViTImageProcessor.from_pretrained(model_name)
vit_model = ViTForImageClassification.from_pretrained(
    model_name, num_labels=10, ignore_mismatched_sizes=True
)

print(f"ViT parameters: {sum(p.numel() for p in vit_model.parameters()):,}")

# Use subset for faster training
train_subset = torch.utils.data.Subset(train_dataset, range(0, 2000))
test_subset = torch.utils.data.Subset(test_dataset, range(0, 500))

train_vit_dataset = ViTDataset(train_subset, processor)
test_vit_dataset = ViTDataset(test_subset, processor)

training_args = TrainingArguments(
    output_dir="./vit_results",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
)

trainer = Trainer(
    model=vit_model,
    args=training_args,
    train_dataset=train_vit_dataset,
    eval_dataset=test_vit_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
eval_result = trainer.evaluate()
vit_acc = eval_result["eval_accuracy"]
print(f"ViT Test Accuracy: {vit_acc:.3f}")

## Image Generation with SDXL-Turbo

In [None]:
from diffusers import DiffusionPipeline

pipe = DiffusionPipeline.from_pretrained(
    "stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16"
)
pipe = pipe.to(device)

In [None]:
prompt = "A cute robot cat, digital art"
image = pipe(prompt=prompt, num_inference_steps=4, guidance_scale=0.0).images[0]

In [None]:
image