# Problem 1

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ------------------------------
# Vision Transformer Definition
# ------------------------------

class ViT(nn.Module):
    def __init__(self, img_size=32, patch_size=4, in_channels=3, num_classes=100,
                 embed_dim=256, depth=6, num_heads=8, mlp_dim=512, dropout=0.1):
        super(ViT, self).__init__()

        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = (img_size // patch_size) ** 2
        self.patch_dim = in_channels * patch_size * patch_size

        self.patch_embed = nn.Linear(self.patch_dim, embed_dim)

        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.randn(1, self.num_patches + 1, embed_dim))
        self.dropout = nn.Dropout(dropout)

        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads,
                                                   dim_feedforward=mlp_dim, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=depth)

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(embed_dim),
            nn.Linear(embed_dim, num_classes)
        )

    def forward(self, x):
        B, C, H, W = x.shape

        # Divide into patches
        x = x.unfold(2, self.patch_size, self.patch_size).unfold(3, self.patch_size, self.patch_size)
        x = x.contiguous().view(B, C, -1, self.patch_size, self.patch_size)
        x = x.permute(0, 2, 1, 3, 4)  # B, num_patches, C, patch, patch
        x = x.reshape(B, self.num_patches, -1)  # B, num_patches, patch_dim

        # Patch embedding
        x = self.patch_embed(x)  # (B, num_patches, embed_dim)

        # Add class token
        cls_tokens = self.cls_token.expand(B, -1, -1)  # (B, 1, embed_dim)
        x = torch.cat((cls_tokens, x), dim=1)  # (B, num_patches+1, embed_dim)

        # Add positional encoding
        x = x + self.pos_embed
        x = self.dropout(x)

        # Transformer
        x = self.transformer(x)

        # Classification using CLS token
        cls_output = x[:, 0]
        return self.mlp_head(cls_output)

# ----------------------
# CIFAR-100 DataLoader
# ----------------------

transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor()
])

transform_test = transforms.Compose([
    transforms.ToTensor()
])

train_dataset = datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_train)
test_dataset = datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)

# ----------------------
# Training Function
# ----------------------

def train_model(model, train_loader, test_loader, criterion, optimizer, scheduler, num_epochs=20):
    best_acc = 0.0
    for epoch in range(num_epochs):
        model.train()
        running_loss, running_corrects, total = 0.0, 0, 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            _, preds = torch.max(outputs, 1)
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels)
            total += labels.size(0)

        train_loss = running_loss / total
        train_acc = running_corrects.double() / total

        model.eval()
        val_corrects, val_total = 0, 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                val_corrects += torch.sum(preds == labels)
                val_total += labels.size(0)
        val_acc = val_corrects.double() / val_total

        scheduler.step()

        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} Val Acc: {val_acc:.4f}")
        best_acc = max(best_acc, val_acc.item())

    return best_acc

# ----------------------
# Model Init + Training
# ----------------------

model = ViT(img_size=32, patch_size=4, embed_dim=256, num_classes=100).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01)
scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

start_time = time.time()
best_acc = train_model(model, train_loader, test_loader, criterion, optimizer, scheduler, num_epochs=20)
end_time = time.time()

print(f"\nBest Validation Accuracy: {best_acc:.4f}")
print(f"Training Time: {(end_time - start_time)/60:.2f} minutes")


Files already downloaded and verified
Files already downloaded and verified
Epoch 1/20 - Train Loss: 4.3646 Acc: 0.0348 Val Acc: 0.0499
Epoch 2/20 - Train Loss: 3.9552 Acc: 0.0845 Val Acc: 0.1067
Epoch 3/20 - Train Loss: 3.7099 Acc: 0.1255 Val Acc: 0.1503
Epoch 4/20 - Train Loss: 3.5415 Acc: 0.1526 Val Acc: 0.1815
Epoch 5/20 - Train Loss: 3.3984 Acc: 0.1798 Val Acc: 0.1942
Epoch 6/20 - Train Loss: 3.2914 Acc: 0.1986 Val Acc: 0.2144
Epoch 7/20 - Train Loss: 3.2064 Acc: 0.2130 Val Acc: 0.2397
Epoch 8/20 - Train Loss: 3.1180 Acc: 0.2305 Val Acc: 0.2411
Epoch 9/20 - Train Loss: 3.0433 Acc: 0.2458 Val Acc: 0.2649
Epoch 10/20 - Train Loss: 2.9754 Acc: 0.2580 Val Acc: 0.2653
Epoch 11/20 - Train Loss: 2.8116 Acc: 0.2905 Val Acc: 0.3106
Epoch 12/20 - Train Loss: 2.7568 Acc: 0.3002 Val Acc: 0.3132
Epoch 13/20 - Train Loss: 2.7118 Acc: 0.3107 Val Acc: 0.3216
Epoch 14/20 - Train Loss: 2.6653 Acc: 0.3192 Val Acc: 0.3216
Epoch 15/20 - Train Loss: 2.6208 Acc: 0.3276 Val Acc: 0.3282
Epoch 16/20 - Trai

# Problem 2

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from transformers import SwinForImageClassification, SwinConfig, AutoImageProcessor
from transformers import TrainingArguments, Trainer
import time

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define transforms for CIFAR-100 to match Swin input size (224x224)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5071, 0.4865, 0.4409), (0.2673, 0.2564, 0.2761))
])

# Load CIFAR-100 datasets
train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

# Helper function to fine-tune model
def fine_tune_model(model_name):
    model = SwinForImageClassification.from_pretrained(model_name, num_labels=100)
    
    # Freeze backbone
    for param in model.swin.parameters():
        param.requires_grad = False
    
    model = model.to(device)

    # Training setup
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()

    def train_epoch():
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        return total_loss / len(train_loader), correct / total

    def evaluate():
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images).logits
                preds = outputs.argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    # Training loop
    epoch_times = []
    for epoch in range(2):  # Change to 5 for more epochs
        start = time.time()
        train_loss, train_acc = train_epoch()
        end = time.time()
        epoch_times.append(end - start)
        print(f"Epoch {epoch+1}, Loss: {train_loss:.4f}, Acc: {train_acc:.4f}, Time: {epoch_times[-1]:.2f}s")

    test_acc = evaluate()
    return sum(epoch_times) / len(epoch_times), test_acc

# Run for both Tiny and Small
tiny_time, tiny_acc = fine_tune_model("microsoft/swin-tiny-patch4-window7-224")
small_time, small_acc = fine_tune_model("microsoft/swin-small-patch4-window7-224")
