# Assignment: Vision Transformers on CIFAR10

In [42]:
#imports
from __future__ import print_function
import os
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils


In [43]:
#loading the dataset
dataset = dset.CIFAR10(root="./data", download=True,
                           transform=transforms.Compose([
                               transforms.Resize(64),
                               transforms.ToTensor(),
                               transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
                           ]))
nc=3

dataloader = torch.utils.data.DataLoader(dataset, batch_size=128,
                                         shuffle=True, num_workers=2)


In [44]:
#checking the availability of cuda devices
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Tasks:
* try to get the best test Accuracy on Cifar10 using a transformer model
* pre-trained models allowed - see [here](https://docs.pytorch.org/vision/main/models/vision_transformer.html) for list of models in TorchVision
* **hint**: just like with the CNN in Week 5 - wee need to change the classification layer to fit our 10 class CIFAR-10 problem before we can fine-tune it...
* **hint**: Transformers need a lot of compute + memory - use the A100 GPU



In [45]:
import torchvision.models as models

In [50]:
models.list_models()


# vit_b_16

model = models.vit_b_16(weights='DEFAULT').cuda()
model.image_size = 64

In [51]:
model.heads.head = torch.nn.Linear(in_features=768, out_features=10, bias=True) #this is now new and un-trained!

In [52]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

In [53]:
running_loss = 0.0
for epoch in range(10):  # loop over the dataset multiple times
    print("---epoch:",epoch)
    for i, data in enumerate(dataloader, 0):

        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data[0].cuda(),data[1].cuda()

        print(inputs.shape)
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 1000 == 999:    # every 1000 mini-batches...
            #print(running_loss / 1000)
            running_loss = 0.0
print('Finished Training')

---epoch: 0
torch.Size([128, 3, 64, 64])


RuntimeError: The size of tensor a (17) must match the size of tensor b (197) at non-singleton dimension 1

In [None]:
correct = 0
total = 0

model.cuda()

# since we're not training, we don't need to calculate the gradients for our outputs
model.eval()

for data in dataloader:
        images, labels = data[0].cuda(),data[1].cuda()
        # calculate outputs by running images through the network
        outputs = model(images)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')