In [1]:
from torchvision import models
from torchvision.transforms import transforms
import torch
import CitiesData

In [2]:
# The inference transforms are available at ViT_B_16_Weights.IMAGENET1K_V1.transforms and perform the following preprocessing operations: Accepts PIL.Image, batched (B, C, H, W) and single (C, H, W) image torch.Tensor objects. 
# The images are resized to resize_size=[256] using interpolation=InterpolationMode.BILINEAR, followed by a central crop of crop_size=[224]. 
# Finally the values are first rescaled to [0.0, 1.0] and then normalized using mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225].

#models.ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1
vit = models.vit_b_16()



In [3]:
for param in vit.parameters():
    param.requires_grad = False

In [4]:
class ViT(torch.nn.Module):
    def __init__(self, visionTransformer: models.VisionTransformer):
        super(ViT, self).__init__()

        ViTLayers = torch.nn.Sequential(*list(visionTransformer.children())[:-1])
        for param in ViTLayers.parameters():
            param.requires_grad = False

        self.ViT = ViTLayers
        self.linear = torch.nn.Linear(768, 10)
        self.softmax = torch.nn.Softmax(dim = 1)

    def forward(self, x):
        extractedFeature = self.ViT(x)
        probabilities = self.linear(extractedFeature)
        softmax = self.softmax(probabilities)

        return softmax

In [5]:
visionTransformer = ViT(vit)
print(*list(visionTransformer.children())[:-1])

Sequential(
  (0): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (1): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadA

In [6]:
# Loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(visionTransformer.parameters(), lr=0.001)


In [8]:
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainDataLoader, testDataLoader = CitiesData.getCitiesDataLoader("./Data/", transforms = transform)

In [9]:
print(len(trainDataLoader))
print(len(testDataLoader))
for i in trainDataLoader:
    print(len(i))
    break

1565
174
4


In [10]:
num_epochs = 10
for epoch in range(num_epochs):
    for data in trainDataLoader:
        image, cities, _, _ = data

        optimizer.zero_grad()
        outputs = visionTransformer(image)
        loss = criterion(outputs, cities)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

AssertionError: Expected (batch_size, seq_length, hidden_dim) got torch.Size([128, 768, 40, 40])