## ViT - Vision Transformer

* $\textbf{Vision Transformer (ViT)}$ model pre-trained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224, and fine-tuned on ImageNet 2012 (1 million images, 1,000 classes) at resolution 224x224.

* Fine tuned on $\textbf{custom data set}$

* To do: Train model from scratch (model config - adjust patch size, dropout, ..)

### Config

In [1]:
EPOCHS = 10
BATCH_SIZE = 1
LEARNING_RATE = 1e-5
EVAL_BATCH = 1

### Preprocessing

Uncomment below cell to resize the images to 224x224 to match $\textbf{google/vit-base-patch16-224}$.

We have to specify each directory seperately and run the below cell multiple times.

In [None]:
# import os
# from PIL import Image

# path = "./train/sit/"
# save_path = "./train_resized/sit/"

# for root, subdirs, files in os.walk(path):
#     for f in files:
#         if f.endswith('jpeg'):
#             #print(f)
#             im = Image.open(path+f)
#             imResize = im.resize((224,224), Image.ANTIALIAS)
#             imResize.save(save_path + f, 'JPEG', quality=90)
             

### Dataset

In [2]:
import torchvision
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import tqdm
from transformers import ViTModel
from transformers.modeling_outputs import SequenceClassifierOutput
import torch.nn as nn
import torch.nn.functional as F
from transformers import ViTFeatureExtractor
import torch.nn as nn
import torch
import torch.utils.data as data
from torch.autograd import Variable
import numpy as np
import tqdm


# No eval data set to evaluate the model during training, due to data scarcity
train_ds = torchvision.datasets.ImageFolder('./train_resized', transform=ToTensor())
test_ds = torchvision.datasets.ImageFolder('./test_resized', transform=ToTensor())

len(train_ds.classes), len(train_ds), len(test_ds)

  from .autonotebook import tqdm as notebook_tqdm


(4, 732, 87)

### Data augmentation

* To do: cropping, flipping, ..

In [None]:
# data augmentation



### Custom model

* To do: Implement ViT from huggingface - use model.config

In [None]:
# from transformers import ViTConfig, ViTModel

# # Initializing a ViT vit-base-patch16-224 style configuration
# configuration = ViTConfig(patch_size=32)

# # Initializing a model (with random weights) from the vit-base-patch16-224 style configuration
# test_model = ViTModel(configuration)

# # Accessing the model configuration
# configuration = test_model.config

### Adjust $\textbf{google/vit-base-patch16-224}$

We add a dropout and a classification layer to the model to predict the classes (4 classes)

In [3]:
class ViTForImageClassification(nn.Module):
    def __init__(self, num_labels=4):
        super(ViTForImageClassification, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.vit.config.hidden_size, num_labels)
        self.num_labels = num_labels

    def forward(self, pixel_values, labels):
        outputs = self.vit(pixel_values=pixel_values)
        output = self.dropout(outputs.last_hidden_state[:,0])
        logits = self.classifier(output)

        loss = None
        if labels is not None:
          loss_fct = nn.CrossEntropyLoss()
          loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        if loss is not None:
          return logits, loss.item()
        else:
          return logits, None

In [4]:
# Define Model
model = ViTForImageClassification(len(train_ds.classes)) 
# Feature Extractor
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
# Adam Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
# Cross Entropy Loss
loss_func = nn.CrossEntropyLoss()
# Use GPU if available  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
if torch.cuda.is_available():
    model.cuda() 

Some weights of the model checkpoint at google/vit-base-patch16-224 were not used when initializing ViTModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print("Number of train samples: ", len(train_ds))
print("Number of test samples: ", len(test_ds))
print("Detected Classes are: ", train_ds.class_to_idx) 

train_loader = data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=4)
test_loader  = data.DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) 

# Train the model

for epoch in tqdm(range(EPOCHS)):

#for epoch in range(EPOCHS):  
#for epoch in tqdm(range(EPOCHS)):      
  for step, (x, y) in enumerate(train_loader):
    model.train()
    # Change input array into list with each batch being one element
    x = np.split(np.squeeze(np.array(x)), BATCH_SIZE)
    # Remove unecessary dimension
    for index, array in enumerate(x):
      x[index] = np.squeeze(array)
    # Apply feature extractor, stack back into 1 tensor and then convert to tensor
    x = torch.tensor(np.stack(feature_extractor(x)['pixel_values'], axis=0))
    # Send to GPU if available
    x, y  = x.to(device), y.to(device)
    b_x = Variable(x)   # batch x (image)
    b_y = Variable(y)   # batch y (target)
    # Feed through model
    output, loss = model(b_x, None)
    # Calculate loss
    if loss is None: 
      loss = loss_func(output, b_y)   
      optimizer.zero_grad()           
      loss.backward()                 
      optimizer.step()
  
  # with torch.no_grad():
  #   model.eval()
  #   # Get the next batch for testing purposes
  #   test = next(iter(test_loader))
  #   test_x = test[0]
  #   # Reshape and get feature matrices as needed
  #   test_x = np.split(np.squeeze(np.array(test_x)), BATCH_SIZE)
  #   for index, array in enumerate(test_x):
  #     test_x[index] = np.squeeze(array)
  #   test_x = torch.tensor(np.stack(feature_extractor(test_x)['pixel_values'], axis=0))
  #   # Send to appropirate computing device
  #   test_x = test_x.to(device)
  #   test_y = test[1].to(device)
  #   # Get output (+ respective class) and compare to target
  #   test_output, loss = model(test_x, test_y)
  #   test_output = test_output.argmax(1)
  #   # Calculate Accuracy
  #   accuracy = (test_output == test_y).sum().item() / BATCH_SIZE
  #   print('Epoch: ', epoch, '| train loss: %.4f' % loss, '| test accuracy: %.2f' % accuracy)

In [None]:
acc = []

# Disable grad
with torch.no_grad():
    for inputs, targets in test_loader:
        # inputs, target = next(iter(test_loader))
        # Reshape and get feature matrices as needed
        # print(inputs.shape)
        inputs = inputs[0].permute(1, 2, 0)
        # print(inputs.shape)
        # Save original Input
        # originalInput = inputs
        # for index, array in enumerate(inputs):
        #   inputs[index] = np.squeeze(array)
        #   print(inputs[index].shape)
        inputs = torch.tensor(np.stack(feature_extractor(inputs)['pixel_values'], axis=0))
        # print(inputs.shape)
        # print(targets.shape)

        # Send to appropriate computing device
        inputs = inputs.to(device)
        targets = targets.to(device)
      
        # Generate prediction
        prediction, loss = model(inputs, targets)
          
        # Predicted class value using argmax
        predicted_class = np.argmax(prediction.cpu())
        value_predicted = list(test_ds.class_to_idx.keys())[list(test_ds.class_to_idx.values()).index(predicted_class)]
        value_target = list(test_ds.class_to_idx.keys())[list(test_ds.class_to_idx.values()).index(targets)]
        accuracy = (predicted_class == targets).sum().item() / EVAL_BATCH
        acc.append(accuracy)

        # if step % 10 == 0:     
        #     # Show result
        #     plt.imshow(originalInput)
        #     # plt.xlim(224,0)
        #     # plt.ylim(224,0)
        #     plt.title(f'Prediction: {value_predicted} - Actual target: {value_target}')
        #     plt.show()

In [None]:
sum(acc) / len(acc) * 100

### Results for 10 epochs

* Model 1:  93.10 
* Model 2:  91.95
* Model 3:  94.25
* Model 4:  91.95
* Model 5:  91.95
* Model 6:  93.10
* Model 7:  93.10
* Model 8:  91.95
* Model 9:  91.95
* Model 10: 93.10

Average accuracy: 92.64

### Save model

We save and provide the best model with an accuracy of 94.25 on he test set

In [None]:
# save model
# torch.save(model, './models/best_model.pt')

### Load model

In [None]:
# MODEL_PATH = './models/best_model.pt'
# model = torch.load(MODEL_PATH)
# model.eval()