<a href="https://colab.research.google.com/github/Jaseelkt007/ML/blob/master/Image%20classification%20using%20Vision%20Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Image classification using Vision transformer
### this is used classify the CIFAR10 dataset

In [None]:
pip install timm

In [None]:
'''Basic understanding of Patch embedding and Positional embedding are follows. these functionalities are included in the pretrained models like VIT
'''
class Patch_embedding(nn.Module):
    def __init__(self, img_size ,patch_size , in_channels = 3 ,embed_size = 768) -> None:
        super().__init__()
        self.num_pathes = (img_size // patch_size )*2
        self.patch_size  = patch_size
        self.proj = nn.Linear(patch_size * patch_size *in_channels , embed_size)

    def forward(self,x):
        batch , c , h, w = x.shape
        # cut the images into patches of size 16*16
        patches = x.unfold(2, self.patch_size, self.patch_size).unfold(3, self.patch_size, self.patch_size) # (batch , 3, 14,14,16,16)
        # flatten it
        patches = patches.contiguous().view(batch_size, -1, self.patch_size * self.patch_size * c) # ( batch, 196, 768)
        # linear projection to embedding vector
        embedding = self.proj(patches)
        return embedding
# Test the Patch Embedding
patch_embedding = Patch_embedding(img_size=224, patch_size=16, embed_size=768)
x = torch.randn(64, 3, 224, 224)  # Example batch of 64 images
patch_embeddings = patch_embedding(x)  # Patch embeddings
print(patch_embeddings.shape)  # Output shape: (64, num_patches, embedding_dim)

# Positional Encodeing
class Positional_encoding(nn.Module):
    def __init__(self, embedding_size , max_length = 5000) -> None:
        super().__init__()

        pe = torch.zeros((max_length, embedding_size))

        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1) # (max_length, 1 ) column vector
        # 10000^(i/d) => exp(-(1/d)*log1000) , this is computationaly efficient
        div_term = torch.exp(torch.arange(0 , embedding_size, 2)).float() * (-math.log(10000)/embedding_size) # - this is scaling factor (embeding_len/2 ,)
        # for even dimension
        pe[: , 0::2] = torch.sin(position * div_term) # (max_length, embedding_size)
        pe[: , 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0) # (1 , max_len, embed_size)
        self.register_buffer('pe',pe) # defined non trainable tensor(buffers), willnot be updated during training

    def forward(self, x):
        return x + self.pe[: , :x.size(1)] # x.size(1) = seq_len of the image

pos = Positional_encoding(embedding_size=768)
pos_embedding = pos(patch_embeddings)
print(pos_embedding.shape)

# cls token is added to with patch embedding, then add the position embedding

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import math
import timm
from torch.utils.data import Subset
import numpy as np

#parameters
image_size = 224 #226*224 - resize
patch_size = 16 # divide image into 16*16 patches
embedding_dim = 768 # 16*16*3
batch_size = 64

transform = transforms.Compose([transforms.Resize((224,224)), transforms.ToTensor(), transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5))]) # Normilise to -1 to 1
train_dataset = torchvision.datasets.CIFAR10(root='./data', train= True, download = True, transform=transform)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train= False, download = True, transform = transform)

# Only need to use a small (25%) subset of dataset for fine - tuning cause the domain is still the same
train_size = len(train_dataset)
indices = np.random.choice(train_size, size = int(train_size * 0.25), replace= False) # select 25% indices from the dataset randomly
train_subset = Subset(train_dataset , indices)


train_loader = DataLoader(train_subset, batch_size = 64, shuffle =True , num_workers=2) # (batch_size , C ,H ,W)
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle=True , num_workers=2)

# extract the encoder part alone including embedding.
class VIT_Encoder(nn.Module):
    def __init__(self, vit_model ) -> None:
        super().__init__()
        # Use everything except the classification head
        self.patch_embed = vit_model.patch_embed
        self.cls_token = vit_model.cls_token
        self.pos_embed = vit_model.pos_embed
        self.pos_drop = vit_model.pos_drop
        self.blocks = vit_model.blocks # Transformer encoder layers
        self.norm = vit_model.norm # Layer normaliztion after encoder

    def forward(self, x):
        # Patch embedding
        x = self.patch_embed(x)
        # add the class token and positional embeding
        batch_size = x.shape[0]
        cls_token = self.cls_token.expand(batch_size, -1, -1) # multipy copies of cls token to each batches
        x = torch.cat((cls_token , x), dim=1) # concatinate to x -> prepend to patch embedding
        x = x + self.pos_embed
        x = self.pos_drop(x)

        # Transformer encoder layers
        x = self.blocks(x)
        x = self.norm(x)

        return x # (batch_size, seq_len, embeddin_size) --> (64, 197,768) --> 197 = 14*14(token patches) +1(cls token) ,768 = 16*16*3 is consider as the embedding size


# classification head
class MLP_head(nn.Module):
    def __init__(self,embedding_dim, num_classes):
        super().__init__()
        self.fc_out = nn.Linear(embedding_dim, num_classes)
    def forward(self, x):
        return self.fc_out(x)



class Final_model(nn.Module):
    def __init__(self, encoder , mlp_head) -> None:
        super().__init__()
        self.encoder = encoder
        self.mlp_head = mlp_head

    def forward(self, x):
        out_representation = self.encoder(x) # (batch_size, seq_len, embedding_size)
        cls_rep = out_representation[: ,0,:] # only take cls token
        final_out = self.mlp_head(cls_rep)
        return final_out

# hyperparameters
batch_size = 64
learning_rate = 1e-3
num_epochs = 10
embedding_dim = 768


# Load pre-trained VIT model and extract encoder
vit_model = timm.create_model('vit_base_patch16_224' , pretrained = True)
vit_encoder = VIT_Encoder(vit_model)
encoder = VIT_Encoder(vit_model)
mlp_head = MLP_head(embedding_dim=embedding_dim,num_classes=10 )
model = Final_model(encoder, mlp_head)

# Freeze the encoder weights , to fine tune MLP alone
for param in model.encoder.parameters():
    param.requires_grad = False

# optimizer only update MLP parameters
optimizer = optim.AdamW(model.mlp_head.parameters() , lr= learning_rate, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Traning
for epoch in range(num_epochs):
    model.train()
    running_loss = 0
    correct = 0
    total = 0
    for images , labels in train_loader:
        images , labels = images.to(device) , labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Track accuracy and loss
        running_loss += loss.item() # accumulate the batch loss
        _ , predicted = outputs.max(1)
        correct += predicted.eq(labels).sum().item() # first compare predicted and true, then counts, then accumulate
        total += labels.size(0) # tracks the samples processed

    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100* correct/total
    print(f'Epoch : {epoch + 1}/{num_epochs}. , Loss: {epoch_loss:.4f}, Accuracy : {epoch_acc:.2f}%')

Files already downloaded and verified
Files already downloaded and verified


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Epoch : 1/10. , Loss: 0.2079, Accuracy : 94.50%
Epoch : 2/10. , Loss: 0.0642, Accuracy : 98.16%
Epoch : 3/10. , Loss: 0.0446, Accuracy : 98.74%
Epoch : 4/10. , Loss: 0.0341, Accuracy : 99.07%
Epoch : 5/10. , Loss: 0.0268, Accuracy : 99.39%
Epoch : 6/10. , Loss: 0.0219, Accuracy : 99.41%
Epoch : 7/10. , Loss: 0.0181, Accuracy : 99.59%
Epoch : 8/10. , Loss: 0.0156, Accuracy : 99.67%
Epoch : 9/10. , Loss: 0.0128, Accuracy : 99.74%
Epoch : 10/10. , Loss: 0.0112, Accuracy : 99.82%


In [5]:
# Evaluation
model.eval()
test_correct = 0
test_total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device) , labels.to(device)
        outputs = model(images)
        _ , predicted = outputs.max(1)
        test_total += labels.size(0)
        test_correct += predicted.eq(labels).sum().item()
test_acc = 100 * test_correct/test_total
print(f'Test Accuracy: {test_acc:.2f}%')

Test Accuracy: 97.09%


In [6]:
from google.colab import drive
drive.mount('/content/drive')

import os

# Specify the new folder path
folder_path = '/content/drive/MyDrive/vit_models'

# Create the folder if it doesn't exist
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Save the model
model_save_path = os.path.join(folder_path, 'vit_finetuned.pth')
torch.save(model.state_dict(), model_save_path)

print(f"Model saved to {model_save_path}")

Mounted at /content/drive
Model saved to /content/drive/MyDrive/vit_models/vit_finetuned.pth
