<a href="https://colab.research.google.com/github/Lohitasrith01/Image-Classification/blob/main/Vision_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Vision Transformer (ViT) for Image Classification
Use a Vision Transformer to solve the Cats and Dogs Dataset. You can use pre-defined ViT model or implement from scratch.


In [None]:

from zipfile import ZipFile
import os
import torch
from torchvision import datasets, transforms
from torch.utils.data import random_split, DataLoader
from torchvision.datasets import ImageFolder
from PIL import Image
from transformers import ViTForImageClassification, ViTImageProcessor
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as T

uploaded_path = "/content/PetImages.zip"

with ZipFile(uploaded_path, 'r') as zip_ref:
    zip_ref.extractall("/content")

data_dir = "/content/PetImages"

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

class SafeImageFolder(ImageFolder):
    def __getitem__(self, index):
        try:
            return super().__getitem__(index)
        except (OSError, ValueError) as e:
            return self.__getitem__((index + 1) % len(self))

full_dataset = SafeImageFolder(root=data_dir, transform=transform)

print(f"Filtered dataset size: {len(full_dataset)} images")

print(f"Loaded {len(full_dataset)} images across {len(full_dataset.classes)} classes: {full_dataset.classes}")

train_size = int(0.7 * len(full_dataset))
val_size = int(0.15 * len(full_dataset))
test_size = len(full_dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(full_dataset, [train_size, val_size, test_size])

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)



Filtered dataset size: 25000 images
Loaded 25000 images across 2 classes: ['Cat', 'Dog']


2. Choose to use a pre-defined ViT model or implement it from scratch. You can use an in-built predefined models for this part.

In [None]:
!pip install transformers timm



In [None]:
image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")

model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224")

model.classifier = nn.Linear(model.classifier.in_features, 2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermed

In [None]:

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

def compute_accuracy(outputs, labels):
    _, preds = torch.max(outputs, 1)
    return (preds == labels).sum().item() / len(labels)

def train(model, train_loader, val_loader, epochs=3):
    model.train()

    for epoch in range(epochs):
        train_loss = 0.0
        train_acc = 0.0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images).logits
            loss = criterion(outputs, labels)
            acc = compute_accuracy(outputs, labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_acc += acc

        avg_loss = train_loss / len(train_loader)
        avg_acc = train_acc / len(train_loader)

        print(f"Epoch {epoch+1} | Train Loss: {avg_loss:.4f} | Train Acc: {avg_acc:.4f}")

        evaluate(model, val_loader)

def evaluate(model, data_loader):
    model.eval()
    total_acc, total_loss = 0.0, 0.0

    with torch.no_grad():
        for images, labels in data_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images).logits
            loss = criterion(outputs, labels)
            acc = compute_accuracy(outputs, labels)

            total_loss += loss.item()
            total_acc += acc

    avg_loss = total_loss / len(data_loader)
    avg_acc = total_acc / len(data_loader)
    print(f"Validation Loss: {avg_loss:.4f} | Validation Accuracy: {avg_acc:.4f}")

    model.train()

train(model, train_loader, val_loader, epochs=3)




Epoch 1 | Train Loss: 0.0315 | Train Acc: 0.9890
Validation Loss: 0.0152 | Validation Accuracy: 0.9971
Epoch 2 | Train Loss: 0.0035 | Train Acc: 0.9992
Validation Loss: 0.0150 | Validation Accuracy: 0.9963
Epoch 3 | Train Loss: 0.0006 | Train Acc: 0.9999
Validation Loss: 0.0180 | Validation Accuracy: 0.9968


The model is not overfitting despite low training loss.
Validation accuracy is consistently above 99.6%, indicating excellent generalization.
The small validation loss increase in epoch 3 is negligible.

In [None]:
### ADD YOUR CODE HERE ###
model_path = "vit_cat_dog_model"

model.cpu()
model.eval()
torch.save(model.state_dict(), f"{model_path}.pt")

image_processor.save_pretrained(model_path)


['vit_cat_dog_model/preprocessor_config.json']

In [None]:
inference_transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize([0.5], [0.5])
])

def predict_image(image_path):
    image = Image.open(image_path).convert("RGB")
    input_tensor = inference_transform(image).unsqueeze(0)

    model.eval()
    with torch.no_grad():
        outputs = model(input_tensor)
        _, predicted = torch.max(outputs.logits, 1)

    label_map = {0: "Cat", 1: "Dog"}
    return label_map[predicted.item()]


5. Record a short video (~5 mins) demonstrating how your deployed ViT model works. The video should showcase the model taking image inputs and providing predictions. Explain the key aspects of your implementation and deployment process in the video.
   a. Upload the video to UBbox and create a shared link
   b. Add the link at the end of your ipynb file.

In [None]:
print(predict_image("/content/PetImages/Dog/10000.jpg"))


Dog
