In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Install Required Libraries

In [None]:
!pip install transformers datasets
!pip install torch torchvision


Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

Import Necessary Libraries

In [None]:
import torch
from torch import nn, optim
from transformers import ViTForImageClassification, ViTFeatureExtractor
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
import os


Load the Pre-trained ViT Model

In [None]:
# Load the pre-trained ViT model from Hugging Face
model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224',
    num_labels=251,
    ignore_mismatched_sizes=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([251]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([251, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Set Up the Feature Extractor

In [None]:
# Initialize feature extractor for ViT
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')

# Image transformations: resize, normalize as per the ViT requirements
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
])


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



Load Your Dataset

In [None]:
#2mins Paths to train and validation datasets
train_dir = '/content/drive/MyDrive/Designproject/Implementation/Dataset/organized_train_set/'
val_dir = '/content/drive/MyDrive/Designproject/Implementation/Dataset/organized_val_set/'

# Load datasets
train_dataset = datasets.ImageFolder(train_dir, transform=transform)
val_dataset = datasets.ImageFolder(val_dir, transform=transform)

# Create DataLoader for training and validation
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)


Training Setup

In [None]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

Fine-tune the Model

In [None]:
import time
import torch
from tqdm import tqdm
from torch.cuda.amp import GradScaler

# Initialize gradient scaler for mixed precision
scaler = GradScaler()

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    start_time = time.time()  # Start timer for the epoch

    # Using tqdm with total number of batches
    for images, labels in tqdm(train_loader, total=len(train_loader)):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()  # Reset gradients

        # Enable mixed precision if using CUDA
        with torch.cuda.amp.autocast():
            # Forward pass
            outputs = model(images).logits  # Forward pass
            loss = criterion(outputs, labels)  # Compute loss

        # Scale the loss and perform backward pass
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Track accuracy
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        running_loss += loss.item()

    epoch_loss = running_loss / len(train_loader)
    epoch_acc = correct / total
    epoch_time = time.time() - start_time  # Calculate time taken

    print(f'Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}, Time: {epoch_time:.2f}s')

    # Validation after every epoch
    model.eval()
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images).logits
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_acc = val_correct / val_total
    print(f'Validation Accuracy: {val_acc:.4f}')

# Save the fine-tuned model
model.save_pretrained('/content/drive/MyDrive/Designproject/Implementation/ViT_fine_tuned')


  scaler = GradScaler()
  with torch.cuda.amp.autocast():
  1%|          | 6/926 [04:56<12:38:02, 49.44s/it]


KeyboardInterrupt: 

Save the Model

In [None]:
# Save the fine-tuned model
model.save_pretrained('/content/drive/MyDrive/Designproject/Implementation/ViT_fine_tuned_food_recognition')


Inference on New Images

In [None]:
from PIL import Image

#we have Selected random image file
image = Image.open('/content/drive/MyDrive/Designproject/Implementation/Dataset/organized_train_set/apple_pie/train_097374.jpg')

# Preprocessing the image
inputs = feature_extractor(images=image, return_tensors="pt").to(device)

# predictions
model.eval()
with torch.no_grad():
    logits = model(**inputs).logits
    predicted_class = logits.argmax(-1).item()

# We will get the class name
predicted_class_name = train_dataset.classes[predicted_class]
print(f"Predicted food class: {predicted_class_name}")


Predicted food class: savarin


In [None]:
from PIL import Image
import torch
from transformers import AutoFeatureExtractor, AutoModelForImageClassification

# Load the fine-tuned model and feature extractor
model_path = '/content/drive/MyDrive/Designproject/Implementation/ViT_fine_tuned_food_recognition'
feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)
model = AutoModelForImageClassification.from_pretrained(model_path)

# Move model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Select a random image file
image_path = '/content/drive/MyDrive/Designproject/Implementation/Dataset/organized_train_set/apple_pie/train_097374.jpg'
image = Image.open(image_path)

# Preprocessing the image
inputs = feature_extractor(images=image, return_tensors="pt").to(device)

# Make predictions
model.eval()
with torch.no_grad():
    logits = model(**inputs).logits
    predicted_class = logits.argmax(-1).item()

# Get the class name
predicted_class_name = feature_extractor.label2id[predicted_class]  # Adjust this if necessary
print(f"Predicted food class: {predicted_class_name}")


OSError: Incorrect path_or_model_id: '/content/drive/MyDrive/Designproject/Implementation/ViT_fine_tuned_food_recognition'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [None]:
!ls /content/drive/MyDrive/Designproject/Implementation/Dataset/ViT_fine_tuned_food_recognition

config.json  model.safetensors
