In [None]:
import os
import torch
from torchvision import datasets, transforms
from transformers import ViTForImageClassification, ViTFeatureExtractor
from torch.utils.data import DataLoader
from torchvision.transforms import InterpolationMode
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
from torch.ao.quantization import quantize_dynamic

Collecting numpy<2
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m103.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.0
    Uninstalling numpy-2.2.0:
      Successfully uninstalled numpy-2.2.0
Successfully installed numpy-1.26.4


In [None]:
# Set device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# Load the Pretrained Model for Baseline Training
model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224",
    num_labels=101,
    ignore_mismatched_sizes=True,
).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([101]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([101, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Optimized Data Transform Pipeline
transform = transforms.Compose([
    transforms.Resize((224, 224), interpolation=InterpolationMode.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

In [None]:
# Optimized Data Transform Pipeline
transform = transforms.Compose([
    transforms.Resize((224, 224), interpolation=InterpolationMode.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

# Load Food101 Dataset
train_dataset = datasets.Food101(root='./data', split='train', transform=transform, download=True)
val_dataset = datasets.Food101(root='./data', split='test', transform=transform, download=True)
test_dataset = datasets.Food101(root='./data', split='test', transform=transform, download=True)

train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=torch.multiprocessing.cpu_count(),
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=torch.multiprocessing.cpu_count(),
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=torch.multiprocessing.cpu_count(),
    pin_memory=True
)

In [None]:
# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
# Training Loop with Checkpoint Saving
def train_model(model, dataloader, optimizer, criterion, device, epochs=5, save_dir="model_checkpoints"):
    model.train()
    os.makedirs(save_dir, exist_ok=True)
    for epoch in range(epochs):
        total_loss, correct, total = 0, 0, 0
        with tqdm(dataloader, unit="batch") as tepoch:
            for images, labels in tepoch:
                images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)
                optimizer.zero_grad()
                outputs = model(images).logits
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
                preds = torch.argmax(outputs, dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

                tepoch.set_postfix(loss=total_loss/total, accuracy=100 * correct/total)

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}, Accuracy: {100 * correct/total:.2f}%")

        # Save the model checkpoint after each epoch
        model.save_pretrained(os.path.join(save_dir, f"checkpoint_epoch_{epoch+1}"))


In [None]:
# Train the Baseline Model for 5 (or more) Epochs
train_model(model, train_loader, optimizer, criterion, device)

100%|██████████| 2368/2368 [08:44<00:00,  4.52batch/s, accuracy=65, loss=0.06]


Epoch 1/5, Loss: 1.9197, Accuracy: 64.97%


100%|██████████| 2368/2368 [08:42<00:00,  4.53batch/s, accuracy=85.4, loss=0.0201]


Epoch 2/5, Loss: 0.6414, Accuracy: 85.37%


100%|██████████| 2368/2368 [08:42<00:00,  4.53batch/s, accuracy=91.2, loss=0.0119]


Epoch 3/5, Loss: 0.3805, Accuracy: 91.19%


100%|██████████| 2368/2368 [08:42<00:00,  4.53batch/s, accuracy=95.1, loss=0.00706]


Epoch 4/5, Loss: 0.2258, Accuracy: 95.07%


100%|██████████| 2368/2368 [08:42<00:00,  4.53batch/s, accuracy=97.5, loss=0.004]


Epoch 5/5, Loss: 0.1278, Accuracy: 97.52%


In [None]:
# Save Model Size Function
def save_model_size(model, path):
    torch.save(model.state_dict(), path)
    size_in_mb = os.path.getsize(path) / (1024 * 1024)
    print(f"Model Size: {size_in_mb:.2f} MB")
    return size_in_mb

In [None]:
model = ViTForImageClassification.from_pretrained("model_checkpoints/checkpoint_epoch_5")
model.to(device)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [None]:
# Evaluation Function
def evaluate_model(model, dataloader, device):
    correct, total = 0, 0
    all_preds, all_labels = [], []

    model.eval()
    with torch.no_grad():
        for images, labels in tqdm(dataloader):
            images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)
            outputs = model(images).logits
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")

    return accuracy, precision, recall, f1

In [None]:
# save the accuray and other results above in a list
baseline_results = evaluate_model(model, test_loader, device)

100%|██████████| 790/790 [00:57<00:00, 13.72it/s]

Accuracy:  0.8764
Precision: 0.8779
Recall:    0.8764
F1 Score:  0.8764





In [None]:
print(baseline_results)

(0.8763960396039604, 0.8778630341015732, 0.8763960396039604, 0.8764282967683963)


# Post Training Quantization

In [None]:
!pip uninstall bitsandbytes -y
!pip install -U bitsandbytes

Found existing installation: bitsandbytes 0.45.0
Uninstalling bitsandbytes-0.45.0:
  Successfully uninstalled bitsandbytes-0.45.0
Collecting bitsandbytes
  Using cached bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Using cached bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.0


In [None]:
from transformers import BitsAndBytesConfig, ViTForImageClassification
import bitsandbytes as bnb

In [None]:
import bitsandbytes as bnb
print(bnb.__version__)

0.45.0


In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Enable 8-bit quantization
    llm_int8_threshold=6.0,  # Ensure proper layer quantization
    llm_int8_enable_fp32_cpu_offload=True,  # Offload FP32 layers to CPU
    llm_int8_skip_modules=["LayerNorm", "Embeddings"]  # Keep critical layers in FP32
)

In [None]:
# Load Fine-Tuned Model and Apply PTQ
ptq_model = ViTForImageClassification.from_pretrained(
    "model_checkpoints/checkpoint_epoch_5",
    quantization_config=quantization_config,
    device_map="auto",
    local_files_only=True,
)


# Quantized Layers (INT8) PTQ

    Fully Connected Layers:
        torch.nn.Linear layers within:
            Vision Transformer Encoder Layers
            Feed-Forward Network (FFN) Layers
        These layers are the most memory-intensive, making them ideal for INT8 quantization.

    Attention Mechanism Layers:
        Self-Attention Projections: Query, Key, and Value projections are quantized.
        Attention Output Layers: Quantization of attention heads reduces memory use significantly.

In [None]:
# Save Model Size
original_model_size = save_model_size(model, "original_model_size")
mixed_precision_size = save_model_size(ptq_model, "mixed_precision_model_size")

# Evaluate the Quantized Model
original_metrics = evaluate_model(model, test_loader, device)
mixed_precision_metrics = evaluate_model(ptq_model, test_loader, device)

Model Size: 327.67 MB
Model Size: 83.17 MB


100%|██████████| 790/790 [00:57<00:00, 13.78it/s]


Accuracy:  0.8764
Precision: 0.8779
Recall:    0.8764
F1 Score:  0.8764


100%|██████████| 790/790 [00:42<00:00, 18.74it/s]

Accuracy:  0.7997
Precision: 0.8174
Recall:    0.7997
F1 Score:  0.7994





In [None]:
# Summary Comparison
print("\n=== Model Summary ===")
print(f"Post-Training Quantized Metrics: Accuracy={mixed_precision_metrics[0]:.4f}, Precision={mixed_precision_metrics[1]:.4f}, Recall={mixed_precision_metrics[2]:.4f}, F1-Score={mixed_precision_metrics[3]:.4f}")



=== Model Summary ===
Post-Training Quantized Metrics: Accuracy=0.7997, Precision=0.8174, Recall=0.7997, F1-Score=0.7994
