In [1]:
import sys
import os
sys.path.append('..')

from Utils.Accuracy_measures import topk_accuracy
from Utils.TinyImageNet_loader import get_tinyimagenet_dataloaders
from Utils.Num_parameter import count_parameters
import torchvision.transforms as transforms

import time
import torch


import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, datasets, transforms
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm


In [2]:
image_size = 224
batch_size = 64

tiny_transform_train = transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.Resize((image_size, image_size)), 
transforms.RandomCrop(image_size, padding=5),
transforms.RandomRotation(10),
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
tiny_transform_val = transforms.Compose([
transforms.Resize((image_size, image_size)), 
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
tiny_transform_test = transforms.Compose([
transforms.Resize((image_size, image_size)), 
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])


train_loader, val_loader, test_loader= get_tinyimagenet_dataloaders(
                                        data_dir = '../datasets',
                                        transform_train=tiny_transform_train,
                                        transform_val=tiny_transform_val,
                                        transform_test=tiny_transform_test,
                                        batch_size=batch_size,
                                        image_size=image_size)




In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')


# Load pre-trained ViT model
model = models.vit_b_16(pretrained=False)

# Modify the classifier head to match the number of classes in Tiny ImageNet (200 classes)
num_classes = 200
model.heads.head = nn.Linear(model.heads.head.in_features, num_classes)

# Move the model to the appropriate device
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()

optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01)

# Learning rate scheduler
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Define accuracy calculation function
def calculate_accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    # Get the indices of the top k predictions
    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()

    # Compare predictions with targets
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        # Calculate the number of correct predictions in top k
        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
        res.append((correct_k / batch_size).item() * 100)
    return res  # Returns a list of accuracies

# Training and validation loops
def train_model(model, criterion, optimizer, scheduler, num_epochs, train_loader, val_loader):
    best_acc = 0.0
    for epoch in range(num_epochs):
        print(f'\nEpoch {epoch+1}/{num_epochs}')
        print('-' * 30)

        # Training phase
        model.train()
        running_loss = 0.0
        top1_acc_train = 0.0
        top3_acc_train = 0.0
        top5_acc_train = 0.0

        for inputs, labels in tqdm(train_loader, desc='Training'):
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)

            loss = criterion(outputs, labels)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            # Statistics
            running_loss += loss.item() * inputs.size(0)
            acc1, acc3, acc5 = calculate_accuracy(outputs, labels, topk=(1, 3, 5))
            top1_acc_train += acc1 * inputs.size(0)
            top3_acc_train += acc3 * inputs.size(0)
            top5_acc_train += acc5 * inputs.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc1 = top1_acc_train / len(train_loader.dataset)
        epoch_acc3 = top3_acc_train / len(train_loader.dataset)
        epoch_acc5 = top5_acc_train / len(train_loader.dataset)

        print(f'Train Loss: {epoch_loss:.4f} | Top-1 Acc: {epoch_acc1:.2f}% | Top-3 Acc: {epoch_acc3:.2f}% | Top-5 Acc: {epoch_acc5:.2f}%')

        # Validation phase
        model.eval()
        running_loss = 0.0
        top1_acc_val = 0.0
        top3_acc_val = 0.0
        top5_acc_val = 0.0

        with torch.no_grad():
            for inputs, labels in tqdm(val_loader, desc='Validation'):
                inputs = inputs.to(device)
                labels = labels.to(device)

                # Forward pass
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                # Statistics
                running_loss += loss.item() * inputs.size(0)
                acc1, acc3, acc5 = calculate_accuracy(outputs, labels, topk=(1, 3, 5))
                top1_acc_val += acc1 * inputs.size(0)
                top3_acc_val += acc3 * inputs.size(0)
                top5_acc_val += acc5 * inputs.size(0)

        epoch_loss = running_loss / len(val_loader.dataset)
        epoch_acc1 = top1_acc_val / len(val_loader.dataset)
        epoch_acc3 = top3_acc_val / len(val_loader.dataset)
        epoch_acc5 = top5_acc_val / len(val_loader.dataset)

        print(f'Val Loss: {epoch_loss:.4f} | Top-1 Acc: {epoch_acc1:.2f}% | Top-3 Acc: {epoch_acc3:.2f}% | Top-5 Acc: {epoch_acc5:.2f}%')

        # Adjust learning rate
        scheduler.step()

        # Save the model if it has the best accuracy so far
        if epoch_acc1 > best_acc:
            best_acc = epoch_acc1
            torch.save(model.state_dict(), 'best_vit_model.pth')

    print(f'\nBest Validation Top-1 Accuracy: {best_acc:.2f}%')

# Calculate number of parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Run the training process
num_epochs = 20  # Adjust the number of epochs as needed
model_parameters = count_parameters(model)
print(f'\nTotal number of trainable parameters: {model_parameters}')

train_model(model, criterion, optimizer, scheduler, num_epochs, train_loader, val_loader)

# Evaluate on the test set
def evaluate_model(model, test_loader):
    model.eval()
    top1_acc_test = 0.0
    top3_acc_test = 0.0
    top5_acc_test = 0.0

    with torch.no_grad():
        for inputs, labels in tqdm(test_loader, desc='Testing'):
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)

            acc1, acc3, acc5 = calculate_accuracy(outputs, labels, topk=(1, 3, 5))
            top1_acc_test += acc1 * inputs.size(0)
            top3_acc_test += acc3 * inputs.size(0)
            top5_acc_test += acc5 * inputs.size(0)

    epoch_acc1 = top1_acc_test / len(test_loader.dataset)
    epoch_acc3 = top3_acc_test / len(test_loader.dataset)
    epoch_acc5 = top5_acc_test / len(test_loader.dataset)

    print(f'\nTest Top-1 Accuracy: {epoch_acc1:.2f}%')
    print(f'Test Top-3 Accuracy: {epoch_acc3:.2f}%')
    print(f'Test Top-5 Accuracy: {epoch_acc5:.2f}%')

# Load the best model and evaluate
model.load_state_dict(torch.load('best_vit_model.pth'))
model = model.to(device)
evaluate_model(model, test_loader)


Using device: cuda





Total number of trainable parameters: 85952456

Epoch 1/20
------------------------------


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
Training:   0%|          | 1/1563 [01:31<39:41:05, 91.46s/it]


KeyboardInterrupt: 