In [1]:
import torch
import torch.nn as nn

import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10

import torch.nn.functional as F
from tqdm import tqdm

import torchvision.models as models

from transformers import ViTForImageClassification, ViTFeatureExtractor

2024-06-11 22:53:54.274182: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-11 22:53:54.274291: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-11 22:53:54.399752: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [63]:
class MultiheadAttentionEinsum(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(MultiheadAttentionEinsum, self).__init__()
        self.num_heads = num_heads
        self.head_dim = embedding_dim // num_heads

        self.q_linear = nn.Linear(embedding_dim, embedding_dim)
        self.k_linear = nn.Linear(embedding_dim, embedding_dim)
        self.v_linear = nn.Linear(embedding_dim, embedding_dim)
        self.fc_out = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, query, key, value):
        batch_size = query.size(0)

        # Linear projections
        Q = self.q_linear(query)
        K = self.k_linear(key)
        V = self.v_linear(value)

        # Reshape and split by heads
        Q = Q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

        # Scaled dot-product attention
        attention_scores = torch.einsum("bnqd,bnkd->bnqk", [Q, K]) / (self.head_dim ** 0.5)
        attention_probs = F.softmax(attention_scores, dim=-1)

        attended_values = torch.einsum("bnqk,bnvd->bnqd", [attention_probs, V])
        attended_values = attended_values.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.head_dim)

        # Final linear projection
        out = self.fc_out(attended_values)
        return out

In [64]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(TransformerEncoderLayer, self).__init__()
        self.multihead_attention = MultiheadAttentionEinsum(embedding_dim=embedding_dim, num_heads=num_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(embedding_dim, 2048),
            nn.ReLU(),
            nn.Linear(2048, embedding_dim)
        )
        self.layer_norm1 = nn.LayerNorm(embedding_dim)
        self.layer_norm2 = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        residual = x
        x = self.layer_norm1(x)
        x = x.permute(1, 0, 2)  # (seq_len, batch_size, embedding_dim)
        attn_output = self.multihead_attention(x, x, x)[0]  # self-attention
        x = attn_output + residual
        x = x.permute(1, 0, 2)  # (batch_size, seq_len, embedding_dim)

        residual = x
        x = self.layer_norm2(x)
        x = self.feed_forward(x)
        x = x + residual

        return x


In [65]:
class VisionTransformer(nn.Module):
    def __init__(self, num_classes, patch_size, embedding_dim, num_heads, num_layers):
        super(VisionTransformer, self).__init__()
        self.patch_embedding = nn.Conv2d(3, embedding_dim, kernel_size=patch_size, stride=patch_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 14 * 14 + 1, embedding_dim))
        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embedding_dim, num_classes)

    def forward(self, x):
        batch_size = x.size(0)
        x = self.patch_embedding(x)
        x = x.flatten(2).transpose(1, 2)
        x = torch.cat((x, self.positional_encoding.repeat(batch_size, 1, 1)), dim=1)
        for layer in self.transformer_layers:
            x = layer(x)
        x = x.mean(dim=1)
        x = self.fc(x)
        return x

In [5]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
num_epochs = 10
batch_size = 64
learning_rate = 0.001
num_classes = 10
patch_size = 16
embedding_dim = 128
num_heads = 8
num_layers = 3

# CIFAR-10 dataset preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Load CIFAR-10 dataset
train_dataset = CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = CIFAR10(root='./data', train=False, download=True, transform=transform)

# Data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:04<00:00, 34750619.96it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [66]:
# Initialize the model
model = VisionTransformer(num_classes, patch_size, embedding_dim, num_heads, num_layers).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [67]:
# Training loop
total_steps = len(train_loader)
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
    
    for i, (images, labels) in enumerate(train_loader_tqdm):
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        train_loader_tqdm.set_postfix(loss=loss.item())

    avg_loss = epoch_loss / total_steps
    tqdm.write(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # Evaluate the model on the test dataset
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        accuracy = 100 * correct / total
        tqdm.write(f"Accuracy of the model on the test images: {accuracy:.2f}%")

Epoch 1/10: 100%|██████████| 782/782 [02:12<00:00,  5.91batch/s, loss=1.82]


Epoch [1/10], Loss: 2.0522
Accuracy of the model on the test images: 25.05%


Epoch 2/10: 100%|██████████| 782/782 [02:12<00:00,  5.91batch/s, loss=2.09]


Epoch [2/10], Loss: 1.9763
Accuracy of the model on the test images: 28.05%


Epoch 3/10: 100%|██████████| 782/782 [02:12<00:00,  5.89batch/s, loss=1.61]


Epoch [3/10], Loss: 1.8319
Accuracy of the model on the test images: 35.57%


Epoch 4/10: 100%|██████████| 782/782 [02:12<00:00,  5.90batch/s, loss=1.6] 


Epoch [4/10], Loss: 1.6945
Accuracy of the model on the test images: 38.66%


Epoch 5/10: 100%|██████████| 782/782 [02:12<00:00,  5.89batch/s, loss=1.3] 


Epoch [5/10], Loss: 1.6117
Accuracy of the model on the test images: 43.94%


Epoch 6/10: 100%|██████████| 782/782 [02:12<00:00,  5.89batch/s, loss=1.62]


Epoch [6/10], Loss: 1.5398
Accuracy of the model on the test images: 43.03%


Epoch 7/10: 100%|██████████| 782/782 [02:12<00:00,  5.90batch/s, loss=1.32]


Epoch [7/10], Loss: 1.4701
Accuracy of the model on the test images: 45.62%


Epoch 8/10: 100%|██████████| 782/782 [02:12<00:00,  5.90batch/s, loss=2.05]


Epoch [8/10], Loss: 1.4237
Accuracy of the model on the test images: 48.14%


Epoch 9/10: 100%|██████████| 782/782 [02:12<00:00,  5.88batch/s, loss=1.39]


Epoch [9/10], Loss: 1.3854
Accuracy of the model on the test images: 51.47%


Epoch 10/10: 100%|██████████| 782/782 [02:12<00:00,  5.90batch/s, loss=1.58] 


Epoch [10/10], Loss: 1.3446
Accuracy of the model on the test images: 51.09%


In [17]:
def train_model(num_classes, patch_size, embedding_dim, num_heads, num_layers, num_epochs, learning_rate):
    model = VisionTransformer(num_classes, patch_size, embedding_dim, num_heads, num_layers).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
        
        for i, (images, labels) in enumerate(train_loader_tqdm):
            images = images.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            train_loader_tqdm.set_postfix(loss=loss.item())

        avg_loss = epoch_loss / len(train_loader)
        tqdm.write(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # Evaluate the model on the test dataset
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        accuracy = 100 * correct / total
        tqdm.write(f"Accuracy of the model on the test images: {accuracy:.2f}%")
    
    return accuracy

In [18]:
def hyperparameter_tuning():
    num_classes = 10
    num_epochs = 1

    # Define hyperparameters to search
    patch_sizes = [16, 32]
    embedding_dims = [128, 256]
    num_heads_list = [8, 16]
    num_layers_list = [3, 6]
    learning_rates = [0.001, 0.0001]

    best_accuracy = 0
    best_params = {}

    for patch_size in patch_sizes:
        for embedding_dim in embedding_dims:
            for num_heads in num_heads_list:
                for num_layers in num_layers_list:
                    for learning_rate in learning_rates:
                        print(f"Training with patch_size={patch_size}, embedding_dim={embedding_dim}, num_heads={num_heads}, num_layers={num_layers}, learning_rate={learning_rate}")
                        accuracy = train_model(num_classes, patch_size, embedding_dim, num_heads, num_layers, num_epochs, learning_rate)
                        if accuracy > best_accuracy:
                            best_accuracy = accuracy
                            best_params = {
                                'patch_size': patch_size,
                                'embedding_dim': embedding_dim,
                                'num_heads': num_heads,
                                'num_layers': num_layers,
                                'learning_rate': learning_rate
                            }

    print(f"Best accuracy: {best_accuracy:.2f}%")
    print("Best hyperparameters:", best_params)

In [None]:
# Call the hyperparameter tuning function
hyperparameter_tuning()

Training with patch_size=16, embedding_dim=128, num_heads=8, num_layers=3, learning_rate=0.001


Epoch 1/1: 100%|██████████| 782/782 [02:20<00:00,  5.55batch/s, loss=2.09]


Epoch [1/1], Loss: 2.0298
Accuracy of the model on the test images: 27.06%
Training with patch_size=16, embedding_dim=128, num_heads=8, num_layers=3, learning_rate=0.0001


Epoch 1/1: 100%|██████████| 782/782 [02:20<00:00,  5.57batch/s, loss=1.83]


Epoch [1/1], Loss: 1.9831
Accuracy of the model on the test images: 29.37%
Training with patch_size=16, embedding_dim=128, num_heads=8, num_layers=6, learning_rate=0.001


Epoch 1/1: 100%|██████████| 782/782 [03:19<00:00,  3.93batch/s, loss=2.15]


Epoch [1/1], Loss: 2.2929
Accuracy of the model on the test images: 18.00%
Training with patch_size=16, embedding_dim=128, num_heads=8, num_layers=6, learning_rate=0.0001


Epoch 1/1: 100%|██████████| 782/782 [03:20<00:00,  3.91batch/s, loss=1.9] 


Epoch [1/1], Loss: 1.9581
Accuracy of the model on the test images: 35.47%
Training with patch_size=16, embedding_dim=128, num_heads=16, num_layers=3, learning_rate=0.001


Epoch 1/1: 100%|██████████| 782/782 [02:26<00:00,  5.32batch/s, loss=1.97]


Epoch [1/1], Loss: 1.9832
Accuracy of the model on the test images: 29.44%
Training with patch_size=16, embedding_dim=128, num_heads=16, num_layers=3, learning_rate=0.0001


Epoch 1/1: 100%|██████████| 782/782 [02:26<00:00,  5.33batch/s, loss=1.81]


Epoch [1/1], Loss: 1.9752
Accuracy of the model on the test images: 30.89%
Training with patch_size=16, embedding_dim=128, num_heads=16, num_layers=6, learning_rate=0.001


Epoch 1/1: 100%|██████████| 782/782 [03:32<00:00,  3.68batch/s, loss=2.31]


Epoch [1/1], Loss: 2.3218
Accuracy of the model on the test images: 10.00%
Training with patch_size=16, embedding_dim=128, num_heads=16, num_layers=6, learning_rate=0.0001


Epoch 1/1: 100%|██████████| 782/782 [03:33<00:00,  3.67batch/s, loss=1.89]


Epoch [1/1], Loss: 1.9678
Accuracy of the model on the test images: 31.35%
Training with patch_size=16, embedding_dim=256, num_heads=8, num_layers=3, learning_rate=0.001


Epoch 1/1: 100%|██████████| 782/782 [02:59<00:00,  4.37batch/s, loss=2.15]


Epoch [1/1], Loss: 2.2053
Accuracy of the model on the test images: 16.69%
Training with patch_size=16, embedding_dim=256, num_heads=8, num_layers=3, learning_rate=0.0001


Epoch 1/1: 100%|██████████| 782/782 [02:58<00:00,  4.37batch/s, loss=1.93]


Epoch [1/1], Loss: 1.9303
Accuracy of the model on the test images: 34.41%
Training with patch_size=16, embedding_dim=256, num_heads=8, num_layers=6, learning_rate=0.001


Epoch 1/1: 100%|██████████| 782/782 [04:39<00:00,  2.80batch/s, loss=2.33]


Epoch [1/1], Loss: 2.3317
Accuracy of the model on the test images: 10.00%
Training with patch_size=16, embedding_dim=256, num_heads=8, num_layers=6, learning_rate=0.0001


Epoch 1/1: 100%|██████████| 782/782 [04:38<00:00,  2.81batch/s, loss=1.95]


Epoch [1/1], Loss: 1.9000
Accuracy of the model on the test images: 37.98%
Training with patch_size=16, embedding_dim=256, num_heads=16, num_layers=3, learning_rate=0.001


Epoch 1/1: 100%|██████████| 782/782 [03:05<00:00,  4.21batch/s, loss=2.13]


Epoch [1/1], Loss: 2.1561
Accuracy of the model on the test images: 17.67%
Training with patch_size=16, embedding_dim=256, num_heads=16, num_layers=3, learning_rate=0.0001


Epoch 1/1: 100%|██████████| 782/782 [03:06<00:00,  4.20batch/s, loss=1.87]


Epoch [1/1], Loss: 1.9298
Accuracy of the model on the test images: 34.66%
Training with patch_size=16, embedding_dim=256, num_heads=16, num_layers=6, learning_rate=0.001


Epoch 1/1: 100%|██████████| 782/782 [04:51<00:00,  2.68batch/s, loss=2.17]


Epoch [1/1], Loss: 2.2524
Accuracy of the model on the test images: 15.30%
Training with patch_size=16, embedding_dim=256, num_heads=16, num_layers=6, learning_rate=0.0001


Epoch 1/1: 100%|██████████| 782/782 [04:51<00:00,  2.68batch/s, loss=2.01]


Epoch [1/1], Loss: 1.8958
Accuracy of the model on the test images: 35.11%
Training with patch_size=32, embedding_dim=128, num_heads=8, num_layers=3, learning_rate=0.001


Epoch 1/1: 100%|██████████| 782/782 [01:56<00:00,  6.70batch/s, loss=2.16]


Epoch [1/1], Loss: 2.1815
Accuracy of the model on the test images: 17.73%
Training with patch_size=32, embedding_dim=128, num_heads=8, num_layers=3, learning_rate=0.0001


Epoch 1/1: 100%|██████████| 782/782 [01:57<00:00,  6.68batch/s, loss=1.96]


Epoch [1/1], Loss: 2.0865
Accuracy of the model on the test images: 27.84%
Training with patch_size=32, embedding_dim=128, num_heads=8, num_layers=6, learning_rate=0.001


Epoch 1/1: 100%|██████████| 782/782 [02:35<00:00,  5.04batch/s, loss=2.25]


Epoch [1/1], Loss: 2.3211
Accuracy of the model on the test images: 10.00%
Training with patch_size=32, embedding_dim=128, num_heads=8, num_layers=6, learning_rate=0.0001


Epoch 1/1: 100%|██████████| 782/782 [02:35<00:00,  5.03batch/s, loss=1.98]


Epoch [1/1], Loss: 2.0973
Accuracy of the model on the test images: 27.99%
Training with patch_size=32, embedding_dim=128, num_heads=16, num_layers=3, learning_rate=0.001


Epoch 1/1:  79%|███████▉  | 618/782 [01:36<00:25,  6.35batch/s, loss=2.24]

In [2]:
train_transform = transforms.Compose([
    transforms.Resize(224),
#     transforms.CenterCrop(224),
    transforms.RandomRotation(20),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [3]:
test_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [4]:
# Load CIFAR-10 dataset
train_dataset = CIFAR10(root='./data', train=True, download=True, transform=test_transform)
test_dataset = CIFAR10(root='./data', train=False, download=True, transform=test_transform)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:10<00:00, 16139000.11it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [5]:
# Data loaders
batch_size = 64
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [6]:
# Best hyperparameters from tuning
best_params = {
    'patch_size': 16,
    'embedding_dim': 256,
    'num_heads': 8,
    'num_layers': 6,
    'learning_rate': 0.0001
}

In [73]:
def train_model(num_classes, patch_size, embedding_dim, num_heads, num_layers, num_epochs, learning_rate):
    model = VisionTransformer(num_classes, patch_size, embedding_dim, num_heads, num_layers).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        correct_train = 0
        total_train = 0
        train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
        
        for i, (images, labels) in enumerate(train_loader_tqdm):
            images = images.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()
            train_accuracy = 100 * correct_train / total_train

            train_loader_tqdm.set_postfix(loss=loss.item(), train_accuracy=train_accuracy)
            train_loader_tqdm.refresh()  # Refresh tqdm display

        avg_loss = epoch_loss / len(train_loader)
        train_accuracy = 100 * correct_train / total_train
        tqdm.write(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%")

        # Evaluate the model on the test dataset after each epoch
        model.eval()
        with torch.no_grad():
            correct = 0
            total = 0
            for images, labels in test_loader:
                images = images.to(device)
                labels = labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

            test_accuracy = 100 * correct / total
            tqdm.write(f"Accuracy of the model on the test images: {test_accuracy:.2f}%")
    
    return test_accuracy

In [None]:
# Train the model with the best hyperparameters for 20 epochs
best_accuracy = train_model(num_classes=10, 
                            patch_size=best_params['patch_size'], 
                            embedding_dim=best_params['embedding_dim'], 
                            num_heads=best_params['num_heads'], 
                            num_layers=best_params['num_layers'], 
                            num_epochs=10, 
                            learning_rate=best_params['learning_rate'])

print(f"Best accuracy with best hyperparameters: {best_accuracy:.2f}%")

Epoch 1/10: 100%|██████████| 782/782 [04:33<00:00,  2.86batch/s, loss=2, train_accuracy=28.8]   


Epoch [1/10], Loss: 1.9179, Train Accuracy: 28.76%
Accuracy of the model on the test images: 36.23%


Epoch 2/10: 100%|██████████| 782/782 [04:34<00:00,  2.84batch/s, loss=1.58, train_accuracy=40.3]


Epoch [2/10], Loss: 1.6610, Train Accuracy: 40.33%
Accuracy of the model on the test images: 46.13%


Epoch 3/10: 100%|██████████| 782/782 [04:33<00:00,  2.86batch/s, loss=1.51, train_accuracy=46.2]


Epoch [3/10], Loss: 1.4999, Train Accuracy: 46.19%
Accuracy of the model on the test images: 47.15%


Epoch 4/10: 100%|██████████| 782/782 [04:34<00:00,  2.85batch/s, loss=1.37, train_accuracy=50.6]


Epoch [4/10], Loss: 1.3850, Train Accuracy: 50.59%
Accuracy of the model on the test images: 50.30%


Epoch 5/10: 100%|██████████| 782/782 [04:33<00:00,  2.86batch/s, loss=1.37, train_accuracy=53.8] 


Epoch [5/10], Loss: 1.2990, Train Accuracy: 53.84%
Accuracy of the model on the test images: 54.67%


Epoch 6/10: 100%|██████████| 782/782 [04:33<00:00,  2.86batch/s, loss=1.43, train_accuracy=56.6] 


Epoch [6/10], Loss: 1.2244, Train Accuracy: 56.64%
Accuracy of the model on the test images: 58.04%


Epoch 7/10: 100%|██████████| 782/782 [04:33<00:00,  2.86batch/s, loss=1.26, train_accuracy=58.8] 


Epoch [7/10], Loss: 1.1742, Train Accuracy: 58.79%
Accuracy of the model on the test images: 59.20%


Epoch 8/10: 100%|██████████| 782/782 [04:33<00:00,  2.86batch/s, loss=0.889, train_accuracy=60.6]


Epoch [8/10], Loss: 1.1208, Train Accuracy: 60.64%
Accuracy of the model on the test images: 60.67%


Epoch 9/10: 100%|██████████| 782/782 [04:33<00:00,  2.86batch/s, loss=0.957, train_accuracy=62.1]


Epoch [9/10], Loss: 1.0827, Train Accuracy: 62.11%
Accuracy of the model on the test images: 61.45%


Epoch 10/10:  57%|█████▋    | 442/782 [02:34<01:59,  2.84batch/s, loss=1.02, train_accuracy=62.8] 

In [8]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [88]:
# Define VGG model
vgg = models.vgg16(pretrained=True)
vgg.classifier[6] = nn.Linear(4096, 10)  # Modify the last layer to match the number of classes
vgg = vgg.to(device)

In [89]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(vgg.parameters(), lr=3e-4)

In [None]:
# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    vgg.train()
    running_loss = 0.0
    correct = 0
    total = 0
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
    
    for i, (inputs, labels) in enumerate(train_loader_tqdm):
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = vgg(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

        train_loader_tqdm.set_postfix(loss=running_loss/(i+1), train_accuracy=(100.0*correct/total))
        train_loader_tqdm.refresh()  # Refresh tqdm display
    # Evaluation on test set
    vgg.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = vgg(inputs)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    print(f'Accuracy of the network on the test images: {(100.0*correct/total):.2f}%')

Epoch 1/5: 100%|██████████| 782/782 [07:23<00:00,  1.76batch/s, loss=0.725, train_accuracy=75.1]


Accuracy of the network on the test images: 83.69%


Epoch 2/5: 100%|██████████| 782/782 [07:24<00:00,  1.76batch/s, loss=0.387, train_accuracy=87]  


Accuracy of the network on the test images: 85.16%


Epoch 3/5:  46%|████▌     | 358/782 [03:23<03:59,  1.77batch/s, loss=0.271, train_accuracy=90.9]

In [9]:
# Define ResNet model
resnet = models.resnet18(pretrained=True)
resnet.fc = nn.Linear(resnet.fc.in_features, 10)  # Modify the last layer to match the number of classes
resnet = resnet.to(device)

In [10]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(resnet.parameters(), lr=3e-4)

In [13]:
# Training loop
for epoch in range(5):
    resnet.train()
    running_loss = 0.0
    correct = 0
    total = 0
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch+1}/{5}", unit="batch")
    
    for i, (inputs, labels) in enumerate(train_loader_tqdm):
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = resnet(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

        train_loader_tqdm.set_postfix(loss=running_loss/(i+1), train_accuracy=(100.0*correct/total))
        train_loader_tqdm.refresh()  # Refresh tqdm display
        
    # Evaluation on test set
    resnet.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = resnet(inputs)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    print(f'Accuracy of the network on the test images: {(100.0*correct/total):.2f}%')

Epoch 1/5: 100%|██████████| 782/782 [02:36<00:00,  5.01batch/s, loss=0.354, train_accuracy=88]  


Accuracy of the network on the test images: 90.71%


Epoch 2/5: 100%|██████████| 782/782 [02:35<00:00,  5.04batch/s, loss=0.162, train_accuracy=94.6]


Accuracy of the network on the test images: 90.74%


Epoch 3/5: 100%|██████████| 782/782 [02:35<00:00,  5.03batch/s, loss=0.109, train_accuracy=96.2] 


Accuracy of the network on the test images: 92.09%


Epoch 4/5: 100%|██████████| 782/782 [02:34<00:00,  5.05batch/s, loss=0.0847, train_accuracy=97.1]


Accuracy of the network on the test images: 91.59%


Epoch 5/5: 100%|██████████| 782/782 [02:35<00:00,  5.03batch/s, loss=0.0631, train_accuracy=97.9]


Accuracy of the network on the test images: 92.07%


In [15]:
# Define ViT model with ignore_mismatched_sizes
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
vit = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', num_labels=10, ignore_mismatched_sizes=True).to(device)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(vit.parameters(), lr=0.0001)

In [19]:
# Training loop
for epoch in range(5):
    vit.train()
    running_loss = 0.0
    correct = 0
    total = 0
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch+1}/{5}", unit="batch")
    
    for i, (inputs, labels) in enumerate(train_loader_tqdm):
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = vit(inputs).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

        train_loader_tqdm.set_postfix(loss=running_loss/(i+1), train_accuracy=(100.0*correct/total))
        train_loader_tqdm.refresh()  # Refresh tqdm display

    # Evaluation on test set
    vit.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = vit(inputs).logits
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    print(f'Accuracy of the ViT model on the test images: {(100.0*correct/total):.2f}%')

Epoch 1/5: 100%|██████████| 782/782 [16:11<00:00,  1.24s/batch, loss=0.116, train_accuracy=96.6]


Accuracy of the ViT model on the test images: 96.78%


Epoch 2/5:  21%|██▏       | 168/782 [03:30<12:48,  1.25s/batch, loss=0.0295, train_accuracy=99.1]


KeyboardInterrupt: 