In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10  # Example dataset, replace with your dataset
import matplotlib.pyplot as plt
import numpy as np
import cv2



In [2]:
class VisionTransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_ratio=4.0, dropout=0.1):
        super(VisionTransformerBlock, self).__init__()

        # Multi-Head Self Attention
        self.self_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)

        # Layer Normalization and Dropout
        self.norm1 = nn.LayerNorm(embed_dim)
        self.dropout1 = nn.Dropout(dropout)

        # MLP (Multi-Layer Perceptron)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, int(embed_dim * mlp_ratio)),
            nn.GELU(),
            nn.Linear(int(embed_dim * mlp_ratio), embed_dim),
            nn.Dropout(dropout)
        )

        # Layer Normalization and Dropout
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        # Multi-Head Self Attention
        attn_output, _ = self.self_attn(x, x, x)

        # Residual Connection and Layer Normalization
        x = x + self.dropout1(attn_output)
        x = self.norm1(x)

        # MLP
        mlp_output = self.mlp(x)

        # Residual Connection and Layer Normalization
        x = x + self.dropout2(mlp_output)
        x = self.norm2(x)

        return x

class VisionTransformer(nn.Module):
    def __init__(self, num_classes, img_size, patch_size, embed_dim, num_heads, num_layers, mlp_ratio=4.0, dropout=0.1):
        super(VisionTransformer, self).__init__()
        self.num_classes = num_classes
        self.patch_embed = nn.Conv2d(3, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.pos_embedding = nn.Parameter(torch.randn(1, (img_size // patch_size) ** 2 + 1, embed_dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        self.transformer_blocks = nn.ModuleList([
            VisionTransformerBlock(embed_dim, num_heads, mlp_ratio, dropout) for _ in range(num_layers)
        ])
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        B, C, H, W = x.shape
        x = self.patch_embed(x)
        x = x.flatten(2).transpose(1, 2)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embedding
        for block in self.transformer_blocks:
            x = block(x)
        x = self.pooling(x).squeeze(2)
        x = self.fc(x)
        return x

In [3]:
# Instantiate the ViT-based CNN classifier model
# Example usage: 
model = VisionTransformer(num_classes=5, img_size=224, patch_size=16, embed_dim=256, num_heads=8, num_layers=12)
# Adjust the parameters based on your requirements

# Print the model architecture
print(model)

VisionTransformer(
  (patch_embed): Conv2d(3, 256, kernel_size=(16, 16), stride=(16, 16))
  (transformer_blocks): ModuleList(
    (0-11): 12 x VisionTransformerBlock(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
      )
      (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (mlp): Sequential(
        (0): Linear(in_features=256, out_features=1024, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=1024, out_features=256, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
  (pooling): AdaptiveAvgPool2d(output_size=1)
  (fc): Linear(in_features=256, out_features=5, bias=True)
)


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision import datasets
from torch.utils.data import DataLoader
from efficientnet_pytorch import EfficientNet
from torch.utils.tensorboard import SummaryWriter

In [9]:
# Assuming you have a dataset in the 'data' directory with train and val subdirectories.
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ]),
}

train_dataset = datasets.ImageFolder('flower/Train', transform=data_transforms['train'])
val_dataset = datasets.ImageFolder('flower/Test', transform=data_transforms['val'])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [10]:
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Example training loop (adjust as needed)
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    for batch_images, batch_labels in train_loader:
        outputs = model(batch_images)
        loss = criterion(outputs, batch_labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x1 and 256x5)