In [8]:
import torch
import torch.utils.data as data
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, datasets
from sklearn.metrics import precision_score, recall_score, confusion_matrix

In [9]:
class PatchEmbedding(nn.Module):
    def __init__(self, img_size, patch_size, in_channels, embed_dim):
        super(PatchEmbedding, self).__init__()
        self.n_patches = (img_size // patch_size) ** 2
        self.patch_size = patch_size
        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.proj(x)  # (B, embed_dim, n_patches^(1/2), n_patches^(1/2))
        x = x.flatten(2)  # (B, embed_dim, n_patches)
        x = x.transpose(1, 2)  # (B, n_patches, embed_dim)
        return x


In [None]:
class PositionalEmbedding(nn.Module):
    def __init__(self, n_patches, embed_dim):
        super(PositionalEmbedding, self).__init__()
        self.pos_embed = nn.Parameter(torch.randn(1, n_patches, embed_dim))

    def forward(self, x):
        return x + self.pos_embed

In [10]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout=0.1):
        super().__init__()
        self.layernorm1 = nn.LayerNorm(embed_dim)
        self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
        self.layernorm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Linear(mlp_dim, embed_dim),
            nn.Dropout(dropout)
        )

    def forward(self, src):
        src2 = self.layernorm1(src)
        src2, _ = self.attention(src2, src2, src2)
        src = src + src2
        src2 = self.layernorm2(src)
        src2 = self.mlp(src2)
        src = src + src2
        return src

In [11]:
class VisionTransformer(nn.Module):
    def __init__(self, img_size=224, patch_size=16, in_channels=3, num_classes=1000, embed_dim=768, depth=12, num_heads=12, mlp_dim=3072, dropout=0.1):
        super().__init__()
        self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, self.patch_embed.num_patches + 1, embed_dim))
        self.dropout = nn.Dropout(dropout)
        self.encoder_layers = nn.ModuleList([
            TransformerEncoderLayer(embed_dim, num_heads, mlp_dim, dropout)
            for _ in range(depth)
        ])
        self.layernorm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        B = x.size(0)
        x = self.patch_embed(x)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embed
        x = self.dropout(x)

        for layer in self.encoder_layers:
            x = layer(x)

        x = self.layernorm(x)
        cls_token_final = x[:, 0]
        x = self.head(cls_token_final)
        return x
    

In [12]:
def train_vit(model, train_loader, val_loader, epochs=10, lr=0.001):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader)}")

        model.eval()
        correct = 0
        total = 0
        all_labels = []
        all_preds = []
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                all_labels.extend(labels.cpu().numpy())
                all_preds.extend(predicted.cpu().numpy())

        accuracy = 100 * correct / total
        precision = precision_score(all_labels, all_preds, average='weighted')
        recall = recall_score(all_labels, all_preds, average='weighted')
        conf_matrix = confusion_matrix(all_labels, all_preds)

        print(f"Validation Accuracy: {accuracy}%")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print("Confusion Matrix:")
        print(conf_matrix)

        print(f"Total number of samples: {total}")
        print(f"Number of correct predictions: {correct}")
        print(f"First 10 predictions: {all_preds[:10]}")
        print(f"First 10 actual labels: {all_labels[:10]}")

transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

In [13]:
epoch = 10
batch_size = 16
lr = 5e-5

train_dataset = datasets.ImageFolder(root="../Dataset/train", transform=transform)
test_dataset = datasets.ImageFolder(root="../Dataset/test", transform=transform)

# Create data loaders
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model = VisionTransformer(img_size=224, patch_size=16, in_channels=3, num_classes=len(train_dataset.classes))
train_vit(model, train_loader, val_loader, epochs=epoch, lr=lr)

Epoch [1/10], Loss: 1.8598459859689076


TypeError: ToTensor.__init__() takes 1 positional argument but 2 were given

In [18]:
#Test code
#Parameters
batch_size = 8
lr = 5e-5
seed = 12345

transform = transforms.Compose([
    #transforms.RandomResizedCrop(224),
    transforms.Resize((224, 224)),  # Resize images to 224x224 (for example)
    transforms.ToTensor(),          # Convert PIL image to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize image
])

train_dataset = datasets.ImageFolder(root="../Dataset/train", transform=transform)
test_dataset = datasets.ImageFolder(root="../Dataset/test", transform=transform)

# Create data loaders
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class VisionTransformer(nn.Module):
    def __init__(self, num_classes, image_size=224, patch_size=16, num_channels=3, num_layers=12, hidden_size=768, num_heads=12):
        super(VisionTransformer, self).__init__()
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_patches = (image_size // patch_size) ** 2
        self.patch_embedding = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
        self.positional_embedding = nn.Parameter(torch.randn(1, self.num_patches + 1, hidden_size))
        
        self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=hidden_size, nhead=num_heads), num_layers)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        x = self.patch_embedding(x)
        x = x.flatten(2).transpose(1, 2)
        x = torch.cat((self.positional_embedding.expand(x.size(0), -1, -1), x), dim=1)
        x = self.transformer(x)
        x = x[:, 0]
        x = self.fc(x)
        return x
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = VisionTransformer(num_classes=4)  
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    for batch_idx, (data, targets) in enumerate(train_loader):
        data, targets = data.to(device), targets.to(device)
        
        # Forward pass
        outputs = model(data)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch_idx % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for data, targets in val_loader:
        data, targets = data.to(device), targets.to(device)
        outputs = model(data)
        _, predicted = torch.max(outputs, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()
    
    accuracy = correct / total * 100
    print(f'Accuracy on test set: {accuracy:.2f}%')




Epoch [1/5], Loss: 1.6054
Epoch [2/5], Loss: 1.7272
Epoch [3/5], Loss: 1.3159
Epoch [4/5], Loss: 1.4845
Epoch [5/5], Loss: 1.3597
Accuracy on test set: 25.00%
