<a href="https://colab.research.google.com/github/NIRMALT04/DND/blob/main/q1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi
import math, time, random, os
from dataclasses import dataclass
from typing import Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)
torch.manual_seed(42)
random.seed(42)

Thu Oct  2 17:19:11 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   44C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# CIFAR-10: 50k train, 10k test; images are 32x32x3
IMG_SIZE = 32
NUM_CLASSES = 10

# Stronger aug helps ViT. Start simple; then try RandAugment/AutoAugment/CutMix/MixUp later.
train_tf = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                         (0.2470, 0.2435, 0.2616)),
])

test_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                         (0.2470, 0.2435, 0.2616)),
])

train_ds = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_tf)
test_ds  = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_tf)

BATCH_SIZE = 128
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds, batch_size=256, shuffle=False, num_workers=2, pin_memory=True)


100%|██████████| 170M/170M [00:04<00:00, 41.3MB/s]


In [3]:
@dataclass
class ViTConfig:
    image_size: int = 32
    patch_size: int = 4
    in_chans: int = 3
    num_classes: int = 10
    dim: int = 256
    depth: int = 6
    heads: int = 8
    mlp_ratio: int = 4
    attn_dropout: float = 0.0
    dropout: float = 0.1

class PatchEmbed(nn.Module):
    def __init__(self, image_size, patch_size, in_chans, dim):
        super().__init__()
        assert image_size % patch_size == 0
        self.num_patches = (image_size // patch_size) ** 2
        self.proj = nn.Conv2d(in_chans, dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        # x: [B, C, H, W] -> [B, N, D]
        x = self.proj(x)       # [B, D, H/P, W/P]
        x = x.flatten(2)       # [B, D, N]
        x = x.transpose(1, 2)  # [B, N, D]
        return x

class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn

    def forward(self, x):
        return self.fn(self.norm(x))

class MLP(nn.Module):
    def __init__(self, dim, hidden_dim, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(dim, hidden_dim)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_dim, dim)
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, dim, heads=8, dropout=0.0):
        super().__init__()
        assert dim % heads == 0
        self.heads = heads
        self.scale = (dim // heads) ** -0.5
        self.to_qkv = nn.Linear(dim, dim * 3, bias=False)
        self.drop = nn.Dropout(dropout)
        self.proj = nn.Linear(dim, dim)

    def forward(self, x):
        B, N, D = x.shape
        qkv = self.to_qkv(x).reshape(B, N, 3, self.heads, D // self.heads)
        q, k, v = qkv.unbind(dim=2)  # [B, N, H, Dh]
        q = q.transpose(1, 2)  # [B, H, N, Dh]
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.drop(attn)
        out = attn @ v  # [B, H, N, Dh]
        out = out.transpose(1, 2).reshape(B, N, D)
        return self.proj(out)

class TransformerBlock(nn.Module):
    def __init__(self, dim, heads, mlp_ratio, attn_dropout=0.0, dropout=0.1):
        super().__init__()
        self.attn = PreNorm(dim, MultiHeadSelfAttention(dim, heads, attn_dropout))
        self.mlp  = PreNorm(dim, MLP(dim, dim * mlp_ratio, dropout))

    def forward(self, x):
        x = x + self.attn(x)
        x = x + self.mlp(x)
        return x

class ViT(nn.Module):
    def __init__(self, cfg: ViTConfig):
        super().__init__()
        self.cfg = cfg
        self.patch_embed = PatchEmbed(cfg.image_size, cfg.patch_size, cfg.in_chans, cfg.dim)
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, cfg.dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, 1 + num_patches, cfg.dim))
        self.pos_drop = nn.Dropout(cfg.dropout)

        self.blocks = nn.ModuleList([
            TransformerBlock(cfg.dim, cfg.heads, cfg.mlp_ratio, cfg.attn_dropout, cfg.dropout)
            for _ in range(cfg.depth)
        ])
        self.norm = nn.LayerNorm(cfg.dim)
        self.head = nn.Linear(cfg.dim, cfg.num_classes)

        self._init_weights()

    def _init_weights(self):
        nn.init.trunc_normal_(self.pos_embed, std=0.02)
        nn.init.trunc_normal_(self.cls_token, std=0.02)
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.trunc_normal_(m.weight, std=0.02)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.LayerNorm):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        B = x.shape[0]
        x = self.patch_embed(x)               # [B, N, D]
        cls = self.cls_token.expand(B, -1, -1)  # [B, 1, D]
        x = torch.cat([cls, x], dim=1)        # [B, 1+N, D]
        x = x + self.pos_embed
        x = self.pos_drop(x)
        for blk in self.blocks:
            x = blk(x)
        x = self.norm(x)
        cls_out = x[:, 0]                     # CLS token
        return self.head(cls_out)


In [4]:
def accuracy(logits, y):
    preds = logits.argmax(dim=1)
    return (preds == y).float().mean().item()

@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    total, correct, loss_sum = 0, 0, 0.0
    for x, y in loader:
        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
        logits = model(x)
        loss = F.cross_entropy(logits, y)
        loss_sum += loss.item() * x.size(0)
        correct += (logits.argmax(1) == y).sum().item()
        total += x.size(0)
    return correct / total, loss_sum / total

def save_ckpt(model, optimizer, epoch, best_acc, path='vit_cifar10.pt'):
    torch.save({'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
                'best_acc': best_acc}, path)

def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


In [5]:
EPOCHS = 60
cfg = ViTConfig(
    image_size=IMG_SIZE,
    patch_size=4,
    num_classes=NUM_CLASSES,
    dim=256,
    depth=6,
    heads=8,
    mlp_ratio=4,
    attn_dropout=0.0,
    dropout=0.1
)

model = ViT(cfg).to(device)
print('Params (M):', count_params(model) / 1e6)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.05, betas=(0.9, 0.999))
# Cosine LR with warmup
total_steps = EPOCHS * len(train_loader)
warmup_steps = int(0.05 * total_steps)

def lr_schedule(step):
    if step < warmup_steps:
        return step / max(1, warmup_steps)
    pct = (step - warmup_steps) / max(1, total_steps - warmup_steps)
    return 0.5 * (1 + math.cos(math.pi * pct))

scaler = torch.cuda.amp.GradScaler()
best_acc = 0.0
global_step = 0

for epoch in range(1, EPOCHS+1):
    model.train()
    epoch_loss, epoch_acc, n = 0.0, 0.0, 0

    for x, y in train_loader:
        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
        lr = 3e-4 * lr_schedule(global_step)
        for pg in optimizer.param_groups:
            pg['lr'] = lr

        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=device.type=='cuda'):
            logits = model(x)
            loss = F.cross_entropy(logits, y, label_smoothing=0.1)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        acc = accuracy(logits, y)
        bs = x.size(0)
        epoch_loss += loss.item() * bs
        epoch_acc += acc * bs
        n += bs
        global_step += 1

    tr_loss = epoch_loss / n
    tr_acc  = epoch_acc / n
    te_acc, te_loss = evaluate(model, test_loader)

    if te_acc > best_acc:
        best_acc = te_acc
        save_ckpt(model, optimizer, epoch, best_acc)

    print(f'Epoch {epoch:03d} | LR {lr:.6f} | Train Loss {tr_loss:.4f} Acc {tr_acc*100:.2f}% | '
          f'Test Loss {te_loss:.4f} Acc {te_acc*100:.2f}% | Best {best_acc*100:.2f}%')

print('Best Test Acc:', best_acc)


Params (M): 4.766474


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast(enabled=device.type=='cuda'):


Epoch 001 | LR 0.000100 | Train Loss 2.0267 Acc 26.92% | Test Loss 1.7669 Acc 34.82% | Best 34.82%
Epoch 002 | LR 0.000200 | Train Loss 1.8102 Acc 37.75% | Test Loss 1.6180 Acc 41.92% | Best 41.92%
Epoch 003 | LR 0.000300 | Train Loss 1.6218 Acc 47.78% | Test Loss 1.3846 Acc 50.18% | Best 50.18%
Epoch 004 | LR 0.000300 | Train Loss 1.5071 Acc 53.16% | Test Loss 1.2462 Acc 55.98% | Best 55.98%
Epoch 005 | LR 0.000299 | Train Loss 1.4365 Acc 56.64% | Test Loss 1.1785 Acc 59.28% | Best 59.28%
Epoch 006 | LR 0.000298 | Train Loss 1.3866 Acc 59.33% | Test Loss 1.0848 Acc 63.32% | Best 63.32%
Epoch 007 | LR 0.000296 | Train Loss 1.3428 Acc 61.40% | Test Loss 1.1353 Acc 60.77% | Best 63.32%
Epoch 008 | LR 0.000294 | Train Loss 1.3012 Acc 63.19% | Test Loss 1.0248 Acc 64.33% | Best 64.33%
Epoch 009 | LR 0.000292 | Train Loss 1.2712 Acc 64.64% | Test Loss 1.0173 Acc 64.89% | Best 64.89%
Epoch 010 | LR 0.000289 | Train Loss 1.2330 Acc 66.60% | Test Loss 0.9670 Acc 67.10% | Best 67.10%
Epoch 011 