<a href="https://colab.research.google.com/github/JingchenYan1/Real-Time-ML/blob/main/Homework_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
import time

class PatchEmbed(nn.Module):
    def __init__(self,img_size,patch_size,emb_dim):
        super().__init__()
        self.num_patches=(img_size//patch_size)**2
        self.proj=nn.Conv2d(3,emb_dim,patch_size,patch_size)
    def forward(self,x):
        x=self.proj(x)
        x=x.flatten(2)
        return x.transpose(1,2)

class TransformerEncoderBlock(nn.Module):
    def __init__(self,emb_dim,heads,mlp_dim,drop_rate=0.1):
        super().__init__()
        self.norm1=nn.LayerNorm(emb_dim)
        self.attn=nn.MultiheadAttention(emb_dim,heads)
        self.norm2=nn.LayerNorm(emb_dim)
        self.mlp=nn.Sequential(
            nn.Linear(emb_dim,mlp_dim),
            nn.GELU(),
            nn.Dropout(drop_rate),
            nn.Linear(mlp_dim,emb_dim),
            nn.Dropout(drop_rate),
        )
    def forward(self,x):
        x2=x
        x=x.transpose(0,1)
        attn_out,_=self.attn(x,x,x)
        x=(attn_out+x).transpose(0,1)
        x=self.norm1(x)
        x2=x
        x=self.mlp(x)
        x=self.norm2(x+x2)
        return x

class ViT(nn.Module):
    def __init__(self,img_size,patch_size,emb_dim,depth,heads,mlp_dim,n_classes,drop_rate=0.1):
        super().__init__()
        self.patch=PatchEmbed(img_size,patch_size,emb_dim)
        self.cls_token=nn.Parameter(torch.zeros(1,1,emb_dim))
        self.pos_embed=nn.Parameter(torch.zeros(1,self.patch.num_patches+1,emb_dim))
        self.blocks=nn.ModuleList([
            TransformerEncoderBlock(emb_dim,heads,mlp_dim,drop_rate)
            for _ in range(depth)
        ])
        self.norm=nn.LayerNorm(emb_dim)
        self.head=nn.Linear(emb_dim,n_classes)
    def forward(self,x):
        x=self.patch(x)
        b=x.shape[0]
        cls=self.cls_token.expand(b,-1,-1)
        x=torch.cat((cls,x),1)
        x+=self.pos_embed
        for blk in self.blocks:
            x=blk(x)
        x=self.norm(x[:,0])
        return self.head(x)

def train_epoch(model,loader,optim,device):
    model.train()
    total_loss=0
    for xb,yb in loader:
        xb,yb=xb.to(device),yb.to(device)
        out=model(xb)
        loss=nn.CrossEntropyLoss()(out,yb)
        optim.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=1.0)
        optim.step()
        total_loss+=loss.item()*xb.size(0)
    return total_loss/len(loader.dataset)

def test(model,loader,device):
    model.eval()
    correct=0
    with torch.no_grad():
        for xb,yb in loader:
            xb,yb=xb.to(device),yb.to(device)
            out=model(xb)
            correct+=(out.argmax(1)==yb).sum().item()
    return correct/len(loader.dataset)

transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5071,0.4865,0.4409),(0.2673,0.2564,0.2761))
])
trainset=datasets.CIFAR100(root='./data',train=True,download=True,transform=transform)
testset=datasets.CIFAR100(root='./data',train=False,download=True,transform=transform)
trainloader=DataLoader(trainset,batch_size=64,shuffle=True,num_workers=2)
testloader=DataLoader(testset,batch_size=64,shuffle=False,num_workers=2)

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
configs=[
    {'patch':4,'emb':256,'depth':4,'heads':2,'mlp':512},
    {'patch':4,'emb':512,'depth':8,'heads':4,'mlp':2048},
    {'patch':8,'emb':256,'depth':8,'heads':4,'mlp':1024},
    {'patch':8,'emb':512,'depth':4,'heads':2,'mlp':1024}
]

for cfg in configs:
    model=ViT(32,cfg['patch'],cfg['emb'],cfg['depth'],cfg['heads'],cfg['mlp'],100,drop_rate=0.1).to(device)
    optimizer=optim.Adam(model.parameters(),lr=1e-4,weight_decay=1e-4)
    scheduler=torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max=10)
    for epoch in range(10):
        t0=time.time()
        loss=train_epoch(model,trainloader,optimizer,device)
        t1=time.time()
        scheduler.step()
        print(f"ViT patch={cfg['patch']} emb={cfg['emb']} depth={cfg['depth']} heads={cfg['heads']} Epoch {epoch+1} loss={loss:.4f} time={(t1-t0):.2f}s lr={scheduler.get_last_lr()[0]:.1e}")
        acc=test(model,testloader,device)
        print(f"         val_acc={acc*100:.2f}%")
    params=sum(p.numel() for p in model.parameters())
    print(f"Config {cfg} params: {params}")

resnet=models.resnet18(num_classes=100).to(device)
optimizer=optim.Adam(resnet.parameters(),lr=1e-4,weight_decay=1e-4)
scheduler=torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max=10)
for epoch in range(10):
    t0=time.time()
    loss=train_epoch(resnet,trainloader,optimizer,device)
    t1=time.time()
    scheduler.step()
    print(f"ResNet18 Epoch {epoch+1} loss={loss:.4f} time={(t1-t0):.2f}s lr={scheduler.get_last_lr()[0]:.1e}")
    acc=test(resnet,testloader,device)
    print(f"          val_acc={acc*100:.2f}%")
params=sum(p.numel() for p in resnet.parameters())
print(f"ResNet18 params: {params}")


ViT patch=4 emb=256 depth=4 heads=2 Epoch 1 loss=3.9715 time=24.99s lr=9.8e-05
         val_acc=14.54%
ViT patch=4 emb=256 depth=4 heads=2 Epoch 2 loss=3.4429 time=23.99s lr=9.0e-05
         val_acc=20.32%
ViT patch=4 emb=256 depth=4 heads=2 Epoch 3 loss=3.1359 time=23.94s lr=7.9e-05
         val_acc=25.08%
ViT patch=4 emb=256 depth=4 heads=2 Epoch 4 loss=2.9220 time=23.62s lr=6.5e-05
         val_acc=27.84%
ViT patch=4 emb=256 depth=4 heads=2 Epoch 5 loss=2.7647 time=23.81s lr=5.0e-05
         val_acc=29.63%
ViT patch=4 emb=256 depth=4 heads=2 Epoch 6 loss=2.6469 time=23.88s lr=3.5e-05
         val_acc=31.35%
ViT patch=4 emb=256 depth=4 heads=2 Epoch 7 loss=2.5592 time=24.44s lr=2.1e-05
         val_acc=32.51%
ViT patch=4 emb=256 depth=4 heads=2 Epoch 8 loss=2.4931 time=23.84s lr=9.5e-06
         val_acc=33.30%
ViT patch=4 emb=256 depth=4 heads=2 Epoch 9 loss=2.4502 time=23.85s lr=2.4e-06
         val_acc=33.87%
ViT patch=4 emb=256 depth=4 heads=2 Epoch 10 loss=2.4245 time=23.91s lr=0

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from transformers import SwinForImageClassification, SwinConfig, AutoFeatureExtractor
import time

feature_extractor = AutoFeatureExtractor.from_pretrained('microsoft/swin-tiny-patch4-window7-224')
mean = feature_extractor.image_mean
std = feature_extractor.image_std

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
trainset = datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
testset = datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, num_workers=2)
testloader = DataLoader(testset, batch_size=32, shuffle=False, num_workers=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epochs = 5
lr = 2e-5
results = []

# Load pretrained Tiny and Small, replace head, freeze backbone
tiny_ft = SwinForImageClassification.from_pretrained('microsoft/swin-tiny-patch4-window7-224')
tiny_ft.classifier = nn.Linear(tiny_ft.config.hidden_size, 100)
for name, param in tiny_ft.named_parameters():
    if 'classifier' not in name:
        param.requires_grad = False

small_ft = SwinForImageClassification.from_pretrained('microsoft/swin-small-patch4-window7-224')
small_ft.classifier = nn.Linear(small_ft.config.hidden_size, 100)
for name, param in small_ft.named_parameters():
    if 'classifier' not in name:
        param.requires_grad = False

config = SwinConfig.from_pretrained('microsoft/swin-tiny-patch4-window7-224')
config.num_labels = 100
town_scratch = SwinForImageClassification(config)

models = {
    'swin-tiny-ft': tiny_ft,
    'swin-small-ft': small_ft,
    'swin-tiny-scratch': town_scratch
}

for key, model in models.items():
    model.to(device)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
    print(f"\nStarting fine-tune/train for {key}")
    for epoch in range(epochs):
        t0 = time.time()
        model.train()
        total_loss = 0
        for xb, yb in trainloader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb).logits
            loss = nn.CrossEntropyLoss()(logits, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * xb.size(0)
        train_loss = total_loss / len(trainset)
        model.eval()
        correct = 0
        with torch.no_grad():
            for xb, yb in testloader:
                xb, yb = xb.to(device), yb.to(device)
                preds = model(xb).logits.argmax(dim=1)
                correct += (preds == yb).sum().item()
        val_acc = correct / len(testset)
        t1 = time.time()
        print(f"{key} Epoch {epoch+1}/{epochs} train_loss={train_loss:.4f} val_acc={val_acc*100:.2f}% time={(t1-t0):.2f}s")
    # Final metrics
    params = sum(p.numel() for p in model.parameters())
    time_per_epoch = (t1 - t0)
    results.append((key, params, time_per_epoch, val_acc))

print('\nSummary Results:')
for r in results:
    print(r)



Starting fine-tune/train for swin-tiny-ft
swin-tiny-ft Epoch 1/5 train_loss=4.0374 val_acc=46.20% time=280.79s
swin-tiny-ft Epoch 2/5 train_loss=3.0465 val_acc=57.91% time=280.92s
swin-tiny-ft Epoch 3/5 train_loss=2.3707 val_acc=62.31% time=280.72s
swin-tiny-ft Epoch 4/5 train_loss=1.9424 val_acc=64.77% time=281.02s
swin-tiny-ft Epoch 5/5 train_loss=1.6727 val_acc=66.27% time=280.65s

Starting fine-tune/train for swin-small-ft
swin-small-ft Epoch 1/5 train_loss=3.9802 val_acc=51.48% time=489.01s
swin-small-ft Epoch 2/5 train_loss=2.8934 val_acc=62.52% time=489.19s
swin-small-ft Epoch 3/5 train_loss=2.1735 val_acc=66.31% time=488.56s
swin-small-ft Epoch 4/5 train_loss=1.7375 val_acc=68.60% time=488.54s
swin-small-ft Epoch 5/5 train_loss=1.4785 val_acc=70.07% time=488.52s

Starting fine-tune/train for swin-tiny-scratch
swin-tiny-scratch Epoch 1/5 train_loss=3.9567 val_acc=15.88% time=676.44s
swin-tiny-scratch Epoch 2/5 train_loss=3.3520 val_acc=23.39% time=672.35s
swin-tiny-scratch Epoc