# Q3 Solution

In [1]:
# Set the project root directory
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

In [8]:
import os
import torch
import random
import numpy as np
import yaml

def set_seeds(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def get_configs(project_root, config_filename):
    with open(os.path.join(project_root, "config", config_filename), 'r') as f:
        config = yaml.safe_load(f)
    return config


import os
import torch
from torch.utils.data import DataLoader
import torchvision.transforms as T
from torchvision.datasets import ImageFolder
import numpy as np
from sklearn.model_selection import train_test_split

def _build_transforms(img_size=224):
    train_tfms = T.Compose([
        T.RandomResizedCrop(img_size, scale=(0.6, 1.0), ratio=(0.75, 1.33)),
        T.RandomHorizontalFlip(),
        T.ColorJitter(0.2, 0.2, 0.2, 0.1),
        T.RandomErasing(p=0.1, scale=(0.02, 0.08)),
        T.ToTensor(),
        T.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225]),
    ])
    val_tfms = T.Compose([
        T.Resize(int(img_size * 1.15)),
        T.CenterCrop(img_size),
        T.ToTensor(),
        T.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225]),
    ])
    return train_tfms, val_tfms

def get_train_val_data_loaders(data_root, img_size, batch_size, val_ratio=0.2, seed=42):
    train_tfms, val_tfms = _build_transforms(img_size)

    full_ds = ImageFolder(root=os.path.join(data_root, "train"),
                          transform=train_tfms)
    labels  = full_ds.targets
    indices = list(range(len(full_ds)))

    train_idx, val_idx = train_test_split(
        indices, test_size=val_ratio, stratify=labels, random_state=seed
    )

    train_ds = torch.utils.data.Subset(full_ds, train_idx)
    val_ds   = torch.utils.data.Subset(full_ds, val_idx)
    # override the transform on the val subset
    val_ds.dataset.transform = val_tfms  # deterministic pipeline for val

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
                              num_workers=4, pin_memory=True)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False,
                              num_workers=4, pin_memory=True)

    return train_loader, val_loader, full_ds.classes


def get_test_data_loader(data_root, img_size, batch_size):
    _, val_tfms = _build_transforms(img_size)
    test_ds = ImageFolder(root=os.path.join(data_root, "test"),
                          transform=val_tfms)
    return DataLoader(test_ds, batch_size=batch_size,
                      shuffle=False, num_workers=4, pin_memory=True)


import torch

def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for images, labels in dataloader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc

def validate_one_epoch(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * images.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc

def evaluate_resnet_on_test(model, test_loader, device):
    # compute test accuracy + confusion data
    y_true, y_pred = [], []
    model.eval()
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            preds  = logits.argmax(dim=1)
            y_true.extend(y.cpu().tolist())
            y_pred.extend(preds.cpu().tolist())

    test_acc = sum(int(t==p) for t,p in zip(y_true, y_pred)) / len(y_true)
    return y_true, y_pred, test_acc

In [10]:
import os
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import wandb
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

os.environ["CUDA_VISIBLE_DEVICES"] = "3"   # select GPU

# ------------------------------------------------------------------
# 2) MODEL BUILDING (freeze/unfreeze)
# ----------------------------------------------------------------
def build_finetune_resnet50(num_classes, freeze_until_layer=3):
    model = torchvision.models.resnet50(weights="IMAGENET1K_V2")
    # freeze all
    for p in model.parameters():
        p.requires_grad = False
    # unfreeze from layer4 upwards
    layer_names = ["layer1", "layer2", "layer3", "layer4"]
    for ln in layer_names[freeze_until_layer:]:
        for p in getattr(model, ln).parameters():
            p.requires_grad = True
    # replace classifier
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    return model


# ------------------------------------------------------------------
# 4) MAIN TRAIN LOOP
# ------------------------------------------------------------------
def main(static_config):
    model_config   = static_config["model_config"]
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Data loading
    set_seeds(model_config["seed"])
    train_dl, val_dl, class_names = get_train_val_data_loaders(
        data_root = os.path.join(project_root, static_config["data_root"]),
        img_size  = model_config["resize_dim"],
        batch_size= model_config["batch_size"],
        val_ratio = model_config["val_ratio"],
        seed      = model_config["seed"]
    )
    num_classes = len(class_names)
    print(f"Loaded data: {num_classes} classes")

    # W&B initialization
    wandb.init(
        project   = static_config["wandb_project"],
        name      = static_config["wandb_run_name"],
        config    = {**model_config, "num_classes": num_classes}
    )
    wandb.watch_called = False
    # watch model once it's on device:
    model = build_finetune_resnet50(num_classes,
                                    freeze_until_layer=model_config["freeze_until_layer"])
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model = model.to(device)
    print(f"Model loaded: {model.__class__.__name__} ({num_classes} classes)")
    wandb.watch(model, log="all", log_freq=100)


    # --- optimizer with discriminative LRs for head & layer4 only ---
    base_model = model.module if isinstance(model, nn.DataParallel) else model
    head_params   = list(base_model.fc.parameters())
    layer4_params = list(base_model.layer4.parameters())

    optimizer = optim.AdamW([
        {"params": head_params,   "lr": 1e-3},
        {"params": layer4_params, "lr": 1e-4},
    ], weight_decay=1e-4)

    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2)
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

    patience      = model_config["patience"]
    best_val_acc  = -float("inf")
    patience_cnt  = 0
    # os.makedirs(project_root, static_config["output_dir"], exist_ok=True)
    ckpt_path     = os.path.join(project_root, static_config["output_dir"], "dummy_partB_Q3_best_resnet50.pth")

    UNFREEZE_EPOCH = 4
    epochs         = model_config["epochs"]

    # Training loop
    for epoch in range(1, epochs+1):
        # staged unfreeze of layer3
        if epoch == UNFREEZE_EPOCH:
            print(f">> Unfreezing layer3 at epoch {epoch}")
            layer3 = base_model.layer3
            for p in layer3.parameters():
                p.requires_grad = True
            optimizer.add_param_group({
                "params": list(layer3.parameters()),
                "lr": 1e-4
            })

        train_loss, train_acc = train_one_epoch(model, train_dl, optimizer, criterion, device)
        val_loss,   val_acc   = validate_one_epoch(model, val_dl,   criterion, device)
        scheduler.step()

        print(f"Epoch: {epoch:02d}/{epochs}]  Train_Loss: {train_loss:.3f} Train_Accuracy: {train_acc:.3f} | "
              f"Val_Loss: {val_loss:.3f} Val_Accuracy: {val_acc:.3f}")
        
        #  log to W&B
        wandb.log({
            "PartB_Q3_epoch":         epoch,
            "PartB_Q3_train_loss":    train_loss,
            "PartB_Q3_train_acc":     train_acc,
            "PartB_Q3_val_loss":      val_loss,
            "PartB_Q3_val_acc":       val_acc,
            "PartB_Q3_lr_head":       optimizer.param_groups[0]["lr"],
            "PartB_Q3_lr_layer4":     optimizer.param_groups[1]["lr"],
        })

        # early stopping
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_cnt = 0
            torch.save(model.state_dict(), ckpt_path)
            print(f"  >> New best val‑acc {best_val_acc:.3f} (checkpoint saved)")
        else:
            patience_cnt += 1
            print(f"  >> No improvement ({patience_cnt}/{patience})")
            if patience_cnt >= patience:
                print("Early stopping.")
                break

    # load best & final test eval
    print(f"Training done. Best val‑acc = {best_val_acc:.3f}")

    print("Loading best model for final evaluation...")
    model.load_state_dict(torch.load(ckpt_path))

    print("Getting test data loader...")
    test_loader = get_test_data_loader(os.path.join(project_root, static_config["data_root"]), model_config["resize_dim"], model_config["batch_size"])

    print("Evaluating on test set...")
    y_true, y_pred, test_acc = evaluate_resnet_on_test(model, test_loader, device)
    print(f"Test accuracy = {test_acc*100:.2f}%")
    
    # log final metrics and plot
    wandb.log({"PartB_Q3_test_acc": test_acc})
    cm = wandb.plot.confusion_matrix(
        probs    = None,
        y_true   = y_true,
        preds    = y_pred,
        class_names = class_names
    )
    wandb.log({"PartB_Q3_confusion_matrix": cm})

if __name__ == "__main__":
    config = get_configs(project_root, "configs.yaml")["part_b_configs"]["solution_3_configs"]
    main(config)


Loaded data: 10 classes


Model loaded: ResNet (10 classes)
Epoch: 01/20]  Train_Loss: 1.164 Train_Accuracy: 0.752 | Val_Loss: 0.848 Val_Accuracy: 0.871
  >> New best val‑acc 0.871 (checkpoint saved)
Epoch: 02/20]  Train_Loss: 0.695 Train_Accuracy: 0.941 | Val_Loss: 0.788 Val_Accuracy: 0.897
  >> New best val‑acc 0.897 (checkpoint saved)
Epoch: 03/20]  Train_Loss: 0.585 Train_Accuracy: 0.985 | Val_Loss: 0.771 Val_Accuracy: 0.905
  >> New best val‑acc 0.905 (checkpoint saved)
>> Unfreezing layer3 at epoch 4
Epoch: 04/20]  Train_Loss: 0.552 Train_Accuracy: 0.994 | Val_Loss: 0.765 Val_Accuracy: 0.907
  >> New best val‑acc 0.907 (checkpoint saved)
Epoch: 05/20]  Train_Loss: 0.533 Train_Accuracy: 0.999 | Val_Loss: 0.761 Val_Accuracy: 0.900
  >> No improvement (1/3)
Epoch: 06/20]  Train_Loss: 0.528 Train_Accuracy: 0.999 | Val_Loss: 0.769 Val_Accuracy: 0.900
  >> No improvement (2/3)
Epoch: 07/20]  Train_Loss: 0.525 Train_Accuracy: 1.000 | Val_Loss: 0.754 Val_Accuracy: 0.909
  >> New best val‑acc 0.909 (checkpoint sav

  model.load_state_dict(torch.load(ckpt_path))


Test accuracy = 90.90%
