In [1]:
# Mohammadmilad Sayyad
# Problem 2.b

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

import torchvision
import torchvision.transforms as transforms

import time

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Hyperparameters
batch_size    = 128
learning_rate = 0.01
num_epochs    = 100   # <<< 100 epochs for Problem 2.b

# CIFAR-10 normalization
mean = (0.4914, 0.4822, 0.4465)
std  = (0.2023, 0.1994, 0.2010)

transform_train = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])

# CIFAR-10 datasets and loaders
train_dataset = torchvision.datasets.CIFAR10(
    root="./data", train=True, download=True, transform=transform_train
)

test_dataset = torchvision.datasets.CIFAR10(
    root="./data", train=False, download=True, transform=transform_test
)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False, num_workers=2)

print("Train samples:", len(train_dataset))
print("Test samples:", len(test_dataset))


Using device: cuda


100%|██████████| 170M/170M [00:18<00:00, 9.16MB/s]


Train samples: 50000
Test samples: 10000


In [2]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()

        self.conv1 = nn.Conv2d(in_channels, out_channels,
                               kernel_size=3, stride=stride, padding=1, bias=False)
        self.conv2 = nn.Conv2d(out_channels, out_channels,
                               kernel_size=3, stride=1, padding=1, bias=False)

        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Conv2d(in_channels, out_channels,
                                      kernel_size=1, stride=stride, bias=False)
        else:
            self.shortcut = None

    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = self.conv2(out)
        identity = x if self.shortcut is None else self.shortcut(x)
        out = F.relu(out + identity)
        return out


class ResNet10(nn.Module):
    def __init__(self, num_classes=10):
        super(ResNet10, self).__init__()

        self.in_channels = 16

        # Initial conv
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)

        # 10 blocks total: 3 + 3 + 4
        self.layer1 = self._make_layer(16, num_blocks=3, stride=1)  # 32x32
        self.layer2 = self._make_layer(32, num_blocks=3, stride=2)  # 16x16
        self.layer3 = self._make_layer(64, num_blocks=4, stride=2)  # 8x8

        # Global average pooling + FC
        self.fc = nn.Linear(64, num_classes)

    def _make_layer(self, out_channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for s in strides:
            layers.append(ResidualBlock(self.in_channels, out_channels, stride=s))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)

        out = F.adaptive_avg_pool2d(out, (1, 1))
        out = out.view(out.size(0), -1)  # (N, 64)
        out = self.fc(out)
        return out


In [3]:
class ResidualBlockBN(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlockBN, self).__init__()

        self.conv1 = nn.Conv2d(in_channels, out_channels,
                               kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1   = nn.BatchNorm2d(out_channels)

        self.conv2 = nn.Conv2d(out_channels, out_channels,
                               kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2   = nn.BatchNorm2d(out_channels)

        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
        else:
            self.shortcut = None

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))

        identity = x if self.shortcut is None else self.shortcut(x)
        out = F.relu(out + identity)
        return out


class ResNet10_BN(nn.Module):
    def __init__(self, num_classes=10):
        super(ResNet10_BN, self).__init__()

        self.in_channels = 16

        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1   = nn.BatchNorm2d(16)

        self.layer1 = self._make_layer(16, num_blocks=3, stride=1)
        self.layer2 = self._make_layer(32, num_blocks=3, stride=2)
        self.layer3 = self._make_layer(64, num_blocks=4, stride=2)

        self.fc = nn.Linear(64, num_classes)

    def _make_layer(self, out_channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for s in strides:
            layers.append(ResidualBlockBN(self.in_channels, out_channels, stride=s))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)

        out = F.adaptive_avg_pool2d(out, (1, 1))
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out


In [4]:
class ResNet10_Dropout(nn.Module):
    def __init__(self, num_classes=10, p=0.3):
        super(ResNet10_Dropout, self).__init__()

        self.in_channels = 16

        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)

        self.layer1 = self._make_layer(16, num_blocks=3, stride=1)
        self.layer2 = self._make_layer(32, num_blocks=3, stride=2)
        self.layer3 = self._make_layer(64, num_blocks=4, stride=2)

        self.dropout = nn.Dropout(p=p)
        self.fc = nn.Linear(64, num_classes)

    def _make_layer(self, out_channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for s in strides:
            layers.append(ResidualBlock(self.in_channels, out_channels, stride=s))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)

        out = F.adaptive_avg_pool2d(out, (1, 1))
        out = out.view(out.size(0), -1)
        out = self.dropout(out)
        out = self.fc(out)
        return out


In [5]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def train_one_epoch(model, optimizer, criterion, dataloader, device):
    model.train()
    running_loss = 0.0

    for images, labels in dataloader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss / len(dataloader.dataset)
    return epoch_loss


def evaluate(model, dataloader, device):
    model.eval()
    criterion = nn.CrossEntropyLoss()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item() * images.size(0)

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_loss = running_loss / len(dataloader.dataset)
    accuracy = 100.0 * correct / total
    return avg_loss, accuracy


def run_experiment(model, optimizer, num_epochs, label):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()

    num_params = count_parameters(model)
    print(f"\n===== {label} =====")
    print(f"Model parameters: {num_params}")

    train_losses = []
    test_losses = []
    test_accuracies = []

    start_time = time.time()
    print(f"Training for {num_epochs} epochs...")

    for epoch in range(num_epochs):
        train_loss = train_one_epoch(model, optimizer, criterion, train_loader, device)
        test_loss, test_acc = evaluate(model, test_loader, device)

        train_losses.append(train_loss)
        test_losses.append(test_loss)
        test_accuracies.append(test_acc)

        print(f"[{label}] Epoch [{epoch+1}/{num_epochs}] "
              f"Train Loss: {train_loss:.4f} | "
              f"Test Loss: {test_loss:.4f} | "
              f"Test Acc: {test_acc:.2f}%")

    end_time = time.time()
    training_time = end_time - start_time

    print(f"\n=== Final Results: {label} ===")
    print(f"Training time: {training_time:.2f} seconds")
    print(f"Final training loss: {train_losses[-1]:.4f}")
    print(f"Final test loss: {test_losses[-1]:.4f}")
    print(f"Final test accuracy: {test_accuracies[-1]:.2f}%")
    print(f"Model size (parameters): {num_params}")

    return {
        "label": label,
        "time": training_time,
        "train_loss": train_losses[-1],
        "test_loss": test_losses[-1],
        "test_acc": test_accuracies[-1],
        "params": num_params,
    }


In [6]:
results = []

# 1) Weight Decay λ = 0.001 (no BN, no dropout)
model_wd = ResNet10(num_classes=10)
optimizer_wd = optim.SGD(model_wd.parameters(),
                         lr=learning_rate,
                         momentum=0.9,
                         weight_decay=0.001)   # <<< L2 / weight decay

res_wd = run_experiment(model_wd, optimizer_wd, num_epochs, label="ResNet-10 + Weight Decay (λ=0.001)")
results.append(res_wd)


# 2) Dropout p = 0.3 (no BN, no weight_decay)
model_do = ResNet10_Dropout(num_classes=10, p=0.3)
optimizer_do = optim.SGD(model_do.parameters(),
                         lr=learning_rate,
                         momentum=0.9,
                         weight_decay=0.0)

res_do = run_experiment(model_do, optimizer_do, num_epochs, label="ResNet-10 + Dropout (p=0.3)")
results.append(res_do)


# 3) Batch Normalization (no dropout, no weight_decay)
model_bn = ResNet10_BN(num_classes=10)
optimizer_bn = optim.SGD(model_bn.parameters(),
                         lr=learning_rate,
                         momentum=0.9,
                         weight_decay=0.0)

res_bn = run_experiment(model_bn, optimizer_bn, num_epochs, label="ResNet-10 + BatchNorm")
results.append(res_bn)

print("\n===== Summary of All 3 Regularization Experiments =====")
for r in results:
    print(f"{r['label']}: "
          f"time={r['time']:.2f}s, "
          f"train_loss={r['train_loss']:.4f}, "
          f"test_acc={r['test_acc']:.2f}%, "
          f"params={r['params']}")



===== ResNet-10 + Weight Decay (λ=0.001) =====
Model parameters: 344634
Training for 100 epochs...
[ResNet-10 + Weight Decay (λ=0.001)] Epoch [1/100] Train Loss: 2.0554 | Test Loss: 1.8224 | Test Acc: 32.14%
[ResNet-10 + Weight Decay (λ=0.001)] Epoch [2/100] Train Loss: 1.7126 | Test Loss: 1.5542 | Test Acc: 43.54%
[ResNet-10 + Weight Decay (λ=0.001)] Epoch [3/100] Train Loss: 1.5323 | Test Loss: 1.4446 | Test Acc: 46.06%
[ResNet-10 + Weight Decay (λ=0.001)] Epoch [4/100] Train Loss: 1.4092 | Test Loss: 1.4337 | Test Acc: 48.31%
[ResNet-10 + Weight Decay (λ=0.001)] Epoch [5/100] Train Loss: 1.2690 | Test Loss: 1.1980 | Test Acc: 55.69%
[ResNet-10 + Weight Decay (λ=0.001)] Epoch [6/100] Train Loss: 1.1460 | Test Loss: 1.1526 | Test Acc: 58.68%
[ResNet-10 + Weight Decay (λ=0.001)] Epoch [7/100] Train Loss: 1.0618 | Test Loss: 1.0664 | Test Acc: 62.73%
[ResNet-10 + Weight Decay (λ=0.001)] Epoch [8/100] Train Loss: 0.9869 | Test Loss: 0.9777 | Test Acc: 65.87%
[ResNet-10 + Weight Decay (λ