# ResNet-34 From Scratch

Using Tiny ImageNet, which contains 64×64 images instead of ImageNet’s standard 224×224.  
Created a modified architecture by changing the first 7×7 convolution into a 3×3 convolution with padding so the height and width remain the same. Also removed the first MaxPool layer. 

I used Random cropping with padding, horizontal flipping and normalized with dataset-specific mean and std values computed in mean_std.py.


In [None]:
import torch
import torchvision
from torch import nn, optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split
from torchvision import datasets

transform = transforms.Compose([
    transforms.RandomCrop(size=(64,64), padding=4),         # slightly pad and then crop back to 64x64
    transforms.RandomHorizontalFlip(),                      # randomly flip images left and right
    transforms.ToTensor(),                                  # convert image to tensor
    transforms.Normalize(mean=([0.4802, 0.4481, 0.3975]),   # normalize using mean & std
                         std=([0.2296, 0.2263, 0.2255]))
])

data_dir = 'tiny-imagenet-200/train' # Train location on my laptop. The validation set is scrambled and it is really annoying to fix especially
                                     # in Kaggle, so I split a validation set off the train set. I will run the saved model on the test data later.
                                     
full_dataset = datasets.ImageFolder(root=data_dir, transform=transform)

train_size = int(0.9 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)


ResNet Block

![Res_block](figures/resnet-block.png)

***d2l.ai***

Padding is fixed to 1 in ResNet Blocks.

In [2]:
class ResNetBlock(nn.Module):
    def __init__(self, in_ch:int, out_ch:int, stride:int):
        super().__init__()
        self.sequence = nn.Sequential(
            nn.Conv2d(in_channels=in_ch, out_channels=out_ch, kernel_size=3, stride=stride, padding=1),
            nn.BatchNorm2d(out_ch),
            nn.ReLU(),
            nn.Conv2d(in_channels=out_ch, out_channels=out_ch, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_ch)
        )
        self.skip = nn.Identity()
        if stride != 1 or in_ch != out_ch:
            self.skip = nn.Conv2d(in_channels=in_ch, out_channels=out_ch, kernel_size=1, stride=stride)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.sequence(x)
        x = self.skip(x)
        return self.relu(x + out)

In [3]:
class ResNetStack(nn.Module):
    def __init__(self, in_ch:int, out_ch:int, stride:int, blocks:int):
        super().__init__()
        layers = []
        layers.append(ResNetBlock(in_ch, out_ch, stride))
        for _ in range(1, blocks):
            layers.append(ResNetBlock(out_ch, out_ch, 1))
        self.net = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.net(x)

| Layer         | Details                                                             | Output shape |
| ------------- | ------------------------------------------------------------------- | ------------ |
| **Input**     | 3×64×64                                                             | 64×64×3      |
| **conv1**     | 3×3, stride=1, pad=1, 64 filters                                    | 64×64×64     |
| **bn + relu** |                                                                     | 64×64×64     |
| **conv2\_x**  | 3 blocks: each <br> \[3×3,64,s=1,p=1] + \[3×3,64,s=1,p=1] + skip    | 64×64×64     |
| **conv3\_x**  | 4 blocks: first block has stride=2 (downsamples to 32×32), then s=1 | 32×32×128    |
| **conv4\_x**  | 6 blocks: first block has stride=2 (16×16), then s=1                | 16×16×256    |
| **conv5\_x**  | 3 blocks: first block has stride=2 (8×8), then s=1                  | 8×8×512      |
| **avgpool**   | global avg pool                                                     | 1×1×512      |
| **fc**        | linear 512→200                                                      | 200          |

| Block    | k | s | p | out channels | repeats | downsample  |
| -------- | - | - | - | ------------ | ------- | ------------|
| conv2\_x | 3 | 1 | 1 | 64           | 3       | No          |
| conv3\_x | 3 | 2 | 1 | 128          | 4       | Yes         |
| conv4\_x | 3 | 2 | 1 | 256          | 6       | Yes         |
| conv5\_x | 3 | 2 | 1 | 512          | 3       | Yes         |


In [None]:
class ResNet34(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU()
        )

        self.conv2_x = ResNetStack(in_ch=64, out_ch=64, stride=1, blocks=3)
        self.conv3_x = ResNetStack(in_ch=64, out_ch=128, stride=2, blocks=4)
        self.conv4_x = ResNetStack(in_ch=128, out_ch=256, stride=2, blocks=6)
        self.conv5_x = ResNetStack(in_ch=256, out_ch=512, stride=2, blocks=3)

        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.flatten = nn.Flatten(1)
        self.dropout = nn.Dropout(0.4)
        self.linear = nn.Linear(in_features=512, out_features=200)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2_x(x)
        x = self.conv3_x(x)
        x = self.conv4_x(x)
        x = self.conv5_x(x)
        x = self.pool(x)
        x = self.flatten(x)
        x = self.dropout(x)
        x = self.linear(x)
        return x


In [8]:
model = ResNet34()

X = torch.randn(1, 3, 64, 64)
out = model(X)

out.shape # Should be [1, 200]

torch.Size([1, 200])

| Parameter    | Value                                  |
| ------------ | -------------------------------------- |
| optimizer    | SGD + momentum=0.9                     |
| lr           | 0.1 (step decay or cosine)             |
| weight decay | 1e-4                                   |
| batch size   | 128                                    |
| epochs       | 80-100                                 |
| augmentation | RandomCrop(64,4), RandomHorizontalFlip |


In [None]:
from torch.optim.lr_scheduler import StepLR
import json

best_val_acc = 0.0
history = {
    "train_loss": [],
    "train_acc": [],
    "val_loss": [],
    "val_acc": [],
    "lr": []
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = ResNet34().to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0004)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)

epochs = 100

for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0
    
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        
        optimizer.zero_grad()
        yhat = model(xb)
        loss = loss_fn(yhat, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * xb.size(0)
        preds = torch.argmax(yhat, dim=1)
        correct += (preds == yb).sum().item()
        total += xb.size(0)

    avg_loss = total_loss / total
    accuracy = correct / total

    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            yhat = model(xb)
            loss = loss_fn(yhat, yb)
            val_loss += loss.item() * xb.size(0)
            preds = torch.argmax(yhat, dim=1)
            val_correct += (preds == yb).sum().item()
            val_total += xb.size(0)
    
    avg_val_loss = val_loss / val_total
    val_accuracy = val_correct / val_total
    
    current_lr = scheduler.get_last_lr()[0]

    history["train_loss"].append(avg_loss)
    history["train_acc"].append(accuracy)
    history["val_loss"].append(avg_val_loss)
    history["val_acc"].append(val_accuracy)
    history["lr"].append(current_lr)

    with open("training_history.json", "w") as f:
        json.dump(history, f)

    if val_accuracy > best_val_acc:
        best_val_acc = val_accuracy
        torch.save(model.state_dict(), "ResNet34_best.pth")
        print("Saved new best model.")

    scheduler.step()

    print(f"Epoch {epoch+1}: "
          f"Train Loss: {avg_loss:.4f}, Train Acc: {accuracy:.4f}, "
          f"Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}")