[**Residual Network (ResNet) (He et al., 2016):**](https://arxiv.org/abs/1512.03385) is a widely adopted framework that remains one of the most popular off-the-shelf architectures in computer vision, utilizing residual connections to enable training of very deep networks.

![](./imgs/Resnet-Architectures-Right-And-Residual-Block-Top-Left-Bottleneck-Layer-Bottom.ppm)
> Image Source: [ResearchGate](https://www.researchgate.net/figure/Resnet-Architectures-Right-And-Residual-Block-Top-Left-Bottleneck-Layer-Bottom_fig1_350524328)

ResNet introduces residual connections to combat the vanishing gradient problem in deep neural networks. The key idea is to add shortcut connections that skip one or more layers, allowing the network to learn residuals (i.e., $F(x)+x$) rather than the full transformation. The architecture consists of:

1. Stem: An initial convolutional layer followed by batch normalization, ReLU, and max pooling.
2. Four Stages: Each stage contains multiple residual blocks, with the first block in stages 2–4 typically downsampling the feature maps and increasing the channel count.
3. Global Average Pooling: Reduces spatial dimensions to 1x1.
4. Fully Connected Layer: Outputs class predictions.

The two main block types are:
1. BasicBlock: Contains two 3x3 convolutional layers; used in ResNet-18 and ResNet-34.
2. BottleneckBlock: Contains three convolutional layers (1x1, 3x3, 1x1); used in deeper models like ResNet-50 to reduce computational cost.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import utils

In [3]:
class BasicBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        """ Basic Residual Block (for ResNet-18, ResNet34) """
        super().__init__()

        # First 3x3 conv
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        # Second 3x3 conv
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)

        # Shorcut connection
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.BatchNorm2d(out_channels)
            )
        else:
            self.shortcut = nn.Identity()

    def forward(self, x):
        # Main path
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))

        # Add residual
        out += self.shortcut(x)
        out = self.relu(out)
        
        return out

In [4]:
class BottleneckBlock(nn.Module):
    def __init__(self, in_channels, bottleneck_channels, out_channels, stride=1):
        super().__init__()

        # 1x1 convolution to reduce channels
        self.conv1 = nn.Conv2d(in_channels, bottleneck_channels, kernel_size=1, stride=1)
        self.bn1 = nn.BatchNorm2d(bottleneck_channels)
        self.relu = nn.ReLU(inplace=True)

        # 3x3 convolution with stride for downsampling
        self.conv2 = nn.Conv2d(
            bottleneck_channels, bottleneck_channels, kernel_size=3, stride=stride, padding=1
        )
        self.bn2 = nn.BatchNorm2d(bottleneck_channels)

        # 1x1 convolution torestore channels
        self.conv3 = nn.Conv2d(bottleneck_channels, out_channels, kernel_size=1, stride=1)
        self.bn3 = nn.BatchNorm2d(out_channels)
        
        # Shortcut connection
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.BatchNorm2d(out_channels)
            )
        else:
            self.shortcut = nn.Identity()

    def forward(self, x):
        # Main path
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        
        # Add shortcut
        out += self.shortcut(x)
        out = self.relu(out)
        return out

In [None]:
class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=1000):
        super().__init__()
        
        # Set channel configurations based on block type
        if block == BasicBlock:
            self.channels = [64, 128, 256, 512]
        elif block == BottleneckBlock:
            self.channels = [256, 512, 1024, 2048]
        else:
            raise ValueError("Invalid block type")

        # Stem
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # Four stages
        self.layer1 = self.make_stage(block, 64, self.channels[0], layers[0], stride=1)
        self.layer2 = self.make_stage(block, self.channels[0], self.channels[1], layers[1], stride=2)
        self.layer3 = self.make_stage(block, self.channels[1], self.channels[2], layers[2], stride=2)
        self.layer4 = self.make_stage(block, self.channels[2], self.channels[3], layers[3], stride=2)

        # Global average pooling and fully connected layer
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(self.channels[3], num_classes)

    def make_stage(self, block, in_channels, out_channels, num_blocks, stride):
        """Helper function to create a stage of residual blocks."""
        layers = []
        if block == BasicBlock:
            # First block with specified stride
            layers.append(block(in_channels, out_channels, stride))
            # Subsequent blocks with stride 1
            for _ in range(1, num_blocks):
                layers.append(block(out_channels, out_channels, 1))
        elif block == BottleneckBlock:
            bottleneck_channels = out_channels // 4
            # First block with specified stride
            layers.append(block(in_channels, bottleneck_channels, out_channels, stride))
            # Subsequent blocks with stride 1
            for _ in range(1, num_blocks):
                layers.append(block(out_channels, bottleneck_channels, out_channels, 1))
        return nn.Sequential(*layers)

    def forward(self, x):
        # Stem
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.maxpool(x)

        # Stages
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        # Pooling and classification
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        
        return x

In [12]:
def ResNet18(num_classes=10):
    """ResNet-18: 4 stages with 2 BasicBlocks each."""
    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)

def ResNet34(num_classes=10):
    """ResNet-34: 4 stages with 3, 4, 6, 3 BasicBlocks."""
    return ResNet(BasicBlock, [3, 4, 6, 3], num_classes)

def ResNet50(num_classes=10):
    """ResNet-50: 4 stages with 3, 4, 6, 3 BottleneckBlocks."""
    return ResNet(BottleneckBlock, [3, 4, 6, 3], num_classes)

def ResNet101(num_classes=10):
    """ResNet-101: 4 stages with 3, 4, 23, 3 BottleneckBlocks."""
    return ResNet(BottleneckBlock, [3, 4, 23, 3], num_classes)

In [13]:
utils.layer_summary(ResNet18(num_classes=10), (1, 3, 224, 224))

Layer Name                     Layer Type              Param #         Output Shape
conv1                          Conv2d                     9408    (1, 64, 112, 112)
bn1                            BatchNorm2d                 128    (1, 64, 112, 112)
relu                           ReLU                          0    (1, 64, 112, 112)
maxpool                        MaxPool2d                     0      (1, 64, 56, 56)
layer1.0.conv1                 Conv2d                    36928      (1, 64, 56, 56)
layer1.0.bn1                   BatchNorm2d                 128      (1, 64, 56, 56)
layer1.0.relu                  ReLU                          0      (1, 64, 56, 56)
layer1.0.conv2                 Conv2d                    36928      (1, 64, 56, 56)
layer1.0.bn2                   BatchNorm2d                 128      (1, 64, 56, 56)
layer1.0.shortcut              Identity                      0      (1, 64, 56, 56)
layer1.1.conv1                 Conv2d                    36928      (1, 64, 

In [14]:
utils.layer_summary(ResNet101(num_classes=10), (1, 3, 224, 224))

Layer Name                     Layer Type              Param #         Output Shape
conv1                          Conv2d                     9408    (1, 64, 112, 112)
bn1                            BatchNorm2d                 128    (1, 64, 112, 112)
relu                           ReLU                          0    (1, 64, 112, 112)
maxpool                        MaxPool2d                     0      (1, 64, 56, 56)
layer1.0.conv1                 Conv2d                     4160      (1, 64, 56, 56)
layer1.0.bn1                   BatchNorm2d                 128      (1, 64, 56, 56)
layer1.0.relu                  ReLU                          0      (1, 64, 56, 56)
layer1.0.conv2                 Conv2d                    36928      (1, 64, 56, 56)
layer1.0.bn2                   BatchNorm2d                 128      (1, 64, 56, 56)
layer1.0.conv3                 Conv2d                    16640     (1, 256, 56, 56)
layer1.0.bn3                   BatchNorm2d                 512     (1, 256, 

In [8]:
data = utils.CIFAR10DataLoader(batch_size=64, resize=(224, 224))
train_loader = data.get_train_loader()
test_loader = data.get_test_loader()

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ResNet18(num_classes=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 10
for epoch in range(epochs):
    train_loss, train_acc = utils.train_step(train_loader, model, criterion, optimizer, device)
    test_loss, test_acc = utils.eval_step(test_loader, model, criterion, device)
    print(f"Epoch {epoch + 1}/{epochs}: Train Loss={train_loss}, Test Loss={test_loss}, Test Accuracy={test_acc}")

Epoch 1/10: Train Loss=1.471568859827793, Test Loss=1.5601825858377347, Test Accuracy=0.4766
Epoch 2/10: Train Loss=0.9035725659497863, Test Loss=0.8243136933654737, Test Accuracy=0.713
Epoch 3/10: Train Loss=0.6738652632288311, Test Loss=1.0093901016909606, Test Accuracy=0.6734
Epoch 4/10: Train Loss=0.537425598582191, Test Loss=0.6318783232360888, Test Accuracy=0.7801
Epoch 5/10: Train Loss=0.43620732106516125, Test Loss=0.5323172899757981, Test Accuracy=0.8213
Epoch 6/10: Train Loss=0.3490293579047446, Test Loss=0.5121434251214289, Test Accuracy=0.8281
Epoch 7/10: Train Loss=0.2705655218866628, Test Loss=0.6442782903552815, Test Accuracy=0.7998
Epoch 8/10: Train Loss=0.19743331465060296, Test Loss=0.6032314670693343, Test Accuracy=0.8273
Epoch 9/10: Train Loss=0.14161263011834202, Test Loss=0.7539974400761781, Test Accuracy=0.8063
Epoch 10/10: Train Loss=0.10415992798055033, Test Loss=0.6644984247388354, Test Accuracy=0.8266
