In [None]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

In [None]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
num_epochs = 25
batch_size = 100
learning_rate = 0.001

# Image preprocessing modules
transform = transforms.Compose([
    transforms.Pad(4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32),
    transforms.ToTensor()])

# CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root='root="C:\\Users\\harsh\\Z\\BootCamp',
                                             train=True,
                                             transform=transform,
                                             download=True)

test_dataset = torchvision.datasets.CIFAR10(root='root="C:\\Users\\harsh\\Z\\BootCamp',
                                            train=False,
                                            transform=transforms.ToTensor())

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)

Files already downloaded and verified


In [None]:
print(train_loader)

<torch.utils.data.dataloader.DataLoader object at 0x79d303c69bd0>


In [None]:
def conv3x3(in_channles , out_channles , stride = 1):
  return nn.Conv2d(in_channles , out_channles , kernel_size= 3 , stride = stride , padding = 1)


class ResidualBlock(nn.Module):
  def __init__(self, in_channles , out_channles , stride = 1 , downsampling = None):
    super(ResidualBlock , self).__init__()
    self.conv1 = conv3x3(in_channles ,  out_channles , stride )
    self.bn1 = nn.BatchNorm2d(out_channles)
    self.relu = nn.ReLU(inplace = True)
    self.conv2 = conv3x3(out_channles ,  out_channles )
    self.bn2 = nn.BatchNorm2d(out_channles)
    self.downsampling = downsampling
  def forward(self , x):
    residual = x.clone()
    out = self.conv1(x)
    out = self.bn1(out)
    out = self.relu(out)
    out = self.conv2(out)
    if self.downsampling:
      residual = self.downsampling(x)
    out += residual
    out = self.relu(out)
    return out

In [None]:
class ResNet(nn.Module):
  def __init__(self,block , layers , num_classes = 10):
    #layers list = [2,2,2] len(list) = layers , inside no of blocks
    super(ResNet , self).__init__()
    self.in_channles = 16
    self.conv = conv3x3(3,16,)
    self.bn = nn.BatchNorm2d(16)
    self.relu = nn.ReLU(inplace = True)
    self.layer1 = self.make_layer(block , 16 , layers[0] , stride = 1)
    self.layer2 = self.make_layer(block , 32 , layers[1] , stride = 2)
    self.layer3 = self.make_layer(block , 64 ,layers[2], stride = 2)
    self.avg_pool = nn.AdaptiveAvgPool2d(1)  # 🔥 This ensures the output is always (batch, 64, 1, 1)
    self.fc = nn.Linear(4096, num_classes)

  def make_layer(self , block , out_channles ,blocks , stride = 1):
    downsampling = None
    if (self.in_channles != out_channles) or (stride != 1):
      downsampling = nn.Sequential(conv3x3(self.in_channles , out_channles , stride = stride) , nn.BatchNorm2d(out_channles))

    residual_blocks = []
    residual_blocks.append(block(self.in_channles ,out_channles , stride = stride , downsampling = downsampling))
    self.in_channles = out_channles
    residual_blocks.append(block(self.in_channles ,out_channles ))
    return nn.Sequential(*residual_blocks)
  def forward(self , x):
    out = self.conv(x)
    out = self.bn(out)
    out = self.relu(out)
    out = self.layer1(out)
    out = self.layer2(out)
    out = self.layer3(out)
    out = torch.flatten(out, 1)
    out = self.fc(out)
    return out






In [None]:
model = ResNet(ResidualBlock , [2,2,2])
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters() , learning_rate , )


In [None]:
decay = 0
model.train()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
  #decay the learning rate by facctor of 0.5 every 20 epochs
  if ((epoch + 1) % 20 == 0):
    decay +=1
    optimizer.param_groups[0]['lr'] = (learning_rate * 0.5**decay)
    print("The new learning rate is {}".format(optimizer.param_groups[0]["lr"]))

  for i , (images , labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print ("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}"
                   .format(epoch+1, num_epochs, i+1, len(train_loader), loss.item()))





Epoch [1/25], Step [100/500] Loss: 1.5393
Epoch [1/25], Step [200/500] Loss: 1.4437
Epoch [1/25], Step [300/500] Loss: 1.3520
Epoch [1/25], Step [400/500] Loss: 1.4245
Epoch [1/25], Step [500/500] Loss: 1.2883
Epoch [2/25], Step [100/500] Loss: 1.0088
Epoch [2/25], Step [200/500] Loss: 1.0584
Epoch [2/25], Step [300/500] Loss: 0.7238
Epoch [2/25], Step [400/500] Loss: 0.9748
Epoch [2/25], Step [500/500] Loss: 0.8777
Epoch [3/25], Step [100/500] Loss: 1.1098
Epoch [3/25], Step [200/500] Loss: 1.0552
Epoch [3/25], Step [300/500] Loss: 0.7856
Epoch [3/25], Step [400/500] Loss: 0.7611
Epoch [3/25], Step [500/500] Loss: 0.8873
Epoch [4/25], Step [100/500] Loss: 0.7621
Epoch [4/25], Step [200/500] Loss: 0.9317
Epoch [4/25], Step [300/500] Loss: 0.7054
Epoch [4/25], Step [400/500] Loss: 0.8434
Epoch [4/25], Step [500/500] Loss: 0.5813
Epoch [5/25], Step [100/500] Loss: 0.6892
Epoch [5/25], Step [200/500] Loss: 0.7049
Epoch [5/25], Step [300/500] Loss: 0.7672
Epoch [5/25], Step [400/500] Loss:

In [None]:
#Test the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))

Accuracy of the model on the test images: 86.7 %


Summary of the Discussion on Residual Networks (ResNets)
Skip Connections in ResNets
The final output of each residual block is:
H(x)=F(x)+x
This helps in avoiding vanishing gradients and makes training deep networks easier.
Why Do We Add
𝑥
x (Skip Connection)?

It ensures better gradient flow, preventing the network from learning very small updates.
It helps retain low-level features while still learning new ones.
It allows the network to easily skip layers if they are not needed, making optimization easier.
Why Do We Add
𝑥
x Multiple Times?

Since ResNet is made of multiple residual blocks, each block has its own skip connection.
Each residual block builds on the previous one by adding its own transformation to the accumulated result.
This ensures better gradient flow, easier optimization, and improved feature preservation at every stage of the network.
Downsampling in Residual Blocks

If the input size changes (e.g., due to a stride > 1), downsampling is used to match the dimensions before adding the residual connection.
This is typically done using a 1x1 convolution.
Key Takeaways
✅ ResNets prevent vanishing gradients and improve deep network performance.
✅ Skip connections help with optimization and feature retention.
✅ Adding
𝑥
x multiple times allows gradients to flow through deep networks effectively.

Would you like a code improvement suggestion for your implementation? 🚀