This is the ResNet model that we are going to build.

![resnet](mini-resnet-image.png)

In [None]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

In [None]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper parameters
num_epochs = 10  # to decrease the computation time 2
batch_size = 100
learning_rate = 0.001

# Image preprocessing modules
transform = transforms.Compose([
    transforms.Pad(4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32),
    transforms.ToTensor()
    
])


#CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root='./',
                                            train=True,
                                            transform=transform,
                                            download=True)
test_dataset = torchvision.datasets.CIFAR10(root='./',
                                           train=False,
                                           transform=transforms.ToTensor())

# Data Loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                          batch_size=batch_size,
                                          shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)

**transforms.Pad(4):** This transformation pads the image with a border of 4 pixels on all sides. Padding is often used to ensure that the spatial dimensions of the input images are consistent, which can be important for certain types of neural network architectures.

**transforms.RandomHorizontalFlip():** This transformation randomly flips the image horizontally with a probability of 0.5. Horizontal flipping is a common data augmentation technique used to increase the variability of the training data and improve the robustness of the model to variations in the orientation of objects within the images.

**transforms.RandomCrop(32):** This transformation randomly crops the image to a size of 32x32 pixels. Random cropping is another data augmentation technique that helps the model learn to focus on different parts of the image and improves its ability to generalize to unseen data.

**transforms.ToTensor():** This transformation converts the image from a PIL Image object (or numpy array) to a PyTorch tensor. PyTorch tensors are the primary data structure used for representing images and other types of data in PyTorch, and they are compatible with the operations and functions provided by PyTorch's tensor library.

**download=True:** This parameter indicates whether to download the dataset if it's not already present in the specified root directory. If set to True, it will download the CIFAR-10 dataset from the internet and save it in the specified root directory.

In [None]:
# So, the first thing you might notice is that throughout the whole network we're using 3*3 convolutional layers.
# So, let's go ahead and define a function to perform 3*3 convolution and then we can use this function all over again.

def conv3x3(in_channels, out_channels, stride=1): 
    # stride=1 means, if stride is not given, by default it'll be set to 1
    
    
    # since all over we'll be using 3x3 conv layers, so always kernel size=3
    # we're taking stride value from the input and applying same padding to prreserve the size of input.
    return nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
   

# let's define our residual block. This consists of two convolutional layers,
# and each of these is followed by a batch normalization and a ReLU activation function 
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsampling=None):
        # if not given, by default downsampling is set to None
        super(ResidualBlock, self).__init__()
        # just to inheirt everything from nn.Module.
        
        # Let's start defining our layers.
        # first conv layer
        self.conv1 = conv3x3(in_channels, out_channels, stride)
        self.bn1 = nn.BatchNorm2d(out_channels) #takes the inputs of number of features maps on which BN will be applied
        self.relu = nn.ReLU(inplace = True)
        
        # Second conv layer
        # for the 2nd conv layer, the stride is always 1. 
        # so by default it's always 1 as we set it to be 1 if not given
        self.conv2 = conv3x3(out_channels, out_channels)
        #takes the inputs of number of features maps on which BN will be applied
        self.bn2 = nn.BatchNorm2d(out_channels) 
        
        # finally also specify whether there's a downsampling layer or not.
        self.downsampling = downsampling
        

    # Let's define the forward function in charge of the forward propagation
    def forward(self, x): 
        # x=for the first one,it's the image but for the further layer it's 
        # just the previous feature maps.

        #first, save the input as we need to add it later on.
        residual = x.clone()  # makes a clone of x and save it to residual variable
            
        # let's start running the layers.
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
            
        out = self.conv2(out)
        out = self.bn2(out)
        # Here, at this step we do the additon and then perform the ReLu.
        # But, sometimes, if neede, before the additon we need to downsample
        # so set a condition to do downsampling if needed
        if self.downsampling:
            residual = self.downsampling(x)
            
        # now, do the addition
        out = out + residual
            
        # finally, run it through ReLu activation function
        out = self.relu(out)
            
        return out

In the previous cell, we coded a residual block, which consists of two convolutional layers, and each of them has batch normalization and ReLU. Now, we're going to take this residual block and replicate it again to construct our full residual network. Now here, remember, besides the 6 residual blocks, we have the very first convolutional layer, which processes the image. And at the end, we have average pooling rather than max pooling. So the last layer consists of average pooling. And then, we take this result of average pooling
and we run it through a fully connected layer, which is the classification layer to classify 10 classes. Here we're using CIFAR-10, so we have 10 classes.

Recap about average pooling:
![avgpool](avg-pooling.png)

## In ResNet, kernal size = size of the feature map
In our case size of the feature map afer the 3rd layer is 8x8. Why? size of the input image is 32x32. At the beginning of 2nd layer we downsample it by 2. Hence size of the feature map becomes 16x16. At the beginning of 3rd layer we downsample it again by 2. Hence size of the feature map becomes 8x8. So during average pooling we have feature maps of size 8x8.

In [None]:
# let's define the class of this residual network
class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=10): # this class takes input as residual block, list of layers
                                                       # and # of classification classes, by default which is set to 10 as we're using CIFAR-10 dataset
        """
        layers will be a list: [2,2,2]. That means 2 block for each layer.  So the number of  the list is the number 
        of layers in the network and  each element specifies the number of block in each layer.
        
        NOTE: a layer consists of 2 residual blocks and each residual blocks consists of 2 conv layers.
        """
        super(ResNet, self).__init__()
        
        #let's define the attributes
              
        self.in_channels = 16                # number of input channels for the first residual block       
        self.conv = conv3x3(3,16)            # first conv layer, where input channels are 3 as this is an RBG image
                                             # and # of output channels are 16. This layer also has BatchNorm and ReLu.
        self.bn = nn.BatchNorm2d(16)         # since we have 16 feature maps as output
        self.relu = nn.ReLU(inplace=True)
        
        
        # Let's define the layers (each laeyr has different # of channels)
        self.layer1 = self.make_layer(block,16, layers[0], stride=1) # layers[0] specifies the # of blocks for 1st layer
        self.layer2 = self.make_layer(block,32, layers[1],stride=2)  # since we need to do downsampling, so stride is set to 2
        self.layer3 = self.make_layer(block,64, layers[2],stride=2)
        
        # Let's define the avg pooling
        self.avg_pool = nn.AvgPool2d(8)         # explanation for why kernel size is 8 given above cell
        
        # Let's define the fully connected layer/classification layer
        self.fc = nn.Linear(64, num_classes)  # since after the 3rd layes we have 64 channels and 
                                              # after avg pool we get only 64 value. so the number of input dimension is 64 and 
                                              # since CIFAR-10 dataset has 10 classes only, so output dimension is 10
        
        
    #you can guess what will be the inputs of this function as we used this function in the previous lines.
    def make_layer(self, block, out_channels, num_blocks, stride=1):
        """
        function to make a layer. Our network has 3 layers each with 2 block.
        """
            
        # Explanation for the downsampling layer is given below.
        downsampling = None
        if (self.in_channels != out_channels) or (stride != 1):
            downsampling = nn.Sequential(conv3x3(self.in_channels, out_channels, stride = stride), nn.BatchNorm2d(out_channels))
            
        residual_blocks = []  # to add all the residual blocks of a layer.
        residual_blocks.append(block(self.in_channels, out_channels, stride=stride, downsampling=downsampling)) # appending the first residual block. See the ResidualBlock() class what input it takes.
            
        # as of now, we're using in_channles=16, but we need to change to after a specific block
        self.in_channels = out_channels
            
        # now append the 2nd residual block
        residual_blocks.append(block(self.in_channels, out_channels)) # we don't need to specify stride and downsampling for the 2nd block a layer.
                                                                          # Because for the 2nd block a layer by default stride=1 and no downsampling layer is required.
        
        self.in_channels = out_channels
        
        # now appending the rest of the residual block
        for i in range(2, num_blocks):
            residual_blocks.append(block(out_channels, out_channels))
        
        return nn.Sequential(*residual_blocks)  # explanation given below why * is used.
            
            
    #let's define the forward function
    def forward(self, x):
        # x can be either the image or feature maps from the previous steps
            
        # for the initial conv layer
        out = self.conv(x)
        out = self.bn(out)
        out = self.relu(out)
            
        # then run through the 1st, 2nd, 3rd layer
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
            
        # then run through avg pooling
        out = self.avg_pool(out)
            
        # reshaping the output before passing through classification layer.
        out = out.view(out.size(0),-1)  #out.size(0)==> batch size
            
        # then run through classification layer
        out = self.fc(out)
            
        return out       
            

Initially We set downsampling to none as a default. Then we'll determine whether downsampling is necessary based on certain conditions (check the theoretical lectures to get the understanding of when downsampling is needed).

**1. Mismatched Input and Output Channels:**
    If the number of input channels doesn't match the number of output channels, then downsampling is needed.

**2. Mismatched in Input and Output Feature Map Sizes:**
    When the size of the input feature map differs from the size of the output feature map, then downsampling is needed as well. We check this distinction by examining the stride parameter. A stride value of one indicates no downsampling has occured, while a stride of two implies downsampling has occurred.


If any of these conditions occure then we need to downsample. To encapsulate these conditions effectively, we utilize a logical "or" operation.

the downsampling layer is just a convolutional layer with the stride equal to the stride of the input(first block of the layer). So if we are using a stride of two, that means we're also going to use a stride of two for the downsampling. The number of output channels equals to the output channels of
the layer.

And then we wanna follow this by a batch normalization layer. To run these 2 operation one after another we put them together in nn.Sequential().

In Python, when you use a star (*) before an iterable (like a list), it's called unpacking. It essentially expands the iterable into individual elements. This is particularly useful when you want to pass multiple arguments to a function or, in this case, multiple layers to the nn.Sequential constructor.

Here's an example to illustrate:
![starr](star.png)

In this example, nn.Sequential(\*residual_blocks) is equivalent to nn.Sequential(residual_block1, residual_block2, residual_block3). The star (*) operator allows us to avoid explicitly listing each layer separately and makes the code more concise.

In [None]:
# create an object of ResNet class
# ResNet takes block as 1st input. so pass the ResidualBlock class, then we pass the 3 layers and by default # of classification is set to 10
model = ResNet(ResidualBlock, [2,2,2]).to(device)

criterion = nn.CrossEntropyLoss()   # crossEntropy Loss
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)      # Adam optimization


In [None]:
# Let's start with the training
decay = 0
model.train() # since we're using batch normalization, we need to specify the mode.
for epoch in range(num_epochs):
    
    # Decay the learning rate by a factor of 0.5 every 20 epochs
    if (epoch + 1) % 20 == 0:
        decay+=1
        optimizer.param_groups[0]['lr'] = learning_rate * (0.5**decay) # decaying the learning rate
        print("The new learning rate is {}".format(optimizer.param_groups[0]['lr']))
        
    
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print("Epoch [{}/{}], step [{}/{}] Loss: {:.4f}"
                 .format(epoch+1, num_epochs, i+1, len(train_loader), loss.item()))
        
        

param_groups: a List containing all parameter groups where each parameter group is a Dict. Each parameter group contains metadata specific to the optimizer, such as learning rate and weight decay, as well as a List of parameter IDs of the parameters in the group.

*type(optimizer.param_groups)* ==> list

*type(optimizer.param_groups[0])* ==> dict

In [None]:
#Test the model
model.eval()
with torch.no_grad():   # as we don't want any gradient computations and this makes the compution much faster
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0) #increamenting total images which is basically how many images we have in the batch
        correct += (predicted == labels).sum().item()  # increamenting the correct values. see the CNN section

    print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))