Name: Maurya Vijayaramachandran 

Project: An attempt at implementing Yolo V3 architecture 

email: maurya.mvr@gmail.com

Import Libraries. 

In [None]:
import torch 
import torch.nn as nn
import numpy as np
import os
import pandas as pd
from PIL import Image, ImageFile
from torch.utils.data import Dataset, DataLoader 

Get the config list from yolo architecture for your reference 



[Link for the config file of yolo v3](https://github.com/pjreddie/darknet/blob/master/cfg/yolov3.cfg)



In [None]:
# The config list is as follows. 
# 1. The list has tuples, (OUT_CHANNEL,KERNEL_SIZE,STRIDE).
# 2. The padding is the same in all layers .
# 3. The list has lists, with the structure being [Residual blocks, repeats].
# 4. The string indicates either "Upscaling layer" or "Scale Prediction Layer".

config = [(32, 3, 1),(64, 3, 2),["Residual_Block", 1],(128, 3, 2),["Residual_Block", 2],(256, 3, 2),["Residual_Block", 8],(512, 3, 2),["Residual_Block", 8], (1024, 3, 2),["Residual_Block", 4],(512, 1, 1),(1024, 3, 1),"Scale_Prediction_Layer",(256, 1, 1),"Upsampling_layer",(256, 1, 1),(512, 3, 1),"Scale_Prediction_Layer",(128, 1, 1),"Upsampling_layer",(128, 1, 1),(256, 3, 1),"Scale_Prediction_Layer",]
print("Length of config list is ",len(config))

Length of config list is  24


so basically, this code is like a block of stuff you can use in a neural network to mess around with pictures. It's got a convolutional layer, batch normalization, and this leaky ReLU activation thingy. You can turn on the batch normalization and activation by setting the "bn_act" thing to true, or you can leave it off by setting it to false. Anyway, this block helps you find things in pictures and make better guesses about what's in them.





In [None]:
class ConvolutionalNeuralNetworkBlock(nn.Module):
  """
A convolutional block that applies convolutional layers followed by batch normalization and leaky ReLU activation.

Args:
    in_channels (int): number of input channels
    out_channels (int): number of output channels
    bn_act (bool): apply batch normalization and activation function or not

Returns:
    Tensor of shape (batch_size, out_channels, height, width)

  """
  def __init__(self, in_channels, out_channels, bn_act=True, **kwargs):
      super().__init__()
      self.conv = nn.Conv2d(in_channels, out_channels, bias=not bn_act, **kwargs)
      self.bn = nn.BatchNorm2d(out_channels)
      self.leaky = nn.LeakyReLU(0.1)
      self.use_bn_act = bn_act

  def forward(self, x):
      if self.use_bn_act:
          return self.leaky(self.bn(self.conv(x)))
      else:
          return self.conv(x)

Create a skeletal structure for the residual block

In [None]:
class ResidualBlock(nn.Module):
  """
A residual block that applies multiple sets of convolutional layers to input.

Args:
    channels (int): number of input and output channels
    use_residual (bool): use residual connection or not
    num_repeats (int): number of times to repeat the convolutional layers

Returns:
    Tensor of shape (batch_size, channels, height, width)

  """
   
  
  def __init__(self, channels, use_residual=True, num_repeats=1):
      super().__init__()
      self.layers = nn.ModuleList()
      for repeat in range(num_repeats):
          self.layers += [
              nn.Sequential(
                  ConvolutionalNeuralNetworkBlock(channels, channels // 2, kernel_size=1),
                  ConvolutionalNeuralNetworkBlock(channels // 2, channels, kernel_size=3, padding=1)
              )
          ]

      self.use_residual = use_residual
      self.num_repeats = num_repeats

  def forward(self, x):
      for layer in self.layers:
          if self.use_residual:
              x = x + layer(x)
          else:
              x = layer(x)

      return x

Create a skeletal structure for the scale prediction layer

In [None]:
class ScalePrediction(nn.Module):
  """
Predicts object scales for object detection tasks using a convolutional neural network.

Args:
    in_channels (int): number of input channels
    num_classes (int): number of object classes

Returns:
    Tensor of shape (batch_size, 3, height, width, num_anchors, num_classes+5)

  """
  def __init__(self, in_channels, num_classes):
      super().__init__()
      self.pred = nn.Sequential(
          ConvolutionalNeuralNetworkBlock(in_channels, 2*in_channels, kernel_size=3, padding=1),
          ConvolutionalNeuralNetworkBlock(2*in_channels, (num_classes + 5) * 3, bn_act=False, kernel_size=1)
      )
      self.num_classes = num_classes
      

  def forward(self, x):
      return (
          self.pred(x)
              .reshape(x.shape[0],3, self.num_classes + 5, x.shape[2], x.shape[3])
              .permute(0, 1, 3, 4, 2)
      )

The driver code for generating the model

In [None]:
class My_Yolo_Implementation(nn.Module):
    def __init__(self, in_channels=3, num_classes=80):
        super().__init__()
        self.num_classes = num_classes
        self.in_channels = in_channels
        self.layers = self._create_conv_layers()

    def forward(self, x):
        outputs = []
        route_connections = []
        for layer in self.layers:
            if isinstance(layer, ScalePrediction):
                outputs.append(layer(x))
                continue

            x = layer(x)

            if isinstance(layer, ResidualBlock) and layer.num_repeats == 8:
                route_connections.append(x)

            elif isinstance(layer, nn.Upsample):
                x = torch.cat([x, route_connections[-1]], dim=1)
                route_connections.pop()

        return outputs


    def _create_conv_layers(self):
        layers = nn.ModuleList()
        in_channels = self.in_channels

        for module in config:
            if isinstance(module, tuple):
                out_channels, kernel_size, stride = module
                layers.append(
                    ConvolutionalNeuralNetworkBlock(
                        in_channels,
                        out_channels,
                        kernel_size=kernel_size,
                        stride=stride,
                        padding=1 if kernel_size == 3 else 0,
                    )
                )
                in_channels = out_channels

            elif isinstance(module, list):
                num_repeats = module[1]
                layers.append(
                    ResidualBlock(
                        in_channels,
                        num_repeats=num_repeats
                    )
                )

            elif isinstance(module, str):
                if module == "Scale_Prediction_Layer":
                    layers+=[
                        ResidualBlock(in_channels, use_residual=False, num_repeats=1),
                        ConvolutionalNeuralNetworkBlock(in_channels, in_channels // 2, kernel_size=1),
                        ScalePrediction(in_channels // 2, num_classes=self.num_classes)
                    ]
                    in_channels = in_channels // 2

                elif module == "Upsampling_layer":
                    layers.append(
                        nn.Upsample(scale_factor=2)
                    )
                    in_channels = in_channels * 3

        return layers

Check if the output size matches

In [None]:
if __name__ == "__main__":
  num_classes = 80
  IMAGE_SIZE = 416
  model = My_Yolo_Implementation(num_classes=num_classes)
  x = torch.randn((2,3,IMAGE_SIZE,IMAGE_SIZE))
  out = model(x)
  assert model(x)[0].shape == (2,3,IMAGE_SIZE//32, IMAGE_SIZE//32, num_classes+5)
  assert model(x)[1].shape == (2,3,IMAGE_SIZE//16, IMAGE_SIZE//16, num_classes+5)
  assert model(x)[2].shape == (2,3,IMAGE_SIZE//8, IMAGE_SIZE//8, num_classes+5)
  print("Success")

Success


In [None]:
print(model.state_dict)

<bound method Module.state_dict of My_Yolo_Implementation(
  (layers): ModuleList(
    (0): ConvolutionalNeuralNetworkBlock(
      (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (leaky): LeakyReLU(negative_slope=0.1)
    )
    (1): ConvolutionalNeuralNetworkBlock(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (leaky): LeakyReLU(negative_slope=0.1)
    )
    (2): ResidualBlock(
      (layers): ModuleList(
        (0): Sequential(
          (0): ConvolutionalNeuralNetworkBlock(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (leaky): LeakyReLU(negative_slope=0.1)
        