# Yolo v1 implentation

In [1]:
import torch
import torch.nn as nn

## 기본 방식

In [2]:
class YOLOv1(nn.Module):
   def __init__(self, input_shape, num_classes, num_boxes):
       super(YOLOv1, self).__init__()

       self.num_classes = num_classes
       self.num_boxes = num_boxes

       # First
       self.conv1 = nn.Sequential(
           nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
           nn.LeakyReLU(0.1),
           nn.MaxPool2d(kernel_size=2, stride=2)
       )

       # Second
       self.conv2 = nn.Sequential(
           nn.Conv2d(64, 192, kernel_size=3, stride=1, padding=1),
           nn.LeakyReLU(0.1),
           nn.MaxPool2d(kernel_size=2, stride=2)
       )

       # Third
       self.conv3 = nn.Sequential(
           nn.Conv2d(192, 128, kernel_size=1, stride=1, padding=0),
           nn.LeakyReLU(0.1),
           nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
           nn.LeakyReLU(0.1),
           nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0),
           nn.LeakyReLU(0.1),
           nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
           nn.LeakyReLU(0.1),
           nn.MaxPool2d(kernel_size=2, stride=2)
       )

       # Fourth
       self.conv4 = nn.Sequential(
           nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0),
           nn.LeakyReLU(0.1),
           nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
           nn.LeakyReLU(0.1),
           nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0),
           nn.LeakyReLU(0.1),
           nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
           nn.LeakyReLU(0.1),
           nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0),
           nn.LeakyReLU(0.1),
           nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
           nn.LeakyReLU(0.1),
           nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0),
           nn.LeakyReLU(0.1),
           nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
           nn.LeakyReLU(0.1),
           nn.Conv2d(512, 512, kernel_size=1, stride=1, padding=0),
           nn.LeakyReLU(0.1),
           nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1),
           nn.LeakyReLU(0.1),
           nn.MaxPool2d(kernel_size=2, stride=2)
       )

       # Fifth
       self.conv5 = nn.Sequential(
           nn.Conv2d(1024, 512, kernel_size=1, stride=1, padding=0),
           nn.LeakyReLU(0.1),
           nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1),
           nn.LeakyReLU(0.1),
           nn.Conv2d(1024, 512, kernel_size=1, stride=1, padding=0),
           nn.LeakyReLU(0.1),
           nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1),
           nn.LeakyReLU(0.1),
           nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1),
           nn.LeakyReLU(0.1),
           nn.Conv2d(1024, 1024, kernel_size=3, stride=2, padding=1),
           nn.LeakyReLU(0.1)
       )

       # Sixth
       self.conv6 = nn.Sequential(
           nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1),
           nn.LeakyReLU(0.1),
           nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1),
           nn.LeakyReLU(0.1)
       )

       # FC layers
       self.flatten = nn.Flatten()
       self.fc = nn.Sequential(
           nn.Linear(1024 * 7 * 7, 4096),
           nn.LeakyReLU(0.1),
           nn.Linear(4096, 7 * 7 * (5*self.num_boxes + self.num_classes))
       )

   def forward(self, x):
       x = self.conv1(x)
       x = self.conv2(x)
       x = self.conv3(x)
       x = self.conv4(x)
       x = self.conv5(x)
       x = self.conv6(x)
       x = self.flatten(x)
       x = self.fc(x)
       return x

모델 생성

In [4]:
model = YOLOv1(input_shape=(3, 448, 448), num_classes=20, num_boxes=2)

In [5]:
# 모델 구조 확인
print(model)

YOLOv1(
  (conv1): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): LeakyReLU(negative_slope=0.1)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): LeakyReLU(negative_slope=0.1)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(192, 128, kernel_size=(1, 1), stride=(1, 1))
    (1): LeakyReLU(negative_slope=0.1)
    (2): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): LeakyReLU(negative_slope=0.1)
    (4): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (5): LeakyReLU(negative_slope=0.1)
    (6): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): LeakyReLU(negative_slope=0.1)
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )

임의의 입력으로 테스트

In [6]:
x = torch.randn(1, 3, 448, 448)
output = model(x)
print(f"Output shape: {output.shape}")

Output shape: torch.Size([1, 1470])


## config 파일을 활용한 간소화 방식

YOLOv1 Architecture configuration

In [7]:
architecture_config = [
   # Conv layer: (kernel_size, filters, stride)
   (7, 64, 2), # Kernel size, batch, stride
   # MaxPool: "M"
   "M",
   # Conv layer
   (3, 192, 1),
   "M",
   [(1, 128, 1), (3, 256, 1), 1], # 반복되는 패턴 간소화: [(conv1_params), (conv2_params), n_repeats]
   (1, 256, 1),
    (3, 512, 1),
   "M",
   [(1, 256, 1), (3, 512, 1), 4], # Expansion 역할
   (1, 512, 1),
    (3, 1024, 1),
   "M",
   [(1, 512, 1), (3, 1024, 1), 2],
   (3, 1024, 1),
   (3, 1024, 2),
   (3, 1024, 1),
   (3, 1024, 1), # FC layer - kernel size, Flatten, output
]

In [8]:
class YOLOv1(nn.Module):
   def __init__(self, architecture_config, input_shape, n_grids, n_boxes_per_grid, n_classes):
       super(YOLOv1, self).__init__()

       self.layers = nn.ModuleList()

       # Architecture를 순회하면서 레이어 추가
       for l in architecture_config: # l 리스트의 원소 : 각 층의 config
           layer_type = type(l)

           if layer_type == tuple: # conv 층
               self.layers.append(
                   nn.Conv2d(in_channels=input_shape[0] if len(self.layers)==0 else self.layers[-1].out_channels,
                            out_channels=l[1],
                            kernel_size=l[0],
                            stride=l[2],
                            padding=l[0]//2) # 3 -> 1/ 5 -> 2/ 7 -> 3
               )
               self.layers.append(nn.ReLU())

           elif layer_type == str: # pool 층
               self.layers.append(nn.MaxPool2d(kernel_size=2, stride=2))

           elif layer_type == list: # 반복 층
               conv1, conv2, n_loop = l[0], l[1], l[2] # (1, 256, 1), (3, 512, 1), 4
               for _ in range(n_loop):
                   self.layers.append(
                       nn.Conv2d(in_channels=self.layers[-1].out_channels,
                                out_channels=conv1[1],
                                kernel_size=conv1[0],
                                stride=conv1[2],
                                padding=conv1[0]//2)
                   )
                   self.layers.append(nn.ReLU())

                   self.layers.append(
                       nn.Conv2d(in_channels=conv1[1],
                                out_channels=conv2[1],
                                kernel_size=conv2[0],
                                stride=conv2[2],
                                padding=conv2[0]//2)
                   )
                   self.layers.append(nn.ReLU())

       # Flatten 이후의 FC layers
       self.flatten = nn.Flatten()

       # 마지막 feature map의 크기 계산을 위한 임시 forward pass
       x = torch.zeros(1, *input_shape)
       for layer in self.layers:
           x = layer(x)
       flattened_size = x.shape[1] * x.shape[2] * x.shape[3]

       self.fc1 = nn.Sequential(
           nn.Linear(flattened_size, 4096),
           nn.ReLU()
       )

       self.fc2 = nn.Linear(4096, n_grids*n_grids*(n_boxes_per_grid*5+n_classes))

   def forward(self, x):
       for layer in self.layers:
           x = layer(x)
       x = self.flatten(x)
       x = self.fc1(x)
       x = self.fc2(x)
       return x