https://csm-kr.tistory.com/11
<br>
https://dev.to/afrozchakure/all-you-need-to-know-about-yolo-v3-you-only-look-once-e4m
<br>

In [25]:
import torch
import torch.nn as nn

In [2]:
# tuple : (out_channels, kernel_size, stride)
# 여기서 B가 의미하는 의미는 한 블럭이 반복하는 횟수를 쓴 것이다.

""" 
Information about architecture config:
Tuple is structured by (filters, kernel_size, stride) 
Every conv is a same convolution. 
List is structured by "B" indicating a residual block followed by the number of repeats
"S" is for scale prediction bl2ock and computing the yolo loss
"U" is for upsampling the feature map and concatenating with a previous layer
"""

config = [
    # input size = 416, 416
    # batch size의 경우 darknet53 구조에서 이와 같이 지정되어있음.
    
    (32, 3, 1),  # convolution
    (64, 3, 2),  # downsampling -> 이미지가 절반으로 줄어듬. = 208
    
    ["B", 1],    # Residual block : 입출력 사이의 차이 학습. 출력 형태는 입력값과 동일. = 208
    (128, 3, 2), # downsampling ->  이미지가 절반으로 줄어듬. = 104
    
    ["B", 2],    # Residual block : 입출력 사이의 차이 학습. 출력 형태는 입력값과 동일. = 104
    (256, 3, 2), # downsampling ->  이미지가 절반으로 줄어듬. = 52
    
    ["B", 8],    # Residual block : 입출력 사이의 차이 학습. 출력 형태는 입력값과 동일. = 52
    (512, 3, 2), # downsampling ->  이미지가 절반으로 줄어듬. = 26
    
    ["B", 8],     # Residual block : 입출력 사이의 차이 학습. 출력 형태는 입력값과 동일. = 26
    (1024, 3, 2), # downsampling ->  이미지가 절반으로 줄어듬. = 13
    
    ["B", 4],     # Residual block : 입출력 사이의 차이 학습. 출력 형태는 입력값과 동일. = 13
    (512, 1, 1),
    (1024, 3, 1),
    # ------------------------------> downsampling
    
    # S : scale 예측
    "S",           # 첫번째 출력이 나옴. -> 13x13x1024
    (256, 1, 1),
    
    "U",           # upsampling
    (256, 1, 1),
    (512, 3, 1),
    
    "S",           # 두번째 출력이 나옴. -> 26x26x1024
    (128, 1, 1),
    
    "U",
    (128, 1, 1),
    (256, 3, 1),
    "S",           # 세번째 출력이 나옴. -> 52x52x1024
]

## 입력 형태 : (batch size, img_size, img_size, channel)

conv layer는 이미지에서 여러 filter를 convolution하는데 사용하며, 여러 feature map 생성.<br>
pooling은 사용되지 않고, downsampling 시 stride=2로 지정한다.
<br><br><br>
yolov3은 물체의 scale을 고려해 총 3종류의 크기의 output이 나오도록 하며, FPN(feature pyramid network)을 이용해서 network 설계<br>
- conv 1x1 = s:1, p:0  /  conv 3x3 = s:1, p:1 이다.

#### network 과정
1. input을 320, 416, 609? 총 3가지의 input size만 들어갈 수 있다.
2. darknet53을 지난다.
3. 

In [65]:
'''
yolo에서는 총 53개의 layer(darknet53)가 존재하며,각 conv layer 뒤에는 batch norm과 leakyReLU가 있다.
순서 : convolution layer -> batch normalization -> leaky relu
계산
convolution : (input size - kerner_size + 2*padding) / stride + 1
batch normalization : gradient descent(경사하강법)를 빠르게 수렴시키기 위해서. accuracy 높이고, loss를 줄여준다.
leadkReLU : activation function. x가 음수일 대 gradient가 0.01이 된다.
            마지막 conv에서는 activation function이 ReLU이다.

convolution 이후 최종 filter(= kernel) 값.
-> 3 * (class + 5) -> 3 * (4 + 5) = 27
'''
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, bn_act=True, **kwargs):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=not bn_act, **kwargs)
        self.bn = nn.BatchNorm2d(out_channels)
        self.leaky = nn.LeakyReLU(0.1)
        self.use_bn_act = bn_act

    def forward(self, x):
        if self.use_bn_act:
            return self.leaky(self.bn(self.conv(x)))
        else:
            return self.conv(x)

'''
residual block : 
'''
class ResidualBlock(nn.Module):
    def __init__(self, channels, use_residual=True, num_repeats=1):
        super().__init__()
        self.layers = nn.ModuleList()
        for repeat in range(num_repeats):
            self.layers += [
                nn.Sequential(
                    CNNBlock(channels, channels // 2, kernel_size=1),
                    CNNBlock(channels // 2, channels, kernel_size=3, padding=1),
                )
            ]

        self.use_residual = use_residual
        self.num_repeats = num_repeats

    def forward(self, x):
        for layer in self.layers:
            if self.use_residual:
                x = x + layer(x)
            else:
                x = layer(x)
        return x


class ScalePrediction(nn.Module):
    def __init__(self, in_channels, num_classes):
        super().__init__()
        self.pred = nn.Sequential(
            CNNBlock(in_channels, 2 * in_channels, kernel_size=3, padding=1),
            CNNBlock(
                2 * in_channels, (num_classes + 5) * 3, bn_act=False, kernel_size=1
            ),
        )
        self.num_classes = num_classes

    def forward(self, x):
        return (
            self.pred(x)
            .reshape(x.shape[0], 3, self.num_classes + 5, x.shape[2], x.shape[3])
            .permute(0, 1, 3, 4, 2)
        )

In [82]:
class YOLOv3(nn.Module):
    def __init__(self, in_channels=3, num_classes=4):
        super().__init__()
        self.num_classes = num_classes
        self.in_channels = in_channels
        self.layers = self._create_conv_layers()

    def forward(self, x):
        outputs = []  # for each scale
        route_connections = []
        
        for layer in self.layers:
            if isinstance(layer, ScalePrediction):
                outputs.append(layer(x))
                continue

            x = layer(x)

            if isinstance(layer, ResidualBlock) and layer.num_repeats == 8:
                route_connections.append(x)

            elif isinstance(layer, nn.Upsample):
                # cat : tensor를 합치는 함수. dim=1인 경우 열 방향으로 합친다.
                x = torch.cat([x, route_connections[-1]], dim=1)
                route_connections.pop()

        return outputs

    def _create_conv_layers(self):
        layers = nn.ModuleList()
        in_channels = self.in_channels

        for module in config:
            
            # config list에서 값이 튜플인 경우
            #  예시 : (128, 3, 2)
            if isinstance(module, tuple):
                print('tuple : ', module)
                out_channels, kernel_size, stride = module
                layers.append(
                    CNNBlock(
                        in_channels,
                        out_channels,
                        kernel_size=kernel_size,
                        stride=stride,
                        padding=1 if kernel_size == 3 else 0,
                    )
                )
                in_channels = out_channels
                print('tuple in channel: ', in_channels)
                
            # config list에서 값이 list인 경우
            #  예시 : ["B", 1]
            
            # 이 코드는 network에서 residual block을 담당하고 있음.
            # residual block이란?  ->  docs에 정리.
            elif isinstance(module, list):
                num_repeats = module[1]
                print('list num : ', num_repeats)
                layers.append(ResidualBlock(in_channels, num_repeats=num_repeats,))
                print('residual layer : ', layers)

            # config list에서 값이 string인 경우
            #  예시 : "S", "U"
            elif isinstance(module, str):
                if module == "S":
                    print('S : ', module)
                    layers += [
                        ResidualBlock(in_channels, use_residual=False, num_repeats=1),
                        CNNBlock(in_channels, in_channels // 2, kernel_size=1),
                        ScalePrediction(in_channels // 2, num_classes=self.num_classes),
                    ]
                    in_channels = in_channels // 2
                    print('S in channel : ', in_channels)

                elif module == "U":
                    print('U : ', module)
                    layers.append(nn.Upsample(scale_factor=2),)
                    in_channels = in_channels * 3
                    print('U in channel : ', in_channels)

        return layers

In [83]:
if __name__ == "__main__":
    num_classes = 4
    IMAGE_SIZE = 416
    model = YOLOv3(num_classes=num_classes)
    x = torch.randn((2, 3, IMAGE_SIZE, IMAGE_SIZE))
    out = model(x)
    
    
    assert model(x)[0].shape == (2, 3, IMAGE_SIZE//32, IMAGE_SIZE//32, num_classes + 5)  # torch.Size([2, 3, 13, 13, 9])
    assert model(x)[1].shape == (2, 3, IMAGE_SIZE//16, IMAGE_SIZE//16, num_classes + 5)  # torch.Size([2, 3, 26, 26, 9])
    assert model(x)[2].shape == (2, 3, IMAGE_SIZE//8, IMAGE_SIZE//8, num_classes + 5)    # torch.Size([2, 3, 52, 52, 9])
    print("Success!")

tuple :  (32, 3, 1)
tuple in channel:  32
tuple :  (64, 3, 2)
tuple in channel:  64
list num :  1
residual layer :  ModuleList(
  (0): CNNBlock(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (leaky): LeakyReLU(negative_slope=0.1)
  )
  (1): CNNBlock(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (leaky): LeakyReLU(negative_slope=0.1)
  )
  (2): ResidualBlock(
    (layers): ModuleList(
      (0): Sequential(
        (0): CNNBlock(
          (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (leaky): LeakyReLU(negative_slope=0.1)
        )
        (1): CNNBlock(
          (conv): Conv2d(32, 64, 

residual layer :  ModuleList(
  (0): CNNBlock(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (leaky): LeakyReLU(negative_slope=0.1)
  )
  (1): CNNBlock(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (leaky): LeakyReLU(negative_slope=0.1)
  )
  (2): ResidualBlock(
    (layers): ModuleList(
      (0): Sequential(
        (0): CNNBlock(
          (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (leaky): LeakyReLU(negative_slope=0.1)
        )
        (1): CNNBlock(
          (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn): BatchNorm2d(64, eps

Success!
