In [81]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torchvision.transforms.functional as FT

import os
import numpy as np
import pandas as pd
from tqdm import tqdm

# 이미지 불러오기 및 그리기
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from collections import Counter
from sklearn.model_selection import train_test_split

## 변수 선언

In [64]:
ori_dir = './'
img_dir = ori_dir + 'image/'
label_dir = ori_dir + 'label_txt/'

train_csv = ori_dir + 'png_txt.csv'

img_size = 416
S = 7   # grid cell w,h크기
B = 5
C = 4

classes = [ "AC", "FL", "HC", "HUM" ]

In [65]:
'''
에러 내용 : TypeError: __call__() takes 2 positional arguments but 3 were given. self.
https://stackoverflow.com/questions/62341052/typeerror-call-takes-2-positional-arguments-but-3-were-given-to-train-ra

해결
아래 있는 Compose 함수를 호출해서 transform을 하니 해결이 됨.
'''
class Compose(object):
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img, bboxes):
        for t in self.transforms:
            img, bboxes = t(img), bboxes

        return img, bboxes

transform = Compose([transforms.Resize((448, 448)), transforms.ToTensor(),])

In [66]:
seed = 123
torch.manual_seed(seed)

# Hyperparameters etc. 
DEVICE = "cuda" if torch.cuda.is_available else "cpu"   # torch.device('cpu')
BATCH_SIZE = 4 # 64 in original paper but I don't have that much vram, grad accum?
EPOCHS = 3
NUM_WORKERS = 2
PIN_MEMORY = True
LOAD_MODEL = False
LOAD_MODEL_FILE = "overfit.pth.tar"

In [5]:
DEVICE

'cuda'

In [6]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(torch.cuda.current_device()))

True
1
NVIDIA GeForce GTX 750 Ti


# bbox가 이미지에서 잘 위치한지 확인

In [10]:
'''
def draw_bbox(img_file, boxes):
    # 이미지를 로드합니다.
    image = cv2.imread(img_file)
    
    # 바운딩 박스 좌표를 추출합니다.
    c, x, y, w, h = boxes
    x1, y1 = int((x - w/2) * 1024) , int((y - h/2) * 512)
    x2, y2 = int((x + w/2) * 1024) , int((y + h/2) * 512)

    # 바운딩 박스를 시각화합니다.
    cv2.rectangle(image, (x1, y1), (x2, y2), (0, 0, 255), 2)
    # center 좌표 보기.
    #cv2.line(image, (x, y), (x, y), (0, 0, 255), 3)
    # 해당 이미지 class 확인
    cv2.putText(image, str(c), (x2, y1), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)
    
    
    # 시각화된 이미지를 보여줍니다.
    cv2.imshow('Bounding Boxes', image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
'''

"\ndef draw_bbox(img_file, boxes):\n    # 이미지를 로드합니다.\n    image = cv2.imread(img_file)\n    \n    # 바운딩 박스 좌표를 추출합니다.\n    c, x, y, w, h = boxes\n    x1, y1 = int((x - w/2) * 1024) , int((y - h/2) * 512)\n    x2, y2 = int((x + w/2) * 1024) , int((y + h/2) * 512)\n\n    # 바운딩 박스를 시각화합니다.\n    cv2.rectangle(image, (x1, y1), (x2, y2), (0, 0, 255), 2)\n    # center 좌표 보기.\n    #cv2.line(image, (x, y), (x, y), (0, 0, 255), 3)\n    # 해당 이미지 class 확인\n    cv2.putText(image, str(c), (x2, y1), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)\n    \n    \n    # 시각화된 이미지를 보여줍니다.\n    cv2.imshow('Bounding Boxes', image)\n    cv2.waitKey(0)\n    cv2.destroyAllWindows()\n"

In [11]:
'''
img_file = img_dir + df['img_png'][0]
boxes = info['class'][0] , info['center_x'][0] , info['center_y'][0] , info['w'][0] , info['h'][0]

draw_bbox(img_file , boxes)
'''

"\nimg_file = img_dir + df['img_png'][0]\nboxes = info['class'][0] , info['center_x'][0] , info['center_y'][0] , info['w'][0] , info['h'][0]\n\ndraw_bbox(img_file , boxes)\n"

# Dataset 불러오기

In [67]:
class YoloDataset(torch.utils.data.Dataset):
    
    def __init__(
        self, csv_file, img_dir, label_dir, S=7, B=2, C=4, transform=None,
    ):
        self.annotations = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.transform = transform
        self.S = S
        self.B = B
        self.C = C

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        # 이미지 객체 정보 가져오기.
        label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1])
        boxes = []
        
        with open(label_path) as f:
            for label in f.readlines():
                # 이 순서대로 txt파일에 저장되어있음.
                class_label, x, y, width, height = [
                    float(x) if float(x) != int(float(x)) else int(x)
                    for x in label.replace("\n", "").split()
                ]
                
                # boxes 변수에 이미지 정보 한번에 배열로 저장하기.
                boxes.append([class_label, x, y, width, height])
                
                
        # 이미지 파일 불러오기.
        img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
        image = Image.open(img_path)
        
        # 이미지 정보를 tensor로 변환해주기.
        boxes = torch.tensor(boxes)

        if self.transform:
            image, boxes = self.transform(image, boxes)

        # ====================================================== 
        # ENCODING
        # 이미지를 SxS그리드로 나누고, feature map의 tensor는 S x S x (B*5 + C). 5 : x, y, w, h, confidence
        # label_matrix : ground truth box 중심좌표 계산 후 confidence score, bbox좌표 저장.
        label_matrix = torch.zeros((self.S, self.S, self.C + 5 * self.B))
        for box in boxes:
            class_label, x, y, width, height = box.tolist()
            class_label = int(class_label)

            # 객체가 속한 grid cell의 행(=i)과 열(=j)을 계산한다.
            i, j = int(self.S * y), int(self.S * x)
            # cell 내에서 객체의 상대적인 좌표 계산.
            x_cell, y_cell = self.S * x - j, self.S * y - i
            
            """
            cell 기준으로 bbox cell의 너비와 높이 계산.
            
            width_pixels = (width*self.image_width)
            cell_pixels = (self.image_width)
            
            cell의 상대적인 너비를 찾는 방법 : width_pixels/cell_pixels
            """
            width_cell, height_cell = (
                width * self.S,
                height * self.S,
            )

            # label_matrix가 현재 모든 값이 0인 상태이다.
            # 이때 위에서 구한 grid cell에서의 객체 좌표에 대한 정보를 넣을 것이다.
            # -> ground truth box 중심이 특정 cell에 존재할 경우 해당 cell의 C번째에 값을 1로 지정한다.
            # C번째 : 객체가 존재하는지를 나타내는 index
            if label_matrix[i, j, self.C] == 0:
                # 객체가 있다면 1을 넣어준다.
                label_matrix[i, j, self.C] = 1

                # box 좌표
                # cell 내에서의 bbox 좌표 및 너비를 텐서로 전환 후 저장.
                box_coordinates = torch.tensor([
                    x_cell, y_cell,
                    width_cell, height_cell
                    #min(width_cell, self.S - 1),
                    #min(height_cell, self.S - 1)
                ])
                # C+1 ~ 23번째 index에 값 저장.
                label_matrix[i, j, (self.C+1):(self.C+5)] = box_coordinates

                # class_label에 대해 one-hot encoding 해주기.
                # 값이 있는 cell을 1로 저장.
                label_matrix[i, j, class_label] = 1
                
        return image, label_matrix

### dataset 확인

In [68]:
dataset = YoloDataset(
    train_csv,
    transform=transform,
    img_dir=img_dir,
    label_dir=label_dir
)

In [69]:
image, label_matrix = dataset[0]

14

# yolov1 network architecture

In [70]:
""" 
Information about architecture config:
Tuple is structured : (kernel_size, filters, stride, padding) 
"M" is simply maxpooling with stride 2x2 and kernel 2x2
List is structured by tuples and lastly int with number of repeats
"""

architecture_config = [
    (7, 64, 2, 3),
    "M",
    
    (3, 192, 1, 1),
    "M",
    
    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",
    
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],   # -> conv 4번 반복
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",
    
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],  # -> conv 2번 반복
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    
    (3, 1024, 1, 1),
    (3, 1024, 1, 1),
]

### CNN block

In [71]:
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))

# Yolov1 모델

In [72]:
class Yolov1(nn.Module):
    
    def __init__(self, in_channels=3, **kwargs):
        super(Yolov1, self).__init__()
        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        # 해당 top구조를 darknet구조라 한다.
        self.fcs = self._create_fcs(**kwargs)

    def forward(self, x):
        x = self.darknet(x)
        return self.fcs(torch.flatten(x, start_dim=1))

    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            # arch_config에서 값이 tuple인 경우 : (7, 64, 2, 3)
            if type(x) == tuple:
                layers += [
                    CNNBlock(
                        in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3],
                    )
                ]
                in_channels = x[1]
                
            # arch_config에서 값이 string인 경우 : 'M'
            # -> maxpooling
            elif type(x) == str:
                layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))]
                
            # arch_config에서 값이 list인 경우 : [(1, 256, 1, 0), (3, 512, 1, 1), 4]
            elif type(x) == list:
                conv1 = x[0]
                conv2 = x[1]
                num_repeats = x[2]
                
                # 2번 혹은 4번 돌아감.
                for _ in range(num_repeats):
                    
                    # 1x1 conv인 경우
                    layers += [
                        CNNBlock(
                            in_channels,
                            conv1[1],
                            kernel_size=conv1[0],
                            stride=conv1[2],
                            padding=conv1[3],
                        )
                    ]
                    
                    # 3x3 conv인 경우
                    layers += [
                        CNNBlock(
                            conv1[1],
                            conv2[1],
                            kernel_size=conv2[0],
                            stride=conv2[2],
                            padding=conv2[3],
                        )
                    ]
                    in_channels = conv2[1]

        return nn.Sequential(*layers)

    def _create_fcs(self, split_size, num_boxes, num_classes):
        S, B, C = split_size, num_boxes, num_classes

        # In original paper this should be
        # nn.Linear(1024*S*S, 4096),
        # nn.LeakyReLU(0.1),
        # nn.Linear(4096, S*S*(B*5+C))

        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 496),
            nn.Dropout(0.0),
            nn.LeakyReLU(0.1),
            nn.Linear(496, S * S * (C + B * 5)),
        )

In [73]:
model = Yolov1(split_size=7, num_boxes=2, num_classes=4)

# LOSS

In [86]:
class YoloLoss(nn.Module):
    
    def __init__(self, S=7, B=2, C=4):
        super(YoloLoss, self).__init__()
        self.mse = nn.MSELoss(reduction="sum")

        """
        S is split size of image (in paper 7),
        B is number of boxes (in paper 2),
        C is number of classes (in paper and VOC dataset is 4),
        """
        self.S = S
        self.B = B
        self.C = C

        # 가중치 파라미터
        # pay loss for no object (noobj) and the box coordinates (coord)
        self.lambda_noobj = 0.5
        self.lambda_coord = 5   # 상자 좌표
        
    # 손실함수 계산 시작
    # train 시킬 때 image(=x)를 model에 넣어 나온 예측값이 out이다.
    # predictions = out, target = y(data loader) = label_matrix(YoloDataset)
    def forward(self, predictions, target):
        '''
        각 grid cell마다 2개의 bbox를 예측하고,
        그 중 confidence score가 높은 1개의 bbox를 학습에 적용.
        '''
        
        # grid cell 형태로 예측값들을 다시 배치하기 위해서 pred를 재구조화한다.
        '''
        predictions 입력 시 shape : (BATCH_SIZE, S, S, (C+B*5)) => 7x7x14
        
        [..., :C] = 각 클래스에 대한 확률  /  [..., C:C+4] : 첫 번째 bbox에 대한 좌표 및 너비(x, y, w, h)
        [..., C+4:C+5] : 첫 번째 bbox에 대한 confidence score
        
        [..., C+6:C+9] : 두 번째 bbox에 대한 좌표 및 너비(x, y, w, h)
        [..., C+9:C+10] : 두 번째 bbox에 대한 confidence score
        
        target 입력 시 shape : (BATCH_SIZE, S, S, (C+B*5))  => 7x7x14
        [..., :4] : 각 target bbox 실제 좌표 및 너비(x, y, w, h)
        [..., 4:5] : 각 target bbox에 대한 존재 여부 (1=객체있음 , 0=객체없음)
        [..., 5:C+5] : 각 target bbox에 대한 class 정보로 one-hot encoding된 벡터.
                       ex) 만약 class=2 -> [0, 1, 0, 0]으로 표현
        [..., C+5:(C+B*5)] : 
        '''
        
        # predictions : 모델의 출력으로 받은 예측 값(SxSx(C+B*5)) 크기의 feature map을 flatten한 결과.
        # target : 실제 target 값. 모델이 예측하려는 bbox와 클래스 정보 포함.
        predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B * 5)
        
        # intersection_over_union -> utils.ipybn 파일에 있는 함수.
        # 해당 함수를 사용해 target bbox로 예측된 두 개의 bbox 좌표에 대한 IoU를 계산한다.
        # prediction에서 첫번째 bbox 좌표
        iou_b1 = intersection_over_union(predictions[..., (self.C+1):(self.C+5)], target[..., (self.C+1):(self.C+5)])
        # prediction에서 두번째 bbox 좌표
        iou_b2 = intersection_over_union(predictions[..., (self.C+6):(self.C+self.B * 5)], target[..., (self.C+1):(self.C+5)])
        ious = torch.cat([iou_b1, iou_b2], dim=0)
        
        # ious에서 두 예측 중에서 IoU가 가장 높은 상자를 선택
        # 해당 값은 변수 bestbox에 저장한다.
        # 이때 bestbox는 IoU가 더 높은 box의 index가 저장이 된다.
        iou_maxes, bestbox = torch.max(ious, dim=0)
        # exists_box : target의 마지막 차원에서 bbox 존재 여부 나타내는 값 저장.
        # grid cell에 ground truth box 중심이 존재하는지 여부 확인.
        # 1 = 중심이 있음  ,  0 = 중심이 없음.
        exists_box = target[..., self.C]#.unsqueeze(-1)  # in paper this is Iobj_i

        
        # ======================== #
        #     Localizaton loss     #
        # ======================== #

        # object 없는 상자를 0으로 설정.
        # 두 prediction 중 이전에 계산 된 IoU에서 가장 높은 예측 중 하나만 꺼낸다.
        box_predictions = exists_box * (
            (
                bestbox * predictions[..., (self.C + 6):(self.C + self.B * 5)]
                + (1 - bestbox) * predictions[..., (self.C + 1):(self.C + 5)]
            )
        )

        box_targets = exists_box * target[..., (self.C+1):(self.C+5)]
        
        # Take sqrt of width, height of boxes to ensure that
        box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(
            torch.abs(box_predictions[..., 2:4] + 1e-6)
        )
        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])
        
        # IoU기준으로 선택된 bbox 사용해 pred_bbox와 실제 bbox 간의 좌표 손실 계산.
        box_loss = self.mse(
            torch.flatten(box_predictions, end_dim=-2),
            torch.flatten(box_targets, end_dim=-2),
        )

        
        # ==================== #
        #    Confidence loss   #
        #   FOR OBJECT LOSS    #
        # ==================== #

        # pred_box : IoU가 가장 높은 bbox의 신뢰도 점수
        pred_box = (
            bestbox * predictions[..., (self.C+5):(self.C+6)] + (1 - bestbox) * predictions[..., (self.C):(self.C+1)]
        )
        
        # IoU기준으로 선택된 bbox에 해당하는 신뢰도 점수 사용하여 계산.
        object_loss = self.mse(
            torch.flatten(exists_box * pred_box),
            torch.flatten(exists_box * target[..., (self.C):(self.C+1)]), # --> ?
        )

        
        # ======================= #
        #   FOR NO OBJECT LOSS    #
        # ======================= #

        # max_no_obj = torch.max(predictions[..., 20:21], predictions[..., 25:26])
        # no_object_loss = self.mse(
        #    torch.flatten((1 - exists_box) * max_no_obj, start_dim=1),
        #    torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
        #)

        # 비객체 손실 계산 : 객체가 없는 grid cell에 대한 손실 계산.
        # -> 신뢰도 점수에 대한 loss이고, 모델이 객체가 없는 위치에 대한 신뢰도를 낮출 수 있게 도와준다.
        # 객체가 없을 경우 두 bbox를 모두 학습에 참여한다.
        no_object_loss = self.mse(
            torch.flatten((1 - exists_box) * predictions[..., (self.C):(self.C+1)], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., (self.C):(self.C+1)], start_dim=1)  # --> ?
        )
        
        no_object_loss += self.mse(
            torch.flatten((1 - exists_box) * predictions[..., (self.C+5):(self.C+6)], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., (self.C+5):(self.C+6)], start_dim=1)  # --> ?
        )
        
        # ================== #
        #     CLASS LOSS     #
        # ================== #
        # 클래스 손실 게산 : 예측된 클래스 확률과 실제 클래스 정보 사이의 loss를 계산한다.
        # -> C개의 class score를 target과 비교해 mse loss를 구한다.
        class_loss = self.mse(
            torch.flatten(exists_box * predictions[..., :self.C], end_dim=-2,),
            torch.flatten(exists_box * target[..., :self.C], end_dim=-2,),
        )
        
        # box_loss , object_loss , no_object_loss , class_loss를 전부 더하기.
        loss = (
            self.lambda_coord * box_loss  # first two rows in paper
            + object_loss  # third row in paper
            + self.lambda_noobj * no_object_loss  # forth row
            + class_loss  # fifth row
        )

        return loss

In [None]:
mse = nn.MSELoss(reduction="sum")

S=7, B=2, C=4

# 가중치 파라미터
# pay loss for no object (noobj) and the box coordinates (coord)
lambda_noobj = 0.5
lambda_coord = 5   # 상자 좌표

# 손실함수 계산 시작
predictions = out
target = y  # 값이 모두 0인 7x7x14 tensor넣어주기.
'''
각 grid cell마다 2개의 bbox를 예측하고,
그 중 confidence score가 높은 1개의 bbox를 학습에 적용.
'''

# predictions : 모델의 출력으로 받은 예측 값(SxSx(C+B*5)) 크기의 feature map을 flatten한 결과.
# target : 실제 target 값. 모델이 예측하려는 bbox와 클래스 정보 포함.
predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B * 5)

iou_b1 = intersection_over_union(predictions[..., (self.C+1):(self.C+5)], target[..., (self.C+1):(self.C+5)])
# prediction에서 두번째 bbox 좌표
iou_b2 = intersection_over_union(predictions[..., (self.C+6):(self.C+self.B * 5)], target[..., (self.C+1):(self.C+5)])
ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)

iou_maxes, bestbox = torch.max(ious, dim=0)
# exists_box : target의 마지막 차원에서 bbox 존재 여부 나타내는 값 저장.
# grid cell에 ground truth box 중심이 존재하는지 여부 확인.
# 1 = 중심이 있음  ,  0 = 중심이 없음.
exists_box = target[..., self.C].unsqueeze(-1)  # in paper this is Iobj_i

In [46]:
y.shape

torch.Size([4, 7, 7, 14])

# train

In [87]:
dataset = YoloDataset(
    train_csv,
    transform=transform,
    img_dir=img_dir,
    label_dir=label_dir
)

train, vali = train_test_split(dataset, test_size=0.8, random_state=123)  # 80

train_loader = DataLoader(
    dataset=train,
    batch_size=BATCH_SIZE,
    num_workers=0,
    pin_memory=PIN_MEMORY,
    shuffle=True,
    drop_last=True
)

vali_loader = DataLoader(
    dataset=vali,
    batch_size=BATCH_SIZE,
    num_workers=0,
    pin_memory=PIN_MEMORY,
    shuffle=True,
    drop_last=True
)

In [106]:
y[3, 3]

tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 1.0000, 0.0000, 1.0000, 0.3397, 0.5686, 1.6478, 2.7615,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000]])

### train loader에 값이 제대로 있는지 확인

In [76]:
len(train_loader)

10

In [77]:
dataiter = iter(train_loader)
images, labels = dataiter.next()
images.size()

torch.Size([4, 3, 448, 448])

# model 불러오기

In [147]:
model = Yolov1(split_size=7, num_boxes=2, num_classes=4).to(DEVICE)
optimizer = optim.Adam(
    model.parameters(), lr=0.1, weight_decay=0
)
loss_fn = YoloLoss()

if LOAD_MODEL:
    load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)

# 훈련 시작

In [151]:
def train_fn(train_loader, model, optimizer, loss_fn):
    loop = tqdm(train_loader, leave=True)
    mean_loss = []
    
    with torch.no_grad():
        for batch_idx, (x, y) in enumerate(loop):
            # dataset에서 return한 image, label_matrix값.
            x, y = x.to(DEVICE), y.to(DEVICE)
            out = model(x)

            loss = loss_fn(out, y)
            print('Loss : ', loss)
            mean_loss.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update progress bar
            loop.set_postfix(loss=loss.item())

        print(f"Mean loss was {sum(mean_loss)/len(mean_loss)}")

In [156]:
for epoch in tqdm(range(1)):

    print('epoch : ', epoch + 1)

    pred_boxes, target_boxes = get_bboxes(
        train_loader, model, iou_threshold=0.3, threshold=0.3
    )

    mean_avg_prec = mean_average_precision(
        pred_boxes, target_boxes, iou_threshold=0.5, box_format="midpoint"
    )
    print(f"Train mAP: {mean_avg_prec}")

    train_fn(train_loader, model, optimizer, loss_fn)

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

epoch :  1
Train mAP: 0.0



  0%|                                                                                           | 0/10 [00:00<?, ?it/s][A
  0%|                                                                                            | 0/1 [00:02<?, ?it/s]


RuntimeError: The size of tensor a (7) must match the size of tensor b (4) at non-singleton dimension 3

### 에러내용 : RuntimeError: shape '[16, 7, 7, 30]' is invalid for input of size 10976
<br>
발생이유<br><br>
input shape이 [16, 7, 7, 30]이어야 하는데, 내가 사용한 input size는 [16, 7, 7, 14]이다.<br>
yolo architecture에서 마지막 부분은 7x7x30이기 때문에 이에 맞추기..<br><br>

## B:5, C:5로 변경해서 하기!

# utils

In [143]:
# IoU를 계산하는 함수.
# 계산 결과 : 겹치는 부분이 마이너스라서 모든 값이 다 0이 나옴.
# => 예측과 실제가 겹쳐지는 부분이 매우 미세하거나 없음.
def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
    """
    Calculates intersection over union

    Parameters:
        boxes_preds (tensor): 예측된 bbox 좌표 정보 가지고 있음. (BATCH_SIZE, 4)
        boxes_labels (tensor): 실제 label bbox와 좌표 정보 가지고 있음. (BATCH_SIZE, 4)
        box_format (str): midpoint(중심점, w, h) / corners(좌상단 , 우하단), if boxes (x,y,w,h) or (x1,y1,x2,y2)

    Returns:
        tensor: Intersection over union for all examples
    """

    if box_format == "midpoint":
        # box_1
        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2
        # box_2
        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2

    if box_format == "corners":
        # box_1
        box1_x1 = boxes_preds[..., 0:1]
        box1_y1 = boxes_preds[..., 1:2]
        box1_x2 = boxes_preds[..., 2:3]
        box1_y2 = boxes_preds[..., 3:4]  # (N, 1)
        # box_2
        box2_x1 = boxes_labels[..., 0:1]
        box2_y1 = boxes_labels[..., 1:2]
        box2_x2 = boxes_labels[..., 2:3]
        box2_y2 = boxes_labels[..., 3:4]

    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)

    # 박스의 좌표를 비교해 겹치는 영역을 구해야한다.
    # clamp(0) : 영역이 음수가 되지 않도록 해 영역이 겹치치 않는 경우를 처리.
    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    return intersection / (box1_area + box2_area - intersection + 1e-6)

In [148]:
with torch.no_grad():
    for batch_idx, (x, y) in enumerate(train_loader):
        # dataset에서 return한 image, label_matrix값.
        x, y = x.to(DEVICE), y.to(DEVICE)
        out = model(x)

In [149]:
out = out.reshape(-1, 7, 7, 4 + 2 * 5)
a = intersection_over_union(out, y, box_format="midpoint")

In [150]:
out

tensor([[[[-3.7014e-02, -2.5416e-02,  7.0422e-02,  ...,  2.5490e-01,
            9.9237e-02,  1.2393e-01],
          [ 2.7664e-01,  7.9222e-02,  7.4289e-02,  ..., -2.0881e-01,
           -1.0846e-01,  3.3345e-01],
          [ 1.2530e-01,  7.7614e-02, -2.0971e-02,  ...,  2.6625e-01,
            4.3843e-02, -2.8554e-01],
          ...,
          [ 4.2469e-01,  2.6156e-01, -1.6536e-01,  ...,  1.7936e-01,
            5.7187e-02,  5.7697e-02],
          [-1.4052e-01, -1.1648e-01,  3.3048e-01,  ..., -2.4145e-01,
           -4.4412e-01,  3.0446e-01],
          [ 2.5341e-02, -1.2086e-02, -2.8258e-02,  ..., -1.5466e-01,
           -3.2301e-01,  1.1946e-01]],

         [[ 1.6862e-01, -1.6228e-01,  3.8125e-01,  ..., -3.1161e-01,
           -1.2866e-01,  3.6836e-03],
          [-9.6899e-02,  2.6513e-01,  1.3100e-01,  ..., -3.9063e-02,
            4.4585e-01,  1.0860e-02],
          [-3.4279e-01, -3.6383e-01, -1.0719e-01,  ..., -6.5599e-01,
            1.6501e-01,  4.4188e-01],
          ...,
     

In [155]:
# NMS : 객체 탐지 결과에서 겹치는 예측 박스를 제거해 정확한 예측 결과 얻는데 사용한다.
def non_max_suppression(bboxes, iou_threshold, threshold, box_format="corners"):
    """
    Does Non Max Suppression given bboxes

    Parameters:
        bboxes (list) : 객체 탐지 결과. 예측된 bbox 정보를 포함한 리스트이다.
        shape = (클래스 예측, 확률점수, x1, y1, x2, y2)

        iou_threshold (float) : IoU 임계값. 예측된 bbox들이 겹치는 정도를 평가하는데 사용된다.
        IoU를 넘는 경우, 두 bbox 중 하나는 삭제
        
        threshold (float) : bbox 확률 점수가 이 임계값보다 작은 경우, 해당 bbox는 삭제.
        
        box_format (str): bbox 포맷을 나타내는 문자열.
        midpoint = 중심 좌표와 width, height -> bbox 나타내는 것을 의미.
        corners = 좌측 상단과 우측 하단의 좌표로 bbox 나타냄.

    Returns:
        list: bboxes after performing NMS given a specific IoU threshold
    """

    assert type(bboxes) == list
    
    # 확률 점수가 threshold보다 미만이면 제외.
    bboxes = [box for box in bboxes if box[1] > threshold]
    # 내림차순을 정렬
    bboxes = sorted(bboxes, key=lambda x: x[1], reverse=True)
    bboxes_after_nms = []
    
    while bboxes:
        chosen_box = bboxes.pop(0)
        
        # 남아 있는 bbox 중에서 가장 확률이 높은 box선택.
        # 이 box와 IoU가 iou_threshold를 초과하는 다른 box들 제거하고 결과를 반환한다.
        bboxes = [
            box
            for box in bboxes
            if box[0] != chosen_box[0]
            or intersection_over_union(
                torch.tensor(chosen_box[2:]),
                torch.tensor(box[2:]),
                box_format=box_format,
            )
            < iou_threshold
        ]

        bboxes_after_nms.append(chosen_box)

    return bboxes_after_nms

In [23]:
def mean_average_precision(
    pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=20
):
    """
    Calculates mean average precision 

    Parameters:
        pred_boxes (list): list of lists containing all bboxes with each bboxes
        specified as [train_idx, class_prediction, prob_score, x1, y1, x2, y2]
        true_boxes (list): Similar as pred_boxes except all the correct ones 
        iou_threshold (float): threshold where predicted bboxes is correct
        box_format (str): "midpoint" or "corners" used to specify bboxes
        num_classes (int): number of classes

    Returns:
        float: mAP value across all classes given a specific IoU threshold 
    """

    # list storing all AP for respective classes
    average_precisions = []

    # used for numerical stability later on
    epsilon = 1e-6

    for c in range(num_classes):
        detections = []
        ground_truths = []

        # Go through all predictions and targets,
        # and only add the ones that belong to the
        # current class c
        for detection in pred_boxes:
            if detection[1] == c:
                detections.append(detection)

        for true_box in true_boxes:
            if true_box[1] == c:
                ground_truths.append(true_box)

        # find the amount of bboxes for each training example
        # Counter here finds how many ground truth bboxes we get
        # for each training example, so let's say img 0 has 3,
        # img 1 has 5 then we will obtain a dictionary with:
        # amount_bboxes = {0:3, 1:5}
        amount_bboxes = Counter([gt[0] for gt in ground_truths])

        # We then go through each key, val in this dictionary
        # and convert to the following (w.r.t same example):
        # ammount_bboxes = {0:torch.tensor[0,0,0], 1:torch.tensor[0,0,0,0,0]}
        for key, val in amount_bboxes.items():
            amount_bboxes[key] = torch.zeros(val)

        # sort by box probabilities which is index 2
        detections.sort(key=lambda x: x[2], reverse=True)
        TP = torch.zeros((len(detections)))
        FP = torch.zeros((len(detections)))
        total_true_bboxes = len(ground_truths)
        
        # If none exists for this class then we can safely skip
        if total_true_bboxes == 0:
            continue

        for detection_idx, detection in enumerate(detections):
            # Only take out the ground_truths that have the same
            # training idx as detection
            ground_truth_img = [
                bbox for bbox in ground_truths if bbox[0] == detection[0]
            ]

            num_gts = len(ground_truth_img)
            best_iou = 0

            for idx, gt in enumerate(ground_truth_img):
                iou = intersection_over_union(
                    torch.tensor(detection[3:]),
                    torch.tensor(gt[3:]),
                    box_format=box_format,
                )

                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = idx

            if best_iou > iou_threshold:
                # only detect ground truth detection once
                if amount_bboxes[detection[0]][best_gt_idx] == 0:
                    # true positive and add this bounding box to seen
                    TP[detection_idx] = 1
                    amount_bboxes[detection[0]][best_gt_idx] = 1
                else:
                    FP[detection_idx] = 1

            # if IOU is lower then the detection is a false positive
            else:
                FP[detection_idx] = 1

        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = torch.divide(TP_cumsum, (TP_cumsum + FP_cumsum + epsilon))
        precisions = torch.cat((torch.tensor([1]), precisions))
        recalls = torch.cat((torch.tensor([0]), recalls))
        # torch.trapz for numerical integration
        average_precisions.append(torch.trapz(precisions, recalls))

    return sum(average_precisions) / len(average_precisions)

In [24]:
def plot_image(image, boxes):
    """Plots predicted bounding boxes on the image"""
    im = np.array(image)
    height, width, _ = im.shape

    # Create figure and axes
    fig, ax = plt.subplots(1)
    # Display the image
    ax.imshow(im)

    # box[0] is x midpoint, box[2] is width
    # box[1] is y midpoint, box[3] is height

    # Create a Rectangle potch
    for box in boxes:
        box = box[2:]
        assert len(box) == 4, "Got more values than in x, y, w, h, in a box!"
        upper_left_x = box[0] - box[2] / 2
        upper_left_y = box[1] - box[3] / 2
        rect = patches.Rectangle(
            (upper_left_x * width, upper_left_y * height),
            box[2] * width,
            box[3] * height,
            linewidth=1,
            edgecolor="r",
            facecolor="none",
        )
        # Add the patch to the Axes
        ax.add_patch(rect)

    plt.show()

In [25]:
def get_bboxes(
    loader,
    model,
    iou_threshold,
    threshold,
    
    pred_format="cells",
    box_format="midpoint",
    device="cuda",
):
    all_pred_boxes = []
    all_true_boxes = []

    # make sure model is in eval before get bboxes
    model.eval()
    train_idx = 0
    
    for batch_idx, (x, labels) in enumerate(loader):
        x = x.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            predictions = model(x)

        batch_size = x.shape[0]
        true_bboxes = cellboxes_to_boxes(labels)
        bboxes = cellboxes_to_boxes(predictions)

        for idx in range(batch_size):
            nms_boxes = non_max_suppression(
                bboxes[idx],
                iou_threshold=iou_threshold,
                threshold=threshold,
                box_format=box_format,
            )

            #if batch_idx == 0 and idx == 0:
            #    plot_image(x[idx].permute(1,2,0).to("cpu"), nms_boxes)
            #    print(nms_boxes)
            for nms_box in nms_boxes:
                all_pred_boxes.append([train_idx] + nms_box)
                print('all_pred_boxes : ', all_pred_boxes)
            for box in true_bboxes[idx]:
                # many will get converted to 0 pred
                if box[1] > threshold:
                    all_true_boxes.append([train_idx] + box)
                    
            train_idx += 1

    model.train()
    return all_pred_boxes, all_true_boxes

In [153]:
def convert_cellboxes(predictions, S=7):
    """
    Converts bounding boxes output from Yolo with
    an image split size of S into entire image ratios
    rather than relative to cell ratios. Tried to do this
    vectorized, but this resulted in quite difficult to read
    code... Use as a black box? Or implement a more intuitive,
    using 2 for loops iterating range(S) and convert them one
    by one, resulting in a slower but more readable implementation.
    """
    B=2
    C=4
    
    predictions = predictions.to("cpu")
    batch_size = predictions.shape[0]
    predictions = predictions.reshape(batch_size, 7, 7, 14)
    
    bboxes1 = predictions[..., (C+1):(C+5)]
    bboxes2 = predictions[..., (C+6):(C+10)]
    scores = torch.cat(
        (predictions[..., C].unsqueeze(0), predictions[..., (C+5)].unsqueeze(0)), dim=0
    )
    
    best_box = scores.argmax(0).unsqueeze(-1)
    best_boxes = bboxes1 * (1 - best_box) + best_box * bboxes2
    cell_indices = torch.arange(7).repeat(batch_size, 7, 1).unsqueeze(-1)
    
    x = 1 / S * (best_boxes[..., :1] + cell_indices)
    y = 1 / S * (best_boxes[..., 1:2] + cell_indices.permute(0, 2, 1, 3))
    w_y = 1 / S * best_boxes[..., 2:4]
    
    converted_bboxes = torch.cat((x, y, w_y), dim=-1)
    predicted_class = predictions[..., :C].argmax(-1).unsqueeze(-1)
    best_confidence = torch.max(predictions[..., C], predictions[..., (C+5)]).unsqueeze(
        -1
    )
    converted_preds = torch.cat(
        (predicted_class, best_confidence, converted_bboxes), dim=-1
    )

    return converted_preds


def cellboxes_to_boxes(out, S=7):
    converted_pred = convert_cellboxes(out).reshape(out.shape[0], S * S, -1)
    converted_pred[..., 0] = converted_pred[..., 0].long()
    all_bboxes = []
    
    for ex_idx in range(out.shape[0]):
        bboxes = []

        for bbox_idx in range(S * S):
            bboxes.append([x.item() for x in converted_pred[ex_idx, bbox_idx, :]])
        all_bboxes.append(bboxes)

    return all_bboxes

In [27]:
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])