# 모델 레이어 만들기

In [None]:
from tensorflow.keras import datasets, layers, models, activations, losses, optimizers, metrics

def create_yolo():

    model = models.Sequential()
    
    # Block 1
    model.add(layers.Convolution2D(64, (7, 7), strides=(2, 2), input_shape=(448, 448, 3), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'))


    # Block 2
    model.add(layers.Convolution2D(192, (3, 3), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'))

    # Block 3
    model.add(layers.Convolution2D(128, (1, 1), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(256, (3, 3), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(256, (1, 1), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(512, (3, 3), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))

    model.add(layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'))
    
    # Block 4
    model.add(layers.Convolution2D(256, (1, 1), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(512, (3, 3), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(256, (1, 1), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(512, (3, 3), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(256, (1, 1), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(512, (3, 3), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(256, (1, 1), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(512, (3, 3), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(512, (1, 1), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(1024, (3, 3), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'))


    # Block 5
    model.add(layers.Convolution2D(512, (1, 1), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(1024, (3, 3), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(512, (1, 1), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(1024, (3, 3), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(1024, (3, 3), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(1024, (3, 3), strides=(2, 2), padding='same'))

    model.add(layers.LeakyReLU(alpha=0.1))
    
    # Block 6
    model.add(layers.Convolution2D(1024, (3, 3), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.Convolution2D(1024, (3, 3), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.1))


    model.add(layers.Flatten())
    model.add(layers.Dense(4096))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(7 * 7 * 30))
    model.add(layers.Reshape(target_shape=(7, 7, 30)))

    return model

# Pascal VOC 데이터셋 로드

In [None]:
import os
import glob
import cv2
import numpy
import xml.etree.ElementTree as ET
import tqdm

classes_num = {'aeroplane': 0, 'bicycle': 1, 'bird': 2, 'boat': 3, 'bottle': 4, 
               'bus': 5, 'car': 6, 'cat': 7, 'chair': 8, 'cow': 9, 
               'diningtable': 10, 'dog': 11, 'horse': 12, 'motorbike': 13, 'person': 14, 
               'pottedplant': 15, 'sheep': 16, 'sofa': 17, 'train': 18, 'tvmonitor': 19}

classes = list(classes_num.keys())


def voc_load_data(img_dir_path, annotation_path, batch=10):
    images, labels = [], []
    img_file_list = glob.glob((img_dir_path + "/*.jpg"))

    for i in range(len(img_file_list)):
        for img_path in tqdm.tqdm(img_file_list[batch * i: batch * (i + 1)]):

            # Read image
            image = cv2.imread(img_path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image_h, image_w = image.shape[0:2]

            # Resize & normalization
            image = cv2.resize(image, (448, 448))
            image = image / 255.0

            images.append(image)

            # Read xml file
            xml_name = os.path.split(img_path)[-1]
            xml_name = xml_name.split(".")[-2]
            xml_path = annotation_path + f"/{xml_name}.xml"

            # parse xml
            tree = ET.parse(xml_path)
            root = tree.getroot()

            # Empty matrix
            label_matrix = numpy.zeros((7, 7, 25))

            for obj in root.iter('object'):
                difficult = obj.find('difficult').text
                class_name = obj.find('name').text
                if class_name not in classes or difficult == "1":
                    continue

                # Set class id
                cls_id = classes.index(class_name)
                xmlbox = obj.find('bndbox')
                tlx, tly = int(xmlbox.find('xmin').text), int(xmlbox.find('ymin').text)
                brx, bry = int(xmlbox.find('xmax').text), int(xmlbox.find('ymax').text)

                # point -> 0~1 normalization
                x = (tlx + brx) / 2 / image_w
                y = (tly + bry) / 2 / image_h
                w = (brx - tlx) / image_w
                h = (bry - tly) / image_h

                # loc in 7x7 grid & point(0~1) in grid cell
                loc = [7 * x, 7 * y]
                loc_i = int(loc[1])
                loc_j = int(loc[0])
                y = loc[1] - loc_i
                x = loc[0] - loc_j

                if label_matrix[loc_i, loc_j, 24] == 0:
                    # [<----------20---------->|x|y|w|h|pc]
                    label_matrix[loc_i, loc_j, cls_id] = 1
                    label_matrix[loc_i, loc_j, 20:24] = [x, y, w, h]
                    label_matrix[loc_i, loc_j, 24] = 1  # response

            labels.append(label_matrix)

        return numpy.array(images), numpy.array(labels)

# 학습코드 구현 - Generator

In [None]:
import os
import glob
import cv2
import numpy
import xml.etree.ElementTree as ET
import tqdm

classes_num = {'aeroplane': 0, 'bicycle': 1, 'bird': 2, 'boat': 3, 'bottle': 4, 
               'bus': 5, 'car': 6, 'cat': 7, 'chair': 8, 'cow': 9, 
               'diningtable': 10, 'dog': 11, 'horse': 12, 'motorbike': 13, 'person': 14, 
               'pottedplant': 15, 'sheep': 16, 'sofa': 17, 'train': 18, 'tvmonitor': 19}

classes = list(classes_num.keys())


def voc_load_data(img_dir_path, annotation_path, batch=10):
    images, labels = [], []
    img_file_list = glob.glob((img_dir_path + "/*.jpg"))

    for i in range(len(img_file_list)):
        for img_path in tqdm.tqdm(img_file_list[batch * i: batch * (i + 1)]):

            # Read image
            image = cv2.imread(img_path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image_h, image_w = image.shape[0:2]

            # Resize & normalization
            image = cv2.resize(image, (448, 448))
            image = image / 255.0

            images.append(image)

            # Read xml file
            xml_name = os.path.split(img_path)[-1]
            xml_name = xml_name.split(".")[-2]
            xml_path = annotation_path + f"/{xml_name}.xml"

            # parse xml
            tree = ET.parse(xml_path)
            root = tree.getroot()

            # Empty matrix
            label_matrix = numpy.zeros([7, 7, 25])

            for obj in root.iter('object'):
                difficult = obj.find('difficult').text
                class_name = obj.find('name').text
                if class_name not in classes or difficult == "1":
                    continue

                # Set class id
                cls_id = classes.index(class_name)
                xmlbox = obj.find('bndbox')
                tlx, tly = int(xmlbox.find('xmin').text), int(xmlbox.find('ymin').text)
                brx, bry = int(xmlbox.find('xmax').text), int(xmlbox.find('ymax').text)

                # point -> 0~1 normalization
                x = (tlx + brx) / 2 / image_w
                y = (tly + bry) / 2 / image_h
                w = (brx - tlx) / image_w
                h = (bry - tly) / image_h

                # loc in 7x7 grid & point(0~1) in grid cell
                loc = [7 * x, 7 * y]
                loc_i = int(loc[1])
                loc_j = int(loc[0])
                y = loc[1] - loc_i
                x = loc[0] - loc_j

                if label_matrix[loc_i, loc_j, 24] == 0:
                    # [<----------20---------->|x|y|w|h|pc]
                    label_matrix[loc_i, loc_j, cls_id] = 1
                    label_matrix[loc_i, loc_j, 20:24] = [x, y, w, h]
                    label_matrix[loc_i, loc_j, 24] = 1  # response

            labels.append(label_matrix)

        return numpy.array(images), numpy.array(labels)

# 학습코드 구현 - Callback

In [None]:
from tensorflow import keras
import math
import cv2 as cv
import numpy as np
import numpy
import xml.etree.ElementTree as ET
import glob
import os


classes_num = {'aeroplane': 0, 'bicycle': 1, 'bird': 2, 'boat': 3, 'bottle': 4, 
               'bus': 5, 'car': 6, 'cat': 7, 'chair': 8, 'cow': 9, 
               'diningtable': 10, 'dog': 11, 'horse': 12, 'motorbike': 13, 'person': 14, 
               'pottedplant': 15, 'sheep': 16, 'sofa': 17, 'train': 18, 'tvmonitor': 19}

classes = list(classes_num.keys())


class SequenceData(keras.utils.Sequence):

    def __init__(self, model, img_dir_path, annotation_path, target_size, batch_size, shuffle=True, use_only=10):
        self.model = model
        self.datasets = glob.glob((img_dir_path + "/*.jpg"))[:use_only]
        self.image_path = img_dir_path
        self.label_path = annotation_path
        self.image_size = target_size[0:2]
        # batch_size를 맴버변수에 저장합니다.
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.datasets))
        self.shuffle = shuffle

    def __len__(self):
        num_imgs = len(self.datasets)
        # 이미지 개수를 batch 크기로 나눈 값을 "올림" 합니다.
        return math.ceil(num_imgs / float(self.batch_size))

    def __getitem__(self, idx):
        # batch_size 만큼의 index를 전달 합니다.
        # 예) batch_size가 2라면, self.indexes[idx * 2:(idx+1) * 2] 라고 일반화 할 수 있습니다.
        # 그러면 idx가 0부터 커짐에 따라 순차적으로 [0:2], [2:4], [4:6] 값을 가져오게 됩니다.
        batch_indexs = self.indexes[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch = [self.datasets[k] for k in batch_indexs]
        X, y = self.data_generation(batch)
        return X, y

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def read(self, image_path):
        xml_path = os.path.join(os.path.abspath(self.label_path),
                                os.path.split(image_path)[-1].split('.')[0]) + ".xml"

        image = cv.imread(image_path)
        image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
        image_h, image_w = image.shape[0:2]
        image = cv.resize(image, self.image_size)
        image = image / 255.

        tree = ET.parse(xml_path)
        root = tree.getroot()
        label_matrix = numpy.zeros([7, 7, 25])

        label_matrix = np.zeros([7, 7, 25])
        for obj in root.iter('object'):
            difficult = obj.find('difficult').text
            class_name = obj.find('name').text
            if class_name not in classes or difficult == "1":
                continue

            cls_id = classes.index(class_name)
            xmlbox = obj.find('bndbox')
            tlx, tly = int(xmlbox.find('xmin').text), int(xmlbox.find('ymin').text)
            brx, bry = int(xmlbox.find('xmax').text), int(xmlbox.find('ymax').text)
            x = (tlx + brx) / 2 / image_w
            y = (tly + bry) / 2 / image_h
            w = (brx - tlx) / image_w
            h = (bry - tly) / image_h

            loc = [7 * x, 7 * y]
            loc_i = int(loc[1])
            loc_j = int(loc[0])
            y = loc[1] - loc_i
            x = loc[0] - loc_j

            if label_matrix[loc_i, loc_j, 24] == 0:
                label_matrix[loc_i, loc_j, cls_id] = 1
                label_matrix[loc_i, loc_j, 20:24] = [x, y, w, h]
                label_matrix[loc_i, loc_j, 24] = 1  # response

        return image, label_matrix

    def data_generation(self, batch_datasets):
        images = []
        labels = []

        for dataset in batch_datasets:
            image, label = self.read(dataset)
            images.append(image)
            labels.append(label)

        X = np.array(images)
        y = np.array(labels)

        return X, y

# 출력 센서 디코딩1

In [None]:
import numpy


def decode(y, image_w, image_h):
    boxes = []
    """
    출력 텐서를 해석하여 x, y, w, h로 표현되는 바운딩 박스를 리턴하는 함수를 구현해주세요.
    """
    # image_w, image_h = 448, 448
    # 7 x 7의 그리드 셀을 순회하며
    for i in range(7):
        for j in range(7):

            grid_vector = y[0]
                
            # 두 개의 Anchorbox를 찾아냅니다
            # AnchorBox는 = [x, y, w, h] 로 구성되어 있습니다.
            # 이때 x, y는 셀 안에서의 중심 좌표라는 것을 명심하세요!
            anchor_boxA = grid_vector[i, j, 20:25]
            anchor_boxB = grid_vector[i, j, 25:]
            box1 = anchor_boxA.copy()
            box2 = anchor_boxB.copy()
                
            box1[0] = (j + box1[0]) / 7 * image_w
            box1[1] = (i + box1[1]) / 7 * image_h
            box1[2] = box1[2] * image_w
            box1[3] = box1[3] * image_h


            box2[0] = (j + box2[0]) / 7 * image_w
            box2[1] = (i + box2[1]) / 7 * image_h
            box2[2] = box2[2] * image_w
            box2[3] = box2[3] * image_h

            # 첫번째 박스의 중심좌표를 tlx, tly로 변환합니다.
            box1[0] -= (box1[2] / 2)
            box1[1] -= (box1[3] / 2)

            # 두번째 박스의 중심좌표를 tlx, tly으로 변환합니다.
            box2[0] -= (box2[2] / 2)
            box2[1] -= (box2[3] / 2)

            boxes.append(box1)
            boxes.append(box2)

    return boxes

# 출력 센서 디코딩2

In [None]:
import numpy


classes_num = {'aeroplane': 0, 'bicycle': 1, 'bird': 2, 'boat': 3, 'bottle': 4, 'bus': 5,
               'car': 6, 'cat': 7, 'chair': 8, 'cow': 9, 'diningtable': 10, 'dog': 11,
               'horse': 12, 'motorbike': 13, 'person': 14, 'pottedplant': 15, 'sheep': 16,
               'sofa': 17, 'train': 18, 'tvmonitor': 19}

classes = list(classes_num.keys())



def intersection_over_union(box1, box2):

    # 교집합 부분의 top left 좌표와 bottom right 좌표를 계산합니다.
    x1 = numpy.maximum(box1[0], box2[0])
    y1 = numpy.maximum(box1[1], box2[1])
    x2 = numpy.minimum(box1[0] + box1[2], box2[0] + box2[2])
    y2 = numpy.minimum(box1[1] + box1[3], box2[1] + box2[3])

    # 교집합의 넒이를 구합니다.
    intersection = numpy.maximum(x2 - x1, 0) * numpy.maximum(y2 - y1, 0)

    # 박스1의 넓이와 박스2의 넓이를 각각 구합니다.
    box1_area = box1[2] * box1[3]
    box2_area = box2[2] * box2[3]

    # 두 박스의 넒이를 더한뒤 교집합 영역 넓이를 뺴면 합영역이 됩니다.
    union = box1_area + box2_area - intersection

    # iou를 계산합니다.
    iou = intersection / union
    return iou


def decode(y, image_shape, class_confidence_threshold=0.4, iou_threshold=0.1):
    boxes, names = [], []

    grid_vector = y[0]
    
    # NMS 진행 : IOU가 높아 겹치는 박스를 제거합니다
    # 아래 NMS 배열은 98개의 클래스 별 신뢰도를 담을 행렬입니다.
    # 0 ~ 20 행까지는 클래스 별 신뢰도를, 20 ~ 25행 까지는 AnchorBox의 좌표와 신뢰도 값을 저장합니다.
    # 아래 코드의 빈칸을 채워주세요
    nms = numpy.zeros((25, 98))
    for i in range(7):
        for j in range(7):
            box_num = i * 7 + j
            # AnchorBox A
            nms[:20, box_num] = grid_vector[i, j, 0:20]
            # 박스의 confidence를 클래스 confidence에 곱하여 
            # 박스의 위치와 클래스 모두 고려될 수 있도록 합니다.
            nms[:20, box_num] *= grid_vector[i, j, 24]
            nms[20:, box_num] = grid_vector[i, j, 20:25]
                                
            # 박스 좌표 변환 : 그리드 셀에서 좌표를 -> x, y, w, h로 image_shape에 맞게
            nms[20, box_num] = (j + grid_vector[i, j, 20]) / 7 * image_shape[1]
            nms[21, box_num] = (i + grid_vector[i, j, 21]) / 7 * image_shape[0]
            nms[22, box_num] = grid_vector[i, j, 22] * image_shape[1]
            nms[23, box_num] = grid_vector[i, j, 23] * image_shape[0]
                  
            # 박스의 중심좌표를 x1, y1으로 변환합니다.
            nms[20, box_num] -= (nms[22, box_num] / 2)
            nms[21, box_num] -= (nms[23, box_num] / 2)
            


            # AnchorBox B
            nms[:20, box_num + 1] = grid_vector[i, j, 0:20]
            # 박스의 confidence를 클래스 confidence에 곱하여 
            # 박스의 위치와 클래스 모두 고려될 수 있도록 합니다.
            nms[:20, box_num + 1] *= grid_vector[i, j, -1]

            nms[20:, box_num + 1] = grid_vector[i, j, 25:]
            
            # 박스 좌표 변환 : 그리드 셀에서 좌표를 -> x, y, w, h로 image_shape에 맞게
            nms[20, box_num + 1] = (j + grid_vector[i, j, 25]) / 7 * image_shape[1]
            nms[21, box_num + 1] = (i + grid_vector[i, j, 26]) / 7 * image_shape[0]
            nms[22, box_num + 1] = grid_vector[i, j, 27] * image_shape[1]
            nms[23, box_num + 1] = grid_vector[i, j, 28] * image_shape[0]
                    
            # 박스의 중심좌표를 x1, y1으로 변환합니다.
            nms[20, box_num + 1] -= (nms[22, box_num + 1] / 2)
            nms[21, box_num + 1] -= (nms[23, box_num + 1] / 2)


    # 아래 주석을 해제하면, nms 배열에 클래스 별 신뢰도 값이 복사되었는지 알 수 있습니다.
    for c in range(20):
        for k in range(0, 98):
            print(f"{classes[c]}에 대한 {k} 번째 박스 신뢰도는 {nms[c, k]}")

    # 모든 클래스 마다
    for class_id in range(nms.shape[0] - 5):
        # 모든 박스별로
        for box_order in range(nms.shape[1]):

            # 클래스의 신뢰도가 낮으면 주어진 클래스 threshold 보다 낮으면
            # 해당 클래스 신뢰도를 0으로 만듭니다.
            if nms[class_id, box_order] < class_confidence_threshold:
                # 클래스 신뢰도를 0으로 만듭니다
                nms[class_id, box_order] = 0

        # class confidence값에 따라 소팅하여
        # 클래스 별로 해당 클래스에서 IOU가 높은 것을 제거하도록 
        # 해당 박스의 클래스에 신뢰도에 0을 줍니다.
        candidates = nms[class_id, :].argsort()[::-1]
        for i in range(candidates.shape[0]):
            for j in range(i + 1, candidates.shape[0]):
                box1 = nms[20:24, candidates[i]]
                box2 = nms[20:24, candidates[j]]

                iou = intersection_over_union(box1, box2)
                
                if iou > iou_threshold:
                    nms[class_id, candidates[j]] = 0


                    
    # 아래 주석을 해제하면 IOU로 제거된 이후의 
    # 신뢰도를 출력합니다.
    # for c in range(20):
    #     for k in range(0, 98):

    #         print(f"{classes[c]}에 대한 {k} 번째 박스 신뢰도는 {nms[c, k]}")
    
    # 이제 남은 박스들 중 점수가 0이상인 박스들만
    # 좌표를 x. y, w, h로 변환 합니다.
    for box_num in range(nms.shape[1]):
        class_id = numpy.argmax(nms[:20, box_num])
        confidence = nms[class_id, box_num]
                
        if confidence > 0:
            box = nms[20:24, box_num].copy()
            boxes.append(box)
            names.append(classes[class_id])
    
    return boxes, names