YOLO 的網路輸出是一個 7x7x30 的 tensor，

下面將藉由程式碼實現損失函數的計算，透過損失函數衡量經由 YOLO 模型辨識後的結果和標記檔的差距有多遠。
今天的範例，採用的是 YOLO V1 tiny 的網絡

資料來源: https://github.com/solaris33/dl_cv_tensorflow_10weeks/blob/master/week10/tensorflow-yolo/yolo/net/yolo_net.py

In [None]:
# 指定 Google Drive 雲端硬碟的根目錄 drive
from google.colab import drive
drive.mount('/content/drive')

import os
import sys
import subprocess

# 此處為 Google drive 中的文件路徑
path = "/content/drive/.../
os.chdir(path)

!ls

# We'll need to update our path to import from Drive.
sys.path.append(os.path.join(path, "yolo/net"))

if not os.path.exists("models/pretrain/yolo_tiny.ckpt"):
  # 下載 YOLO Tiny 的網路權重
  print("Model doesn't exist, downloading...")
  os.system("wget https://drive.google.com/file/d/0B-yiAeTLLamRekxqVE01Yi1RRlk/view?usp=sharing")

else:
  print("Model exist")

# Now we can import the library and use the function.
from yolo_tiny_net import YoloTinyNet

# 確保 colob 中使用的 tensorflow 是 1.x 版本而不是 tensorflow 2
%tensorflow_version 1.x

import tensorflow as tf 
import cv2
import numpy as np
import matplotlib.pyplot as plt
import keras

In [None]:
classes_name =  ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train","tvmonitor"]

# 避免執行環境儲存執行結束的變數
tf.reset_default_graph()

讀入資料集

In [None]:
# 下載圖片範例
# 如果已經下載過，可以註解掉
!wget https://github.com/pjreddie/darknet/blob/master/data/dog.jpg?raw=true -O dog.jpg
img = cv2.imread("image/dog.jpg")

print(img.shape)
h, w, _ = img.shape

def show(img):
    # 因預設載入的圖片為 BGR，因此現在需要將其轉為 RGB
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) 
    plt.show()

In [None]:
boxes = np.array([
            [128, 224, 314, 537],
            [475, 85, 689, 170],
            [162, 119, 565, 441]
        ]).astype(float)

# 將 Bounding Boxes 坐標以原圖的 Resolution Normalize 至 0 ~ 1 之間
boxes[:, [0, 2]] = boxes[:, [0, 2]] / img.shape[1]
boxes[:, [1, 3]] = boxes[:, [1, 3]] / img.shape[0]

img_show = img.copy()
for x1, y1, x2, y2 in boxes:
    cv2.rectangle(img_show, (int(x1 * w), int(y1 * h)), (int(x2 * w), int(y2 * h)), (0, 255, 0), 2)

show(img_show)

In [None]:
# 模擬影像的標記檔結果
# 由於 YOLO 的輸入是 448 * 448，因此圖片的大小需要先設為 448 * 448，所以原本的標記框大小也要對應轉換
dy = 448 / 576
dx = 448 / 768
boxes = np.array([
            [128 * dx, 224 * dy, 314 * dx, 537 * dy],
            [475 * dx, 85 * dy, 689 * dx, 170 * dy],
            [162 * dx, 119 * dy, 565 * dx, 441 * dy]
        ]).astype(float)

resized_img = cv2.resize(img, (448, 448))
img_show = resized_img.copy()
for x1, y1, x2, y2 in boxes:
    cv2.rectangle(img_show, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)

show(img_show)

# 確認轉換後的標記結果能框住物體的全部
print(boxes[0, ])
print(boxes[1, ])
print(boxes[2, ])

In [None]:
# 標記檔：labels 轉成 loss 函數所需的格式：
# (xmin, ymin, xmax, ymax) --> (x_center, y_center, w, h, class) in (448, 448)
labels = np.zeros((3, 5)) 

labels [:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2
labels [:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2

labels [:, 2] = boxes[:, 2] - boxes[:, 0]
labels [:, 3] = boxes[:, 3] - boxes[:, 1]

labels [0, 4] = 6
labels [1, 4] = 1
labels [2, 4] = 11

print(labels)

In [None]:
labels = tf.reshape(labels, (3, 5))
labels = tf.cast(labels, tf.float32) 

print(labels)

In [None]:
# 解析模型的輸出
def process_predicts(predicts):
  # 類別
  p_classes = predicts[0, :, :, 0:20]
  
  # 個別 Bounding Box 所包含物件的機率
  c = predicts[0, :, :, 20:22]

  # 個別 Bounding Box 的標記框位置
  coordinate = predicts[0, :, :, 22:]

  p_classes = np.reshape(p_classes, (7, 7, 1, 20))
  c = np.reshape(c, (7, 7, 2, 1))

  # 對應相乘，產生 𝑝(𝐶𝑙𝑎𝑠𝑠_𝑗 | 𝑜𝑏𝑗𝑒𝑐𝑡) * 𝑃(𝑜𝑏𝑗𝑒𝑐𝑡)
  p = c * p_classes

  # 返還最大值索引（486）
  index = np.argmax(p) 
  index = np.unravel_index(index, p.shape)

  class_num = index[3]

  coordinate = np.reshape(coordinate, (7, 7, 2, 4))

  max_coordinate = coordinate[index[0], index[1], index[2], :]

  x_center = max_coordinate[0]
  y_center = max_coordinate[1]
  w = max_coordinate[2]
  h = max_coordinate[3]
  
  #index[1] : x 網格位置
  x_center = (index[1] + x_center) * (448/7.0)
  
  # index[0]: y網格位置
  y_center = (index[0] + y_center) * (448/7.0)

  w *= 448
  h *= 448

  x_min = x_center - w / 2
  y_min = y_center - h / 2

  x_max = x_min + w
  y_max = y_min + h

  return x_min, y_min, x_max, y_max, class_num

設定 YoloTinyNet，並產生一個預測結果

In [None]:
common_params = {'image_size': 448, 'num_classes': 20, 'batch_size': 1}
net_params = {'cell_size': 7, 'boxes_per_cell': 2, 'weight_decay': 0.0005, 'class_scale': 2, 'object_scale': 1, 'noobject_scale': 0.5, 'coord_scale': 5}

net = YoloTinyNet(common_params, net_params, test=False)

# 傳值的工作交給 sess.run(), 需要傳入的值放在 feed_dict，並對應每一個 input，而 placeholder 和 feed_dict 是綁定在一起的
image = tf.placeholder(tf.float32, (1, 448, 448, 3))
predicts = net.inference(image)

In [None]:
# Session 是 Tensorflow 所開啟的一個對話，並執行輸出結果
sess = tf.Session()

img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
img = img.astype(np.float32)

img = img / 255 * 2 - 1
img = np.reshape(img, (1, 448, 448, 3))

saver = tf.train.Saver(net.trainable_collection)

# 輸入網絡架構的參數檔
saver.restore(sess, 'models/pretrain/yolo_tiny.ckpt')

# 開啟對話，進行預測
predict = sess.run(predicts, feed_dict={image: img})

print(predict.shape)
print(predict)

預測出來產生的結果檔

In [None]:
xmin, ymin, xmax, ymax, class_num = process_predicts(np_predict)

print("output：")
print(xmin)
print(ymin)
print(xmax)
print(ymax)
print(class_num)

class_name = classes_name[class_num]
print(class_name)

# 畫出對應框
cv2.rectangle(resized_img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 0, 255))
cv2.putText(resized_img, class_name, (int(xmin), int(ymin)), 2, 1.5, (0, 0, 255))

# 結果輸出
cv2.imwrite('dog_out.jpg', resized_img)



接下來我們透過三個函數來計算一張圖片經由 YOLO 模型辨識後的結果和標記檔的差距有多遠。

1. iou --> 計算兩個 Bounding Box 的 IoU 值
2. body1 --> 轉換過後的標記框，然後和預測框計算損失函數
3. loss --> 計算每一個標記框和所有預測框的損失

loss function 函數

In [None]:
# 參數的設定
image_size = 448
num_classes = 20
batch_size = 1
cell_size = 7
boxes_per_cell = 2
weight_decay = 0.0005
class_scale = 2.0
object_scale = 1.0
noobject_scale = 0.5
coord_scale = 5.0

In [None]:
def iou(boxes1, boxes2):
    # Calculate IoU
    # Arguments:
    #   boxes1：（代表預測框）4-D tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4]
    #   boxes2：（代表實際框）1-D tensor ===> (x_center, y_center, w, h)
    
    # Return:
    #   回傳 IoU 的值
    #   IoU: 3-D tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]

    # ((x_center - w / 2), (y_center - h / 2), (x_center + w / 2), (y_center + h / 2)) = (左上方座標，右下方座標)
    boxes1 = tf.stack([boxes1[:, :, :, 0] - boxes1[:, :, :, 2] / 2, boxes1[:, :, :, 1] - boxes1[:, :, :, 3] / 2,
                      boxes1[:, :, :, 0] + boxes1[:, :, :, 2] / 2, boxes1[:, :, :, 1] + boxes1[:, :, :, 3] / 2])
    boxes1 = tf.transpose(boxes1, [1, 2, 3, 0])
    boxes2 =  tf.stack([boxes2[0] - boxes2[2] / 2, boxes2[1] - boxes2[3] / 2, boxes2[0] + boxes2[2] / 2, boxes2[1] + boxes2[3] / 2])

    # Calculate the left up point
    # 計算交集的左上方點，和右下方點
    left_up = tf.maximum(boxes1[:, :, :, 0:2], boxes2[0:2])
    right_down = tf.minimum(boxes1[:, :, :, 2:], boxes2[2:])

    # 交集（intersection）
    intersection = right_down - left_up
    
    # 交集面積
    inter_square = intersection[:, :, :, 0] * intersection[:, :, :, 1]
    
    # 只取長寬 > 0 的做計算
    mask = tf.cast(intersection[:, :, :, 0] > 0, tf.float32) * tf.cast(intersection[:, :, :, 1] > 0, tf.float32)
    
    inter_square = mask * inter_square
    
    # Calculate the boxes1 square and boxes2 square
    # 計算聯集面積，等於兩個方形區塊 - 交集面積（calculate the boxes1 square and boxes2 square）
    square1 = (boxes1[:, :, :, 2] - boxes1[:, :, :, 0]) * (boxes1[:, :, :, 3] - boxes1[:, :, :, 1])
    square2 = (boxes2[2] - boxes2[0]) * (boxes2[3] - boxes2[1])
    return inter_square / (square1 + square2 - inter_square + 1e-6)

In [None]:
def cond1(num, object_num, loss, predict, label, nilboy):
    return num < object_num

In [None]:
def body1(num, object_num, loss, predict, labels, nilboy):
    # calculate loss
    # Args:
    #   predict（預測框結果）：3-D tensor [cell_size, cell_size, 5 * boxes_per_cell]
    #   labels（標記框） : [max_objects, 5], 5：(x_center, y_center, w, h, class)
    #   max_objects, 紀錄物體的總數量

    # 用 num 控制現在是計算第幾個物件的標記框，取出來計算

    label = labels[num:num + 1, :]
    label = tf.reshape(label, [-1])

    # Calculate objects tensor [CELL_SIZE, CELL_SIZE]
    # 判斷是否有某一物體的標記框中心落在網格 i 中 ==> 1^{object}_{i}
    # Objects
    min_x = (label[0] - label[2] / 2) / (image_size / cell_size)
    max_x = (label[0] + label[2] / 2) / (image_size / cell_size)

    min_y = (label[1] - label[3] / 2) / (image_size / cell_size)
    max_y = (label[1] + label[3] / 2) / (image_size / cell_size)
    
    # 計算不大於 min_x 的最大整數
    min_x = tf.floor(min_x)
    min_y = tf.floor(min_y)
    
    # 計算不小於 min_x 的最小整數
    max_x = tf.ceil(max_x)
    max_y = tf.ceil(max_y)

    # 相減，計算物體涵蓋的 cell 範圍
    temp = tf.cast(tf.stack([max_y - min_y, max_x - min_x]), dtype=tf.int32)
    objects = tf.ones(temp, tf.float32)

    temp = tf.cast(tf.stack([min_y, cell_size - max_y, min_x, cell_size - max_x]), tf.int32)
    temp = tf.reshape(temp, (2, 2))
    objects = tf.pad(objects, temp, "CONSTANT")

    # Calculate objects  tensor [CELL_SIZE, CELL_SIZE]
    # Calculate responsible tensor [CELL_SIZE, CELL_SIZE] --> 1^{object}_{ij}
    # 判斷第 i 個網格中第 j 個bbox是否負責這個物體
    center_x = label[0] / (image_size / cell_size)
    center_x = tf.floor(center_x)

    center_y = label[1] / (image_size / cell_size)
    center_y = tf.floor(center_y)

    response = tf.ones([1, 1], tf.float32)

    temp = tf.cast(tf.stack([center_y, cell_size - center_y - 1, center_x, cell_size -center_x - 1]), tf.int32)
    temp = tf.reshape(temp, (2, 2))
    response = tf.pad(response, temp, "CONSTANT")
    
    # calculate iou_predict_truth [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    # 取出 box
    predict_boxes = predict[:, :, num_classes + boxes_per_cell:]
    
    # 重新 reshape
    predict_boxes = tf.reshape(predict_boxes, [cell_size, cell_size, boxes_per_cell, 4])

    predict_boxes = predict_boxes * [image_size / cell_size, image_size / cell_size, image_size, image_size]

    base_boxes = np.zeros([cell_size, cell_size, 4])

    for y in range(cell_size):
      for x in range(cell_size):
        # nilboy
        base_boxes[y, x, :] = [image_size / cell_size * x, image_size / cell_size * y, 0, 0]
    base_boxes = np.tile(np.resize(base_boxes, [cell_size, cell_size, 1, 4]), [1, 1, boxes_per_cell, 1])

    predict_boxes = base_boxes + predict_boxes

    iou_predict_truth = iou(predict_boxes, label[0:4])
    # calculate C [cell_size, cell_size, boxes_per_cell]
    C = iou_predict_truth * tf.reshape(response, [cell_size, cell_size, 1])

    # calculate I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    # 判斷第 I 個網格中第 j 個bbox是否負責這個物體
    I = iou_predict_truth * tf.reshape(response, (cell_size, cell_size, 1))
    
    max_I = tf.reduce_max(I, 2, keepdims=True)

    I = tf.cast((I >= max_I), tf.float32) * tf.reshape(response, (cell_size, cell_size, 1))

    # calculate no_I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    no_I = tf.ones_like(I, dtype=tf.float32) - I 


    p_C = predict[:, :, num_classes:num_classes + boxes_per_cell]

    # calculate truth x,y,sqrt_w,sqrt_h 0-D
    x = label[0]
    y = label[1]

    sqrt_w = tf.sqrt(tf.abs(label[2]))
    sqrt_h = tf.sqrt(tf.abs(label[3]))
    # sqrt_w = tf.abs(label[2])
    # sqrt_h = tf.abs(label[3])

    # calculate predict p_x, p_y, p_sqrt_w, p_sqrt_h 3-D [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    p_x = predict_boxes[:, :, :, 0]
    p_y = predict_boxes[:, :, :, 1]

    p_sqrt_w = tf.sqrt(tf.minimum(image_size * 1.0, tf.maximum(0.0, predict_boxes[:, :, :, 2])))
    p_sqrt_h = tf.sqrt(tf.minimum(image_size * 1.0, tf.maximum(0.0, predict_boxes[:, :, :, 3])))
    
    # calculate truth p 1-D tensor [NUM_CLASSES]
    P = tf.one_hot(tf.cast(label[4], tf.int32), num_classes, dtype=tf.float32)

    # calculate predict p_P 3-D tensor [CELL_SIZE, CELL_SIZE, NUM_CLASSES]
    p_P = predict[:, :, 0:num_classes]

    # class_loss
    class_loss = tf.nn.l2_loss(tf.reshape(objects, (cell_size, cell_size, 1)) * (p_P - P)) * class_scale
    # class_loss = tf.nn.l2_loss(tf.reshape(response, (cell_size, cell_size, 1)) * (p_P - P)) * class_scale

    # object_loss
    object_loss = tf.nn.l2_loss(I * (p_C - C)) * object_scale

    # noobject_loss
    noobject_loss = tf.nn.l2_loss(no_I * (p_C)) * noobject_scale

    # coord_loss
    coord_loss = (tf.nn.l2_loss(I * (p_x - x)/(image_size/cell_size)) +
                 tf.nn.l2_loss(I * (p_y - y)/(image_size/cell_size)) +
                 tf.nn.l2_loss(I * (p_sqrt_w - sqrt_w))/ image_size +
                 tf.nn.l2_loss(I * (p_sqrt_h - sqrt_h))/image_size) * coord_scale

    nilboy = I

    with tf.Session() as sess1:
        print("第幾個標記框",sess1.run(num))
        #print(sess1.run(num)) 
        print(sess1.run(label)) 
        print("class_loss =",sess1.run(class_loss))
        print("object_loss=",sess1.run(object_loss))
        print("noobject_loss=",sess1.run(noobject_loss))
        print("coord_loss=",sess1.run(coord_loss))
    return num + 1, object_num, [loss[0] + class_loss, loss[1] + object_loss, loss[2] + noobject_loss, loss[3] + coord_loss], predict, labels, nilboy

In [None]:
def loss(predicts, labels, objects_num):
    # Add Loss to all the trainable variables

    # Args:
    #   predicts: [batch_size, cell_size, cell_size, 5 * boxes_per_cell] ===> (num_classes, boxes_per_cell, 4 * boxes_per_cell)
    #   labels: 3-D tensor of [batch_size, max_objects, 5]
    #   objects_num: 1-D tensor [batch_size]

    class_loss = tf.constant(0, tf.float32) # Bounding Box 的類別計算損失
    object_loss = tf.constant(0, tf.float32) # Bounding Box 的信心度計算損失
    noobject_loss = tf.constant(0, tf.float32) # Bounding Box 的信心度計算損失
    coord_loss = tf.constant(0, tf.float32) # Bounding Box 的定位計算損失
    loss = [0, 0, 0, 0]
    
    # 每一張圖都各自計算 loss
    for i in range(batch_size):
      predict = predicts[i, :, :, :]
      label = labels[i, :, :]
      object_num = objects_num[i]
      nilboy = tf.ones([7, 7, 2])
      
      # 以每一個標記框分別計算每一張圖是否有那一個物件，並進行損失函數的運算
      tuple_results = body1(tf.constant(0), object_num, [class_loss, object_loss, noobject_loss, coord_loss], predict, label, nilboy)
      for j in range(4):
        loss[j] = loss[j] + tuple_results[2][j]
      
      tuple_results = body1(tf.constant(1), object_num, [class_loss, object_loss, noobject_loss, coord_loss], predict, label, nilboy)
      for j in range(4):
        loss[j] = loss[j] + tuple_results[2][j]
      
      tuple_results = body1(tf.constant(2), object_num, [class_loss, object_loss, noobject_loss, coord_loss], predict, label, nilboy)    
      for j in range(4):
        loss[j] = loss[j] + tuple_results[2][j]
      
      nilboy = tuple_results[5]
    
    return loss

### 計算標記框和預測框的loss function

In [None]:
predicts = tf.reshape(np_predict,(1, 7, 7, 30))
labels = tf.reshape(labels, (1, 3, 5))
output_loss = loss(predicts, labels, tf.constant(3, shape=[1]))

print("預測結果和標記框的損失量")

with tf.Session() as sess: 
    print(sess.run(output_loss)) 

sess.close()