# YOLO object detection using OpenCV

Based on code by Chieko Natori: https://github.com/ChiekoN/yolov3_opencv

In [None]:
import numpy as np

from matplotlib import pyplot as plt
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14
%matplotlib inline

import time

In [None]:
# code requires the following minimum library versions
# OpenCV 3.4.2
import cv2

print("OpenCV version {}".format(cv2.__version__))

In [None]:
# get the category names of the coco dataset used for training

# set path
coco_names_file = "yolo/coco.names"

# read coco object names
LABELS = open(coco_names_file).read().strip().split("\n")

print('The COCO dataset contains images of the following items: \n', LABELS)

In [None]:
# configure YOLOv3

# set paths to config files
yolov3_weight_file = "yolo/yolov3.weights"
yolov3_config_file = "yolo/yolov3.cfg"

# built YOLO network
net = cv2.dnn.readNetFromDarknet(yolov3_config_file, yolov3_weight_file)

In [None]:
# print information about all layers in the YOLO v3 model

# get all layer names in the network
ln = net.getLayerNames()
print("YOLO v3 has {} layers:".format(len(ln)))
print(ln)

# the output layers are those with unconnected output
ln_out = [ln[i - 1] for i in net.getUnconnectedOutLayers()]
print("\nNames of YOLO v3 output layers: \n{}".format(ln_out))

In [None]:
# read image file
from skimage.io import imread

#image = imread("images/lunch.jpg")
image = imread("images/perth.jpg")
#image = imread("images/crossing.jpg")
#image = imread("images/coffee.jpg")
#image = imread("images/wiener_kaffeehaus_2.jpg")
#image = imread("images/black_and_white.jpg")

# determine image size -> will be needed to rescale bounding boxes
(h, w) = image.shape[:2]

# preprocess image data with rescaling and resizing, using RGB which is unusual for opencv
blob = cv2.dnn.blobFromImage(image, 1 / 255.0, (416, 416), crop=False)
print('Format of blob: {}'.format(blob.shape))

# we need to convert the blob in order to be able to show it
blob_to_show = blob[0, :, :, :].transpose(1, 2, 0)
print('Format of blob_to_show: {}'.format(blob_to_show.shape))

# visualize result of preprocessing
plt.figure(figsize = (8,8))
plt.imshow(blob_to_show)
plt.title('image resized', size = 16)
plt.show()

In [None]:
##### the actual inference #########
# set a new input to the network
net.setInput(blob)

# mesasure the time needed for the object detection 
start = time.time()
layerOutputs = net.forward(ln_out)
end = time.time()

# Showing spent time for forward pass
print('YOLO v3 took {:.4f} seconds'.format(end - start))

In [None]:
# let's have a look at the output
# Check point
print('The output of YOLO v3 is a', type(layerOutputs), 'consisting of', len(layerOutputs) , type(layerOutputs[0])) 

print('\nThe size of the numpy arrays is:')
print(ln_out[0], ':', layerOutputs[0].shape)
print(ln_out[1], ':', layerOutputs[1].shape)
print(ln_out[2], ':', layerOutputs[2].shape)


#### why 85 features per bounding box?
There are the 80 classes in the COCO training set and the 5 parameters (x, y, width, height, object confidence).


##### why the number of bounding boxes?

The number of (possible) bounding boxes is the number of grid positions in that layer times the number of (anchor) bounding boxes per grid position (three here): 

 - **yolo_82**:  grid shape = 13 x 13 ---> 13 x 13 x 3 =  **507**, (detects *BIGGER OBJECTS*)
 - **yolo_94**:  grid shape = 26 x 26 ---> 26 x 26 x 3 = **2028**
 - **yolo_106**: grid shape = 52 x 52 ---> 52 x 52 x 3 = **8112**, (detects *SMALLER OBJECTS*) 

*Reference:* https://towardsdatascience.com/yolo-v3-object-detection-53fb7d3bfe6b

In [None]:
# Let's look at one of the bounding box predictions:
# Note: In YOLO v3, each class is seperately predicted using *logistic regression* instead of *softmax*. 
# This allows the model to predict, for example, the image of a woman as 'person' and as 'woman' at the same time.

# we take the first bounding box of the first output layer

print('x, y, width, height of the bounding box:\n',  layerOutputs[0][0 ,:4])
print('object confidence:\n',  layerOutputs[0][0 ,4])
print('predictions for the 80 class probabilities:\n',  layerOutputs[0][0 ,5:])

In [None]:
# this cell :
# - merges the three outputs 
# - removes all empty bounding boxes (i.e. class_probability < threshold ), 
# - rescales the bounding boxes

# Preparing lists for detected bounding boxes, obtained confidences and class's number
boxes = []
scores = []
classes = []

# this is our threshold for keeping the bounding box
probability_minimum =0.5

# iterating through all three outputs
for result in layerOutputs:
    # Going through all bounding boxes from current output layer
    for detection in result:
        
        # Getting class for current object
        scores_current = detection[5:]
        class_current = np.argmax(scores_current)

        # Getting probability for current object
        probability_current = scores_current[class_current]

        # Getting object confidence for current object
        object_confidence = detection[4]
        
        # Eliminating weak predictions by minimum probability
        if probability_current > probability_minimum:
        #if probability_current*object_confidence > probability_minimum:  # this is an alternative way
        
        
            # Scaling bounding box coordinates to the initial image size
            # by elementwise multiplying them with the width and height of the image
            box_current = np.array(detection[0:4]) * np.array([w, h, w, h])

            # YOLO data format keeps center of detected box and its width and height
            # here we reconstruct the top left and bottom right corner
            x_center, y_center, box_width, box_height = box_current.astype('int')
            x_min = int(x_center - (box_width / 2))
            y_min = int(y_center - (box_height / 2))
            x_max = int(x_center + (box_width / 2))
            y_max = int(y_center + (box_height / 2))
            

            # Adding results into prepared lists
            boxes.append([x_min, y_min, x_max, y_max])
            scores.append(float(probability_current))
            classes.append(class_current)
            
classes=np.array(classes)   
scores=np.array(scores)
boxes=np.array(boxes)

In [None]:
# what do we find
for i in range(len(scores)):
    print(LABELS[classes[i]], 'with probability {:.3} at position{}'.format(scores[i],boxes[i,:]))


In [None]:
# Draw bounding boxes on a copy of the image
image_copy = np.copy(image)

# assign random colours to the class labels, these are used to draw the bounding boxes
np.random.seed(42)
COLORS = np.random.randint(0, 255, size=(len(LABELS), 3), dtype="uint8")
border_thickness = 2

# parameters text
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.7
text_thickness = 2
text_offset_x = 7
text_offset_y = 7

for box in range(len(scores)):
    # draw the bounding box
    color = tuple([int(c) for c in COLORS[classes[box]]])

    (pt1_x, pt1_y) = (int(boxes[box, 0]), int(boxes[box, 1]))
    (pt2_x, pt2_y) = (int(boxes[box, 2]), int(boxes[box, 3]))
    
    cv2.rectangle(image_copy, (pt1_x, pt1_y), (pt2_x, pt2_y), color, border_thickness)
    
    
    # print the label
    text = "{}: {:.4f}".format(LABELS[classes[box]], scores[box])
    (t_w, t_h), _ = cv2.getTextSize(text, font, fontScale=font_scale, thickness=text_thickness)
    (text_box_x1, text_box_y1) = (pt1_x, pt1_y - (t_h + text_offset_y))
    (test_box_x2, text_box_y2) = ((pt1_x + t_w + text_offset_x), pt1_y)
    
    cv2.rectangle(image_copy, (text_box_x1, text_box_y1), (test_box_x2, text_box_y2), color, cv2.FILLED)   
    cv2.putText(image_copy, text, (pt1_x + text_offset_x,pt1_y - 5), font, font_scale, 
                                (255, 255, 255), text_thickness)

                  
plt.figure(figsize=(w*0.05, h*0.05))
plt.imshow(image_copy)
plt.axis('off')
plt.show()


### Non Max Supression

 1. Select the box that has the highest score.

 2. Compute its overlap with all other boxes (IoU), and remove boxes that overlap it more than iou_threshold.
 
 3. Bo back to setp 1 and iterate until there's no more boxes with a lower score than the current selected box.

In [None]:
def iou(box1, box2):
    # Caculate IoU between box1 and box2
    #    box1/box2 : (x1, y1, x2, y2), where x1 and y1 are coordinates of upper left corner, 
    #                x2 and y2 are of lower right corner
    #    return: IoU
    
    # get the area of intersection 
    # top left corner
    xi1 = max(box1[0], box2[0])
    yi1 = max(box1[1], box2[1])
    # bottom right corner
    xi2 = min(box1[2], box2[2])
    yi2 = min(box1[3], box2[3])

    # max answers questioon: is there and intersection at all
    inter_area = max(xi2 - xi1, 0) * max(yi2 - yi1, 0)
    
    # get the area of union
    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
    
    union_area = box1_area + box2_area - inter_area
    
    # get iou
    iou = inter_area / union_area
    
    return iou

In [None]:
def yolo_non_max_supression(boxes, scores, score_threshold=0.5, iou_threshold=0.5):
    #  Apply Non-max supression.
    #    boxes : Array of coordinates of boxes (x1, y1, x2, y2)
    #    scores : Array of confidence scores with respect to boxes
    #    score_threshold : Threshold, higher will be kept
    #    iou_threshold : if iou is below threshold, keep both boxes
    #    Return : Indices of boxes and scores to be kept
    
    sorted_idx = np.argsort(scores)[::-1]
    
    remove = []
    for i in np.arange(len(scores)):
        if i in remove:   # already processed
            continue
        if scores[sorted_idx[i]] < score_threshold:   # score below threshold?
            remove.append(i)
            continue
            
        for j in np.arange(i+1, len(scores)):     # go through remaining boxes of list
            if scores[sorted_idx[j]] < score_threshold:
                remove.append(j)
                continue
                
            overlap = iou(boxes[sorted_idx[i]], boxes[sorted_idx[j]])
            if overlap > iou_threshold:   # remove second box, which has smaller score due to sorting
                remove.append(j)
                
    sorted_idx = np.delete(sorted_idx, remove)  # clean the list

    return sorted(sorted_idx)

In [None]:
# Apply Non-max supression
print('we start with ', len(scores), 'indices')
nms_idx = yolo_non_max_supression(boxes, scores, score_threshold=0.5, iou_threshold = 0.5)
print(len(nms_idx), 'indices are kept:', nms_idx)


In [None]:
# Draw bounding boxes on the image

for box in nms_idx:
    # draw the bounding box
    color = tuple([int(c) for c in COLORS[classes[box]]])

    (pt1_x, pt1_y) = (int(boxes[box, 0]), int(boxes[box, 1]))
    (pt2_x, pt2_y) = (int(boxes[box, 2]), int(boxes[box, 3]))
    
    cv2.rectangle(image, (pt1_x, pt1_y), (pt2_x, pt2_y), color, border_thickness)
    
    
    # print the label
    text = "{}: {:.4f}".format(LABELS[classes[box]], scores[box])
    (t_w, t_h), _ = cv2.getTextSize(text, font, fontScale=font_scale, thickness=text_thickness)
    (text_box_x1, text_box_y1) = (pt1_x, pt1_y - (t_h + text_offset_y))
    (test_box_x2, text_box_y2) = ((pt1_x + t_w + text_offset_x), pt1_y)
    
    cv2.rectangle(image, (text_box_x1, text_box_y1), (test_box_x2, text_box_y2), color, cv2.FILLED)   
    cv2.putText(image, text, (pt1_x + text_offset_x,pt1_y - 5), font, font_scale, 
                                (255, 255, 255), text_thickness)

                  
plt.figure(figsize=(w*0.05, h*0.05))
plt.imshow(image)
plt.axis('off')
plt.show()


### Putting everything together

In [None]:
#image = imread("images/lunch.jpg")
#image = imread("images/perth.jpg")
#image = imread("images/coffee.jpg")
#image = imread("images/wiener_kaffeehaus_2.jpg")
image = imread("images/diverse_group_of_women.jpeg")

# determine image size -> will be needed to rescale bounding boxes
(h, w) = image.shape[:2]

# preprocess image data with rescaling and resizing, 
blob = cv2.dnn.blobFromImage(image, 1 / 255.0, (416, 416), crop=False)

net.setInput(blob)

# mesasure the time needed for the object detection 
start = time.time()
layerOutputs = net.forward(ln_out)
end = time.time()

# Showing spent time for forward pass
print('YOLO v3 took {:.4f} seconds'.format(end - start))

# Preparing lists for detected bounding boxes, obtained confidences and class's number
boxes = []
scores = []
classes = []

# this is our threshold for keeping the bounding box
probability_minimum =0.5

# iterating through all three outputs
for result in layerOutputs:
    # Going through all bounding boxes from current output layer
    for detection in result:
        
        # Getting class for current object
        scores_current = detection[5:]
        class_current = np.argmax(scores_current)

        # Getting probability for current object
        probability_current = scores_current[class_current]

        # Getting object confidence for current object
        object_confidence = detection[4]
        
        # Eliminating weak predictions by minimum probability
        if probability_current > probability_minimum:
    
            # Scaling bounding box coordinates to the initial image size
            box_current = np.array(detection[0:4]) * np.array([w, h, w, h])
            x_center, y_center, box_width, box_height = box_current.astype('int')
            x_min = int(x_center - (box_width / 2))
            y_min = int(y_center - (box_height / 2))
            x_max = int(x_center + (box_width / 2))
            y_max = int(y_center + (box_height / 2))
        
            # Adding results into prepared lists
            boxes.append([x_min, y_min, x_max, y_max])
            scores.append(float(probability_current))
            classes.append(class_current)
            
classes=np.array(classes)   
scores=np.array(scores)
boxes=np.array(boxes)

nms_idx = yolo_non_max_supression(boxes, scores, score_threshold=0.5, iou_threshold = 0.5)

for box in nms_idx:
    # draw the bounding box
    color = tuple([int(c) for c in COLORS[classes[box]]])

    (pt1_x, pt1_y) = (int(boxes[box, 0]), int(boxes[box, 1]))
    (pt2_x, pt2_y) = (int(boxes[box, 2]), int(boxes[box, 3]))
    
    cv2.rectangle(image, (pt1_x, pt1_y), (pt2_x, pt2_y), color, border_thickness)
    
    
    # print the label
    text = "{}: {:.4f}".format(LABELS[classes[box]], scores[box])
    (t_w, t_h), _ = cv2.getTextSize(text, font, fontScale=font_scale, thickness=text_thickness)
    (text_box_x1, text_box_y1) = (pt1_x, pt1_y - (t_h + text_offset_y))
    (test_box_x2, text_box_y2) = ((pt1_x + t_w + text_offset_x), pt1_y)
    
    cv2.rectangle(image, (text_box_x1, text_box_y1), (test_box_x2, text_box_y2), color, cv2.FILLED)   
    cv2.putText(image, text, (pt1_x + text_offset_x,pt1_y - 5), font, font_scale, 
                                (255, 255, 255), text_thickness)

                  
plt.figure(figsize=(w*0.05, h*0.05))
plt.imshow(image)
plt.axis('off')
plt.show()

for box in nms_idx:
    print(LABELS[classes[box]], 'with probability {:.3} at position{}'.format(scores[box],boxes[box,:]))
