In [None]:
import numpy as np
import cv2
import sys
import os.path
import matplotlib.pyplot as plt
# matplotlib inline


In [2]:
import matplotlib
matplotlib.rcParams['figure.figsize'] = (15.0,15.0)
matplotlib.rcParams['image.cmap'] = 'gray'

Step 1: Initialize the parameters

The YOLO3 algorithm generates bounding boxes as the predicted detection outputs. Every predicted box is associated with a confidience score. In the first stage, all the boxes below the confidence threshold parameter are ignored for futher processing.

 The rest of the boxes undergo non-maxinum suppression which removes redundant overlapping bounding boxes. Non-maximum suppression is controlled by a parameter nmsThreshold. You can try to change these values and see how the numkber of the output predicted boxes changes.
 
Next, the default values for the input width and height for the networkd input image are set. We set each of them to 416 so that we can compare 

In [3]:
# Initialize the parameters
objectnessThreshold = 0.5 # Objectness threshold
confThreshold = 0.5       # Confidence threshold
nmsThreshold = 0.4        # Non-maximum suppression threshold
inpWidth = 416            # Width of network's input image
inpHeight = 416           # Height of network's input image

Step 2: Load the model and classes 

The file coco.names contains all the objects for which the model was trained. We read class names

Next, we load the network which has two part

- Yolo weights and yolo conf

We set the DNN backend to OpenCV here and the target CPU. You could try setting the preferable target to cv.dnn.DNN_TARGET_OPENCL to run it on a GPU. But keep in mind that the current OpenCV version is tested only with Intel’s GPUs, it would automatically switch to CPU, if you do not have an Intel GPU.

In [6]:
# Load names of classes
classesFile = "coco.names"
classes = None

with open(classesFile, 'rt') as f:
    classes = f.read().rstrip('\n').split('\n')
    
# Give the configuration and weights files for the model and load the network using them
modelConfiguration = "yolov3.cfg"
modelWeights = "yolov3.weights"

net = cv2.dnn.readNetFromDarknet(modelConfiguration, modelWeights)

Step 3: Process each frame

The input image is passed through the network and the output is decoded and displayed using a few utility functions. Let us go over the utility functions



Step 3a: Getting the names of output layers

The forward function in OpenCV's Net class needs the ending layer till which it should run in the network. Since we want to run through the whole network, we need to identify the last layer of the network. We do that by uisng the functiongetUnconnectedOutlayers()

In [None]:
def getOutputsNames(net):
    # Get the names of all the layers in the network
    layersNames = net.getLayerNames()
    return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()]

In [8]:
# Draw the predicted boundingb ox
def drawPred (classId, conf, left, top, right, bottom):
    # Draw a bounding box
    cv2.rectangle(frame, (left, top), (right, bottom), (255, 178, 50), 3)
    label = '%.2f' % conf
    
    # Get the label for the class name and its confidence
    if classes:
        assert(classId < len(classes))
        label = '%s:%s' % (classes[classId], label)
    # Display the label at the top of the bouding box
    labelSize, baseLine = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
    top = max(top, labelSize[1])
    top = max(top, labelSize[1])
    cv2.rectangle(frame,(left, top - round(1.5*labelSize[1])), (left + round(1.5*labelSize[0]), top + baseLine), (255, 255, 255), cv2.FILLED)
    cv2.putText(frame, label, (left, top), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0,0,0), 1)

Step 3c: Post-processing the network's output 
The network outputs bounding boxes are each represented by a vector of number of classes and 5 elements.

The first 4 elements represent the center_x; center_y, width and height. The fifth element represents the confidence that the bouding box encloses an object.

The rest of the elements are the confidence associated with each class (i.e object type) The box is assigned to the class corresponding to the hightest scence for the box

The highest score for a box is also called its confidence. If the confidence of a box is less than the given threshold, the bounding box is dropped and not considreed for further processing

The boxes with their confidence equal to or greater than the threshold are then subjected to Non Maximum Suppression. This would reduce the number of overlapping boxes

In [None]:
# Remove the bounding boxes with low confidence using non-maxima suppression

def postprocess(frame, outs):
    frameHeight = frame.shape[0]
    frameWidth = frame.shape[1]
    
    classIds = []
    
    confidences = []
    boxes = []
    # scan through all the bounding boxes output from the network and keep only
    # The ones high confidence. Assigned the label with the highest confidence
    for out in outs:
        for detection in out:
            if detection[4] > objectnessThreshold:
                scores = detection[5:]
                classId = np.argmax(scores)
                confidence = scores[classId]
                if confidence > confThreshold:
                    center_x = int(detection[0] * frameWidth)
                    center_y = int(detection[1] * frameHeight)
                    width = int(detection[2] * frameWidth)
                    height = int(detection[3] * frameHeight)
                    left = int(center_x - width/2)
                    top = int(center_y - height/2)
                    classIds.append(classId)
                    confidences.append(float(confidences))
                    boxes.append([left, top, width, height])
    # Perform non maximum suppression to eliminate redundant overlapping boxes with
    # lower confidences
    indices = cv2.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold)
    for i in indices:
        i = i[0]
        box = boxes[i]
        left = box[0]
        top = box[1]
        width = box[2]
        height = box[3]
        drawPred(classIds[i], confidence, left, top, left + width, top + height)

The non maximum Suppression is controlled by the nmsThreshold parameter. If nmsThreshold is set too low, like 0.1, we might not detect overlapping object of same or different classes. But if it is set to high like 1. We might get multiple boxes for the same object. So we used an intermidate value 0.4 in our code above.

The main loop
The input image to a neural network needs to be in a certain format called a blob

After a frame is road from the input image or vieo stream, it is passed through the blobFromImage function to convert it to an input blob for the neural network. In this process, it scales the image pixel values to a target range 0 1 using a scale factor 1/255. It also resizes the image to the given size of (416, 416) without cropping. Note that we do not perform any mean substraction here, hence pass[0,0,0] to the mean parameter of the function and keep the swapRB parameter to its default value of 1.

The output blob is then passed in to the network as its input and a forward pass is run to get a list of predicted bounding boxes as the network's output. These boxes go through a post processing step in order filter out the ones with low confidence scores. We will go through the post processing step in more details in the next section .We preint out the inference time for each frame at the top left. The image with the final bounding boxes is then saved to the disk, other as an image for an image input using a video writer for the input video stream.

In [None]:
# Process inputs
imagePath = D