# Detecting on videos with YOLOv3 trained on Coco

Here, the code to make detection with the pre-trained YOLOv3 is the same for image detection. We present a way to make detections with the webcam, if you have one, enjoy it. And also, a simple way to read a video, make detection over it and then, recorder a new video with the detection. For this example, we consider a video of Brasilia traffic, with an amazing sunset as background.

## Importing Libraries

In [1]:
import numpy as np
import time
import cv2
import os
import imageio
import imutils

## Setting the model

In [2]:
# Load the COCO class labels that our YOLO model was trained on
yolo = "yolo-coco"
labelsPath = os.path.sep.join([yolo, "coco.names"])
LABELS = open(labelsPath).read().strip().split("\n")

In [3]:
# Initialize the color list to represent each possible class label
np.random.seed(42)
COLORS = np.random.randint(0, 255, size=(len(LABELS), 3), dtype="uint8")

In [4]:
# Derive the paths to the YOLO weights and model configuration
weightsPath = os.path.sep.join([yolo, "yolov3.weights"])
configPath = os.path.sep.join([yolo, "yolov3.cfg"])

In [5]:
net = cv2.dnn.readNetFromDarknet(configPath, weightsPath)

In [None]:
## Video stream from webcam

In [6]:
cap = cv2.VideoCapture(0)

In [8]:
while cap.isOpened():
    ret, frame = cap.read()
    frame = imutils.resize(frame, width=640, height=640)
    (H, W) = frame.shape[:2]
    
    ln = net.getLayerNames()
    ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]
    
    blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob)
    layerOutputs = net.forward(ln)
    
    boxes = []
    confidences = []
    classIDs = []
    
    # Loop over each one of the layer outputs
    for output in layerOutputs:
        # loop over each one of the detections
        for detection in output:
            # extract the class ID and confidence (i.e, probability) of the current object detection
            scores = detection[5:]
            classID = np.argmax(scores)
            confidence = scores[classID]
            # filter out weak predictions by ensuring the detected probabilityy is greater than the minimum probability
            if confidence > 0.5:
                # scale the bounding box coordinates back relative to the size of the image, keepin in mind that YOLO
                # actually returns the center (x,y) coordinates of the bounding box followed by the boxes width and height
                box = detection[0:4] * np.array([W, H, W, H])
                (centerX, centerY, width, height) = box.astype("int")
                # use the center (x,y) coordinates to derive the top and left corner of the bounding box
                x = int(centerX - (width / 2))
                y = int(centerY - (height / 2))
                # update the list of bounding box coordinates, confidences and class IDs
                boxes.append([x, y, int(width), int(height)])
                confidences.append(float(confidence))
                classIDs.append(classID)
    
    idxs = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.3)
    if len(idxs) > 0:
        # loop over the indexes we are keeping
        for i in idxs.flatten():
            # extract the bounding box coordinates
            (x, y) = (boxes[i][0], boxes[i][1])
            (w, h) = (boxes[i][2], boxes[i][3])
            # draw a bounding box rectangle and label on the image
            color = [int(c) for c in COLORS[classIDs[i]]]
            cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
            text = "{}: {:.4f}".format(LABELS[classIDs[i]], confidences[i])
            cv2.putText(frame, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    cv2.imshow("Output", frame)
    if cv2.waitKey(10) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()

## Video stream from a file, a simple way to record it.

In [9]:
reader = imageio.get_reader("../dataset/videos/cars.mkv") # to read the file and grab total of frames
fps = reader.get_meta_data()['fps']
writer = imageio.get_writer('test_yolos.mp4', fps = round(fps)) # object to write frame and concatenate them
print(round(fps))

25


In [10]:
for i, frame in enumerate(reader):
    
    frame = imutils.resize(frame, width=640, height=640)
    (H, W) = frame.shape[:2]
    
    ln = net.getLayerNames()
    ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]
    
    blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob)
    layerOutputs = net.forward(ln)
    
    boxes = []
    confidences = []
    classIDs = []
    
    # Loop over each one of the layer outputs
    for output in layerOutputs:
        # loop over each one of the detections
        for detection in output:
            # extract the class ID and confidence (i.e, probability) of the current object detection
            scores = detection[5:]
            classID = np.argmax(scores)
            confidence = scores[classID]
            # filter out weak predictions by ensuring the detected probabilityy is greater than the minimum probability
            if confidence > 0.5:
                # scale the bounding box coordinates back relative to the size of the image, keepin in mind that YOLO
                # actually returns the center (x,y) coordinates of the bounding box followed by the boxes width and height
                box = detection[0:4] * np.array([W, H, W, H])
                (centerX, centerY, width, height) = box.astype("int")
                # use the center (x,y) coordinates to derive the top and left corner of the bounding box
                x = int(centerX - (width / 2))
                y = int(centerY - (height / 2))
                # update the list of bounding box coordinates, confidences and class IDs
                boxes.append([x, y, int(width), int(height)])
                confidences.append(float(confidence))
                classIDs.append(classID)
    
    idxs = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.3)
    if len(idxs) > 0:
        # loop over the indexes we are keeping
        for i in idxs.flatten():
            # extract the bounding box coordinates
            (x, y) = (boxes[i][0], boxes[i][1])
            (w, h) = (boxes[i][2], boxes[i][3])
            # draw a bounding box rectangle and label on the image
            color = [int(c) for c in COLORS[classIDs[i]]]
            cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
            text = "{}: {:.4f}".format(LABELS[classIDs[i]], confidences[i])
            cv2.putText(frame, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    writer.append_data(frame)
writer.close()