## Imports

In [1]:
import numpy as np
import cv2

## Load YOLOv3
Setup yolo and store the output layers for collecting outputs later in the code

In [2]:
net = cv2.dnn.readNet('yolov3.weights', 'yolov3.cfg')
layer_names = net.getLayerNames()
output_layer = []
for i in net.getUnconnectedOutLayers():
    output_layer.append(layer_names[i[0] - 1])

## Store the object names
Store it in list and lets check what it contains

In [3]:
with open('coco_names.txt', 'r') as f:
    classes = [line.strip() for line in f.readlines()]

In [4]:
classes

['person',
 'bicycle',
 'car',
 'motorbike',
 'aeroplane',
 'bus',
 'train',
 'truck',
 'boat',
 'traffic light',
 'fire hydrant',
 'stop sign',
 'parking meter',
 'bench',
 'bird',
 'cat',
 'dog',
 'horse',
 'sheep',
 'cow',
 'elephant',
 'bear',
 'zebra',
 'giraffe',
 'backpack',
 'umbrella',
 'handbag',
 'tie',
 'suitcase',
 'frisbee',
 'skis',
 'snowboard',
 'sports ball',
 'kite',
 'baseball bat',
 'baseball glove',
 'skateboard',
 'surfboard',
 'tennis racket',
 'bottle',
 'wine glass',
 'cup',
 'fork',
 'knife',
 'spoon',
 'bowl',
 'banana',
 'apple',
 'sandwich',
 'orange',
 'broccoli',
 'carrot',
 'hot dog',
 'pizza',
 'donut',
 'cake',
 'chair',
 'sofa',
 'pottedplant',
 'bed',
 'diningtable',
 'toilet',
 'tvmonitor',
 'laptop',
 'mouse',
 'remote',
 'keyboard',
 'cell phone',
 'microwave',
 'oven',
 'toaster',
 'sink',
 'refrigerator',
 'book',
 'clock',
 'vase',
 'scissors',
 'teddy bear',
 'hair drier',
 'toothbrush']

## Load the video 

In [5]:
cap = cv2.VideoCapture('vtest.avi')

## Set the output formats for the video to be stored
As the code is running on CPU we cant see the video output alongside its processing, so we will store it.

In [6]:
fourcc = cv2.VideoWriter_fourcc(*'XVID')
vid_out = cv2.VideoWriter('yolo_output.mkv', fourcc, 30, (int(cap.get(3)),
                          int(cap.get(4))))

It takes almost 20 minutes to process this 1 minute video :)

Have Patience!!

In [7]:
while cap.isOpened():
    ret, frame = cap.read()
    if ret:
        height, width, channels = frame.shape
        blob = cv2.dnn.blobFromImage(frame, 0.005, (416, 416), (0, 0, 0), True)
        net.setInput(blob)
        outs = net.forward(output_layer)
        boxes, class_ids, confidences = [], [], []
        for out in outs:
            for detection in out:
                scores = detection[5:]
                class_id = np.argmax(scores)
                confidence = scores[class_id]
                if confidence > 0.5 and str(classes[class_id]) == 'person':
                    center_x = int(detection[0] * width)
                    center_y = int(detection[1] * height)
                    w = int(detection[2] * width)
                    h = int(detection[3] * height)
                    x = int(center_x - w / 2)
                    y = int(center_y - h / 2)
                    boxes.append([x, y, w, h])
                    class_ids.append(class_id)
                    confidences.append(float(confidence))

        indexes = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.4,
                                   nms_threshold=0.5)
        for i in range(len(boxes)):
            if i in indexes:
                x, y, w, h = boxes[i]
                label = str(classes[class_ids[i]])
                cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
#         cv2.imshow('output', frame)
#         cv2.waitKey(10)

        vid_out.write(frame)
    else:
        break

cap.release()
cv2.destroyAllWindows()

## Check Output

In [9]:
cap = cv2.VideoCapture('yolo_output.mkv')
while cap.isOpened():
    ret, op_frame = cap.read()
    if ret:
        cv2.imshow('output', op_frame)
    else:
        break
    if cv2.waitKey(5) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()