### IMPORTING LIBRARIES

In [24]:
import cv2
import numpy as np

### Adding the filename and filesize

In [25]:
filename = 'edmonton_canada.mp4'
file_size = (1920,1080) # Assumes 1920x1080 mp4
RESIZED_DIMENSIONS = (300, 300) # Dimensions that SSD was trained on. 
IMG_NORM_RATIO = 0.007843 # In grayscale a pixel can range between 0 and 255

### Loading the pre-trained neural network

In [26]:
model = cv2.dnn.readNetFromCaffe('MobileNetSSD_deploy.prototxt.txt', 
        'MobileNetSSD_deploy.caffemodel')

In [27]:
classLabels = []
file_name = 'Labels.txt'
with open(file_name,'rt') as fpt:
    classLabels = fpt.read().rstrip('\n').split('\n')

In [28]:
print(classLabels)

['person', 'bicycle', 'car', 'motorbike', 'aeroplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']


### Creating the bounding boxes

In [29]:
bbox_colors = np.random.uniform(255,0,size = (len(classLabels),3))

### Video setup and output

In [31]:
video = cv2.VideoCapture(filename)
while video.isOpened():
    success, frame = video.read()
    if success:
        #Capturing the frames height and width
        (h,w) = frame.shape[:2]
        # Create a blob. A blob is a group of connected pixels in a binary 
        # frame that share some common property (e.g. grayscale value)
        # Preprocess the frame to prepare it for deep learning classification
        frame_blob = cv2.dnn.blobFromImage(cv2.resize(frame, RESIZED_DIMENSIONS),
                                          IMG_NORM_RATIO, RESIZED_DIMENSIONS, 127.5)
        model.setInput(frame_blob) #Set the input for the neural network
        model_output = model.forward() #Predict the objects in the image
        
        for i in np.arange(0,model_output.shape[2]):
            confidence = model_output[0,0,i,2]
    
            #Confidence must be atleast 30%
            if confidence > 0.30:
                idx = int(model_output[0,0,i,1])

                bounding_box = model_output[0,0,i,3:7] * np.array([w,h,w,h])

                (startX, startY, endX, endY) = bounding_box.astype("int")

                label = "{}: {:.2f}%".format(classLabels[idx],confidence*100)
                cv2.rectangle(frame, (startX, startY),(endX,endY),bbox_colors[idx%3],2)
                y = startY - 15 if startY - 15 > 15 else startY + 15
                cv2.putText(frame, label, (startX, y), cv2.FONT_HERSHEY_SIMPLEX,0.5,bbox_colors[idx],2)
        frame = cv2.resize(frame,file_size,interpolation = cv2.INTER_NEAREST)
    else:
        break
video.release()

finished
