In [1]:
import numpy as np
import cv2
import time

<h3> Algorithm:
   
    
 Reading input video --> Loading YOLO v3 Network -->
    
 --> Reading frames in the loop --> Getting blob from the frame -->
    
 --> Implementing Forward Pass --> Getting Bounding Boxes -->
    
 --> Non-maximum Suppression --> Drawing Bounding Boxes with Labels -->
    
 --> Writing processed frames

 Result:
 New video file with Detected Objects, Bounding Boxes and Labels

In [2]:
##############################################
######   Reading input video    #############
############################################

video = cv2.VideoCapture('vid.mp4')

# Preparing variable for writer
# that we will use to write processed frames
writer = None

# Preparing variables for spatial dimensions of the frames
h, w = None, None


##############################################
#####    Loading YOLO v3 Network    #########
############################################

with open('yolo-coco-data/coco.names') as f:
    # Getting labels reading every line
    # and putting them into the list
    labels = [line.strip() for line in f]
    
network = cv2.dnn.readNetFromDarknet('yolo-coco-data/yolov3.cfg',
                                     'yolo-coco-data/yolov3.weights')

# Getting list with names of all layers from YOLO v3 network
layers_names_all = network.getLayerNames()
layers_names_output = \
    [layers_names_all[i[0] - 1] for i in network.getUnconnectedOutLayers()]
# Setting minimum probability to eliminate weak predictions
probability_minimum = 0.5

# Setting threshold for filtering weak bounding boxes
# with non-maximum suppression
threshold = 0.3

# Generating colours for representing every detected object
# with function randint(low, high=None, size=None, dtype='l')
colours = np.random.randint(0, 255, size=(len(labels), 3), dtype='uint8')


###############################################
#######     Reading frames in the loop   #####
#############################################

# Defining variable for counting frames
# At the end we will show total amount of processed frames
f = 0

# Defining variable for counting total time
# At the end we will show time spent for processing all frames
t = 0

# Defining loop for catching frames
while True:
    # Capturing frame-by-frame
    ret, frame = video.read()
    
    # If the frame was not retrieved
    # e.g.: at the end of the video,
    # then we break the loop
    if not ret:
        break

    # Getting spatial dimensions of the frame
    # we do it only once from the very beginning
    # all other frames have the same dimension
    if w is None or h is None:
        # Slicing from tuple only first two elements
        h, w = frame.shape[:2]
        
###############################################
######     Getting blob from the frame   #####
#############################################        

    # cv2.dnn.blobFromImag() : facilitate image preprocessing for deep learning classification:
    blob = cv2.dnn.blobFromImage(frame, 1 / 255.0,(416, 416), swapRB=True, crop=False)
    
    
##############################################
#######    Implementing Forward Pass  #######
############################################

    # Implementing forward pass with our blob and only through output layers
    # Calculating at the same time, needed time for forward pass
    network.setInput(blob)  # setting blob as input to the network
    start = time.time()
    output_from_network = network.forward(layers_names_output)
    end = time.time()

    # Increasing counters for frames and total time
    f += 1
    t += end - start

    # Showing spent time for single current frame
    print('Frame number {0} took {1:.5f} seconds'.format(f, end - start))
    
    
#####################################################
######      Getting Bounding Boxes    ##############
####################################################
    
    bounding_boxes = []
    confidences = []
    class_numbers = []

    # Going through all output layers after feed forward pass
    for result in output_from_network:
        # Going through all detections from current output layer
        for detected_objects in result:
            # Getting 80 classes' probabilities for current detected object
            scores = detected_objects[5:]
            # Getting index of the class with the maximum value of probability
            class_current = np.argmax(scores)
            # Getting value of probability for defined class
            confidence_current = scores[class_current]
            
            
            if confidence_current > probability_minimum:
                # Scaling bounding box coordinates to the initial frame size
                # YOLO data format keeps coordinates for center of bounding box
                # and its current width and height
                # That is why we can just multiply them elementwise
                # to the width and height
                # of the original frame and in this way get coordinates for center
                # of bounding box, its width and height for original frame
                box_current = detected_objects[0:4] * np.array([w, h, w, h])

                # Now, from YOLO data format, we can get top left corner coordinates
                # that are x_min and y_min
                x_center, y_center, box_width, box_height = box_current
                x_min = int(x_center - (box_width / 2))
                y_min = int(y_center - (box_height / 2))

                # Adding results into prepared lists
                bounding_boxes.append([x_min, y_min,
                                       int(box_width), int(box_height)])
                confidences.append(float(confidence_current))
                class_numbers.append(class_current)
                
                
##########################################################
########    Non-maximum suppression  ###################
######################################################


    # Implementing non-maximum suppression of given bounding boxes
    # With this technique we exclude some of bounding boxes if their
    # corresponding confidences are low or there is another
    # bounding box for this region with higher confidence
    
    results = cv2.dnn.NMSBoxes(bounding_boxes, confidences,
                               probability_minimum, threshold)
    
    
######################################################
########    Drawing bounding boxes and labels  #####
######################################################
    
    if len(results) > 0:
        # Going through indexes of results
        for i in results.flatten():
            # Getting current bounding box coordinates,
            # its width and height
            x_min, y_min = bounding_boxes[i][0], bounding_boxes[i][1]
            box_width, box_height = bounding_boxes[i][2], bounding_boxes[i][3]

            # Preparing colour for current bounding box
            # and converting from numpy array to list
            colour_box_current = colours[class_numbers[i]].tolist()

            # # # Check point
            # print(type(colour_box_current))  # <class 'list'>
            # print(colour_box_current)  # [172 , 10, 127]

            # Drawing bounding box on the original current frame
            cv2.rectangle(frame, (x_min, y_min),
                          (x_min + box_width, y_min + box_height),
                          colour_box_current, 2)

            # Preparing text with label and confidence for current bounding box
            text_box_current = '{}: {:.4f}'.format(labels[int(class_numbers[i])],
                                                   confidences[i])

            # Putting text with label and confidence on the original image
            cv2.putText(frame, text_box_current, (x_min, y_min - 5),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, colour_box_current, 2)
            
            
####################################################################
##############       Writing processed frame into the file   ######
###################################################################

    # Initializing writer
    # we do it only once from the very beginning
    # when we get spatial dimensions of the frames
    if writer is None:
        # Constructing code of the codec
        # to be used in the function VideoWriter
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')

        # Writing current processed frame into the video file
        # Pay attention! If you're using Windows, yours path might looks like:
        # r'videos\result-traffic-cars.mp4'
        # or:
        # 'videos\\result-traffic-cars.mp4'
        writer = cv2.VideoWriter('result-vid.mp4', fourcc, 30,
                                 (frame.shape[1], frame.shape[0]), True)

    # Write processed current frame to the file
    writer.write(frame)
    
    
#######################################################
#########   Reading frames in the loop  ###############
######################################################

print()
print('Total number of frames', f)
print('Total amount of time {:.5f} seconds'.format(t))
print('FPS:', round((f / t), 1))


# Releasing video reader and writer
video.release()
writer.release()

Frame number 1 took 1.04158 seconds
Frame number 2 took 0.55053 seconds
Frame number 3 took 0.55348 seconds
Frame number 4 took 0.54455 seconds
Frame number 5 took 0.54554 seconds
Frame number 6 took 0.58045 seconds
Frame number 7 took 0.53656 seconds
Frame number 8 took 0.61835 seconds
Frame number 9 took 0.61735 seconds
Frame number 10 took 0.62333 seconds
Frame number 11 took 0.56746 seconds
Frame number 12 took 0.56748 seconds
Frame number 13 took 0.58643 seconds
Frame number 14 took 0.59241 seconds
Frame number 15 took 0.60540 seconds
Frame number 16 took 0.60938 seconds
Frame number 17 took 0.59740 seconds
Frame number 18 took 0.55452 seconds
Frame number 19 took 0.57544 seconds
Frame number 20 took 0.53158 seconds
Frame number 21 took 0.54953 seconds
Frame number 22 took 0.54454 seconds
Frame number 23 took 0.57746 seconds
Frame number 24 took 0.55847 seconds
Frame number 25 took 0.54754 seconds
Frame number 26 took 0.63131 seconds
Frame number 27 took 0.59142 seconds
Frame numb