### OpenCV + YOLOv4(tiny-coco) for Object Detection Video ###

In [1]:
import numpy as np
import time
import cv2
import matplotlib.pyplot as plt

### Setup Functions ###

In [2]:
def config_net(net_weights_path, net_config_path):
    # Config net using cv2 (needs weights and config files)
    net = cv2.dnn.readNet(net_config_path, net_weights_path)
    output_layers = net.getLayerNames()
    output_layers  = [output_layers [i[0] - 1] for i in net.getUnconnectedOutLayers()]

    return net, output_layers 

In [3]:
def YOLO_processing(img, net, output_layers):
    # Pass img/frame, net and output layers get output:

    start= time.time()

    # Modify img to input format:
    blob = cv2.dnn.blobFromImage(img, 1 / 255.0, (416, 416), swapRB = True, crop = False)

    # Pass to net:
    net.setInput(blob)
    layer_outputs = net.forward(output_layers)

    end = time.time()
    
    yolo_process_time = end - start
    
    return layer_outputs, yolo_process_time

In [4]:
def YOLO_detections(img, net, output_layers, labels, des_det, colors, threshold = 0.5, threshold_NMS = 0.3):
    # Output format:
    # (pc, bx, by, bh, bw, .... class_preds)

    (H, W) = img.shape[:2]
    
    layer_outputs, yolo_process_time = YOLO_processing(img, net, output_layers)
    
    boxes = []
    confidence = []
    IDclasses = []

    for output in layer_outputs:
        for detection in output:
            scores = detection[5:]
            classeID = np.argmax(scores) # Get class with more probability
            trust = scores[classeID]
            if trust > threshold:
                # reescale position values:
                box = detection[0:4] * np.array([W, H, W, H])
                (centerX, centerY, width, height) = box.astype('int')

                # box structure:
                x = int(centerX - (width / 2))
                y = int(centerY - (height / 2))

                boxes.append([x, y, int(width), int(height)])
                confidence.append(float(trust))
                IDclasses.append(classeID)
    
    # Filtering detected object boxes with NMS (Non-maxima Suppression):
    objs = cv2.dnn.NMSBoxes(boxes, confidence, threshold, threshold_NMS)
    
    # Draw Desired Boxes:
    if len(objs) > 0:
        for i in objs.flatten():
            # Filter Desired Object Classes to detect:
            if labels[IDclasses[i]] in des_det:
                (x, y) = (boxes[i][0], boxes[i][1])
                (w, h) = (boxes[i][2], boxes[i][3])
    
                # get obj class color:
                color = [int(c) for c in colors[IDclasses[i]]]
                text = "{}: {:.4f}".format(labels[IDclasses[i]], confidence[i])
                
                # add litle background to class name info:
                backg = np.full((img.shape), (0,0,0), dtype=np.uint8)
                cv2.putText(backg, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 2)
                fx,fy,fw,fh = cv2.boundingRect(backg[:,:,2])
                
                # Draw obj bbox:
                cv2.rectangle(img, (x, y), (x + w, y + h), color, 2) 
                cv2.rectangle(img, (fx, fy), (fx + fw, fy + fh), color, -1) 
                cv2.rectangle(img, (fx, fy), (fx + fw, fy + fh), color, 3) 
                cv2.putText(img, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,0), 1)              
    
    return img, boxes, confidence, yolo_process_time

### Object Detection and Classification: ###

In [5]:
## Tiny Yolo Info: 
## Tiny Yolo: https://github.com/AlexeyAB/darknet
## https://githubmemory.com/repo/Tossy0423/yolov4-for-darknet_ros/issues/7
## Using Tiny-Yolo for faster processing.

In [8]:
# Get Yolo Net Architecture Weights and Configuration:
net_weights_path = 'weights/yolov4-tiny.weights'
net_config_path = 'cfgs/yolov4-tiny.cfg'
net, output_layers = config_net(net_weights_path, net_config_path)

In [9]:
# Check structure:
print('Total of Layers: ' + str(len(output_layers)))
print(output_layers)

Total of Layers: 2
['yolo_30', 'yolo_37']


In [10]:
# Get coco class labels:
coco_labels = open('cfgs/coco.names').read().strip().split('\n')
print(coco_labels)
print(len(coco_labels))

['person', 'bicycle', 'car', 'motorbike', 'aeroplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
80


In [11]:
# Generate random colors for bboxes:
np.random.seed(0)
colors = np.random.randint(0, 255, size=(len(coco_labels), 3), dtype='uint8')

In [12]:
# Define necessary variables:
threshold = 0.5
threshold_NMS = 0.3
desired_detections = ['car', 'motorbike', 'bicycle']

In [14]:
# Load Video:
cap = cv2.VideoCapture('C:/Users/Mafeus/Desktop/Git_Repos/OpenCV/Testing Grounds/ztest_media/street_sample_video.mp4')
video_fps = cap.get(cv2.CAP_PROP_FPS)

while True:
    
    start = time.time()
    
    success, frame = cap.read()
    
    if not success:
        break
        
    frame = cv2.resize(frame, (600,500))
    
    # Keyboard Controls:
    
    key = cv2.waitKey(1) or 0xff   
        
    if key == ord('k'):
        break
    
    #########################################################################################################################
    
    frame, boxes, confidence, yolo_process_time = YOLO_detections(frame, net, output_layers, coco_labels, desired_detections, 
                                                                    colors, threshold, threshold_NMS)
    
    #########################################################################################################################
    
    end = time.time()

    frame_time = (end - start) + 0.0001
    fps = np.floor(1/frame_time)
        
    if (fps > video_fps):
        time.sleep(1/video_fps)
        fps = video_fps
            
    cv2.putText(frame, "FPS: {}".format(fps), (30,20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
    
    cv2.imshow("Output", frame)
    
cv2.destroyAllWindows()
cap.release()