# Experiment with opencv's yolo on a video
Erik Matovič and Jakub Horvat

Spracovanie videa použitím OpenCV YOLOv3:

In [2]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple, Any

cv2.__version__

'4.6.0'

In [4]:
def get_cap_out(video_path:str, out_root:str='..', start_idx:int=15) -> Tuple[cv2.VideoCapture,
                                                                              cv2.VideoWriter]:
    """
    Read video capture and make video writer.
    :param video_path:  path of the input 
    :param out_root:    path of the output folder
    :param start_idx:   index for the name of the output video 
    returns: cv2.VideoCapture, cv2.VideoWriter 
    """
    # load video
    cap = cv2.VideoCapture(video_path)

    # convert the resolutions from float to integer.
    frame_width = int(cap.get(3))
    frame_height = int(cap.get(4))

    # make video writer
    out = cv2.VideoWriter(out_root + video_path[start_idx:-4] + '.avi', cv2.VideoWriter_fourcc('M','J','P','G'), 10, (frame_width,frame_height))
    return cap, out

In [3]:
import torch 
torch.__version__
torch.cuda.is_available()
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3060 Laptop GPU'

In [4]:
import cv2
cv2.__version__

'4.6.0'

In [5]:
cv2.dnn.DNN_BACKEND_CUDA

5

In [6]:
cv2.dnn.DNN_TARGET_CUDA

6

In [3]:
import cv2
import numpy as np

# Load YOLO model
net = cv2.dnn.readNet("../yolov3.weights", "../yolov3.cfg")
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)

# Define the labels of the classes
classes = []
with open('../yolov3.txt', 'r') as f:
    classes = [line.strip() for line in f.readlines()]

# Define the minimum confidence threshold and the non-maximum suppression threshold
conf_threshold = 0.5
nms_threshold = 0.4

# Define the colors for drawing the bounding boxes
colors = np.random.uniform(0, 255, size=(len(classes), 3))

# Define the pedestrian tracker
tracker = cv2.TrackerCSRT_create()

# Initialize the bounding box
bbox = None

# Open the video file
cap = cv2.VideoCapture('../PIE_data/PIE_clips/set01/video_0001.mp4')
# convert the resolutions from float to integer.
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))
out = cv2.VideoWriter('../outputs/video_0001.mp4', cv2.VideoWriter_fourcc('M','J','P','G'), 10, (frame_width,frame_height))

counter = 0
while True:
    # Read a frame from the video
    ret, frame = cap.read()
    #print('ret')
    if not ret:
        break
    
    #frame = cv2.resize(frame, None, fx=0.2,fy=0.2) #(600, 400))

    # Resize the frame to the input size of the YOLO network
    height, width = frame.shape[:2]
    inp_size = (416, 416)
    blob = cv2.dnn.blobFromImage(frame, 1/255, inp_size, swapRB=True, crop=False)
    #print('blob')
    # Pass the blob through the network
    net.setInput(blob)
    output_layers = net.getUnconnectedOutLayersNames()
    layer_outputs = net.forward(output_layers)
    #print('net')

    # Decode the output of the network
    boxes = []
    confidences = []
    class_ids = []
    #print('boxes')
    for output in layer_outputs:
        for detection in output:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > conf_threshold and class_id == 0:
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                boxes.append([x, y, w, h])
                confidences.append(float(confidence))
                class_ids.append(class_id)

    # Apply non-maximum suppression to remove overlapping bounding boxes
    indices = cv2.dnn.NMSBoxes(boxes, confidences, conf_threshold, nms_threshold)
    for i in indices:
        #print(i)
        #i = i[0]
        #print(i)
        x, y, w, h = boxes[i]
        label = f"{classes[class_ids[i]]}: {confidences[i]:.2f}"
        #color = colors[class_ids[i]]
        #print()
        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
        cv2.putText(frame, label, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

        # Update the bounding box for the tracker
        if bbox is None:
            bbox = (x, y, w, h)
            #tracker.init(frame, bbox)
        #else:
            #success, bbox = tracker.update(frame)
            #if success:
            #    x, y, w, h = [int(v) for v in bbox]
            #    cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
            #else:
            #    bbox = None
    #print('here')
    # Display the resulting frame
    print(boxes)
    frame = cv2.resize(frame, (600, 400))
    cv2.imshow("Pedestrian detection", frame)
    cv2.waitKey(0)
    # Press 'q' to quit
    #if cv2.waitKey(1) == ord('q'):
    #    break
    #out.write(frame)
    counter += 1
    if counter % 10 == 0:
        break

cap.release()
out.release()
cv2.destroyAllWindows()

[[1, 694, 44, 176], [1, 720, 48, 154], [207, 734, 48, 143], [207, 745, 47, 137]]
[[0, 703, 44, 154], [3, 715, 44, 161], [206, 739, 49, 133], [1273, 745, 57, 117], [204, 747, 51, 136], [1272, 757, 66, 112]]
[[0, 694, 41, 168], [3, 713, 41, 167], [1275, 743, 47, 117], [205, 750, 48, 134], [1277, 761, 53, 105]]
[[0, 696, 39, 163], [3, 711, 37, 172], [1277, 746, 44, 116], [202, 746, 47, 137], [1279, 754, 47, 117]]
[[1548, 708, 45, 106], [1574, 707, 48, 102], [0, 694, 37, 170], [1, 707, 37, 180], [1272, 743, 48, 116], [201, 748, 49, 134]]
[[0, 693, 35, 171], [1, 712, 36, 169], [1273, 745, 50, 113], [192, 752, 53, 128], [201, 747, 48, 135]]
[[1571, 710, 53, 100], [-1, 699, 33, 159], [1, 713, 34, 168], [1274, 746, 46, 117], [191, 753, 54, 128], [199, 748, 50, 135]]
[[-2, 704, 32, 152], [0, 719, 33, 156], [1264, 746, 51, 120], [1273, 747, 45, 113]]
[[-1, 705, 29, 153], [0, 721, 31, 153], [188, 748, 52, 114], [1262, 744, 54, 123], [1272, 747, 45, 116], [187, 753, 55, 126], [1259, 757, 56, 112]]