In [1]:
# Object detection on photo using YOLOv3/YOLOv4 

import cv2
import numpy as np

# download configuration file for YOLOv4
config_file = "path/to/yolov4.cfg"

# download configuration file for YOLOv3
# config_file = "path/to/yolov3.cfg"

# download frozen weights file for YOLOv4
frozen_model = "path/to/yolov4.weights"


# download frozen weights file for YOLOv3
# frozen_model = "path/to/yolov3.weights"

# download label class array 
labels_file = "path/to/coco.names"


with open(labels_file, "r") as f:
    labels = f.read().strip().split("\n")

# initialize detector object 
net = cv2.dnn.readNetFromDarknet(config_file, frozen_model)

# to get name of output layers of NN (YOLOv3/4 has 3 of them)
output_layer_names = net.getUnconnectedOutLayersNames()


cap = cv2.VideoCapture(1) 


In [2]:
# Essentially, the main task of the Yolo network is to return an array that will display detection objects
# which are needed to find the upper left corner of the image and calculate the height and width of the frame
# and also calculating the probability of matching a certain class from the labels array

In [3]:
while True:
    #frame - every frame
    # ret - success of each frame
    ret, frame = cap.read()
    # if the frame is not successful (the camera does not work - break)
    if not ret:
        break

    # heigh and width of each frame
    height, width, _ = frame.shape
    # detection settings
    blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416), swapRB=True, crop=False)

    # apply the settings to detector object
    net.setInput(blob)
    layer_outputs = net.forward(output_layer_names)

    boxes = []
    confidences = []
    class_ids = []

    for output in layer_outputs:
        for detection in output:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]

            if confidence > 0.5:
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                boxes.append([x, y, w, h])
                confidences.append(float(confidence))
                class_ids.append(class_id)

# choosing the best frames for each image
    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)

    if len(indexes) > 0:
        for i in indexes.flatten():
            x, y, w, h = boxes[i]
            label = labels[class_ids[i]]
            confidence = confidences[i]

            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
            text = f"{label}: {confidence:.2f}"
            cv2.circle(frame, (int(x + w/2), int(y + h/2)), 5, (0, 255, 0), thickness=cv2.FILLED)
            cv2.putText(frame, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    cv2.imshow("Object Detection", frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()