In [1]:
import cv2
from ultralytics import YOLO 

cap = cv2.VideoCapture("path/to/your/video")


ret, frame = cap.read ()

# assign index to each object 
index = 0

# variable for specific object count
count = 0

# variable for total object number 
object_count = 0

# model import from YOLO
model = YOLO("yolov8s.pt")
# classes import 
names = model.model.names
threshold = 0.5


# Initialize a dictionary to store the number of objects of each class
# each object in the class_counts dictionary will first be assigned the number 0
class_counts = {names[class_id].upper(): 0 for class_id in names.keys()}

# the output of the dictionary will be like this:
# {'PERSON': 0, 'BICYCLE': 0, 'CAR': 0, 'MOTORCYCLE': 0...} - since it contains absolutely all class names

while ret:
    ret, frame = cap.read()
    # results - an object of the YOLO class that returns the model
    # its first element is boxes, which contains a set of arrays with the coordinates of the bounding box, the probability of matching
    # object to one or another class and object ID

    # to select boxes from results, initialize [0]
    # thne place it to results object
    results = model(frame)[0]
    
    # Reset object counters at the beginning of each iteration

    count = 0
    object_count = 0
    index = 0

    for name in names.values():
        class_counts[name.upper()] = 0

    # results can't be iterated since it's an object, so I convert that part of it called boxes into a list
    # in the end the result for each object will look like this:
    # [0.361572265625, 547.4945068359375, 132.5535888671875, 714.080078125, 0.8390625715255737, 2.0] - here all data corresponds to the variables x1, y1, x2, y2, score, class_id
    # which I put in result

    # results corresponds to the number of detected objects
    for result in results.boxes.data.tolist():
        x1, y1, x2, y2, score, class_id = result

        
        # the keys for the class_counts dictionary are the names of the classes from the "names" array
        class_name = names[int(class_id)].upper()
        

        if score > threshold:
        
            count += 1
            index += 1
            object_count += 1
            x_center = int((x1 + x2) / 2)
            y_center = int((y1 + y2) / 2)
            # if an object is detected, increase the value of the corresponding key
            # (which corresponds to class_name) by 1
            class_counts[class_name] += 1
            
            # FOR EACH DETECTED OBJECT

            # bounding box
            cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255), 2)
            # point at the center of each detected object
            cv2.circle(frame, (x_center, y_center), 3, (0, 0, 255), thickness=cv2.FILLED)
            # personal number and class name of each detected object
            cv2.putText(frame, f"{index}: {names[int(class_id)].upper()}", (int(x1), int(y1 - 10)), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 0, 255), 3, cv2.LINE_AA)
     
    

    # since the class_counts dictionary contains too many classes, we make a slice in which we mark only the most important classes for us
    # in this case it is {"CAR", "PERSON", "MOTORCYCLE", "BICYCLE", "TRUCK", "BUS"}
            
    class_counts = {key: class_counts[key] for key in class_counts.keys() & {"CAR", "PERSON", "MOTORCYCLE", "BICYCLE", "TRUCK", "BUS"}}

    # Displaying the number of objects of each class
    text = ""
    for name, count in class_counts.items():
        text += f"{name}: {count} "

    cv2.rectangle(frame, (5, 660), (1400, 690), (0, 0, 0), 80)    
    cv2.putText(frame, text, (15, 675), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    
    # object_count - since this line displays the total number of objects
    cv2.rectangle(frame, (8, 15), (350, 25), (0, 0, 0), 20)
    cv2.putText(frame, f"Total object count: {object_count}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    cv2.imshow("Video", frame)

    
    
 
    ret, frame = cap.read()
    if cv2.waitKey(1) & 0xFF == ord('q'):
                break

cap.release()
cv2.destroyAllWindows()




0: 384x640 8 cars, 1 umbrella, 219.5ms
Speed: 4.0ms preprocess, 219.5ms inference, 6.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 cars, 1 umbrella, 128.7ms
Speed: 17.4ms preprocess, 128.7ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 cars, 1 umbrella, 1 frisbee, 130.0ms
Speed: 1.7ms preprocess, 130.0ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 cars, 1 umbrella, 1 frisbee, 143.1ms
Speed: 4.2ms preprocess, 143.1ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 cars, 1 umbrella, 1 frisbee, 206.4ms
Speed: 1.8ms preprocess, 206.4ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 cars, 1 frisbee, 116.6ms
Speed: 1.6ms preprocess, 116.6ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 cars, 1 frisbee, 142.8ms
Speed: 1.4ms preprocess, 142.8ms inference, 9.5ms postprocess per image at shape (1, 3, 384, 640)

0

KeyboardInterrupt: 