In [None]:
import cv2
import torch
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
import numpy as np
from time import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Đang sử dụng:", device)

model = YOLO("yolo12s.pt")

tracker = DeepSort(max_age=5)

cap = cv2.VideoCapture("6387-191695740.mp4") #Change to your video file path
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

out = cv2.VideoWriter("output_deepsort.mp4", cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))

prev_time = 0

while cap.isOpened():
    prev_time = time()
    ret, frame = cap.read()
    if not ret:
        break

    results = model(frame)[0]
    annotated_frame = frame.copy()

    detections = []

    for box in results.boxes:
        cls_id = int(box.cls[0])
        if cls_id != 0:
            continue

        conf = float(box.conf[0])
        x1, y1, x2, y2 = map(int, box.xyxy[0])
        w = x2 - x1
        h = y2 - y1

        detections.append(([x1, y1, w, h], conf, "person"))

    tracks = tracker.update_tracks(detections, frame=frame)

    for track in tracks:
        if not track.is_confirmed():
            continue
        track_id = track.track_id
        ltrb = track.to_ltrb()
        x1, y1, x2, y2 = map(int, ltrb)

        label = f"ID: {track_id}"
        cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(annotated_frame, label, (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

    curr_time = time()
    fps_process = 1 / (curr_time - prev_time) if prev_time != 0 else 0
    prev_time = curr_time

    text = f"FPS: {fps_process:.2f}"
    cv2.putText(annotated_frame, text, (width - 150, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

    out.write(annotated_frame)
    cv2.imshow("YOLOv12 + DeepSORT", annotated_frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
out.release()
cv2.destroyAllWindows()


Đang sử dụng: cuda

0: 384x640 26 persons, 1 frisbee, 75.2ms
Speed: 2.5ms preprocess, 75.2ms inference, 74.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 23 persons, 3 frisbees, 22.9ms
Speed: 2.4ms preprocess, 22.9ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 25 persons, 3 frisbees, 21.0ms
Speed: 1.7ms preprocess, 21.0ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 25 persons, 1 backpack, 3 frisbees, 17.3ms
Speed: 1.9ms preprocess, 17.3ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 24 persons, 2 frisbees, 17.9ms
Speed: 1.5ms preprocess, 17.9ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 27 persons, 1 frisbee, 17.3ms
Speed: 1.7ms preprocess, 17.3ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 26 persons, 1 backpack, 2 frisbees, 18.2ms
Speed: 1.8ms preprocess, 18.2ms inference, 3.4ms postprocess per image at s