In [56]:
from bytetracker import BYTETracker
from bytetracker.basetrack import BaseTrack
import cv2
from ultralytics import YOLO
import numpy as np
import pandas as pd

In [57]:
model = YOLO("yolov8n.pt", task="detect")
model.info()

YOLOv8n summary: 129 layers, 3,157,200 parameters, 0 gradients, 8.9 GFLOPs


(129, 3157200, 0, 8.8575488)

In [58]:
tracker = BYTETracker()
BaseTrack._count = 0

In [59]:
def draw_all_bbox_on_image(image, tracking_objects: np.ndarray):
    """
    A list of of detections with track id, class id and confidence.
            [
                [x, y, x, y, track_id, class_id, conf],
                [x, y, x, y, track_id, class_id, conf],
                ...
            ]

    Plot this on the image with the track id, class id and confidence.
    """
    for detection in tracking_objects:
        x1, y1, x2, y2, track_id, _, conf = detection
        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
        cv2.rectangle(image, (x1, y1), (x2, y2), (255, 0, 0), 2)
        cv2.putText(
            image,
            f"{int(track_id)} ({conf:.2f})",
            (x1, y1 - 10),
            0,
            1,
            (0, 255, 0),
            2,
        )
    return image


def yolo_results_to_bytetrack_format(detections):
    """Transforms YOLO detections into the bytetrack format.

    Args:
        detections: A list of YOLO detections.

    Returns:
        A list of bytetrack detections.
    """
    boxes = detections.numpy().boxes.xyxyn
    scores = detections.numpy().boxes.conf
    classes = detections.numpy().boxes.cls
    return np.stack(
        [
            boxes[:, 0],
            boxes[:, 1],
            boxes[:, 2],
            boxes[:, 3],
            scores,
            classes,
        ],
        axis=1,
    )


def scale_bbox_as_xyxy(bbox: np.ndarray, target_img_size: tuple):
    """Scales a bounding box to a target image size.

    Args:
        bbox: A bounding box in the format [x, y, x, y].
        target_img_size: The target image size as a tuple (h, W).

    Returns:
        The scaled bounding box.
    """
    x1, y1, x2, y2 = bbox
    h, w = target_img_size
    scaled_bbox = np.array([x1 * w, y1 * h, x2 * w, y2 * h])
    return scaled_bbox

In [60]:
cap = cv2.VideoCapture(0)
frame_id = 0

while True:
    ret, frame = cap.read()
    if not ret:
        time.sleep(0.01)
        continue

    frame_id += 1
    frame_tracked = []

    detections = model.predict(frame, classes=0, conf=0.25, verbose=False)[0]
    for detection in detections:
        boxes = detections.numpy().boxes.xyxyn
        scores = detections.numpy().boxes.conf
        classes = detections.numpy().boxes.cls

        formatted = np.stack([boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3], scores, classes,], axis=1)
        tracked = tracker.update(formatted, frame_id)

        if len(tracked) > 0:
            tracked_objects = np.insert(tracked, 0, frame_id, axis=1)
            # print(f"Tracked objects with frame_id: {tracked_objects}")
            frame_tracked.append(tracked_objects)

    if frame_tracked:
        df_tracked = pd.DataFrame(np.concatenate(frame_tracked), columns=["frame_id", "x1", "y1", "x2", "y2", "track_id", "class", "confidence"])
        df_tracked[["x1", "y1", "x2", "y2"]] = df_tracked[["x1", "y1", "x2", "y2"]].apply(
            lambda x: scale_bbox_as_xyxy(x[0:4], detections.orig_shape), axis=1, result_type="expand"
        )

        for i, row in df_tracked.iterrows():
            x1, y1, x2, y2 = int(row["x1"]), int(row["y1"]), int(row["x2"]), int(row["y2"])
            confidence = row["confidence"]
            class_id = row["class"]
            track_id = row["track_id"]

            label = f"ID: {track_id} Conf: {confidence:.2f}"
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    cv2.imshow("Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()