In [1]:
from ultralytics import YOLO
from boxmot import DeepOCSORT
from pathlib import Path
from deep_sort_realtime.deepsort_tracker import DeepSort
import cv2
import numpy as np

In [4]:
model = YOLO("best_100.pt")

In [2]:
# Initialize SORT tracker
deepsort = DeepSort(max_age=30, nn_budget=70, nms_max_overlap=1.0)

### deepsort tracker

In [31]:
def draw_boxes(img, bbox, offset=(0, 0)):
    x1, y1, x2, y2 = [int(i) for i in bbox]
    x1 += offset[0]
    y1 += offset[1]
    x2 += offset[0]
    y2 += offset[1]
    color = (0, 255, 0)
    label = f'{1}'
    cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
    cv2.putText(img, label, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.75, color, 2)
    return img

# Open video file or capture device
cap = cv2.VideoCapture('kickvideo.mp4')  # or cap = cv2.VideoCapture(0) for webcam

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # YOLO detection
    result = model(frame)
    
    detections = []
    for idx in range(len(result[0].boxes.cls)):
        # convert boxes to deepsort ltwh
        boxes = result[0].boxes.xyxy[idx].cpu().numpy()
        boxes[2] = boxes[2] - boxes[0]
        boxes[3] = boxes[3] - boxes[1]
        # add directions
        detections.append(
            [boxes.tolist(), result[0].boxes.conf[idx].cpu().item(), result[0].boxes.cls[idx].cpu().item()]
        )
    detections = np.array(detections)
    if len(detections) != 0:
        # Tracker update

        tracks = deepsort.update_tracks(detections, frame=frame)
        
        # Visualization
        for track in tracks:
            if not track.is_confirmed():
                continue
            else:
                frame = draw_boxes(frame, track.to_ltrb())
    
    cv2.imshow('Tracking', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



0: 256x448 (no detections), 15.5ms
Speed: 1.7ms preprocess, 15.5ms inference, 1.4ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 (no detections), 28.4ms
Speed: 1.7ms preprocess, 28.4ms inference, 1.5ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 (no detections), 30.8ms
Speed: 1.7ms preprocess, 30.8ms inference, 2.5ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 (no detections), 37.0ms
Speed: 2.6ms preprocess, 37.0ms inference, 1.7ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 (no detections), 18.6ms
Speed: 1.5ms preprocess, 18.6ms inference, 1.2ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 (no detections), 25.4ms
Speed: 1.4ms preprocess, 25.4ms inference, 2.8ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 (no detections), 16.2ms
Speed: 2.8ms preprocess, 16.2ms inference, 0.9ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 (no detections), 8.6ms
Speed: 1.2ms preprocess, 8.6ms inf

  detections = np.array(detections)


0: 256x448 1 0, 8.7ms
Speed: 2.0ms preprocess, 8.7ms inference, 3.3ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 1 0, 7.1ms
Speed: 1.3ms preprocess, 7.1ms inference, 3.5ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 1 0, 6.6ms
Speed: 1.4ms preprocess, 6.6ms inference, 1.4ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 1 0, 7.2ms
Speed: 1.6ms preprocess, 7.2ms inference, 1.1ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 1 0, 8.0ms
Speed: 1.7ms preprocess, 8.0ms inference, 3.1ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 1 0, 9.3ms
Speed: 1.4ms preprocess, 9.3ms inference, 1.2ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 1 0, 7.9ms
Speed: 1.4ms preprocess, 7.9ms inference, 1.4ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 1 0, 7.4ms
Speed: 1.3ms preprocess, 7.4ms inference, 1.8ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 1 0, 6.4ms
Speed: 1.2ms preprocess, 6

### boxmot trackers

In [5]:
# tracking with boxmot

tracker = DeepOCSORT(
    model_weights=Path('osnet_x0_25_msmt17.pt'), # which ReID model to use
    device='cuda:0',
    fp16=False,
)

# Open video file or capture device
cap = cv2.VideoCapture('kickvideo.mp4')  # or cap = cv2.VideoCapture(0) for webcam

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # YOLO detection
    result = model(frame)
    
    detections = []
    for idx in range(len(result[0].boxes.cls)):
        boxes = result[0].boxes.xyxy[idx].cpu().numpy()
        detections.append(
            [*boxes.tolist(), result[0].boxes.conf[idx].cpu().item(), result[0].boxes.cls[idx].cpu().item()]
        )
    detections = np.array(detections)
    if len(detections) != 0:
        # Tracker update
        tracks = tracker.update(detections, frame)
        tracker.plot_results(frame, show_trajectories=False)
    
    cv2.imshow('Tracking', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


[32m2024-06-09 22:42:57.593[0m | [1mINFO    [0m | [36mboxmot.utils.torch_utils[0m:[36mselect_device[0m:[36m52[0m - [1mYolo Tracking v10.0.71 🚀 Python-3.10.12 torch-2.2.2+cu121
CUDA:0 (NVIDIA GeForce RTX 3080, 10240MiB)[0m
[32m2024-06-09 22:42:57.792[0m | [32m[1mSUCCESS [0m | [36mboxmot.appearance.reid_model_factory[0m:[36mload_pretrained_weights[0m:[36m207[0m - [32m[1mSuccessfully loaded pretrained weights from "osnet_x0_25_msmt17.pt"[0m



0: 256x448 (no detections), 12.6ms
Speed: 2.1ms preprocess, 12.6ms inference, 0.7ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 (no detections), 30.6ms
Speed: 2.1ms preprocess, 30.6ms inference, 1.3ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 (no detections), 17.9ms
Speed: 1.5ms preprocess, 17.9ms inference, 1.1ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 (no detections), 6.2ms
Speed: 0.9ms preprocess, 6.2ms inference, 1.0ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 (no detections), 6.8ms
Speed: 0.9ms preprocess, 6.8ms inference, 0.6ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 (no detections), 7.6ms
Speed: 1.1ms preprocess, 7.6ms inference, 0.5ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 (no detections), 6.8ms
Speed: 1.2ms preprocess, 6.8ms inference, 0.6ms postprocess per image at shape (1, 3, 256, 448)

0: 256x448 (no detections), 7.4ms
Speed: 1.6ms preprocess, 7.4ms inference, 