In [3]:
from main import track_id


# Basic MOT realisation with SORT (sort.py)


class ObjectDetection:

    def __init__(self, capture_index):
       
        self.capture_index = capture_index
        
        self.device = 'msp' if torch.backends.mps.is_available() else 'cpu'
        
        self.model = self.load_model()
        
        self.class_names = list(self.model.model.names.values())

   

    def load_model(self):
       
        model = YOLO("yolov8m.pt")  
        model.fuse()
    
        return model


    def predict(self, frame):
       
        results = self.model(frame, verbose=True)
        
        return results
    

    def get_results(self, results):
        
        detections_list = []
        
        # Extract detections
        for result in results[0]:
            # to get coordinates
            bbox = result.boxes.xyxy.cpu().numpy()
            # threshold analog
            confidence = result.boxes.conf.cpu().numpy()
            # class id 
            class_id = result.boxes.cls.cpu().numpy()
            
            
            merged_detection = [bbox[0][0], bbox[0][1], bbox[0][2], bbox[0][3], confidence[0], class_id[0]]
            
            
            detections_list.append(merged_detection)
            print(detections_list)
    
        return np.array(detections_list)
    
    
    def draw_bounding_boxes_with_id(self, img, bboxes, ids, class_id):
        # bboxes - coordinates
        # ids - unique index for each detected object to track it 
        # class_id - object class id 
        for bbox, id_, cls in zip(bboxes, ids, class_id):
            
            label = f"ID: {id_} Class: {self.class_names[int(cls)]}"

            cv2.rectangle(img,(int(bbox[0]), int(bbox[1])),(int(bbox[2]), int(bbox[3])),(0,0,255),2)
            cv2.putText(img, label, (int(bbox[0]), int(bbox[1] - 10)), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

            
        return img
       
    
    def __call__(self):

        cap = cv2.VideoCapture(self.capture_index)
        # to check is video opened 
        assert cap.isOpened()
        cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
        cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
        
        
        # SORT realisation
        # max_age - amount of frames, during which, algorithm will remember a specific object
        # min_hits - amount of frames, during which, algorithm will sort out all undecided objects 
        # iou_threshold - to calculate IOU between objects,which could be close to each other 
        sort = Sort(max_age=100, min_hits=8, iou_threshold=0.50)
        
      
        while True:
          
            start_time = time()
            
            ret, frame = cap.read()
            assert ret
            
            results = self.predict(frame)
            detections_list = self.get_results(results)
            
            # SORT Tracking
            if len(detections_list) == 0:
                detections_list = np.empty((0, 5))
        
            # put detections_list into sort.update to realize tracking 
            res = sort.update(detections_list)
            # get coordinates of objects to track
            boxes_track = res[:,:-1]
            # get index of objects to track
            boxes_ids = res[:,-1].astype(int)
            # get objects id - from detections_list, NOT from res(!)
            class_id = detections_list[:,-1]
            
            # draw bounding boxes
            frame = self.draw_bounding_boxes_with_id(frame, boxes_track, boxes_ids, class_id)

            end_time = time()
            fps = 1/np.round(end_time - start_time, 2)
             
            cv2.putText(frame, f'FPS: {int(fps)}', (20,70), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,255,0), 2)
            
            cv2.imshow('YOLOv8 Detection', frame)
 
            if cv2.waitKey(1) & 0xFF == ord("q"):
                break
        
        cap.release()
        cv2.destroyAllWindows()
        
        
    
detector = ObjectDetection(capture_index=1)
detector()


0: 288x512 (no detections), 288.3ms
Speed: 4.0ms preprocess, 288.3ms inference, 3.6ms postprocess per image at shape (1, 3, 288, 512)

0: 384x640 (no detections), 410.9ms
Speed: 1.9ms preprocess, 410.9ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 156.2ms
Speed: 2.1ms preprocess, 156.2ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)
Detected face boxes: []
No valid detections.

0: 288x512 1 face, 370.7ms
Speed: 29.4ms preprocess, 370.7ms inference, 3.3ms postprocess per image at shape (1, 3, 288, 512)

0: 384x640 3 persons, 335.8ms
Speed: 2.1ms preprocess, 335.8ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 132.7ms
Speed: 2.6ms preprocess, 132.7ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)
Detected face boxes: [[648.5823974609375, 7.533645153045654, 1370.4390869140625, 734.0816040039062]]


TypeError: object of type 'float' has no len()

In [None]:
# tracking only for one specific class -  humans (index 0)
# just detection for other objects 

import torch
import numpy as np
from sort import Sort


class Detector:
    def __init__(self, capture_index):
        self.capture_index = capture_index
        self.model = self.load_model()
        self.names = list(self.model.model.names.values())
        self.device = 'mps' if torch.backends.mps.is_available() else 'cpu'

    def load_model(self):
        model = YOLO("yolov8m.pt")
        model.fuse()

        return model

    def predict(self, frame):
        results = self.model(frame, verbose=True)
        return results

    def get_results(self, results):
        detections_list = []
        for result in results[0]:

            class_id = result.boxes.cls.cpu().numpy().astype(int)
            
            # results, which will be fit into "sort" must be only of index 0 - so we choose only humans for tracking
            if class_id == 0:
                bboxes = result.boxes.xyxy.cpu().numpy()
                score = result.boxes.conf.cpu().numpy()
                class_id = result.boxes.cls.cpu().numpy()

                detection = [bboxes[0][0], bboxes[0][1], bboxes[0][2], bboxes[0][3], score[0], class_id[0]]
                detections_list.append(detection)

        return np.array(detections_list)

    def draw(self, frame, bboxes, score, class_id):
        
        
        for bbox, conf, obj_id in zip(bboxes, score, class_id):
            
            label = f"{conf}:  {self.names[int(obj_id)]}"
            
            cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0, 0, 255), 2)
            cv2.putText(frame, label, (int(bbox[0]), int(bbox[1]) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
        return frame
    
    
    def draw_without_id(self, frame, results): 
        
        bboxes = results[0].boxes.xyxy.cpu().numpy().astype(int)
        class_id = results[0].boxes.cls.cpu().numpy().astype(int)
        
        for bbox, obj_id in zip(bboxes, class_id):
            name = self.names[int(obj_id)]
            
            if obj_id != 0:
                random.seed(int(obj_id))
                color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
                cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), color, 2)
                cv2.putText(frame, name, (int(bbox[0]), int(bbox[1]) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
        
        return frame
                

    def __call__(self):
        cap = cv2.VideoCapture(self.capture_index)
        ret, frame = cap.read()
        assert ret
        cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
        cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

        sort = Sort(max_age=100, min_hits=8, iou_threshold=0.50)

        while True:
            start_time = time()
            ret, frame = cap.read()
            assert ret

            results = self.predict(frame)
            detections_list = self.get_results(results)
            
            if len(detections_list) == 0:
                detections_list = np.empty((0, 5))

            res = sort.update(detections_list)
            boxes_track = res[:, :-1]
            boxes_ids = res[:, -1].astype(int)
            class_id = detections_list[:, -1]

            frame = self.draw(frame, boxes_track, boxes_ids, class_id)
            frame = self.draw_without_id(frame, results)

            end_time = time()
            fps = 1 / np.round(end_time - start_time, 2)
            cv2.putText(frame, f'FPS: {int(fps)}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 0), 2)

            cv2.imshow('YOLOv8 Detection', frame)
            if cv2.waitKey(1) & 0xFF == ord("q"):
                break

        cap.release()
        cv2.destroyAllWindows()


detector = Detector(capture_index=1)
detector()




Deep sort

In [None]:
import os
import random
import cv2
import numpy as np
from time import time  
from ultralytics import YOLO
from deepsort_tracker import Tracker

model = YOLO("yolov8m.pt")
model.fuse()
# import tracker object from deepsort_tracker.py file 
tracker = Tracker()

# set random colors to bounding boxers
colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) for _ in range(10)]

detection_threshold = 0.3

cap = cv2.VideoCapture(1)
assert cap.isOpened()
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

while True:
    start_time = time()

    ret, frame = cap.read()
    if not ret:
        break  # Exit the loop if there's no frame

    results = model(frame, verbose=False)[0]
    
    detections = []
    for result in results.boxes.data.tolist():
        
        x1, y1, x2, y2, score, class_id = result
        if score > detection_threshold:
            detections.append([int(x1), int(y1), int(x2), int(y2), int(score)])
        
        # update the tracker
        tracker.update(frame, detections)
        # get coordinates from tracked objects 
        for track in tracker.tracks:
            bbox = track.bbox
            x1, y1, x2, y2 = bbox
            track_id = track.track_id

            cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), colors[track_id % len(colors)], 3)
            cv2.putText(frame, "ID: " + str(track_id), (int(bbox[0]), int(bbox[1] - 10)), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    end_time = time()
    fps = 1/np.round(end_time - start_time, 2)
        
    cv2.putText(frame, f'FPS: {int(fps)}', (20,70), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,255,0), 2)

    cv2.imshow("Tracking", frame)

    # Press 'q' to exit the loop
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()