In [1]:
from ultralytics import YOLO
import cv2
import torch
import torchvision


In [2]:
import math
from sort import *  #It is used to track the objects

In [3]:
class_names = []
with open('coco-labels.txt', 'r') as f:
    for label in range(0,80):
        class_names.append(str(f.readline()[:-1]))

In [4]:
class_names[:5]

['person', 'bicycle', 'car', 'motorcycle', 'airplane']

In [6]:
# Basically, we are trying to count objects once it crosses this limit (it varies).
line_limits = [400, 297, 673, 297]


##### Making a tracker

Max_age defines the waiting time(patience) for the same object to appear on the frame again.

In [5]:
tracker1 = Sort(max_age=20, min_hits=3, iou_threshold=0.3)

#### People Counter

In [46]:
# Using video file
cap = cv2.VideoCapture('People.mp4')

total_count = []
id_set = set()
line_limits = [0, 500, 1280, 500]
while True:
    success, img = cap.read()
    img = cv2.resize(img, (1280, 720))
    model = YOLO('../Yolo-weights/yolov8n.pt')
    results = model(img, stream=True)

    detections = np.empty((0,5))

    # Bounding Box
    for r in results:
        boxes = r.boxes
        for box in boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            #cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 3)

            # Confidence
            conf = math.ceil(box.conf[0]*100)/100

            # Class Name
            cls = int(box.cls[0])
            current_c = class_names[cls]

            if current_c == 'person' and conf > 0.3:
                
                cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 3)
                currentArray = np.array([x1, y1, x2, y2, conf])
                detections = np.vstack((detections, currentArray))
                cv2.putText(
                            img,
                            f'Label: {current_c}, Confidence: {conf}',
                            (max(0,x1), max(35, y1-10)),   # Making it 35 so that label won't go outside the window
                            fontFace = cv2.FONT_HERSHEY_SIMPLEX,
                            fontScale = 0.6,
                            color = (255, 0, 255),
                            thickness=2
                        )
    resultsTracker = tracker1.update(detections)

    
    for result_tr in resultsTracker:
        x1, y1, x2, y2, id = map(int, result_tr)
        w, h = x2 - x1, y2 - y1
        cv2.rectangle(img, (x1, y1), (x2, y2), (125, 125, 0), 3)
        cv2.putText(img, f'Currently tracking Object ID: {id}', (max(0, x1+5), max(20, y1+25)), fontFace=cv2.FONT_ITALIC,
                    fontScale=0.6, color=(255,255,255), thickness=1)
        id_set.add(id)


    #counter = len(total_count)
    counter = len(id_set)        
    cv2.putText(img, f'Count: {counter}', (50, 650), fontFace=cv2.FONT_HERSHEY_COMPLEX,
                   fontScale=1, color=(0,0,125), thickness=2)    

    cv2.imshow("Image", img)
    # cv2.waitKey(1)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()



0: 384x640 9 persons, 1 train, 1 truck, 8 handbags, 278.8ms
Speed: 4.0ms preprocess, 278.8ms inference, 6.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 1 train, 7 handbags, 268.6ms
Speed: 4.1ms preprocess, 268.6ms inference, 6.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 1 train, 8 handbags, 243.9ms
Speed: 8.3ms preprocess, 243.9ms inference, 9.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 1 train, 9 handbags, 269.6ms
Speed: 8.9ms preprocess, 269.6ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 1 train, 8 handbags, 1 tv, 106.2ms
Speed: 5.5ms preprocess, 106.2ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 10 handbags, 1 tv, 124.2ms
Speed: 7.5ms preprocess, 124.2ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 9 persons, 8 handbags, 1 tv, 110.5ms
Speed: 2.0ms preprocess, 110.5ms infe

##### Observations:

* Here we are using YOLOv8n to count people in the camera's field of view.
* Though it is working well, accuracy is not reliable due to occlusion and overcrowding.
* One of the solution is to face the camera at a good vantage point depending upon the objective.