In [None]:
 import cv2
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
import threading
import json
import time
import os

# Load the YOLO model
model = YOLO("yolo11n.pt").to('cpu')

# Initialize DeepSORT tracker with stricter parameters
tracker = DeepSort(
    max_age=50,               # Increase track retention
    n_init=5,                 # More confirmations before creating a new ID
    max_cosine_distance=0.2,  # Stricter similarity threshold
    nn_budget=100,            # Size of the appearance feature cache
    nms_max_overlap=0.5       # Reduce overlapping detection issues
)

# RTSP Camera Address
rtsp_url = ""

# Asynchronous Video Stream Class
class VideoStream:
    def __init__(self, src=0):
        self.capture = cv2.VideoCapture(src)
        self.capture.set(cv2.CAP_PROP_BUFFERSIZE, 1)
        self.ret, self.frame = self.capture.read()
        self.stopped = False
        threading.Thread(target=self.update, args=()).start()

    def update(self):
        while not self.stopped:
            self.ret, self.frame = self.capture.read()

    def read(self):
        return self.ret, self.frame

    def stop(self):
        self.stopped = True

# JSON Logging Setup
json_file = "detections_with_ids.json"
if not os.path.exists(json_file):
    with open(json_file, 'w') as f:
        json.dump([], f)

# Append detections to JSON
def append_detection_to_json(detection):
    with open(json_file, 'r+') as file:
        try:
            data = json.load(file)
        except json.JSONDecodeError:
            data = []

        data.append(detection)
        file.seek(0)
        json.dump(data, file, indent=4)
        file.truncate()

# Start the Video Stream
stream = VideoStream(rtsp_url)

while True:
    ret, frame = stream.read()
    if not ret:
        break

    # Resize for faster processing
    frame = cv2.resize(frame, (1280, 720))

    # Perform detection
    results = model(frame)

    detections = []
    for result in results[0].boxes.data.cpu().numpy():
        x1, y1, x2, y2, conf, cls = result

        # Only process detections with higher confidence and for the 'person' class
        if int(cls) == 0 and conf > 0.5:
            width, height = x2 - x1, y2 - y1
            detections.append(([x1, y1, width, height], conf))

    # Update DeepSORT tracker with detections
    tracks = tracker.update_tracks(detections, frame=frame)

    for track in tracks:
        if not track.is_confirmed():
            continue

        track_id = track.track_id
        ltrb = track.to_ltrb()
        x1, y1, x2, y2 = map(int, ltrb)

        # Append detection to JSON
        detection = {
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "id": track_id,
            "class": "person",
            "confidence": float(conf),
            "bbox": [x1, y1, x2, y2]
        }
        append_detection_to_json(detection)

        # Annotate detection on the frame
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, f'ID: {track_id}', (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    # Display the frame
    cv2.imshow('Real-Time Human Tracking with Consistent IDs', frame)

    # Exit on pressing 'q'
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
stream.stop()
cv2.destroyAllWindows()



0: 480x640 1 person, 1 chair, 565.1ms
Speed: 9.0ms preprocess, 565.1ms inference, 5.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 chair, 624.1ms
Speed: 10.1ms preprocess, 624.1ms inference, 6.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 chair, 460.2ms
Speed: 7.5ms preprocess, 460.2ms inference, 9.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 traffic light, 1 chair, 595.6ms
Speed: 10.1ms preprocess, 595.6ms inference, 11.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 traffic light, 1 chair, 517.2ms
Speed: 10.7ms preprocess, 517.2ms inference, 7.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 traffic light, 1 chair, 513.5ms
Speed: 10.9ms preprocess, 513.5ms inference, 8.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 chair, 539.7ms
Speed: 11.2ms preprocess, 539.7ms inference, 7.3ms postprocess per image at shape (