In [1]:
import numpy as np
import tensorflow as tf
import cv2
from ultralytics import YOLO
from norfair import Detection, Tracker
import uuid


In [2]:
video_path = "../data/people_walking.mp4" #video path

 #"n" so it is nano model, faster. 
 #It is trained by COCO dataset that includes 80 different labels 
 #I used nano model but you can use small, medium,...
model = YOLO("../yolov8n.pt")

In [3]:
interpreter = tf.lite.Interpreter(model_path="../reid_model.tflite")
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

In [4]:
def get_embedding(image):
    image = cv2.resize(image, (128,256))
    image = image / 255.0
    image = np.expand_dims(image.astype(np.float32), axis=0)
    interpreter.set_tensor((input_details[0]['index']), image)
    interpreter.invoke()
    return interpreter.get_tensor(output_details[0]['index'])[0]

In [5]:
def cosine_similarity(a, b):
    return np.dot(a, b) /(np.linalg.norm(a) * np.linalg.norm(b))

In [6]:
def generate_new_id():
    return str(uuid.uuid4())[:8]

In [7]:
tracker = Tracker(distance_function="euclidean", distance_threshold=25)

In [8]:
cap = cv2.VideoCapture(video_path)

if not cap.isOpened:
    print("Video could not open")
    exit()

unique_ids = set()
reid_db = {} #track_id : embedding


In [9]:
while True:
    ret, frame = cap.read()

    if not ret:
        break

    results = model(frame)[0]
    detections = []
    new_reid_matches = {} # detectionindex : matched_id

    for i, box in enumerate(results.boxes):
        cls_id = int(box.cls[0])

        if model.names[cls_id] == "person":
            conf = float(box.conf[0])

            if conf > 70:
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                cx, cy = int((x1 + x2) / 2), int((y1+ y2) /2)

                person_crop = frame[y1:y2, x1:x2]
                if person_crop.size == 0:
                    continue

                embedding = get_embedding(person_crop)
                matched_id = None

                for known_id, known_embedding in reid_db.items():
                    if cosine_similarity(embedding, known_embedding) > 0.70:
                        matched_id = known_id
                        break

                if matched_id is None:
                    matched_id = generate_new_id()
                    reid_db[matched_id] = embedding
                else: # Burayla sonradan oynayabilirim.
                    reid_db[matched_id] = embedding 
            
                unique_ids.add(matched_id)
                new_reid_matches[i] = matched_id

                detections.append(Detection(points=np.array([[cx, cy]]), scores=np.array([conf])))

                cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 117, 44), 2)
                label = f"{model.names[cls_id]} {conf:.2f}"
                cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 117, 44), 2)

    
    tracked_objects = tracker.update(detections)

    for i, obj in enumerate(tracked_objects):
        if obj.live_points == 0:
            continue
        
        x, y = obj.estimate[0]

        track_id = list(new_reid_matches.values())[i] if i < len(new_reid_matches) else "X"

        cv2.putText(frame, f"ID: {track_id}", (int(x), int(y) - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 117, 44), 2)
        

    (text_width, text_height), _ = cv2.getTextSize(f"Human Count: {len(unique_ids)}", cv2.FONT_HERSHEY_COMPLEX, 1, 2)
    x = frame.shape[1] - text_width - 10
    y = text_height + 10
    cv2.putText(frame, f"Human Count: {len(unique_ids)}", (x, y), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 117, 44), 2)


    cv2.imshow("Test Video", frame)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break


cap.release()
cv2.destroyAllWindows()
            



0: 384x640 11 persons, 2 backpacks, 1 handbag, 55.8ms
Speed: 1.5ms preprocess, 55.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 2 backpacks, 1 handbag, 45.6ms
Speed: 1.5ms preprocess, 45.6ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 2 backpacks, 1 handbag, 43.0ms
Speed: 1.2ms preprocess, 43.0ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 1 backpack, 1 handbag, 43.4ms
Speed: 2.0ms preprocess, 43.4ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 backpack, 2 handbags, 42.6ms
Speed: 1.3ms preprocess, 42.6ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 2 backpacks, 2 handbags, 44.3ms
Speed: 1.3ms preprocess, 44.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 backpack, 1 handbag, 44.1ms
Speed: 1.5ms preprocess, 44.1ms in

: 