In [1]:
import cv2
import torch
import numpy as np
import pickle
from ultralytics import YOLO
from torchreid.reid.utils import FeatureExtractor



In [2]:
model = YOLO("yolov8n.pt")
model.info()

YOLOv8n summary: 129 layers, 3,157,200 parameters, 0 gradients, 8.9 GFLOPs


(129, 3157200, 0, 8.8575488)

In [None]:
results = model("media/bus.jpg")
for result in results:
  boxes = result.boxes.xyxy
  confidences = result.boxes.conf
  classes = result.boxes.cls

  for box, conf, cls in zip(boxes, confidences, classes):
        if int(cls) == 0:
            print(f"Found person at: {box} with {conf:.2f} confidence")

# Test


In [5]:
device = "mps" if torch.backends.mps.is_available() else "cpu"
device

'mps'

In [6]:
extractor = FeatureExtractor(model_name="osnet_x1_0", model_path=None, device=device)

Successfully loaded imagenet pretrained weights from "/Users/ooj/.cache/torch/checkpoints/osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']
Model: osnet_x1_0
- params: 2,193,616
- flops: 978,878,352


In [10]:
with open("test_embeddings.pkl", "rb") as f:
    test_embeddings = pickle.load(f)

with open("test2_embeddings.pkl", "rb") as f:
    test2_embeddings = pickle.load(f)

enrolled = {
    "ooj": test_embeddings,
    "adel": test2_embeddings
}

In [11]:
MATCH_THRESHOLD = 0.7
cap = cv2.VideoCapture(0)

In [14]:
while True:
    ret, frame = cap.read()
    if not ret:
        continue

    
    results = model(frame)
    for result in results:
        boxes = result.boxes.xyxy 
        classes = result.boxes.cls 
        
        for box, cls in zip(boxes, classes):
            if int(cls) != 0:  # only ppl
                continue

            x1, y1, x2, y2 = map(int, box)
            roi = frame[y1:y2, x1:x2]
            
            roi_embedding_tensor = extractor([roi])
            roi_embedding = roi_embedding_tensor.cpu().numpy().flatten()
            norm = np.linalg.norm(roi_embedding)
            roi_embedding = roi_embedding if norm == 0 else roi_embedding / norm
            
            best_score = -1.0
            best_label = "Unknown"
            
            for label, emb_list in enrolled.items():
                for enrolled_emb in emb_list:
                    score = np.dot(roi_embedding, enrolled_emb)
                    if score > best_score:
                        best_score = score
                        best_label = label
            
            if best_score < MATCH_THRESHOLD:
                best_label = "Unknown"
            
            # Draw the bounding box and label on the frame
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0,255,0), 2)
            cv2.putText(frame, f"{best_label} ({best_score:.2f})", 
                        (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2)
    
    cv2.imshow("Recognition", frame)
    
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break



0: 384x640 1 person, 77.5ms
Speed: 2.9ms preprocess, 77.5ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 47.3ms
Speed: 1.4ms preprocess, 47.3ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 46.3ms
Speed: 1.2ms preprocess, 46.3ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 46.2ms
Speed: 1.2ms preprocess, 46.2ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 57.0ms
Speed: 1.4ms preprocess, 57.0ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 46.9ms
Speed: 1.4ms preprocess, 46.9ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 47.2ms
Speed: 1.3ms preprocess, 47.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 47.7ms
Speed: 1.5ms preprocess, 47.7ms inference, 0.5ms postprocess per image at shape (1, 3, 38

In [None]:
cap.release()
cv2.destroyAllWindows()

# dist


In [3]:
FOCAL_LENGTH_PIXELS = 500
KNOWN_PERSON_HEIGHT = 1.7

In [4]:
cap = cv2.VideoCapture(0)

In [5]:
while True:
    ret, frame = cap.read()
    if not ret:
        continue

    results = model(frame)
    
    for result in results:
        boxes = result.boxes.xyxy
        classes = result.boxes.cls
        for box, cls in zip(boxes, classes):
                x1, y1, x2, y2 = map(int, box)
                bbox_height = y2 - y1
                distance = (FOCAL_LENGTH_PIXELS * KNOWN_PERSON_HEIGHT) / bbox_height
                
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(frame, f"Dist: {distance:.2f}m", (x1, y1 - 10), 
                            cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    
    cv2.imshow("Distance Estimation", frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break


0: 384x640 1 person, 67.8ms
Speed: 3.0ms preprocess, 67.8ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 49.3ms
Speed: 4.3ms preprocess, 49.3ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 48.8ms
Speed: 1.1ms preprocess, 48.8ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 51.9ms
Speed: 1.1ms preprocess, 51.9ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 47.1ms
Speed: 1.2ms preprocess, 47.1ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 44.3ms
Speed: 1.3ms preprocess, 44.3ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 44.8ms
Speed: 1.0ms preprocess, 44.8ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 45.4ms
Speed: 1.3ms preprocess, 45.4ms inference, 0.9ms postprocess per image at shape (1, 3, 38

In [None]:
cap.release()
cv2.destroyAllWindows()