In [1]:
import cv2
import time
import torch
import numpy as np
import pickle
from ultralytics import YOLO
from torchreid.reid.utils import FeatureExtractor



In [2]:
model = YOLO("yolov8n.pt")
model.info()

YOLOv8n summary: 129 layers, 3,157,200 parameters, 0 gradients, 8.9 GFLOPs


(129, 3157200, 0, 8.8575488)

In [3]:
results = model("media/bus.jpg")
for result in results:
  boxes = result.boxes.xyxy
  confidences = result.boxes.conf
  classes = result.boxes.cls

  for box, conf, cls in zip(boxes, confidences, classes):
        if int(cls) == 0:
            print(f"Found person at: {box} with {conf:.2f} confidence")


image 1/1 /Users/ooj/Dev/HKUST/fyp/FinalYearProject/src/cv/media/bus.jpg: 448x640 4 persons, 1 bus, 62.1ms
Speed: 3.0ms preprocess, 62.1ms inference, 6.7ms postprocess per image at shape (1, 3, 448, 640)
Found person at: tensor([133.3593,  72.5784, 198.6200, 181.3386]) with 0.92 confidence
Found person at: tensor([ 34.2597,  82.9228,  91.5337, 181.7508]) with 0.86 confidence
Found person at: tensor([ 74.2149,  71.5516, 119.7641, 181.7909]) with 0.79 confidence
Found person at: tensor([113.3372,  74.2637, 144.6671, 181.3813]) with 0.66 confidence


# Test


In [4]:
device = "mps" if torch.backends.mps.is_available() else "cpu"
device

'mps'

In [5]:
extractor = FeatureExtractor(model_name="osnet_x1_0", model_path=None, device=device)

Successfully loaded imagenet pretrained weights from "/Users/ooj/.cache/torch/checkpoints/osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']
Model: osnet_x1_0
- params: 2,193,616
- flops: 978,878,352


In [6]:
with open("data/test_embeddings.pkl", "rb") as f:
    test_embeddings = pickle.load(f)

with open("data/test2_embeddings.pkl", "rb") as f:
    test2_embeddings = pickle.load(f)

with open("data/test3_embeddings.pkl", "rb") as f:
    test3_embeddings = pickle.load(f)

enrolled = {
    "ooj": test_embeddings,
    "adel": test2_embeddings,
    "ooj2": test3_embeddings,
}

In [7]:
MATCH_THRESHOLD = 0.7
cap = cv2.VideoCapture(0)

In [8]:
while True:
    ret, frame = cap.read()

    if not ret:
        time.sleep(0.01)
        continue
    
    results = model(frame)
    for result in results:
        boxes = result.boxes.xyxy 
        classes = result.boxes.cls 
        
        for box, cls in zip(boxes, classes):
            if int(cls) != 0:  # only ppl
                continue

            x1, y1, x2, y2 = map(int, box)
            roi = frame[y1:y2, x1:x2]
            
            roi_embedding_tensor = extractor([roi])
            roi_embedding = roi_embedding_tensor.cpu().numpy().flatten()
            norm = np.linalg.norm(roi_embedding)
            roi_embedding = roi_embedding if norm == 0 else roi_embedding / norm
            
            best_score = -1.0
            best_label = "Unknown"
            
            for label, embedding in enrolled.items():
                for enrolled_emb in embedding:
                    score = np.dot(roi_embedding, enrolled_emb)
                    if score > best_score:
                        best_score = score
                        best_label = label
            
            if best_score < MATCH_THRESHOLD:
                best_label = "Unknown"
            
            # Draw the bounding box and label on the frame
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0,255,0), 2)
            cv2.putText(frame, f"{best_label} ({best_score:.2f})", 
                        (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2)
    
    cv2.imshow("Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break


0: 384x640 4 persons, 1 chair, 54.9ms
Speed: 1.4ms preprocess, 54.9ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 chairs, 54.2ms
Speed: 1.9ms preprocess, 54.2ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 chairs, 73.1ms
Speed: 1.6ms preprocess, 73.1ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 chairs, 56.1ms
Speed: 1.4ms preprocess, 56.1ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 chairs, 42.8ms
Speed: 1.1ms preprocess, 42.8ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 chairs, 41.0ms
Speed: 1.1ms preprocess, 41.0ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 chairs, 49.5ms
Speed: 1.3ms preprocess, 49.5ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 chairs, 44.3ms
Sp

In [9]:
cap.release()
cv2.destroyAllWindows()

# dist


In [10]:
FOCAL_LENGTH_PIXELS = 500
KNOWN_PERSON_HEIGHT = 1.7

In [11]:
cap = cv2.VideoCapture(0)

In [12]:
while True:
    ret, frame = cap.read()
    if not ret:
        continue

    results = model(frame)
    
    for result in results:
        boxes = result.boxes.xyxy
        classes = result.boxes.cls
        for box, cls in zip(boxes, classes):
                x1, y1, x2, y2 = map(int, box)
                bbox_height = y2 - y1
                distance = (FOCAL_LENGTH_PIXELS * KNOWN_PERSON_HEIGHT) / bbox_height
                
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(frame, f"Dist: {distance:.2f}m", (x1, y1 - 10), 
                            cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    
    cv2.imshow("Distance Estimation", frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break


0: 384x640 2 persons, 53.6ms
Speed: 1.6ms preprocess, 53.6ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 chair, 58.4ms
Speed: 1.5ms preprocess, 58.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 chair, 54.8ms
Speed: 1.7ms preprocess, 54.8ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 chair, 1 refrigerator, 60.1ms
Speed: 1.6ms preprocess, 60.1ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 chair, 1 refrigerator, 51.2ms
Speed: 1.3ms preprocess, 51.2ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 chair, 1 refrigerator, 61.7ms
Speed: 1.5ms preprocess, 61.7ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 chair, 1 refrigerator, 66.7ms
Speed: 1.3ms preprocess, 66.7ms inference, 1.8ms postprocess per image at shape (1, 3, 384,

In [13]:
cap.release()
cv2.destroyAllWindows()