In [2]:
import numpy as np
import matplotlib.pyplot as plt
import cv2 
import argparse
import sys
from typing import List

import cv2
import pyttsx3
from ultralytics import YOLO

In [38]:
FOCAL_LENGTH = 707.94 
KNOWN_WIDTHS = {
    "person": 0.5,
    "cup": 0.08,
    "chair": 0.45,
    "laptop": 0.30,
    "bottle": 0.07,
}

def draw_boxes(image_bgr, results, score_threshold=0.5):
    out = image_bgr.copy()
    for r in results:
        for box in r.boxes:
            cls_id = int(box.cls[0])
            conf = float(box.conf[0])
            if conf < score_threshold:
                continue
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            cls = r.names[cls_id]
            cv2.rectangle(out, (x1, y1), (x2, y2), (0, 255, 0), 2)
            text = f"{cls}: {conf:.2f}"
            ((text_w, text_h), _) = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
            cv2.rectangle(out, (x1, y1 - text_h - 6), (x1 + text_w, y1), (0, 255, 0), -1)
            cv2.putText(out, text, (x1, y1 - 4), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 1, cv2.LINE_AA)
    return out

def run_image_mode(input_path: str, score: float = 0.5, output_path: str = None, weights: str = "YOLOv10x.pt"):
    model = YOLO(weights)
    img = cv2.imread(input_path)
    if img is None:
        print(f"Error: could not read {input_path}")
        return
    results = model(img)
    drawn = draw_boxes(img, results, score)
    if output_path:
        cv2.imwrite(output_path, drawn)
        print(f"Wrote annotated image to {output_path}")
    else:
        names = extract_names(results, score)
        tts(names)
        cv2.imshow('detected', drawn)
        cv2.waitKey(0)
        cv2.destroyAllWindows()

def run_webcam_mode(score: float = 0.5, frame_resize: float = 0.6, frame_skip: int = 3, weights: str = "YOLOv10x.pt"):
    model = YOLO(weights)
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("ERROR: Could not open webcam")
        return
    engine = None
    last_spoken = []
    frame_idx = 0
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame_idx += 1
            if frame_idx % frame_skip != 0:
                cv2.imshow('webcam', frame)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break
                continue
            if frame_resize != 1.0:
                small = cv2.resize(frame, (0, 0), fx=frame_resize, fy=frame_resize)
            else:
                small = frame
            results = model(small)
            out_small = draw_boxes(small, results, score)
            names = extract_names(results, score)
            if names and names != last_spoken:
                tts(names, engine)
                last_spoken = names
            cv2.imshow('webcam', out_small)
            cv2.waitKey(0)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
    except KeyboardInterrupt:
        print('Interrupted by user')
    finally:
        cap.release()
        cv2.destroyAllWindows()
        
def tts(names: List[str], engine=None):
    if not names:
        return
    seen = set()
    unique = []
    for n in names:
        if n not in seen:
            seen.add(n)
            unique.append(n)
    sentence = "I see " + (" and ".join(unique))
    try:
        if engine is None:
            engine = pyttsx3.init()
            engine.setProperty('rate', 150)
            voices = engine.getProperty('voices')
            if len(voices) > 1:
                engine.setProperty('voice', voices[1].id)
        engine.say(sentence)
        engine.runAndWait()
    except Exception as e:
        print("TTS error:", e)
        
def extract_names(results, score_threshold=0.5):
    names = []
    for r in results:
        for box in r.boxes:
            conf = float(box.conf[0])
            cls_id = int(box.cls[0])
            label = r.names[cls_id]
            
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            w = x2 - x1
            h = y2 - y1

            if conf < score_threshold:
                continue
            cls_id = int(box.cls[0])
            distance_text = ""
            if label in KNOWN_WIDTHS:
                distance = (KNOWN_WIDTHS[label] * FOCAL_LENGTH) / w
                distance_text = f"{distance:.2f} meters"
                
            names.append(f"{r.names[cls_id]} at {distance_text}")
            print(f"{r.names[cls_id]} at {distance_text}")
    return names

In [39]:
run_webcam_mode(frame_skip = 1 , frame_resize = 1)


0: 480x640 1 person, 775.6ms
Speed: 3.1ms preprocess, 775.6ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)
person at 0.69 meters

0: 480x640 1 person, 758.2ms
Speed: 1.8ms preprocess, 758.2ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)
person at 1.10 meters
Interrupted by user


In [35]:
run_image_mode("./images/train/images/00d343d4a829121c.jpg")


0: 448x640 1 person, 5 chairs, 1 dining table, 805.1ms
Speed: 4.1ms preprocess, 805.1ms inference, 0.4ms postprocess per image at shape (1, 3, 448, 640)


In [32]:
import cv2

# Real world width of your calibration object (meters)
KNOWN_WIDTH = 0.0856  # credit card width

# Distance from camera to object during calibration (meters)
KNOWN_DISTANCE = 0.3  

cap = cv2.VideoCapture(0)

print("Press 'q' when done selecting ROI...")

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    cv2.imshow("Calibration - place object at known distance", frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        # Select ROI (draw bounding box around object)
        roi = cv2.selectROI("Calibration", frame, False, False)
        x, y, w, h = roi
        pixel_width = w
        focal_length = (pixel_width * KNOWN_DISTANCE) / KNOWN_WIDTH
        print(f"Estimated focal length: {focal_length:.2f} pixels")
        break

cap.release()
cv2.destroyAllWindows()

Press 'q' when done selecting ROI...
Estimated focal length: 707.94 pixels
