In [1]:
import numpy as np
import matplotlib.pyplot as plt
import cv2 
import argparse
import sys
from typing import List

import cv2
import pyttsx3
from ultralytics import YOLO

Creating new Ultralytics Settings v0.0.6 file  
View Ultralytics Settings with 'yolo settings' or at 'C:\Users\PartZ\AppData\Roaming\Ultralytics\settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [32]:
def draw_boxes(image_bgr, results, score_threshold=0.5):
    """Draw bounding boxes and labels on a BGR image using YOLO results."""
    out = image_bgr.copy()
    for r in results:
        for box in r.boxes:
            cls_id = int(box.cls[0])
            conf = float(box.conf[0])
            if conf < score_threshold:
                continue
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            cls = r.names[cls_id]
            cv2.rectangle(out, (x1, y1), (x2, y2), (0, 255, 0), 2)
            text = f"{cls}: {conf:.2f}"
            ((text_w, text_h), _) = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
            cv2.rectangle(out, (x1, y1 - text_h - 6), (x1 + text_w, y1), (0, 255, 0), -1)
            cv2.putText(out, text, (x1, y1 - 4), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 1, cv2.LINE_AA)
    return out

def run_image_mode(input_path: str, score: float = 0.5, output_path: str = None, weights: str = "yolov8n.pt"):
    model = YOLO(weights)
    img = cv2.imread(input_path)
    if img is None:
        print(f"Error: could not read {input_path}")
        return
    results = model(img)
    drawn = draw_boxes(img, results, score)
    if output_path:
        cv2.imwrite(output_path, drawn)
        print(f"Wrote annotated image to {output_path}")
    else:
        names = extract_names(results, score)
        tts(names)
        cv2.imshow('detected', drawn)
        cv2.waitKey(0)
        cv2.destroyAllWindows()

def run_webcam_mode(score: float = 0.5, frame_resize: float = 0.6, frame_skip: int = 3, weights: str = "yolov8n.pt"):
    model = YOLO(weights)
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("ERROR: Could not open webcam")
        return
    engine = None
    last_spoken = []
    frame_idx = 0
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame_idx += 1
            if frame_idx % frame_skip != 0:
                cv2.imshow('webcam', frame)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break
                continue
            if frame_resize != 1.0:
                small = cv2.resize(frame, (0, 0), fx=frame_resize, fy=frame_resize)
            else:
                small = frame
            results = model(small)
            out_small = draw_boxes(small, results, score)
            names = extract_names(results, score)
            if names and names != last_spoken:
                if engine is None:
                    try:
                        engine = init_tts_engine()
                    except Exception as e:
                        print("Could not init TTS engine:", e)
                        engine = None
                tts(names, engine)
                last_spoken = names
            cv2.imshow('webcam', out_small)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
    except KeyboardInterrupt:
        print('Interrupted by user')
    finally:
        cap.release()
        cv2.destroyAllWindows()

def main():
    args = parse_args()
    print('Loading YOLOv8 model...')
    model = YOLO(args.weights)
    print('Model loaded.')
    if args.mode == 'image':
        if not args.input:
            print('There should be an image in input !')
            sys.exit(1)
        run_image_mode(args.input, model, args.score, args.output)


def tts(names: List[str], engine=None):
    if not names:
        return
    seen = set()
    unique = []
    for n in names:
        if n not in seen:
            seen.add(n)
            unique.append(n)
    sentence = "I see " + (" and ".join(unique))
    try:
        if engine is None:
            engine = pyttsx3.init()
            engine.setProperty('rate', 150)
            voices = engine.getProperty('voices')
            if len(voices) > 1:
                engine.setProperty('voice', voices[1].id)
        engine.say(sentence)
        engine.runAndWait()
    except Exception as e:
        print("TTS error:", e)
        
def extract_names(results, score_threshold=0.5):
    names = []
    for r in results:
        for box in r.boxes:
            conf = float(box.conf[0])
            if conf < score_threshold:
                continue
            cls_id = int(box.cls[0])
            names.append(r.names[cls_id])
    return names



In [33]:
run_webcam_mode()


0: 480x640 1 person, 89.0ms
Speed: 4.8ms preprocess, 89.0ms inference, 1.6ms postprocess per image at shape (1, 3, 480, 640)
Could not init TTS engine: name 'init_tts_engine' is not defined

0: 480x640 1 person, 71.6ms
Speed: 9.5ms preprocess, 71.6ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 73.3ms
Speed: 4.6ms preprocess, 73.3ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 75.4ms
Speed: 3.1ms preprocess, 75.4ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 76.3ms
Speed: 3.0ms preprocess, 76.3ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 79.0ms
Speed: 4.3ms preprocess, 79.0ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 72.0ms
Speed: 3.3ms preprocess, 72.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 91.4ms
Speed: 3.2ms preprocess,

In [19]:
tts(["Abolfazl" , "Ansari"])