In [131]:
from collections import defaultdict, Counter
import cv2
import numpy as np
from ultralytics import YOLO
import pyttsx3

# Initialize YOLOv8 model
model = YOLO('yolov8m_saved_model/yolov8m_float32.tflite')
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
                  "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
                  "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
                  "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
                  "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
                  "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
                  "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
                  "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
                  "teddy bear", "hair drier", "toothbrush"
                  ]
# Initialize pyttsx3 engine
engine = pyttsx3.init()



In [132]:
def audio(path):
    
    language = 'en'
    results = model(path)
    x = "There "  # Initialize x
    for result in results:
        boxes =result.boxes.cls
        boxes_int = [int(x) for x in boxes]
        counter = Counter(boxes_int)

        print(counter)  # Check what boxes contains
        for i, (box, count) in enumerate(counter.items()):
            if i != len(counter) - 1 and i != 0:
                if count == 1:
                    x += f"a {classNames[int(box)]} , "
                else:
                    x += f"{count} {classNames[int(box)]}s , "
            elif i == 0:
                if count == 1:
                    x += f"is a {classNames[int(box)]} , "
                else:
                    x += f"are {count} {classNames[int(box)]}s , "
            else:
                if count == 1:
                    x += f"and a {classNames[int(box)]} "
                else:
                    x += f"and {count} {classNames[int(box)]}s "
    
    x+= "in front of you."
    print(x)
    pyttsx3.speak(x)

In [145]:
def detect_objects_and_speak(frame):
    # Convert frame to grayscale
    frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Run YOLOv8 tracking
    results = model.track(frame, persist=True)
    boxes = results[0].boxes.xywh.cpu()
    #probs = results.probs.cpu()
    
    if len(boxes) == 0:
        print("No moving objects detected.")
    else:
        objects_with_direction = []  # List to store objects along with their direction
        objects=[]
        for j,box in enumerate(boxes):
            # Extract box coordinates
            x, y, w, h = box
            # Calculate box center
            center_x =[]
            center_x.append(x)
            for xx in center_x:
            
                print(xx)
                # Determine direction based on box center
                if xx <= 400:
                    direction = "left"
                elif xx >= 900:
                    direction = "right"
                else:
                    direction = "middle"
                print(direction)
                object_name = classNames[int(results[0].boxes.cls[j])]
                print(object_name)
                objects_with_direction.append(f" {direction}")
                objects.append(f" {object_name}")
        text_to_speak = ", ".join(objects_with_direction) 
        text_to_speak += ",".join(objects)
        engine.say(text_to_speak)
        engine.runAndWait()
      

In [146]:
# Main function to capture frames and handle events
def detect():
    cap = cv2.VideoCapture(0)
    cap.set(3, 1920)
    cap.set(4, 1080)
    
    # Main loop
    while True:
        success, frame = cap.read()
        cv2.imshow('Webcam', frame)
        
        key = cv2.waitKey(1)
        
        # Perform object detection and speak the detected objects and their directions when 'u' is pressed
        if key == ord('u'):
            detect_objects_and_speak(frame)  # Call the audio function with the path to the temporary image
        elif key == ord('r'):
            cv2.imwrite("temp_img.jpg", frame)  # Save a temporary image
            audio("temp_img.jpg")  # Call the audio function with the path to the temporary image
        # Break the loop if 'q' is pressed
        elif key == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()

# Call the detect function
detect()


0: 640x640 1 person, 1139.3ms
Speed: 6.0ms preprocess, 1139.3ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)
tensor(1042.5081)
right
person

0: 640x640 1 person, 1148.1ms
Speed: 3.0ms preprocess, 1148.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)
tensor(914.5387)
right
person

0: 640x640 1 person, 1156.6ms
Speed: 6.0ms preprocess, 1156.6ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)
tensor(1182.6370)
right
person

0: 640x640 1 person, 1235.5ms
Speed: 4.0ms preprocess, 1235.5ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)
tensor(1028.9409)
right
person

0: 640x640 1 person, 1130.3ms
Speed: 3.0ms preprocess, 1130.3ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)
tensor(1084.4281)
right
person

0: 640x640 2 persons, 2 books, 1 teddy bear, 1121.7ms
Speed: 16.6ms preprocess, 1121.7ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 640)
tensor(619.9271)
middle
person
tensor(141