In [21]:
from collections import defaultdict, Counter
import cv2
import numpy as np
from ultralytics import YOLO
import pyttsx3

# Initialize YOLOv8 model
model = YOLO('yolov8m_saved_model/yolov8m_float32.tflite')
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
                  "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
                  "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
                  "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
                  "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
                  "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
                  "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
                  "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
                  "teddy bear", "hair drier", "toothbrush"
                  ]
# Initialize pyttsx3 engine
engine = pyttsx3.init()



In [22]:
def audio(path):
    
    language = 'en'
    results = model(path)
    x = "There "  # Initialize x
    for result in results:
        boxes =result.boxes.cls
        boxes_int = [int(x) for x in boxes]
        counter = Counter(boxes_int)

        print(counter)  # Check what boxes contains
        for i, (box, count) in enumerate(counter.items()):
            if i != len(counter) - 1 and i != 0:
                if count == 1:
                    x += f"a {classNames[int(box)]} , "
                else:
                    x += f"{count} {classNames[int(box)]}s , "
            elif i == 0:
                if count == 1:
                    x += f"is a {classNames[int(box)]} , "
                else:
                    x += f"are {count} {classNames[int(box)]}s , "
            else:
                if count == 1:
                    x += f"and a {classNames[int(box)]} "
                else:
                    x += f"and {count} {classNames[int(box)]}s "
    
    x+= "in front of you."
    print(x)
    pyttsx3.speak(x)

In [23]:
# Function to perform object detection and speak the detected objects and their directions
def detect_objects_and_speak(frame):
    # Convert frame to grayscale
    frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Run YOLOv8 tracking
    results = model.track(frame, persist=True)
    boxes = results[0].boxes.xywh.cpu()

    if len(boxes) == 0:
        print("No moving objects detected.")
    else:
               # Calculate the center of the frame
        frame_center_x = frame.shape[1] // 2
        text_to_speak = "There is a"
        
        objects_with_direction = []  # List to store objects along with their direction
        
        for i, obj in enumerate(boxes):
            object_center_x1 = obj[0] + obj[2] / 2
            
            # Calculate the distance between the object's center and the frame center
            distance_from_center = abs(object_center_x1 - frame_center_x)
            
            if distance_from_center > 50:  # Adjust the threshold as needed
                direction = "on the left" if object_center_x1 < frame_center_x else "on the right"
            else:
                direction = "in the middle"
        
            object_name = classNames[int(results[0].boxes.cls[i])]
            
            # Add object along with its direction to the list
            objects_with_direction.append(f" {object_name} {direction}")
        
        # Join the objects with their directions into a single string
        text_to_speak += ", ".join(objects_with_direction[:-1])  # Add objects with direction except the last one
        if len(objects_with_direction) > 1:  # Add 'and' if there are multiple objects
            text_to_speak += ", and " + objects_with_direction[-1]
        else:
            text_to_speak += objects_with_direction[-1]
        
        # Speak out the detected objects and their directions
        engine.say(text_to_speak)
        engine.runAndWait()
        
        # Print the detected objects and their directions
        print(text_to_speak)

In [24]:
# Main function to capture frames and handle events
def detect():
    cap = cv2.VideoCapture(0)
    cap.set(3, 1920)
    cap.set(4, 1080)
    
    # Main loop
    while True:
        success, frame = cap.read()
        cv2.imshow('Webcam', frame)
        
        key = cv2.waitKey(1)
        
        # Perform object detection and speak the detected objects and their directions when 'u' is pressed
        if key == ord('u'):
            detect_objects_and_speak(frame)  # Call the audio function with the path to the temporary image
        elif key == ord('r'):
            cv2.imwrite("temp_img.jpg", frame)  # Save a temporary image
            audio("temp_img.jpg")  # Call the audio function with the path to the temporary image
        # Break the loop if 'q' is pressed
        elif key == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()

# Call the detect function
detect()

Loading yolov8m_saved_model\yolov8m_float32.tflite for TensorFlow Lite inference...

image 1/1 C:\Users\User\temp_img.jpg: 640x640 1 person, 1 bed, 2 refrigerators, 1080.6ms
Speed: 5.0ms preprocess, 1080.6ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)
Counter({72: 2, 0: 1, 59: 1})
There is a person , 2 refrigerators , and a bed in front of you.

0: 640x640 1 person, 949.6ms
Speed: 4.0ms preprocess, 949.6ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 640)
There is a person on the right

0: 640x640 1 person, 962.1ms
Speed: 3.0ms preprocess, 962.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)
There is a person on the right

0: 640x640 1 person, 1 refrigerator, 964.1ms
Speed: 4.0ms preprocess, 964.1ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)
There is a person on the right, and  refrigerator on the left
