In [1]:
from collections import defaultdict, Counter
import cv2
import numpy as np
from ultralytics import YOLO
import pyttsx3
import os
language='en'
# Initialize YOLOv8 model
model = YOLO('yolov8m_saved_model/yolov8m_float32.tflite')
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
                  "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
                  "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
                  "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
                  "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
                  "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
                  "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
                  "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
                  "teddy bear", "hair drier", "toothbrush"
                  ]
engine = pyttsx3.init()



In [2]:
def audio(path):
    
    language = 'en'
    results = model(path)
    x = "There "  # Initialize x
    for result in results:
        boxes =result.boxes.cls
        boxes_int = [int(x) for x in boxes]
        counter = Counter(boxes_int)

        print(counter)  # Check what boxes contains
        for i, (box, count) in enumerate(counter.items()):
            if i != len(counter) - 1 and i != 0:
                if count == 1:
                    x += f"a {classNames[int(box)]} , "
                else:
                    x += f"{count} {classNames[int(box)]}s , "
            elif i == 0:
                if count == 1:
                    x += f"is a {classNames[int(box)]} , "
                else:
                    x += f"are {count} {classNames[int(box)]}s , "
            else:
                if count == 1:
                    x += f"and a {classNames[int(box)]} "
                else:
                    x += f"and {count} {classNames[int(box)]}s "
    
    x+= "in front of you."
    print(x)
    engine.say(x)
    engine.runAndWait()

    
    

In [3]:
def detect_objects_and_speak(frame):
    # Convert frame to grayscale
    frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Run YOLOv8 tracking
    results = model.track(frame, persist=True)
    boxes = results[0].boxes.xywh.cpu()
    #probs = results.probs.cpu()
    
    if len(boxes) == 0:
        print("No moving objects detected.")
    else:
        objects_with_direction = []  # List to store objects along with their direction
        objects=[]
        for j,box in enumerate(boxes):
            # Extract box coordinates
            x, y, w, h = box
            # Calculate box center
            center_x =[]
            center_x.append(y)
            for xx in center_x:
            
                print(xx)
                # Determine direction based on box center
                if xx <= 120:
                    direction = "left"
                elif xx >= 240:
                    direction = "right"
                else:
                    direction = "middle"
                print(direction)
                object_name = classNames[int(results[0].boxes.cls[j])]
                print(object_name)
                objects_with_direction.append(f" {direction}")
                objects.append(f" {object_name}")
                text_to_speak = f"The {objects} is in your {objects_with_direction}" 
        
        
                engine.say(text_to_speak)
                engine.runAndWait()

In [4]:
def detect_distance(frame):
        # Convert frame to grayscale
        frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Run YOLOv8 tracking
        results = model.track(frame, persist=True)
        boxes = results[0].boxes.xywh.cpu()
        #probs = results.probs.cpu()

        if len(boxes) == 0:
                print("No moving objects detected.")
        else:
            objects_with_direction = []  # List to store objects along with their direction
            objects=[]
            for j,box in enumerate(boxes):
                        # Extract box coordinates
                        x, y, w, h = box
                        # Calculate box center
                        print(box)

                        focal_length = 1.479685039370079  # Focal length of the camera
                        object_height = 0.2  # Height of the object in meters
                        image_height = 1080  # Image height in pixels
                        object_pixel_height=w
                        distance = (focal_length * object_height * image_height) / (object_pixel_height )
                        print( f'Distance: {distance:.2f} meters')
                        dist=f"The Object is in {distance:.2f} meters"
                        engine.say(dist)
                        engine.runAndWait()
                        

In [None]:
import cv2
from gpiozero import Button
button_pin = 2  # Example pin for the button
button = Button(button_pin)
def function1(img):
    cv2.imwrite("temp_img.jpg", img)  # Save a temporary image
    audio("temp_img.jpg")
def function2(frame):
    detect_distance(frame)
def function3(frame):
    detect_objects_and_speak(frame)
functions = [function1, function2, function3]
current_function_index = 0
# Main function to capture frames and handle events
def detect():
    # Replace "http://192.168.29.111:8080" with the URL of your IP webcam stream
    cap = cv2.VideoCapture(0)

    # Set the resolution (optional)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1080)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 1920)
    
    # Check if the video stream opened successfully
    if not cap.isOpened():
        print("Error: Couldn't open video stream")
        return
    current_function_index = 0
    # Main loop
    while True:
        # Capture frame-by-frame
        ret, frame = cap.read()

        # If frame is read correctly ret is True
        if not ret:
            print("Error: Couldn't read frame")
            break
        frame=cv2.rotate(frame,cv2.ROTATE_90_CLOCKWISE)

        # Display the resulting frame
        cv2.imshow('Webcam', frame)
        
        if button.is_pressed:
            # Call the current function and increment the index
            functions[current_function_index](frame)
            current_function_index = (current_function_index + 1) % len(functions)

        # Check for key events
        key = cv2.waitKey(1)

        # Perform actions based on key pressed
        
        if key == ord('q'):
            break  # Break the loop if 'q' is pressed

    # Release the video stream and close OpenCV windows
    cap.release()
    cv2.destroyAllWindows()

# Call the detect function
detect()


qt.qpa.plugin: Could not find the Qt platform plugin "wayland" in "/home/user/myenv/lib/python3.11/site-packages/cv2/qt/plugins"


Loading yolov8m_saved_model/yolov8m_float32.tflite for TensorFlow Lite inference...



INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


image 1/1 /home/user/object_detection_and_audio_output_using_yolo_v8/temp_img.jpg: 640x640 2 persons, 1 bench, 2716.4ms
Speed: 68.2ms preprocess, 2716.4ms inference, 2275.8ms postprocess per image at shape (1, 3, 640, 640)
Counter({0: 2, 13: 1})
There are 2 persons , and a bench in front of you.

0: 640x640 2 persons, 2747.2ms
Speed: 4.8ms preprocess, 2747.2ms inference, 2.5ms postprocess per image at shape (1, 3, 640, 640)
tensor([264.1624, 383.1411,  97.1039, 301.9626])
Distance: 3.29 meters
