In [15]:
from collections import defaultdict, Counter
import cv2
import numpy as np
from ultralytics import YOLO
import pyttsx3

# Initialize YOLOv8 model
model = YOLO('yolov8m_saved_model/yolov8m_float32.tflite')
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
                  "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
                  "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
                  "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
                  "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
                  "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
                  "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
                  "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
                  "teddy bear", "hair drier", "toothbrush"
                  ]




In [2]:
def audio(path):
    
    language = 'en'
    results = model(path)
    x = "There "  # Initialize x
    for result in results:
        boxes =result.boxes.cls
        boxes_int = [int(x) for x in boxes]
        counter = Counter(boxes_int)

        print(counter)  # Check what boxes contains
        for i, (box, count) in enumerate(counter.items()):
            if i != len(counter) - 1 and i != 0:
                if count == 1:
                    x += f"a {classNames[int(box)]} , "
                else:
                    x += f"{count} {classNames[int(box)]}s , "
            elif i == 0:
                if count == 1:
                    x += f"is a {classNames[int(box)]} , "
                else:
                    x += f"are {count} {classNames[int(box)]}s , "
            else:
                if count == 1:
                    x += f"and a {classNames[int(box)]} "
                else:
                    x += f"and {count} {classNames[int(box)]}s "
    
    x+= "in front of you."
    print(x)
    pyttsx3.speak(x)

In [3]:
def detect_objects_and_speak(frame):
    # Convert frame to grayscale
    frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Run YOLOv8 tracking
    results = model.track(frame, persist=True)
    boxes = results[0].boxes.xywh.cpu()
    #probs = results.probs.cpu()
    
    if len(boxes) == 0:
        print("No moving objects detected.")
    else:
        objects_with_direction = []  # List to store objects along with their direction
        objects=[]
        for j,box in enumerate(boxes):
            # Extract box coordinates
            x, y, w, h = box
            # Calculate box center
            center_x =[]
            center_x.append(x)
            for xx in center_x:
            
                print(xx)
                # Determine direction based on box center
                if xx <= 400:
                    direction = "left"
                elif xx >= 900:
                    direction = "right"
                else:
                    direction = "middle"
                print(direction)
                object_name = classNames[int(results[0].boxes.cls[j])]
                print(object_name)
                objects_with_direction.append(f" {direction}")
                objects.append(f" {object_name}")
        text_to_speak = ", ".join(objects_with_direction) 
        text_to_speak += ",".join(objects)
        pyttsx3.speak(text_to_speak)

      

In [24]:
# Main function to capture frames and handle events
def detect():
    # Replace "http://192.168.29.111:8080" with the URL of your IP webcam stream
    cap = cv2.VideoCapture(0)

    # Set the resolution (optional)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)
    
    # Check if the video stream opened successfully
    if not cap.isOpened():
        print("Error: Couldn't open video stream")
        return

    # Main loop
    while True:
        # Capture frame-by-frame
        ret, frame = cap.read()

        # If frame is read correctly ret is True
        if not ret:
            print("Error: Couldn't read frame")
            break

        # Display the resulting frame
        cv2.imshow('Webcam', frame)

        # Check for key events
        key = cv2.waitKey(1)

        # Perform actions based on key pressed
        if key == ord('u'):
            detect_objects_and_speak(frame)  # Call the function for object detection and speech synthesis
        elif key == ord('r'):
            cv2.imwrite("temp_img.jpg", frame)  # Save a temporary image
            audio("temp_img.jpg")
        if key==ord('c')
            detect_distance(frame)# Call the audio function with the path to the temporary image
        elif key == ord('q'):
            break  # Break the loop if 'q' is pressed

    # Release the video stream and close OpenCV windows
    cap.release()
    cv2.destroyAllWindows()

# Call the detect function
detect()



0: 640x640 1 person, 956.9ms
Speed: 4.0ms preprocess, 956.9ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)
tensor([1057.69922,  687.37976, 1017.09930,  742.88696])
Distance: 0.43 meters

0: 640x640 1 person, 969.8ms
Speed: 7.0ms preprocess, 969.8ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)
tensor([1275.81384,  679.25800,  956.18604,  715.41833])
Distance: 0.45 meters


In [26]:
def detect_distance(frame):
    # Convert frame to grayscale
    frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Run YOLOv8 tracking
    results = model.track(frame, persist=True)
    boxes = results[0].boxes.xywh.cpu()
    #probs = results.probs.cpu()
    
    if len(boxes) == 0:
        print("No moving objects detected.")
    else:
        objects_with_direction = []  # List to store objects along with their direction
        objects=[]
        for j,box in enumerate(boxes):
            # Extract box coordinates
            x, y, w, h = box
            # Calculate box center
            print(box)
            
            focal_length = 1.479685039370079  # Focal length of the camera
            object_height = 0.2  # Height of the object in meters
            image_height = 1080  # Image height in pixels
            object_pixel_height=h
            distance = (focal_length * object_height * image_height) / (object_pixel_height )
            print( f'Distance: {distance:.2f} meters')

100
