In [3]:
import cv2
import numpy as np
import pyttsx3
import speech_recognition as sr
import threading

# Constants
REAL_WORLD_OBJECT_HEIGHT = 1.7  # Average height of a person in meters
FOCAL_LENGTH = 800  # Example focal length in pixels; calibrate your camera for accurate results

# Load YOLO model
net = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg")
with open("coco.names", "r") as f:
    classes = [line.strip() for line in f.readlines()]

layer_names = net.getLayerNames()
output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]

# Initialize text-to-speech engine
engine = pyttsx3.init()

# Initialize speech recognition
recognizer = sr.Recognizer()

def detect_objects(frame):
    height, width, _ = frame.shape

    # Preprocess the frame for YOLO
    blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward(output_layers)

    detected_objects = []

    for out in outs:
        for detection in out:
            for obj in detection:
                obj = np.array(obj)
                
                # Ensure obj has enough elements
                if obj.size >= 6:
                    # Extract bounding box coordinates and class scores
                    center_x, center_y, w, h = obj[0:4] * [width, height, width, height]
                    scores = obj[5:]
                    class_id = np.argmax(scores)
                    confidence = scores[class_id]

                    if confidence > 0.5:
                        x = int(center_x - w / 2)
                        y = int(center_y - h / 2)

                        # Calculate distance
                        object_size = h  # or use w if more appropriate
                        distance = (REAL_WORLD_OBJECT_HEIGHT * FOCAL_LENGTH) / object_size

                        label = str(classes[class_id])
                        detected_objects.append((label, distance, x, y, w, h))
    
    return detected_objects

def speak_detected_objects(objects, frame):
    if not objects:
        engine.say("No new objects detected.")
    else:
        for obj in objects:
            label, distance, x, y, w, h = obj
            engine.say(f"I see a {label} at approximately {distance:.2f} meters away.")
            # Draw bounding boxes on the frame
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
            cv2.putText(frame, f"{label} {distance:.2f}m", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    engine.runAndWait()

def voice_command():
    while True:
        with sr.Microphone() as source:
            print("Listening for commands...")
            recognizer.adjust_for_ambient_noise(source)
            audio = recognizer.listen(source)

            try:
                command = recognizer.recognize_google(audio).lower()
                print("Command:", command)
                if command == "stop":
                    return
            except sr.UnknownValueError:
                pass  # Ignore unrecognized commands
            except sr.RequestError as e:
                print(f"Google Speech Recognition request failed: {e}")

# Initialize video capture
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Unable to open video capture.")
    exit()

detected_objects = []

# Start voice command thread
command_thread = threading.Thread(target=voice_command, daemon=True)
command_thread.start()

# Start object detection
while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Unable to capture frame.")
        break

    # Perform object detection
    new_objects = detect_objects(frame)

    # Speak out detected objects with distance
    speak_detected_objects(new_objects, frame)

    # Display the frame
    cv2.imshow("Object Detection", frame)

    # Break the loop if the command thread has requested to stop
    if not command_thread.is_alive():
        break

# Release the video capture and close the text-to-speech engine
cap.release()
cv2.destroyAllWindows()
engine.stop()


error: OpenCV(4.10.0) D:\a\opencv-python\opencv-python\opencv\modules\dnn\src\darknet\darknet_importer.cpp:210: error: (-212:Parsing error) Failed to open NetParameter file: yolov3.cfg in function 'cv::dnn::dnn4_v20240521::readNetFromDarknet'


In [None]:
%pip uninstall pyaudio
%pip install pyaudio


In [None]:
%pip uninstall opencv-python
%pip uninstall opencv-python-headless


Note: you may need to restart the kernel to use updated packages.




In [1]:
%pip install opencv-python


Collecting opencv-python
  Downloading opencv_python-4.10.0.84-cp37-abi3-win_amd64.whl (38.8 MB)
     ---------------------------------------- 38.8/38.8 MB 1.3 MB/s eta 0:00:00
Installing collected packages: opencv-python
Successfully installed opencv-python-4.10.0.84
Note: you may need to restart the kernel to use updated packages.


In [1]:
%pip install matplotlib


Note: you may need to restart the kernel to use updated packages.


In [4]:
import cv2
import numpy as np
import pyttsx3
import speech_recognition as sr
import threading
import matplotlib.pyplot as plt

# Constants
REAL_WORLD_OBJECT_HEIGHT = 1.7  # Average height of a person in meters
FOCAL_LENGTH = 800  # Example focal length in pixels; calibrate your camera for accurate results
CONFIDENCE_THRESHOLD = 0.3  # Lowered confidence threshold

# Load YOLO model
net = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg")
with open("coco.names", "r") as f:
    classes = [line.strip() for line in f.readlines()]

layer_names = net.getLayerNames()
output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]

# Initialize text-to-speech engine
engine = pyttsx3.init()

# Initialize speech recognition
recognizer = sr.Recognizer()

def detect_objects(frame):
    height, width, _ = frame.shape

    # Preprocess the frame for YOLO
    blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward(output_layers)

    detected_objects = []

    for out in outs:
        for detection in out:
            for obj in detection:
                obj = np.array(obj)

                # Ensure obj has enough elements
                if obj.size >= 6:
                    # Extract bounding box coordinates and class scores
                    center_x, center_y, w, h = obj[0:4] * [width, height, width, height]
                    scores = obj[5:]
                    class_id = np.argmax(scores)
                    confidence = scores[class_id]

                    if confidence > CONFIDENCE_THRESHOLD:
                        x = int(center_x - w / 2)
                        y = int(center_y - h / 2)

                        # Calculate distance
                        object_size = h  # or use w if more appropriate
                        distance = (REAL_WORLD_OBJECT_HEIGHT * FOCAL_LENGTH) / object_size

                        label = str(classes[class_id])
                        detected_objects.append((label, distance, x, y, w, h))
    
    return detected_objects

def speak_detected_objects(objects, frame):
    if not objects:
        engine.say("No objects detected.")
    else:
        for obj in objects:
            label, distance, x, y, w, h = obj
            engine.say(f"I see a {label} at approximately {distance:.2f} meters away.")
            # Draw bounding boxes on the frame
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
            cv2.putText(frame, f"{label} {distance:.2f}m", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    engine.runAndWait()

def show_frame(frame):
    # Convert the frame from BGR to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    plt.imshow(rgb_frame)
    plt.axis('off')  # Hide the axis
    plt.show()

def voice_command():
    while True:
        with sr.Microphone() as source:
            print("Listening for commands...")
            recognizer.adjust_for_ambient_noise(source)
            audio = recognizer.listen(source)

            try:
                command = recognizer.recognize_google(audio).lower()
                print("Command:", command)
                if command == "stop":
                    return
            except sr.UnknownValueError:
                pass  # Ignore unrecognized commands
            except sr.RequestError as e:
                print(f"Google Speech Recognition request failed: {e}")

# Initialize video capture
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Unable to open video capture.")
    exit()

detected_objects = []

# Start voice command thread
command_thread = threading.Thread(target=voice_command, daemon=True)
command_thread.start()

# Start object detection
frame_count = 0
while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Unable to capture frame.")
        break

    # Perform object detection
    new_objects = detect_objects(frame)

    # Debugging: Print detected objects
    if not new_objects:
        print("No objects detected.")
    else:
        for obj in new_objects:
            print(f"Detected: {obj[0]} at distance {obj[1]:.2f}m")

    # Speak out detected objects with distance
    speak_detected_objects(new_objects, frame)

    # Display the frame
    show_frame(frame)

    # Save the frame
    filename = f"frame_{frame_count}.jpg"
    cv2.imwrite(filename, frame)
    print(f"Saved {filename}")
    frame_count += 1

    # Break the loop if the command thread has requested to stop
    if not command_thread.is_alive():
        break

# Release the video capture and close the text-to-speech engine
cap.release()
cv2.destroyAllWindows()
engine.stop()


error: OpenCV(4.10.0) D:\a\opencv-python\opencv-python\opencv\modules\dnn\src\darknet\darknet_importer.cpp:210: error: (-212:Parsing error) Failed to open NetParameter file: yolov3.cfg in function 'cv::dnn::dnn4_v20240521::readNetFromDarknet'


# Gives Frames


In [8]:
import cv2
import numpy as np
import pyttsx3
import speech_recognition as sr
import threading
import os
import matplotlib.pyplot as plt

# Constants
REAL_WORLD_OBJECT_HEIGHT = 1.7  # Average height of a person in meters
FOCAL_LENGTH = 800  # Example focal length in pixels; calibrate your camera for accurate results
CONFIDENCE_THRESHOLD = 0.3  # Lowered confidence threshold

# Paths to the YOLO files
base_path = os.path.dirname(__file__)  # Directory of the script
cfg_path = os.path.join(base_path, "yolov3.cfg")
weights_path = os.path.join(base_path, "yolov3.weights")
names_path = os.path.join(base_path, "coco.names")

# Check if files exist
if not os.path.isfile(cfg_path):
    raise FileNotFoundError(f"Configuration file not found: {cfg_path}")
if not os.path.isfile(weights_path):
    raise FileNotFoundError(f"Weights file not found: {weights_path}")
if not os.path.isfile(names_path):
    raise FileNotFoundError(f"Names file not found: {names_path}")

# Load YOLO model
net = cv2.dnn.readNet(weights_path, cfg_path)
with open(names_path, "r") as f:
    classes = [line.strip() for line in f.readlines()]

layer_names = net.getLayerNames()
output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]

# Initialize text-to-speech engine
engine = pyttsx3.init()

# Initialize speech recognition
recognizer = sr.Recognizer()

def detect_objects(frame):
    height, width, _ = frame.shape

    # Preprocess the frame for YOLO
    blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward(output_layers)

    detected_objects = []

    for out in outs:
        for detection in out:
            for obj in detection:
                obj = np.array(obj)

                # Ensure obj has enough elements
                if obj.size >= 6:
                    # Extract bounding box coordinates and class scores
                    center_x, center_y, w, h = obj[0:4] * [width, height, width, height]
                    scores = obj[5:]
                    class_id = np.argmax(scores)
                    confidence = scores[class_id]

                    if confidence > CONFIDENCE_THRESHOLD:
                        x = int(center_x - w / 2)
                        y = int(center_y - h / 2)

                        # Calculate distance
                        object_size = h  # or use w if more appropriate
                        distance = (REAL_WORLD_OBJECT_HEIGHT * FOCAL_LENGTH) / object_size

                        label = str(classes[class_id])
                        detected_objects.append((label, distance, x, y, w, h))
    
    return detected_objects

def speak_detected_objects(objects, frame):
    if not objects:
        engine.say("No objects detected.")
    else:
        for obj in objects:
            label, distance, x, y, w, h = obj
            engine.say(f"I see a {label} at approximately {distance:.2f} meters away.")
            # Draw bounding boxes on the frame
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
            cv2.putText(frame, f"{label} {distance:.2f}m", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    engine.runAndWait()

def show_frame(frame):
    # Convert the frame from BGR to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    plt.imshow(rgb_frame)
    plt.axis('off')  # Hide the axis
    plt.show()

def voice_command():
    while True:
        with sr.Microphone() as source:
            print("Listening for commands...")
            recognizer.adjust_for_ambient_noise(source)
            audio = recognizer.listen(source)

            try:
                command = recognizer.recognize_google(audio).lower()
                print("Command:", command)
                if command == "stop":
                    return
            except sr.UnknownValueError:
                pass  # Ignore unrecognized commands
            except sr.RequestError as e:
                print(f"Google Speech Recognition request failed: {e}")

# Initialize video capture
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Unable to open video capture.")
    exit()

detected_objects = []

# Start voice command thread
command_thread = threading.Thread(target=voice_command, daemon=True)
command_thread.start()

# Start object detection
frame_count = 0
while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Unable to capture frame.")
        break

    # Perform object detection
    new_objects = detect_objects(frame)

    # Debugging: Print detected objects
    if not new_objects:
        print("No objects detected.")
    else:
        for obj in new_objects:
            print(f"Detected: {obj[0]} at distance {obj[1]:.2f}m")

    # Speak out detected objects with distance
    speak_detected_objects(new_objects, frame)

    # Display the frame
    show_frame(frame)

    # Save the frame
    filename = f"frame_{frame_count}.jpg"
    cv2.imwrite(filename, frame)
    print(f"Saved {filename}")
    frame_count += 1

    # Break the loop if the command thread has requested to stop
    if not command_thread.is_alive():
        break

# Release the video capture and close the text-to-speech engine
cap.release()
cv2.destroyAllWindows()
engine.stop()


NameError: name '__file__' is not defined

In [None]:
import cv2
import numpy as np
import pyttsx3
import speech_recognition as sr
import threading
import os
import matplotlib.pyplot as plt

# Constants
REAL_WORLD_OBJECT_HEIGHT = 1.7  # Average height of a person in meters
FOCAL_LENGTH = 800  # Example focal length in pixels; calibrate your camera for accurate results
CONFIDENCE_THRESHOLD = 0.3  # Lowered confidence threshold

# Paths to the YOLO files
cfg_path = "yolov3.cfg"
weights_path = "yolov3.weights"
names_path = "coco.names"

# Check if files exist
if not os.path.isfile(cfg_path):
    raise FileNotFoundError(f"Configuration file not found: {cfg_path}")
if not os.path.isfile(weights_path):
    raise FileNotFoundError(f"Weights file not found: {weights_path}")
if not os.path.isfile(names_path):
    raise FileNotFoundError(f"Names file not found: {names_path}")

# Load YOLO model
net = cv2.dnn.readNet(weights_path, cfg_path)
with open(names_path, "r") as f:
    classes = [line.strip() for line in f.readlines()]

layer_names = net.getLayerNames()
output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]

# Initialize text-to-speech engine
engine = pyttsx3.init()

# Initialize speech recognition
recognizer = sr.Recognizer()

def detect_objects(frame):
    height, width, _ = frame.shape

    # Preprocess the frame for YOLO
    blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward(output_layers)

    detected_objects = []

    for out in outs:
        for detection in out:
            for obj in detection:
                obj = np.array(obj)

                # Ensure obj has enough elements
                if obj.size >= 6:
                    # Extract bounding box coordinates and class scores
                    center_x, center_y, w, h = obj[0:4] * [width, height, width, height]
                    scores = obj[5:]
                    class_id = np.argmax(scores)
                    confidence = scores[class_id]

                    if confidence > CONFIDENCE_THRESHOLD:
                        x = int(center_x - w / 2)
                        y = int(center_y - h / 2)

                        # Calculate distance
                        object_size = h  # or use w if more appropriate
                        distance = (REAL_WORLD_OBJECT_HEIGHT * FOCAL_LENGTH) / object_size

                        label = str(classes[class_id])
                        detected_objects.append((label, distance, x, y, w, h))
    
    return detected_objects

def speak_detected_objects(objects, frame):
    if not objects:
        engine.say("No objects detected.")
    else:
        for obj in objects:
            label, distance, x, y, w, h = obj
            engine.say(f"I see a {label} at approximately {distance:.2f} meters away.")
            # Draw bounding boxes on the frame
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
            cv2.putText(frame, f"{label} {distance:.2f}m", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    engine.runAndWait()

def show_frame(frame):
    # Convert the frame from BGR to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    plt.imshow(rgb_frame)
    plt.axis('off')  # Hide the axis
    plt.show()

def voice_command():
    while True:
        with sr.Microphone() as source:
            print("Listening for commands...")
            recognizer.adjust_for_ambient_noise(source)
            audio = recognizer.listen(source)

            try:
                command = recognizer.recognize_google(audio).lower()
                print("Command:", command)
                if command == "stop":
                    return
            except sr.UnknownValueError:
                pass  # Ignore unrecognized commands
            except sr.RequestError as e:
                print(f"Google Speech Recognition request failed: {e}")

# Initialize video capture
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Unable to open video capture.")
    exit()

detected_objects = []

# Start voice command thread
command_thread = threading.Thread(target=voice_command, daemon=True)
command_thread.start()

# Start object detection
frame_count = 0
while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Unable to capture frame.")
        break

    # Perform object detection
    new_objects = detect_objects(frame)

    # Debugging: Print detected objects
    if not new_objects:
        print("No objects detected.")
    else:
        for obj in new_objects:
            print(f"Detected: {obj[0]} at distance {obj[1]:.2f}m")

    # Speak out detected objects with distance
    speak_detected_objects(new_objects, frame)

    # Display the frame
    show_frame(frame)

    # Save the frame
    filename = f"frame_{frame_count}.jpg"
    cv2.imwrite(filename, frame)
    print(f"Saved {filename}")
    frame_count += 1

    # Break the loop if the command thread has requested to stop
    if not command_thread.is_alive():
        break

# Release the video capture and close the text-to-speech engine
cap.release()
cv2.destroyAllWindows()
engine.stop()
