In [1]:
from ultralytics import YOLO

model = YOLO("best_v3_yolov11.pt")

In [2]:
import torch
import torchvision
print(torch.__version__)
print(torchvision.__version__)

2.5.0+cpu
0.20.0+cpu


In [1]:
# This code started as a direct copy from opencv
# https://docs.opencv.org/4.x/dd/d43/tutorial_py_video_display.html

#pyttsx3 courtesy of: https://pypi.org/project/pyttsx3/

import numpy as np
import cv2 as cv
import pyttsx3

# Info about YOLO import and loading the Yolo Model
# https://docs.ultralytics.com/tasks/classify/#train

# Added this to import YOLO
from ultralytics import YOLO

# If DEBUG is True we will output all the detected signs and their confidence
DEBUG = True

# Initialize pyttsx3
engine = pyttsx3.init()

# set speech speed rate
default_rate = engine.getProperty('rate')
engine.setProperty('rate', int(default_rate * 0.75))  # set the speed rate to 75%

# select voice gender
def select_voice(engine, gender='m'):
    voices = engine.getProperty('voices')
    if gender == 'f':
        engine.setProperty('voice', voices[1].id)  #1 for female
    else:
        engine.setProperty('voice', voices[0].id)  #0 for male

# prompt to select gender
gender = input("Select voice (m: male / f: female) : ").strip().lower()
select_voice(engine, gender)

# Load the YOLOv8 model, this is loading our custom trained weights for our model.
model = YOLO("best_v3_yolov11.pt")
 
cap = cv.VideoCapture(0)

# Stored captured text
captured_text = []

# Stores captured confidence
captured_confidence = []

# Set a threshold for the sign to register
confidence_requirement = 0.90
#confidence_requirement = 0.30

# Counts the number of consecutive significant signs
count = 0

# Counts the number of consecutive insignificant signs
noise_count = 0

# Keeps track of the last sign
last = None

translator = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","DEL","NOTHING"," "]
live_text = "Text: "


if not cap.isOpened():
    print("Cannot open camera")
    exit()
while True:
    # Capture frame-by-frame
    ret, frame = cap.read()
 
    # if frame is read correctly ret is True
    if not ret:
        print("Can't receive frame (stream end?). Exiting ...")
        break
    
    # These two lines are found here https://docs.opencv.org/4.x/dd/d43/tutorial_py_video_display.html
    # They reduce the size of the video
    
    #ret = cap.set(cv.CAP_PROP_FRAME_WIDTH,240)
    #ret = cap.set(cv.CAP_PROP_FRAME_HEIGHT,240)
    
    # Our operations on the frame come here
    #gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
    # Display the resulting frame
    #cv.imshow('frame', gray)
    
    # If you scroll to the very bottom of this link
    # https://docs.ultralytics.com/modes/predict/#thread-safe-inference
    # You will find the next 3 lines of code which I took from their and applied to this similar example
    
    # Info about reducing output from https://github.com/ultralytics/ultralytics/issues/1896
    # Run YOLOv8 inference on the frame
    results = model(frame, verbose=False)
    top_class = results[0].probs.top1
    top_confidence = results[0].probs.top1conf  # Get confidence of the top-class prediction
    
    # If the confidence of the sign is above the threshold 
    if top_confidence >= confidence_requirement:
        
        # If the top_class is the last class
        # hence it is consecutive increase count by 1.
        if top_class == last:
            
            count += 1
        
        # If the top_class is not the last class
        # it is  a new class restart counter 
        else:
    
            count = 1
            
        # If there are 3 consecutive significant signs track it
        if count == 3:
            sign = translator[top_class]

            if DEBUG == True:
                captured_text.append(sign)
                captured_confidence.append(top_confidence)

            if sign == "SPACE":
                captured_text.append("_")  # Store "_" for space in captured_text
                live_text += "_"  # Display "_" instead of " " in live_text

            elif sign == "DEL":
                live_text = live_text[:-1]  # Removes the last character

            elif sign != "NOTHING":
                live_text += sign  # Appends the recognized gesture to live_text

        
        # Set last to be the top_class
        last = top_class
        
        # Set the noise counter to 0 since this is not noise
        noise_count = 0 
    
    # If the confidence of the current sign is not enough for the threshold increase noise counter
    else:
        
        noise_count += 1
    
    # If there are three consecutive insignificant signs
    # Reset count, allowing another consecutive sign to be registerred for instance (A,A)
    # Reset the noise counter
    if noise_count == 3:
        count = 0
        noise_count = 0
    
    # Visualize the results on the frame
    #annotated_frame = results[0].plot()
    
    # Display the annotated frame
    
    # Example from: https://www.geeksforgeeks.org/python-opencv-write-text-on-video/
    cv.putText(frame,  
                live_text,  
                (10, 460),  
                cv.FONT_HERSHEY_SIMPLEX, 1,  
                (0, 255, 255),  
                2,  
                cv.LINE_4) 
    
    cv.imshow("Capture", frame)
    
    if cv.waitKey(1) == ord('q'):
        
        if DEBUG == True:
            
            print("\n\nCaptured Text:")
        
            for i in range(len(captured_text)):
                
                print("Translated text: ", captured_text[i] , " Confidence: ", captured_confidence[i].item())
        
        # now I replaced the '_' with ' ' when converted to text
        spoken_text = ''.join(captured_text).replace("_", " ").strip()
        print("Speaking: ", spoken_text)
        engine.say(spoken_text)
        engine.runAndWait()

        break
        
        
 
# When everything done, release the capture
cap.release()
cv.destroyAllWindows()

# a function to replay the captured text after ending the webcam feed
while True:
    replay_choice = input("Press 'R' to replay the sentence or 'E' to exit: ").strip().lower()
    if replay_choice == 'r':
        print("Replaying: ", spoken_text)
        engine.say(spoken_text)
        engine.runAndWait()
    elif replay_choice == 'e':
        print("Exiting.")
        break



Captured Text:
Translated text:  U  Confidence:  0.9990386962890625
Translated text:  H  Confidence:  0.9212673306465149
Speaking:  UH
Replaying:  UH
Exiting.


In [2]:
#Courtesy of: https://www.geeksforgeeks.org/essential-opencv-functions-to-get-started-into-computer-vision/

# Mapping each letter+space to the corresponding ASL image path
asl_image_paths = {
    'A': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\A.jpg',
    'B': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\B.jpg',
    'C': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\C.jpg',
    'D': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\D.jpg',
    'E': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\E.jpg',
    'F': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\F.jpg',
    'G': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\G.jpg',
    'H': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\H.jpg',
    'I': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\I.jpg',
    'J': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\J.jpg',
    'K': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\K.jpg',
    'L': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\L.jpg',
    'M': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\M.jpg',
    'N': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\N.jpg',
    'O': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\O.jpg',
    'P': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\P.jpg',
    'Q': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\Q.jpg',
    'R': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\R.jpg',
    'S': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\S.jpg',
    'T': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\T.jpg',
    'U': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\U.jpg',
    'V': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\V.jpg',
    'W': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\W.jpg',
    'X': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\X.jpg',
    'Y': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\Y.jpg',
    'Z': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\Z.jpg',
    '_': r'C:\Users\jigsa\Downloads\ASL TTS Dataset\SPACE.jpg'
}

# a dictionary to store loaded and resized images to avoid loading each time
asl_images = {}

# pre-load and resize all images to 500x500
for letter, path in asl_image_paths.items():
    img = cv.imread(path)
    if img is not None:
        resized_img = cv.resize(img, (500, 500))
        asl_images[letter] = resized_img
    else:
        print(f"The image for '{letter}' is not found at : {path}")

# a function to display each letter as an ASL gesture
def display_asl_gesture(text):
    for char in text:
        # again, convert the space (' ') to '_' for the dictionary
        char = '_' if char == ' ' else char.upper()

        # get the pre-loaded and resized ASL image
        img = asl_images.get(char)
        if img is not None:
            cv.imshow(f"ASL Gesture for {char}", img)
            cv.waitKey(1000)  #set the display time for each image to 1 second
            cv.destroyWindow(f"ASL Gesture for {char}")
        else:
            print(f"ASL gesture for '{char}' is not available.")

# input text to be translated to ASL
input_text = input("Enter text to translate to ASL gestures: ").strip()

# call the display ASL function
display_asl_gesture(input_text)

cv.destroyAllWindows()

In [None]:
# Courtesy of: https://pypi.org/project/SpeechRecognition/
# and: https://stackoverflow.com/questions/62659602/how-can-i-use-speech-recognition-in-python3-in-the-mac-i-downloaded-but-pyaudio

import speech_recognition as sr

# Initialize the speech recognizer
recognizer = sr.Recognizer()

def speech_to_text_to_asl():
    print("Say something...")

    # Start capturing audio
    with sr.Microphone() as source:
        recognizer.adjust_for_ambient_noise(source)
        audio_data = recognizer.listen(source)

        try:
            # Recognize speech using Google's speech recognition
            text = recognizer.recognize_google(audio_data)
            print(f"Recognized Text: {text}")

            # Call the ASL display function with the recognized text
            display_asl_gesture(text)
            
        except sr.UnknownValueError:
            print("Speech was unclear. Please try again.")
        except sr.RequestError:
            print("Could not request results; check your internet connection.")

# Run the speech-to-text-to-ASL function
speech_to_text_to_asl()

Say something...
Recognized Text: hi my name is Andrew
