# Frist complete code

In [None]:
import cv2 as cv
import tkinter as tk
from tkinter import ttk
from tkinter.scrolledtext import ScrolledText
import ttkbootstrap as ttkb  # Modern UI styling
from ttkbootstrap.constants import PRIMARY
from ultralytics import YOLO
from PIL import Image, ImageTk
import pyttsx3
from googletrans import Translator, LANGUAGES
import speech_recognition as sr
import sys

# Initialize Text-to-Speech
engine = pyttsx3.init()
engine.setProperty(
    "rate", int(engine.getProperty("rate") * 0.75)
)  # Set slower speech rate
engine.setProperty(
    "voice", engine.getProperty("voices")[1].id
)  # Default to female voice

# Translator instance
translator = Translator()


# Function for Text-to-Speech
def textToSpeech(text="Hello"):
    spoken_text = "".join(text).replace("_", " ").strip()
    print("Speaking: ", spoken_text)
    engine.say(spoken_text)
    engine.runAndWait()


# Load the YOLO model
model = YOLO("small2diff2better.pt")
DEBUG = True
confidence_requirement = 0.90

captured_text = []
captured_confidence = []
count = 0
noise_count = 0
last = None
live_text = ""

letters = [
    "A",
    "B",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "J",
    "K",
    "L",
    "M",
    "N",
    "O",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
    "V",
    "W",
    "X",
    "Y",
    "Z",
    "DEL",
    "NOTHING",
    "_",
]


# Initialize the Tkinter window using ttkbootstrap
window = ttkb.Window(themename="darkly")  # Modern theme
window.title("Sign Language Detection App")
window.geometry("800x900")

# Create a Label to display video feed
video_frame = ttkb.Frame(window, padding=10)
video_frame.pack(pady=10)

video_label = ttkb.Label(video_frame)
video_label.pack()

# ScrolledText for displaying detected text
translated_text_display = ScrolledText(
    window, font=("Arial", 18), height=3, wrap=tk.WORD, bg="#282828", fg="#f1f1f1"
)
translated_text_display.pack(padx=10, pady=10, fill="x")

# Additional ScrolledText for translated text
translated_language_display = ScrolledText(
    window, font=("Arial", 18), height=2, wrap=tk.WORD, bg="#282828", fg="#f1f1f1"
)
translated_language_display.pack(padx=10, pady=10, fill="x")

# Dropdown for language selection
language_frame = ttkb.Frame(window, padding=10)
language_frame.pack(pady=10)

selected_language = tk.StringVar(value="es")  # Default to Spanish
language_label = ttkb.Label(language_frame, text="Select Language:", font=("Arial", 12))
language_label.pack(side="left", padx=5)

language_dropdown = ttk.Combobox(
    language_frame,
    values=[f"{lang} ({code})" for code, lang in LANGUAGES.items()],
    textvariable=selected_language,
    state="readonly",
    width=30,
)
language_dropdown.pack(side="left", padx=5)
language_dropdown.set("English (en)")  # Default to Spanish

# Speech Recognizer initialization
recognizer = sr.Recognizer()

# Mapping each letter+space to the corresponding ASL image path
asl_image_paths = {
    "A": r"ASL TTS Dataset\A.png",
    "B": r"ASL TTS Dataset\B.png",
    "C": r"ASL TTS Dataset\C.png",
    "D": r"ASL TTS Dataset\D.png",
    "E": r"ASL TTS Dataset\E.png",
    "F": r"ASL TTS Dataset\F.png",
    "G": r"ASL TTS Dataset\G.png",
    "H": r"ASL TTS Dataset\H.png",
    "I": r"ASL TTS Dataset\I.png",
    "J": r"ASL TTS Dataset\J.png",
    "K": r"ASL TTS Dataset\K.png",
    "L": r"ASL TTS Dataset\L.png",
    "M": r"ASL TTS Dataset\M.png",
    "N": r"ASL TTS Dataset\N.png",
    "O": r"ASL TTS Dataset\O.png",
    "P": r"ASL TTS Dataset\P.png",
    "Q": r"ASL TTS Dataset\Q.png",
    "R": r"ASL TTS Dataset\R.png",
    "S": r"ASL TTS Dataset\S.png",
    "T": r"ASL TTS Dataset\T.png",
    "U": r"ASL TTS Dataset\U.png",
    "V": r"ASL TTS Dataset\V.png",
    "W": r"ASL TTS Dataset\W.png",
    "X": r"ASL TTS Dataset\X.png",
    "Y": r"ASL TTS Dataset\Y.png",
    "Z": r"ASL TTS Dataset\Z.png",
    "_": r"ASL TTS Dataset\SPACE.png",
}

# a dictionary to store loaded and resized images to avoid loading each time
asl_images = {}

# pre-load and resize all images to 500x500
for letter, path in asl_image_paths.items():
    img = cv.imread(path)
    if img is not None:
        resized_img = cv.resize(img, (500, 500))
        asl_images[letter] = resized_img
    else:
        print(f"The image for '{letter}' is not found at : {path}")


# Function to display ASL gesture
def display_asl_gesture(text):
    for char in text:
        char = "_" if char == " " else char.upper()

        img = asl_images.get(char)
        if img is not None:
            cv.imshow(f"ASL Gesture for {char}", img)
            cv.waitKey(1000)
            cv.destroyWindow(f"ASL Gesture for {char}")
        else:
            print(f"ASL gesture for '{char}' is not available.")


# Function for Speech to Text to ASL
def speech_to_text_to_asl():
    print("Say something...")

    # Start capturing audio
    with sr.Microphone() as source:
        recognizer.adjust_for_ambient_noise(source)
        audio_data = recognizer.listen(source)

        try:
            # Recognize speech using Google's speech recognition
            text = recognizer.recognize_google(audio_data)
            print(f"Recognized Text: {text}")

            # Call the ASL display function with the recognized text
            display_asl_gesture(text)

        except sr.UnknownValueError:
            print("Speech was unclear. Please try again.")
        except sr.RequestError:
            print("Could not request results; check your internet connection.")


# Function for Text to ASL (direct text input)
def text_to_asl():
    text = translated_text_display.get("1.0", tk.END).strip()
    if text:
        print(f"Text to ASL: {text}")
        display_asl_gesture(text)


# Function to update the video frame
def update_frame():
    global last, count, noise_count, live_text

    ret, frame = cap.read()
    if ret:
        # Run YOLO inference
        results = model(frame, verbose=False)
        top_class = results[0].probs.top1
        top_confidence = results[0].probs.top1conf

        if top_confidence >= confidence_requirement:
            if top_class == last:
                count += 1
            else:
                count = 1

            if count == 3:
                if DEBUG:
                    captured_text.append(letters[top_class])
                    captured_confidence.append(top_confidence)

                if letters[top_class] not in ["DEL", "NOTHING"]:
                    live_text += letters[top_class]
                elif letters[top_class] == "DEL":
                    live_text = live_text[:-1]

            last = top_class
            noise_count = 0
        else:
            noise_count += 1

        if noise_count == 3:
            count = 0
            noise_count = 0

        # Convert frame to ImageTk
        img = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
        img = Image.fromarray(img)
        img_tk = ImageTk.PhotoImage(image=img)

        video_label.imgtk = img_tk
        video_label.configure(image=img_tk)

        # Update the live text display
        translated_text_display.delete(1.0, tk.END)
        translated_text_display.insert(tk.END, live_text)
        translated_text_display.see(tk.END)

    window.after(10, update_frame)


# Function to translate and display the translated text
def translate_text():
    input_text = live_text.replace("_", " ")
    if input_text.strip():
        selected_lang_code = selected_language.get().split("(")[-1].strip(")")
        translation = translator.translate(input_text, dest=selected_lang_code)
        translated_language_display.delete(1.0, tk.END)
        translated_language_display.insert(tk.END, translation.text)


# Function to speak the predicted text (live_text)
def speak_text():
    predicted_text = live_text
    if predicted_text:
        textToSpeech(predicted_text)


# Function to clear the text areas
def clear_text():
    global live_text
    live_text = ""
    translated_text_display.delete(1.0, tk.END)
    translated_language_display.delete(1.0, tk.END)


# Function to quit the app
def quit_app():
    global cap
    if cap.isOpened():
        cap.release()
    cv.destroyAllWindows()
    window.quit()
    window.destroy()
    sys.exit()


# Add the buttons
button_frame = ttkb.Frame(window)
button_frame.pack(pady=10)

translate_button = ttkb.Button(
    button_frame, text="Translate Text", command=translate_text, bootstyle=PRIMARY
)
translate_button.pack(side="left", padx=5)

speak_button = ttkb.Button(
    button_frame, text="Speak Text", command=speak_text, bootstyle=PRIMARY
)
speak_button.pack(side="left", padx=5)

clear_button = ttkb.Button(
    button_frame, text="Clear", command=clear_text, bootstyle="danger"
)
clear_button.pack(side="left", padx=5)

speech_to_text_button = ttkb.Button(
    button_frame,
    text="Start Speech-to-Text",
    command=speech_to_text_to_asl,
    bootstyle=PRIMARY,
)
speech_to_text_button.pack(side="left", padx=5)

text_to_asl_button = ttkb.Button(
    button_frame, text="Text to ASL", command=text_to_asl, bootstyle=PRIMARY
)
text_to_asl_button.pack(side="left", padx=5)

quit_button = ttkb.Button(
    button_frame, text="Quit", command=quit_app, bootstyle="danger"
)
quit_button.pack(side="left", padx=5)

# Open webcam stream for video
cap = cv.VideoCapture(0)

# Start updating frames
update_frame()

# Start the Tkinter main loop
window.mainloop()

'something is happening here'

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Live text before correction: 'COW'
Corrected text: 'COW'
Say something...
Recognized Text: project demo
Translated to en: cow project demo
Translated to fr: Démo du projet de vache
Translated to en: Cow's demo


In [4]:
import cv2 as cv
import tkinter as tk
from tkinter import ttk
from tkinter.scrolledtext import ScrolledText
import ttkbootstrap as ttkb  # Modern UI styling
from ttkbootstrap.constants import PRIMARY
from ultralytics import YOLO
from PIL import Image, ImageTk
import pyttsx3
from googletrans import Translator, LANGUAGES
import speech_recognition as sr
import sys
from spell_checker import SpellCheck  # custom imported class
from playsound import playsound


# Initialize Text-to-Speech
engine = pyttsx3.init()
engine.setProperty(
    "rate", int(engine.getProperty("rate") * 0.75)
)  # Set slower speech rate
engine.setProperty(
    "voice", engine.getProperty("voices")[1].id
)  # Default to female voice

# Translator instance
translator = Translator()


# Function for Text-to-Speech
def textToSpeech(text="Hello"):
    spoken_text = "".join(text).replace("_", " ").strip()
    print("Speaking: ", spoken_text)
    engine.say(spoken_text)
    engine.runAndWait()


# Load the YOLO model
model = YOLO("small2diff2better.pt")
DEBUG = True
confidence_requirement = 0.90

captured_text = []
captured_confidence = []
count = 0
noise_count = 0
last = None
live_text = ""

letters = [
    "A",
    "B",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "J",
    "K",
    "L",
    "M",
    "N",
    "O",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
    "V",
    "W",
    "X",
    "Y",
    "Z",
    "DEL",
    "NOTHING",
    "_",
]


# # ASL image paths and loading them
asl_image_paths = {
    "A": r"ASL TTS Dataset\A.png",
    "B": r"ASL TTS Dataset\B.png",
    "C": r"ASL TTS Dataset\C.png",
    "D": r"ASL TTS Dataset\D.png",
    "E": r"ASL TTS Dataset\E.png",
    "F": r"ASL TTS Dataset\F.png",
    "G": r"ASL TTS Dataset\G.png",
    "H": r"ASL TTS Dataset\H.png",
    "I": r"ASL TTS Dataset\I.png",
    "J": r"ASL TTS Dataset\J.png",
    "K": r"ASL TTS Dataset\K.png",
    "L": r"ASL TTS Dataset\L.png",
    "M": r"ASL TTS Dataset\M.png",
    "N": r"ASL TTS Dataset\N.png",
    "O": r"ASL TTS Dataset\O.png",
    "P": r"ASL TTS Dataset\P.png",
    "Q": r"ASL TTS Dataset\Q.png",
    "R": r"ASL TTS Dataset\R.png",
    "S": r"ASL TTS Dataset\S.png",
    "T": r"ASL TTS Dataset\T.png",
    "U": r"ASL TTS Dataset\U.png",
    "V": r"ASL TTS Dataset\V.png",
    "W": r"ASL TTS Dataset\W.png",
    "X": r"ASL TTS Dataset\X.png",
    "Y": r"ASL TTS Dataset\Y.png",
    "Z": r"ASL TTS Dataset\Z.png",
    "_": r"ASL TTS Dataset\SPACE.png",
}


# Pre-load and resize all ASL images to 500x500
asl_images = {}
for letter, path in asl_image_paths.items():
    img = cv.imread(path)
    if img is not None:
        resized_img = cv.resize(img, (500, 500))
        asl_images[letter] = resized_img
    else:
        print(f"The image for '{letter}' is not found at: {path}")


# Function to display ASL gestures for each letter in the input text
def display_asl_gesture(text):
    for char in text:
        char = "_" if char == " " else char.upper()
        img = asl_images.get(char)
        if img is not None:
            cv.imshow(f"ASL Gesture for {char}", img)
            cv.waitKey(1000)  # Show each image for 1 second
            cv.destroyWindow(f"ASL Gesture for {char}")
        else:
            print(f"ASL gesture for '{char}' is not available.")


spell_check = SpellCheck()

# Initialize the Tkinter window using ttkbootstrap
window = ttkb.Window(themename="darkly")  # Modern theme
window.title("Sign Language Detection App")
window.geometry("800x900")

# Create a Label to display video feed
video_frame = ttkb.Frame(window, padding=10)
video_frame.pack(pady=10)

video_label = ttkb.Label(video_frame)
video_label.pack()

# ScrolledText for displaying detected text
translated_text_display = ScrolledText(
    window, font=("Arial", 18), height=3, wrap=tk.WORD, bg="#282828", fg="#f1f1f1"
)
translated_text_display.pack(padx=10, pady=10, fill="x")

# Additional ScrolledText for translated text
translated_language_display = ScrolledText(
    window, font=("Arial", 18), height=2, wrap=tk.WORD, bg="#282828", fg="#f1f1f1"
)
translated_language_display.pack(padx=10, pady=10, fill="x")

# Dropdown for language selection
language_frame = ttkb.Frame(window, padding=10)
language_frame.pack(pady=10)

selected_language = tk.StringVar(value="en")  # Default to Spanish
language_label = ttkb.Label(language_frame, text="Select Language:", font=("Arial", 12))
language_label.pack(side="left", padx=5)

language_dropdown = ttk.Combobox(
    language_frame,
    values=[f"{lang} ({code})" for code, lang in LANGUAGES.items()],
    textvariable=selected_language,
    state="readonly",
    width=30,
)
language_dropdown.pack(side="left", padx=5)
language_dropdown.set("English (en)")  # Default to Spanish

# Speech Recognizer initialization
recognizer = sr.Recognizer()


# # Function to display ASL gesture
# def display_asl_gesture(text):
#     for char in text:
#         char = "_" if char == " " else char.upper()
#         print(
#             f"ASL gesture for '{char}' would be displayed here."
#         )  # Placeholder action


# Function for Speech to Text to ASL
# def speech_to_text_to_asl():
#     print("Say something...")
#     with sr.Microphone() as source:
#         recognizer.adjust_for_ambient_noise(source)
#         audio_data = recognizer.listen(source)
#         try:
#             text = recognizer.recognize_google(audio_data)
#             print(f"Recognized Text: {text}")
#             display_asl_gesture(text)
#         except sr.UnknownValueError:
#             print("Speech was unclear. Please try again.")
#         except sr.RequestError:
#             print("Could not request results; check your internet connection.")


# Function for Speech to Text to ASL
# def speech_to_text_to_asl():
#     print("Say something...")
#     with sr.Microphone() as source:
#         recognizer.adjust_for_ambient_noise(source)
#         try:
#             audio_data = recognizer.listen(source)
#             text = recognizer.recognize_google(audio_data)
#             print(f"Recognized Text: {text}")

#             # Insert the recognized text into the second text field
#             current_text = translated_language_display.get(1.0, tk.END).strip()
#             updated_text = f"{current_text} {text}" if current_text else text
#             translated_language_display.delete(1.0, tk.END)
#             translated_language_display.insert(tk.END, updated_text)
#         except sr.UnknownValueError:
#             print("Speech was unclear. Please try again.")
#         except sr.RequestError:
#             print("Could not request results; check your internet connection.")


def speech_to_text_to_asl():
    # Play start sound
    playsound("./start_sound.mp3")  # Path to the start sound file
    print("Say something...")
    with sr.Microphone() as source:
        recognizer.adjust_for_ambient_noise(source)
        try:
            audio_data = recognizer.listen(source)
            text = recognizer.recognize_google(audio_data)
            print(f"Recognized Text: {text}")

            # Insert the recognized text into the second text field
            current_text = translated_language_display.get(1.0, tk.END).strip()
            updated_text = f"{current_text} {text}" if current_text else text
            translated_language_display.delete(1.0, tk.END)
            translated_language_display.insert(tk.END, updated_text)
        except sr.UnknownValueError:
            print("Speech was unclear. Please try again.")
        except sr.RequestError:
            print("Could not request results; check your internet connection.")
        finally:
            # Play stop sound
            playsound("stop_sound.mp3")


# Function for Text to ASL (direct text input)
# def text_to_asl():
#     text = captured_text.strip()
#     # print(captured_text)
#     if text:
#         print(f"Text to ASL: {text}")
#         display_asl_gesture(text)


# Function for Text to ASL (direct text input from the second text field)
def text_to_asl():
    # Get the content of the second text field
    text = translated_language_display.get(1.0, tk.END).strip()

    if text:
        print(f"Text to ASL: {text}")
        display_asl_gesture(text)
    else:
        print("No text available in the second text field for ASL conversion.")


def capture_text():
    global captured_text, live_text
    if live_text.strip():
        print(f"Live text before correction: '{live_text}'")
        try:
            captured_text = spell_check.spell_check(live_text.replace("_", " ").strip())
            print(f"Corrected text: '{captured_text}'")

            # Update the second text area with the corrected text
            current_text = translated_language_display.get(1.0, tk.END).strip()
            updated_text = (
                f"{current_text} {captured_text.lower()}"
                if current_text
                else captured_text.lower()
            )
            translated_language_display.delete(1.0, tk.END)
            translated_language_display.insert(tk.END, updated_text)

            # Clear the live text and the first text area
            live_text = ""
            translated_text_display.delete(1.0, tk.END)
        except Exception as e:
            print(f"Error during spell checking: {e}")


# def capture_text():
#     global captured_text
#     if live_text.strip():
#         print(f"Live text before correction: '{live_text}'")
#         try:
#             captured_text = spell_check.spell_check(live_text.replace("_", " ").strip())
#             print(f"Corrected text: '{captured_text}'")
#             current_text = translated_language_display.get(1.0, tk.END).strip()
#             updated_text = (
#                 f"{current_text} {captured_text.lower()}"
#                 if current_text
#                 else captured_text.lower()
#             )
#             translated_language_display.delete(1.0, tk.END)
#             translated_language_display.insert(tk.END, updated_text)
#         except Exception as e:
#             print(f"Error during spell checking: {e}")


# Function to update the video frame
def update_frame():
    global last, count, noise_count, live_text
    ret, frame = cap.read()
    if ret:
        results = model(frame, verbose=False)
        top_class = results[0].probs.top1
        top_confidence = results[0].probs.top1conf
        if top_confidence >= confidence_requirement:
            if top_class == last:
                count += 1
            else:
                count = 1
            if count == 3:
                if letters[top_class] not in ["DEL", "NOTHING"]:
                    live_text += letters[top_class]
                elif letters[top_class] == "DEL":
                    live_text = live_text[:-1]
            last = top_class
            noise_count = 0
        else:
            noise_count += 1
        if noise_count == 3:
            count = 0
            noise_count = 0
        img = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
        img = Image.fromarray(img)
        img_tk = ImageTk.PhotoImage(image=img)
        video_label.imgtk = img_tk
        video_label.configure(image=img_tk)
        translated_text_display.delete(1.0, tk.END)
        translated_text_display.insert(tk.END, live_text)
        translated_text_display.see(tk.END)
    window.after(10, update_frame)


# Function to translate text
# def translate_text():
# input_text = live_text.replace("_", " ")
# if input_text.strip():
#     selected_lang_code = selected_language.get().split("(")[-1].strip(")")
#     translation = translator.translate(input_text, dest=selected_lang_code)
#     translated_language_display.delete(1.0, tk.END)
#     translated_language_display.insert(tk.END, translation.text)


def translate_text():
    # Fetch text from the second text field
    input_text = translated_language_display.get(1.0, tk.END).strip()

    # Fallback to live_text if the second text field is empty
    if not input_text:
        input_text = live_text.replace("_", " ").strip()

    if input_text:
        # Get the selected language code from the dropdown
        selected_lang_code = selected_language.get().split("(")[-1].strip(")")
        try:
            # Perform translation
            translation = translator.translate(input_text, dest=selected_lang_code)

            # Display the translated text in the second text field
            translated_language_display.delete(1.0, tk.END)
            translated_language_display.insert(tk.END, translation.text)
            print(f"Translated to {selected_lang_code}: {translation.text}")
        except Exception as e:
            print(f"Error during translation: {e}")
    else:
        print("No text available for translation.")


# Function to speak text
# def speak_text():
#     if live_text.strip():
#         textToSpeech(live_text)
# Function to speak the predicted text (from the second text field)


def speak_text():
    translated_text = translated_language_display.get(
        "1.0", tk.END
    ).strip()  # Get text from the second text field
    if translated_text:
        textToSpeech(translated_text)


# Function to clear text
def clear_text():
    global live_text
    live_text = ""
    translated_text_display.delete(1.0, tk.END)
    translated_language_display.delete(1.0, tk.END)


# Function to quit the app
def quit_app():
    global cap
    if cap.isOpened():
        cap.release()
    cv.destroyAllWindows()
    window.quit()
    window.destroy()
    sys.exit()


# Add buttons
button_frame = ttkb.Frame(window)
button_frame.pack(pady=10)

translate_button = ttkb.Button(
    button_frame, text="Translate Text", command=translate_text, bootstyle=PRIMARY
)
translate_button.pack(side="left", padx=5)

capture_button = ttkb.Button(
    button_frame, text="Capture Text", command=capture_text, bootstyle=PRIMARY
)
capture_button.pack(side="left", padx=5)

speak_button = ttkb.Button(
    button_frame, text="Speak Text", command=speak_text, bootstyle=PRIMARY
)
speak_button.pack(side="left", padx=5)

clear_button = ttkb.Button(
    button_frame, text="Clear", command=clear_text, bootstyle="danger"
)
clear_button.pack(side="left", padx=5)

speech_to_text_button = ttkb.Button(
    button_frame,
    text="Start Speech-to-Text",
    command=speech_to_text_to_asl,
    bootstyle=PRIMARY,
)
speech_to_text_button.pack(side="left", padx=5)

text_to_asl_button = ttkb.Button(
    button_frame, text="Text to ASL", command=text_to_asl, bootstyle=PRIMARY
)
text_to_asl_button.pack(side="left", padx=5)

quit_button = ttkb.Button(
    button_frame, text="Quit", command=quit_app, bootstyle="danger"
)
quit_button.pack(side="left", padx=5)

# Open webcam stream
cap = cv.VideoCapture(0)
update_frame()

# Start Tkinter main loop
window.mainloop()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Say something...



    Error 263 for command:
        open stop_sound.mp3
    The specified device is not open or is not recognized by MCI.

    Error 263 for command:
        close stop_sound.mp3
    The specified device is not open or is not recognized by MCI.
Failed to close the file: stop_sound.mp3


Recognized Text: I'm going to put it in my notification


Exception in Tkinter callback
Traceback (most recent call last):
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\tkinter\__init__.py", line 1921, in __call__
    return self.func(*args)
  File "C:\Users\mbele\AppData\Local\Temp\ipykernel_2192\993080594.py", line 255, in speech_to_text_to_asl
    playsound("stop_sound.mp3")
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\site-packages\playsound.py", line 72, in _playsoundWin
    winCommand(u'open {}'.format(sound))
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\site-packages\playsound.py", line 64, in winCommand
    raise PlaysoundException(exceptionMessage)
playsound.PlaysoundException: 
    Error 263 for command:
        open stop_sound.mp3
    The specified device is not open or is not recognized by MCI.


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
from playsound import playsound


def speech_to_text_to_asl():
    # Play start sound
    playsound("start_sound.mp3")  # Path to the start sound file
    print("Say something...")
    with sr.Microphone() as source:
        recognizer.adjust_for_ambient_noise(source)
        try:
            audio_data = recognizer.listen(source)
            text = recognizer.recognize_google(audio_data)
            print(f"Recognized Text: {text}")

            # Insert the recognized text into the second text field
            current_text = translated_language_display.get(1.0, tk.END).strip()
            updated_text = f"{current_text} {text}" if current_text else text
            translated_language_display.delete(1.0, tk.END)
            translated_language_display.insert(tk.END, updated_text)
        except sr.UnknownValueError:
            print("Speech was unclear. Please try again.")
        except sr.RequestError:
            print("Could not request results; check your internet connection.")
        finally:
            # Play stop sound
            playsound("stop_sound.mp3")  # Path to the stop sound file

button

In [9]:
import cv2 as cv
import tkinter as tk
from tkinter import ttk
from tkinter.scrolledtext import ScrolledText
import ttkbootstrap as ttkb  # Modern UI styling
from ttkbootstrap.constants import PRIMARY
from ultralytics import YOLO
from PIL import Image, ImageTk
import pyttsx3
from googletrans import Translator, LANGUAGES
import speech_recognition as sr
import sys
from spell_checker import SpellCheck  # custom imported class
import threading


# Initialize Text-to-Speech
engine = pyttsx3.init()
engine.setProperty(
    "rate", int(engine.getProperty("rate") * 0.75)
)  # Set slower speech rate
engine.setProperty(
    "voice", engine.getProperty("voices")[1].id
)  # Default to female voice

# Translator instance
translator = Translator()


# Function for Text-to-Speech
def textToSpeech(text="Hello"):
    spoken_text = "".join(text).replace("_", " ").strip()
    print("Speaking: ", spoken_text)
    engine.say(spoken_text)
    engine.runAndWait()


# Load the YOLO model
model = YOLO("small2diff2better.pt")
DEBUG = True
confidence_requirement = 0.90

captured_text = []
captured_confidence = []
count = 0
noise_count = 0
last = None
live_text = ""

letters = [
    "A",
    "B",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "J",
    "K",
    "L",
    "M",
    "N",
    "O",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
    "V",
    "W",
    "X",
    "Y",
    "Z",
    "DEL",
    "NOTHING",
    "_",
]


# # ASL image paths and loading them
asl_image_paths = {
    "A": r"ASL TTS Dataset\A.png",
    "B": r"ASL TTS Dataset\B.png",
    "C": r"ASL TTS Dataset\C.png",
    "D": r"ASL TTS Dataset\D.png",
    "E": r"ASL TTS Dataset\E.png",
    "F": r"ASL TTS Dataset\F.png",
    "G": r"ASL TTS Dataset\G.png",
    "H": r"ASL TTS Dataset\H.png",
    "I": r"ASL TTS Dataset\I.png",
    "J": r"ASL TTS Dataset\J.png",
    "K": r"ASL TTS Dataset\K.png",
    "L": r"ASL TTS Dataset\L.png",
    "M": r"ASL TTS Dataset\M.png",
    "N": r"ASL TTS Dataset\N.png",
    "O": r"ASL TTS Dataset\O.png",
    "P": r"ASL TTS Dataset\P.png",
    "Q": r"ASL TTS Dataset\Q.png",
    "R": r"ASL TTS Dataset\R.png",
    "S": r"ASL TTS Dataset\S.png",
    "T": r"ASL TTS Dataset\T.png",
    "U": r"ASL TTS Dataset\U.png",
    "V": r"ASL TTS Dataset\V.png",
    "W": r"ASL TTS Dataset\W.png",
    "X": r"ASL TTS Dataset\X.png",
    "Y": r"ASL TTS Dataset\Y.png",
    "Z": r"ASL TTS Dataset\Z.png",
    "_": r"ASL TTS Dataset\SPACE.png",
}


# Function to handle long-running tasks in a separate thread
def run_in_thread(func):
    def wrapper(*args, **kwargs):
        thread = threading.Thread(target=func, args=args, kwargs=kwargs)
        thread.daemon = True  # Ensure the thread exits when the program ends
        thread.start()

    return wrapper


# Pre-load and resize all ASL images to 500x500
asl_images = {}
for letter, path in asl_image_paths.items():
    img = cv.imread(path)
    if img is not None:
        resized_img = cv.resize(img, (500, 500))
        asl_images[letter] = resized_img
    else:
        print(f"The image for '{letter}' is not found at: {path}")


# Function to display ASL gestures for each letter in the input text
def display_asl_gesture(text):
    for char in text:
        char = "_" if char == " " else char.upper()
        img = asl_images.get(char)
        if img is not None:
            cv.imshow(f"ASL Gesture for {char}", img)
            cv.waitKey(1000)  # Show each image for 1 second
            cv.destroyWindow(f"ASL Gesture for {char}")
        else:
            print(f"ASL gesture for '{char}' is not available.")


spell_check = SpellCheck()

# Initialize the Tkinter window using ttkbootstrap
window = ttkb.Window(themename="darkly")  # Modern theme
window.title("Sign Language Detection App")
window.geometry("800x900")

# Create a Label to display video feed
video_frame = ttkb.Frame(window, padding=10)
video_frame.pack(pady=10)

video_label = ttkb.Label(video_frame)
video_label.pack()

# ScrolledText for displaying detected text
translated_text_display = ScrolledText(
    window, font=("Arial", 18), height=3, wrap=tk.WORD, bg="#282828", fg="#f1f1f1"
)
translated_text_display.pack(padx=10, pady=10, fill="x")

# Additional ScrolledText for translated text
translated_language_display = ScrolledText(
    window, font=("Arial", 18), height=2, wrap=tk.WORD, bg="#282828", fg="#f1f1f1"
)
translated_language_display.pack(padx=10, pady=10, fill="x")

# Dropdown for language selection
language_frame = ttkb.Frame(window, padding=10)
language_frame.pack(pady=10)

selected_language = tk.StringVar(value="en")  # Default to Spanish
language_label = ttkb.Label(language_frame, text="Select Language:", font=("Arial", 12))
language_label.pack(side="left", padx=5)

language_dropdown = ttk.Combobox(
    language_frame,
    values=[f"{lang} ({code})" for code, lang in LANGUAGES.items()],
    textvariable=selected_language,
    state="readonly",
    width=30,
)
language_dropdown.pack(side="left", padx=5)
language_dropdown.set("English (en)")  # Default to Spanish

# Speech Recognizer initialization
recognizer = sr.Recognizer()


# Function for Speech to Text to ASL
@run_in_thread
def speech_to_text_to_asl():
    # Play start sound
    playsound("./start_sound.mp3")  # Path to the start sound file
    print("Say something...")
    with sr.Microphone() as source:
        recognizer.adjust_for_ambient_noise(source)
        try:
            audio_data = recognizer.listen(source)
            text = recognizer.recognize_google(audio_data)
            print(f"Recognized Text: {text}")

            # Insert the recognized text into the second text field
            current_text = translated_language_display.get(1.0, tk.END).strip()
            updated_text = f"{current_text} {text}" if current_text else text
            translated_language_display.delete(1.0, tk.END)
            translated_language_display.insert(tk.END, updated_text)
        except sr.UnknownValueError:
            print("Speech was unclear. Please try again.")
        except sr.RequestError:
            print("Could not request results; check your internet connection.")
        finally:
            # Play stop sound
            playsound("stop_sound.mp3")


# Function for Text to ASL (direct text input from the second text field)
@run_in_thread
def text_to_asl():
    # Get the content of the second text field
    text = translated_language_display.get(1.0, tk.END).strip()

    if text:
        print(f"Text to ASL: {text}")
        display_asl_gesture(text)
    else:
        print("No text available in the second text field for ASL conversion.")


def capture_text():
    global captured_text, live_text
    if live_text.strip():
        print(f"Live text before correction: '{live_text}'")
        try:
            captured_text = spell_check.spell_check(live_text.replace("_", " ").strip())
            print(f"Corrected text: '{captured_text}'")

            # Update the second text area with the corrected text
            current_text = translated_language_display.get(1.0, tk.END).strip()
            updated_text = (
                f"{current_text} {captured_text.lower()}"
                if current_text
                else captured_text.lower()
            )
            translated_language_display.delete(1.0, tk.END)
            translated_language_display.insert(tk.END, updated_text)

            # Clear the live text and the first text area
            live_text = ""
            translated_text_display.delete(1.0, tk.END)
        except Exception as e:
            print(f"Error during spell checking: {e}")


# Function to update the video frame
def update_frame():
    global last, count, noise_count, live_text
    ret, frame = cap.read()
    if ret:
        results = model(frame, verbose=False)
        top_class = results[0].probs.top1
        top_confidence = results[0].probs.top1conf
        if top_confidence >= confidence_requirement:
            if top_class == last:
                count += 1
            else:
                count = 1
            if count == 3:
                if letters[top_class] not in ["DEL", "NOTHING"]:
                    live_text += letters[top_class]
                elif letters[top_class] == "DEL":
                    live_text = live_text[:-1]
            last = top_class
            noise_count = 0
        else:
            noise_count += 1
        if noise_count == 3:
            count = 0
            noise_count = 0
        img = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
        img = Image.fromarray(img)
        img_tk = ImageTk.PhotoImage(image=img)
        video_label.imgtk = img_tk
        video_label.configure(image=img_tk)
        translated_text_display.delete(1.0, tk.END)
        translated_text_display.insert(tk.END, live_text)
        translated_text_display.see(tk.END)
    window.after(10, update_frame)


# def translate_text():
#     # Fetch text from the second text field
#     input_text = translated_language_display.get(1.0, tk.END).strip()

#     # Fallback to live_text if the second text field is empty
#     if not input_text:
#         input_text = live_text.replace("_", " ").strip()

#     if input_text:
#         # Get the selected language code from the dropdown
#         selected_lang_code = selected_language.get().split("(")[-1].strip(")")
#         try:
#             # Perform translation
#             translation = translator.translate(input_text, dest=selected_lang_code)

#             # Display the translated text in the second text field
#             translated_language_display.delete(1.0, tk.END)
#             translated_language_display.insert(tk.END, translation.text)
#             print(f"Translated to {selected_lang_code}: {translation.text}")
#         except Exception as e:
#             print(f"Error during translation: {e}")
#     else:
#         print("No text available for translation.")


# Updated functions
@run_in_thread
def translate_text():
    input_text = translated_language_display.get(1.0, tk.END).strip()
    if not input_text:
        input_text = live_text.replace("_", " ").strip()
    if input_text:
        selected_lang_code = selected_language.get().split("(")[-1].strip(")")
        try:
            translation = translator.translate(input_text, dest=selected_lang_code)
            window.after(0, lambda: update_translated_text(translation.text))
        except Exception as e:
            print(f"Error during translation: {e}")
    else:
        print("No text available for translation.")


# Update the translated text in a thread-safe way
def update_translated_text(text):
    translated_language_display.delete(1.0, tk.END)
    translated_language_display.insert(tk.END, text)


@run_in_thread
def speak_text():
    translated_text = translated_language_display.get("1.0", tk.END).strip()
    if translated_text:
        engine.say(translated_text)
        engine.runAndWait()


# def speak_text():
#     translated_text = translated_language_display.get(
#         "1.0", tk.END
#     ).strip()  # Get text from the second text field
#     if translated_text:
#         textToSpeech(translated_text)


# Function to clear text
def clear_text():
    global live_text
    live_text = ""
    translated_text_display.delete(1.0, tk.END)
    translated_language_display.delete(1.0, tk.END)


# Function to quit the app
def quit_app():
    global cap
    if cap.isOpened():
        cap.release()
    cv.destroyAllWindows()
    window.quit()
    window.destroy()
    sys.exit()


# Add buttons
button_frame = ttkb.Frame(window)
button_frame.pack(pady=10)

translate_button = ttkb.Button(
    button_frame, text="Translate Text", command=translate_text, bootstyle=PRIMARY
)
translate_button.pack(side="left", padx=5)

capture_button = ttkb.Button(
    button_frame, text="Capture Text", command=capture_text, bootstyle=PRIMARY
)
capture_button.pack(side="left", padx=5)

speak_button = ttkb.Button(
    button_frame, text="Speak Text", command=speak_text, bootstyle=PRIMARY
)
speak_button.pack(side="left", padx=5)

clear_button = ttkb.Button(
    button_frame, text="Clear", command=clear_text, bootstyle="danger"
)
clear_button.pack(side="left", padx=5)

speech_to_text_button = ttkb.Button(
    button_frame,
    text="Start Speech-to-Text",
    command=speech_to_text_to_asl,
    bootstyle=PRIMARY,
)
speech_to_text_button.pack(side="left", padx=5)

text_to_asl_button = ttkb.Button(
    button_frame, text="Text to ASL", command=text_to_asl, bootstyle=PRIMARY
)
text_to_asl_button.pack(side="left", padx=5)

quit_button = ttkb.Button(
    button_frame, text="Quit", command=quit_app, bootstyle="danger"
)
quit_button.pack(side="left", padx=5)

# Open webcam stream
cap = cv.VideoCapture(0)
update_frame()

# Start Tkinter main loop
window.mainloop()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Live text before correction: '__CO_'
Corrected text: 'C'
Live text before correction: '_C'
Corrected text: 'C'
Live text before correction: 'COW'
Corrected text: 'COW'
Say something...
Say something...



    Error 275 for command:
        open stop_sound.mp3
    Cannot find the specified file.  Make sure the path and filename are correct.

    Error 263 for command:
        close stop_sound.mp3
    The specified device is not open or is not recognized by MCI.
Failed to close the file: stop_sound.mp3
Exception in thread Thread-63 (speech_to_text_to_asl):
Traceback (most recent call last):
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\mbele\AppData\Local\Temp\ipykernel_2192\809132211.py", line 221, in speech_to_text_to_asl
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\site-packages\playsound.py", line 72, in _playsoundWin
    

Recognized Text: play



    Error 263 for command:
        close ./start_sound.mp3
    The specified device is not open or is not recognized by MCI.
Failed to close the file: ./start_sound.mp3


Say something...



    Error 259 for command:
        play ./start_sound.mp3 wait
    The driver cannot recognize the specified command parameter.
Exception in thread Thread-67 (speech_to_text_to_asl):
Traceback (most recent call last):
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\mbele\AppData\Local\Temp\ipykernel_2192\809132211.py", line 201, in speech_to_text_to_asl
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\site-packages\playsound.py", line 73, in _playsoundWin
    winCommand(u'play {}{}'.format(sound, ' wait' if block else ''))
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\site-packages\playsound.py", line 64, in winCommand
    

Speech was unclear. Please try again.



    Error 275 for command:
        open stop_sound.mp3
    Cannot find the specified file.  Make sure the path and filename are correct.

    Error 263 for command:
        close stop_sound.mp3
    The specified device is not open or is not recognized by MCI.
Failed to close the file: stop_sound.mp3
Exception in thread Thread-66 (speech_to_text_to_asl):
Traceback (most recent call last):
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\mbele\AppData\Local\Temp\ipykernel_2192\809132211.py", line 221, in speech_to_text_to_asl
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\site-packages\playsound.py", line 72, in _playsoundWin
    

Recognized Text: I did a great
Say something...



    Error 275 for command:
        open stop_sound.mp3
    Cannot find the specified file.  Make sure the path and filename are correct.

    Error 263 for command:
        close stop_sound.mp3
    The specified device is not open or is not recognized by MCI.
Failed to close the file: stop_sound.mp3
Exception in thread Thread-70 (speech_to_text_to_asl):
Traceback (most recent call last):
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\mbele\AppData\Local\Temp\ipykernel_2192\809132211.py", line 221, in speech_to_text_to_asl
  File "c:\Users\mbele\miniconda3\envs\capstone\lib\site-packages\playsound.py", line 72, in _playsoundWin
    

Recognized Text: OK Google


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [5]:
import threading


# Function to handle long-running tasks in a separate thread
def run_in_thread(func):
    def wrapper(*args, **kwargs):
        thread = threading.Thread(target=func, args=args, kwargs=kwargs)
        thread.daemon = True  # Ensure the thread exits when the program ends
        thread.start()

    return wrapper


# Updated functions
@run_in_thread
def translate_text():
    input_text = translated_language_display.get(1.0, tk.END).strip()
    if not input_text:
        input_text = live_text.replace("_", " ").strip()
    if input_text:
        selected_lang_code = selected_language.get().split("(")[-1].strip(")")
        try:
            translation = translator.translate(input_text, dest=selected_lang_code)
            window.after(0, lambda: update_translated_text(translation.text))
        except Exception as e:
            print(f"Error during translation: {e}")
    else:
        print("No text available for translation.")


# Update the translated text in a thread-safe way
def update_translated_text(text):
    translated_language_display.delete(1.0, tk.END)
    translated_language_display.insert(tk.END, text)


@run_in_thread
def speak_text():
    translated_text = translated_language_display.get("1.0", tk.END).strip()
    if translated_text:
        engine.say(translated_text)
        engine.runAndWait()