In [1]:
# import necessary packages
import cv2
import numpy as np
import mediapipe as mp
import tensorflow as tf
from tensorflow.keras.models import load_model
import pyttsx3
import os
import threading

# initialize mediapipe
mpHands = mp.solutions.hands
hands = mpHands.Hands(max_num_hands=1, min_detection_confidence=0.8)
mpDraw = mp.solutions.drawing_utils

# Load the gesture recognizer model
model = load_model('mp_hand_gesture')

# Load class names
f = open('gesture.names', 'r')
classNames = f.read().split('\n')
f.close()
print(classNames)

# Initialize the webcam
cap = cv2.VideoCapture(0)

# initialize TTS engine
engine = pyttsx3.init()

# Set voice properties (optional)
voices = engine.getProperty('voices')
engine.setProperty('voice', voices[0].id) # set the first available voice

# Initialize a flag for checking if the TTS engine is already running
tts_running = False

def text_to_speech(text):
    global tts_running
    
    # Check if the TTS engine is already running
    if tts_running:
        return
    
    # Set the flag to indicate that the TTS engine is running
    tts_running = True
    
    # Say the text
    engine.say(text)
    engine.runAndWait()
    
    # Reset the flag after the TTS engine is finished running
    tts_running = False

while True:
    # Read each frame from the webcam
    _, frame = cap.read()

    x, y, c = frame.shape

    # Flip the frame vertically
    frame = cv2.flip(frame, 1)
    framergb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Get hand landmark prediction
    result = hands.process(framergb)

    className = ''

    # post process the result
    if result.multi_hand_landmarks:
        landmarks = []
        for handslms in result.multi_hand_landmarks:
            for lm in handslms.landmark:
                lmx = int(lm.x * x)
                lmy = int(lm.y * y)

                landmarks.append([lmx, lmy])

            # Drawing landmarks on frames
            mpDraw.draw_landmarks(frame, handslms, mpHands.HAND_CONNECTIONS)

            # Predict gesture
            prediction = model.predict([landmarks])
            classID = np.argmax(prediction)
            className = classNames[classID]

    # show the prediction on the frame
    text = className
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 1
    font_thickness = 2
    text_size = cv2.getTextSize(text, font, font_scale, font_thickness)[0]
    text_x = int((frame.shape[1] - text_size[0]) / 2)
    text_y = int(frame.shape[0] - (text_size[1] / 2)) - 10
    cv2.rectangle(frame, (text_x - 5, text_y - text_size[1]), (text_x + text_size[0] + 5, text_y + 5), (0,0,0), -1)
    cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255,255,255), font_thickness)
    
    # start a new thread to run the text-to-speech engine
    t = threading.Thread(target=text_to_speech, args=(text,))
    t.start()

    # Show the final output
    cv2.imshow("Hand Sign Recoginition", frame) 

    if cv2.waitKey(1) == ord('q'):
        break

# release the webcam and destroy all active windows
cap.release()
cv2.destroyAllWindows()


['okay', 'peace', 'Yes', 'No', 'call me', 'stop', 'I Like You', 'Hi', 'fist', 'smile', '']


