In [12]:
import cv2
import mediapipe as mp
import numpy as np
from sklearn.preprocessing import LabelEncoder
import keras
import threading
from gtts import gTTS
import playsound
import os

In [13]:
le = LabelEncoder()
model = keras.models.load_model("my_model")
y = np.load('landmarks/y.npz')['arr_0']
y = le.fit_transform(y)

In [14]:
def reference_from_wrist(results, hand):
    i = 0
    landmarks = []
    references = [0,0,0]
    for cords in results.multi_hand_landmarks[0].landmark:
        if i == 0:
            references[0] = cords.x
            references[1] = cords.y
            references[2] = cords.z
            landmarks.append([0,0,0])

        else:
            if hand == 'Right':
                landmarks.append([cords.x - references[0],cords.y - references[1],cords.z])
            else:
                landmarks.append([references[0] - cords.x,cords.y - references[1],cords.z])
        i += 1
    return landmarks

def handsign_prediction(landmarks, show=True):
    pred = model.predict([landmarks], batch_size=1)[0]
    prob = pred[np.argmax(pred)]
    return le.classes_[np.argmax(pred)],prob

def text_to_speech(*text):
    text = ''.join(text)
    text_speech = gTTS(text=text, lang = 'en')
    filename = 'text_to_speech.mp3'
    text_speech.save(filename)
    playsound.playsound(filename)
    os.remove(filename)
    return

def draw_censor(results, image):
    smallest_x = 0
    smallest_y = 0
    biggest_x = 0
    biggest_y = 0
    i = 0

    image_height, image_width, _ = image.shape

    if not results.multi_hand_landmarks == None:
        for cords in results.multi_hand_landmarks[0].landmark:
            x = cords.x * image_width
            y = cords.y * image_height
            if i == 0:
                smallest_x = int(x)
                smallest_y = int(y)
                biggest_x = int(x)
                biggest_y = int(y)
            else:
                if x < smallest_x:
                    smallest_x = int(x)
                if y < smallest_y:
                    smallest_y = int(y)
                if x > biggest_x:
                    biggest_x = int(x)
                if y > biggest_y:
                    biggest_y = int(y)
            i += 1

    image = cv2.rectangle(image,(smallest_x-50, smallest_y-50), (biggest_x+50, biggest_y), (0,0,0),-1)
    return image

In [16]:
word = ''
current_word = ''
STORE_FRAME = 8
frame = 0
pred = ''
prob = 0

mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands

# For webcam input:
cap = cv2.VideoCapture(0)
with mp_hands.Hands(
    max_num_hands=2,
    min_detection_confidence=0.8,
    min_tracking_confidence=0.8) as hands:
    while cap.isOpened():
        success, image = cap.read()
        if not success:
            # If loading a video, use 'break' instead of 'continue'.
            continue
        # Flip the image horizontally for a later selfie-view display, and convert
        # the BGR image to RGB.
        image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
        # To improve performance, optionally mark the image as not writeable to
        # pass by reference.
        image.flags.writeable = False
        results = hands.process(image)

        # Draw the hand annotations on the image.
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        annotated_image = image.copy()
        censor_image = image.copy()
        if results.multi_hand_landmarks:
            hand = results.multi_handedness[0].classification[0].label
            new_landmarks = reference_from_wrist(results, hand)
            
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(
                    annotated_image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            pred, prob = handsign_prediction(new_landmarks, show = False)
        else:
            pred = ''
            prob = 0
        # describe the type of font 
        # to be used. 
        font = cv2.FONT_HERSHEY_SIMPLEX 
        prediction = 'Prediction: '+ str(pred)
        probability = 'Probability: '+ str(prob)
        # Use putText() method for 
        # inserting text on video 
        if prob > 0.6:
            cv2.putText(image,  
                        probability,  
                        (50, 80),  
                        font, 0.75,  
                        (0, 255, 0),  
                        2,  
                        cv2.LINE_4) 
            if pred == 'middle_finger':
                image = draw_censor(results, censor_image)

            else:
                cv2.putText(image,  
                    prediction,  
                    (50, 50),  
                    font, 0.75,  
                    (0, 255, 0),  
                    2,  
                    cv2.LINE_4)
                if current_word != pred:
                    frame = 0
                    current_word = pred
                else:
                    frame += 1

                if frame == STORE_FRAME:
                    if current_word == 'space':
                        speech = threading.Thread(target=text_to_speech, name="speech", args=word)
                        speech.start()
                        word = ''
                        current_word = ''
                        frame = 0
                    else:
                        word += current_word
                        current_word = ''
                            
                
        cv2.putText(image,  
                    word,  
                    (50, 110),  
                    font, 0.75,  
                    (0, 255, 0),  
                    2,  
                    cv2.LINE_4) 
        cv2.imshow('MediaPipe Hands', image)
        
        if cv2.waitKey(5) & 0xFF == ord('q'):
            break
cap.release()
cv2.destroyAllWindows()