## Importing Dependencies:

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
import cv2
import mediapipe as mp
import time
import pyttsx3

## Importing Dataset:

In [3]:
train = pd.read_csv("train.csv")


In [4]:
pq_file = "train_landmark_files/16069/10042041.parquet"


In [5]:

xyz = pd.read_parquet(pq_file)

In [6]:
ROWS_PER_FRAME = 543  # number of landmarks per frame

def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / ROWS_PER_FRAME)
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

## Converting Mediapipe to Dataframe:

In [7]:
def create_frame_landmark_df(results, frame):
    xyz_skel = xyz[['type','landmark_index']].drop_duplicates().reset_index(drop=True).copy()

    face = pd.DataFrame()
    pose = pd.DataFrame()
    left_hand = pd.DataFrame()
    right_hand = pd.DataFrame()

    if results.face_landmarks:
        for i, point in enumerate(results.face_landmarks.landmark):
            face.loc[i, ['x', 'y', 'z']] = [point.x, point.y, point.z]	
            #face['y'] = - face['y']
    if results.pose_landmarks:
        for i, point in enumerate(results.pose_landmarks.landmark):
            pose.loc[i, ['x', 'y', 'z']] = [point.x, point.y, point.z]

    if results.left_hand_landmarks:
        for i, point in enumerate(results.left_hand_landmarks.landmark):
            left_hand.loc[i, ['x', 'y', 'z']] = [point.x, point.y, point.z]


    if results.right_hand_landmarks:
        for i, point in enumerate(results.right_hand_landmarks.landmark):
            right_hand.loc[i, ['x', 'y', 'z']] = [point.x, point.y, point.z]

    face = face.reset_index().rename(columns = {'index':'landmark_index'}).assign(type = 'face')
    pose = pose.reset_index().rename(columns = {'index':'landmark_index'}).assign(type = 'pose')
    left_hand = left_hand.reset_index().rename(columns = {'index':'landmark_index'}).assign(type = 'left_hand')
    right_hand = right_hand.reset_index().rename(columns = {'index':'landmark_index'}).assign(type = 'right_hand')

    landmarks = pd.concat([face, pose, left_hand, right_hand]).reset_index(drop = True)
    landmarks = xyz_skel.merge(landmarks, on = ['type','landmark_index'], how = 'left')
    landmarks = landmarks.assign(frame = frame)
    return landmarks

## Loading Tensorflow Model:

In [8]:
interpreter = tf.lite.Interpreter("model.tflite")
found_signatures = list(interpreter.get_signature_list().keys())
prediction_fn = interpreter.get_signature_runner("serving_default")

## Testing using a dataset parquet file:

In [9]:
def get_prediction(prediction_fn, pq_file):
    xyz_np = load_relevant_data_subset(pq_file)
    prediction = prediction_fn(inputs = xyz_np)
    pred = prediction['outputs'].argmax()

    # Add ordinally Encoded Sign (assign number to each sign name)
    train['sign_ord'] = train['sign'].astype('category').cat.codes

    # Dictionaries to translate sign <-> ordinal encoded sign
    SIGN2ORD = train[['sign', 'sign_ord']].set_index('sign').squeeze().to_dict()
    ORD2SIGN = train[['sign_ord', 'sign']].set_index('sign_ord').squeeze().to_dict()
    sign = ORD2SIGN[pred]
    pred_conf = prediction['outputs'][pred]
    print(f'PREDICTED SIGN: {sign} [{pred}], CONFIDENCE:{ pred_conf : 0.4}')
    return sign
get_prediction(prediction_fn, pq_file) 

PREDICTED SIGN: green [102], CONFIDENCE: 0.8383


'green'

## Testing using Live-Feed and Text to Speech:

In [11]:
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_holistic = mp.solutions.holistic

all_landmarks = []
predicted_sign = ""
text_position = (50, 50)
sign_value=''

# For webcam input:
cap = cv2.VideoCapture(0,cv2.CAP_DSHOW)
engine = pyttsx3.init()

with mp_holistic.Holistic(
  min_detection_confidence=0.5,
  min_tracking_confidence=0.5) as holistic:
  frame = 0
  start_time = time.time()
  while cap.isOpened():
    frame += 1
    success, image = cap.read()
    if not success:
      print("Ignoring empty camera frame.")
      # If loading a video, use 'break' instead of 'continue'.
      continue
    
    # To improve performance, optionally mark the image as not writeable to
    # pass by reference.
    image.flags.writeable = False
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = holistic.process(image)
    #Create landmark dataframe:
    landmarks = create_frame_landmark_df(results,frame)
    all_landmarks.append(landmarks)

    # Draw landmark annotation on the image.
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    mp_drawing.draw_landmarks(
        image,
        results.face_landmarks,
        mp_holistic.FACEMESH_CONTOURS,
        landmark_drawing_spec=None,
        connection_drawing_spec=mp_drawing_styles
        .get_default_face_mesh_contours_style())
    mp_drawing.draw_landmarks(
        image,
        results.pose_landmarks,
        mp_holistic.POSE_CONNECTIONS,
        landmark_drawing_spec=mp_drawing_styles
        .get_default_pose_landmarks_style())
   
    if time.time() - start_time >= 3:
      if len(all_landmarks)> 0:
        landmarks_df = pd.concat(all_landmarks).reset_index(drop=True)
        landmarks_df.to_parquet("output.parquet")
        pq_file = "output.parquet"
        sign_value = get_prediction(prediction_fn, pq_file)
                     
        # Update predicted_sign only when a new prediction is available
        if predicted_sign != sign_value:
                predicted_sign = sign_value
                engine.say(predicted_sign)
                engine.runAndWait()
        
        start_time = time.time()
        all_landmarks = []

    # Get the size of the text box
    text_size = cv2.getTextSize(sign_value, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)[0]

    # Set the position for the rectangular box
    box_position = (text_position[0] - 10, text_position[1] - text_size[1] - 10)

    # Draw the white rectangular box
    image = cv2.rectangle(image, box_position, (text_position[0] + text_size[0] + 10, text_position[1] + 10), (255, 255, 255), -1)

    # Draw the text
    image = cv2.putText(image, predicted_sign, text_position, cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)
    
    # Display the image:
    cv2.imshow('MediaPipe Holistic', image)
    if cv2.waitKey(1) & 0xFF == ord('q'):
      break
cap.release()
cv2.destroyAllWindows()


PREDICTED SIGN: TV [0], CONFIDENCE: nan
PREDICTED SIGN: TV [0], CONFIDENCE: nan
PREDICTED SIGN: elephant [69], CONFIDENCE: 0.02939
PREDICTED SIGN: cat [38], CONFIDENCE: 0.04214


In [None]:
cap.release()
cv2.destroyAllWindows()