## Importing Libraries

In [3]:
import cv2
import json
import time
import numpy as np
import mediapipe as mp
import tensorflow as tf
from collections import deque
from concurrent.futures import ThreadPoolExecutor

tf.get_logger().setLevel('ERROR')




## MediaPipe Implementation

In [4]:
filtered_hand = list(range(21))
filtered_pose = [0, 2, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

HAND_NUM = len(filtered_hand)
POSE_NUM = len(filtered_pose)

In [5]:
hands = mp.solutions.hands.Hands()
pose = mp.solutions.pose.Pose()

def get_frame_landmarks(frame):
    
    all_landmarks = np.zeros((HAND_NUM * 2 + POSE_NUM, 3))
    
    def get_hands(frame):
        results_hands = hands.process(frame)
        if results_hands.multi_hand_landmarks:
            for i, hand_landmarks in enumerate(results_hands.multi_hand_landmarks):
                if results_hands.multi_handedness[i].classification[0].index == 0: 
                    all_landmarks[:HAND_NUM, :] = np.array(
                        [(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark]) # right
                else:
                    all_landmarks[HAND_NUM:HAND_NUM * 2, :] = np.array(
                        [(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark]) # left

    def get_pose(frame):
        results_pose = pose.process(frame)
        if results_pose.pose_landmarks:
            all_landmarks[HAND_NUM * 2:HAND_NUM * 2 + POSE_NUM, :] = np.array(
                [(lm.x, lm.y, lm.z) for lm in results_pose.pose_landmarks.landmark])[filtered_pose]
        
    with ThreadPoolExecutor(max_workers=2) as executor:
        executor.submit(get_hands, frame)
        executor.submit(get_pose, frame)

    return all_landmarks

## Load Trained Model

In [6]:
gloss_mapping_path = "590_gloss_mapping.json"
index_gloss_mapping_path = "590_index_gloss_mapping.json"
index_label_mapping_path = "590_index_label_mapping.json"

gloss_mapping = json.load(open(gloss_mapping_path, "r"))
index_gloss_mapping = json.load(open(index_gloss_mapping_path, "r"))
index_label_mapping = json.load(open(index_label_mapping_path, "r"))

In [7]:
model_path = 'model.tflite'
interpreter = tf.lite.Interpreter(model_path=model_path)
interpreter.allocate_tensors()

In [8]:
def predict(input_data):
    input_data = np.expand_dims(input_data, axis=0).astype(np.float32)
    interpreter.set_tensor(interpreter.get_input_details()[0]['index'], input_data)
    interpreter.invoke()
    output = interpreter.get_tensor(interpreter.get_output_details()[0]['index'])
    return output

In [9]:
input_shape = list(map(int, interpreter.get_input_details()[0]['shape']))
output_shape = list(map(int, interpreter.get_output_details()[0]['shape']))
input_shape, output_shape

([1, 120, 55, 3], [1, 590])

---

## 1. Test Live Feed

In [58]:
cap = cv2.VideoCapture(0)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

sequence = deque(maxlen=input_shape[1])
for _ in range(input_shape[1]):
    sequence.append(np.zeros((input_shape[2], 3)))

step_length = 40
TIME_PER_STEP = step_length / 30.0
step_time = time.time()
frame_time = 0
label = ''
step = []

while True:
    ret, frame = cap.read()
    if not ret: continue
    
    fps = str(int(1 / (time.time() - frame_time)))
    frame_time = time.time()
    
    # frame = cv2.flip(frame, 1)
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame_rgb.flags.writeable = False
    frame_landmarks = get_frame_landmarks(frame_rgb)
    
    for point in frame_landmarks:
        X = int(point[0] * width)
        y = int(point[1] * height)
        cv2.circle(frame, (X, y), 2, (0, 255, 0), -1)
    cv2.putText(frame, fps, (30,60), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 2, cv2.LINE_AA)
    cv2.putText(frame, label, (30, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)    
    
    step.append(frame_landmarks)

    if time.time() - step_time >= TIME_PER_STEP:
        step = np.array(step)
        step = np.apply_along_axis(lambda arr: np.interp(np.linspace(0, 1, step_length),
                                                         np.linspace(0, 1, arr.shape[0]), arr),
                                   axis=0, arr=step)
        
        sequence.extend(step)
        prediction = predict(np.array(sequence))
        prediction = prediction.reshape(-1)
        # softmax = np.exp(prediction) / np.sum(np.exp(prediction), axis=0)
        # print(f'confidence: {softmax.max()}')
        # if softmax.max() < 0.5:
        #     label = 'None'
        # else:
        prediction = prediction.argmax()
        label = index_label_mapping[str(prediction)]
        print(f'Label: {label}')
        
        step_time = time.time()
        step = []
        
    cv2.imshow("Test", frame)
    cv2.setWindowProperty("Test", cv2.WND_PROP_TOPMOST, 1)
    k = cv2.waitKey(1)
    if k == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Label: None
Label: and
Label: good
Label: good
Label: welcome
Label: welcome


In [20]:
cap.release()
cv2.destroyAllWindows()

---

## 2. Test on Record

In [46]:
def get_video_landmarks(video_path):
    vid = cv2.VideoCapture(video_path)
    all_frame_landmarks = []
    
    while vid.isOpened():
        ret, frame = vid.read()
        if not ret:
            break
        frame.flags.writeable = False
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame_landmarks = get_frame_landmarks(frame)
        all_frame_landmarks.append(frame_landmarks)

    vid.release()
    hands.reset()
    pose.reset()
    return np.array(all_frame_landmarks)

In [47]:
def padding(X, length, pad=0):
    if len(X) > length:
        start = (len(X) - length) // 2
        end = start + length
        X = X[start:end]
    else:
        pad_before = (length - len(X)) // 2
        pad_after = length - len(X) - pad_before
        X = np.pad(X, ((pad_before, pad_after), (0, 0), (0, 0)), 'constant', constant_values=pad)   
    return X

In [49]:
import tkinter as tk
import cv2
import os
from datetime import datetime
from PIL import Image, ImageTk
from tkinter import messagebox as msg

window = tk.Tk()
window.title("Record Translation")
window.attributes('-topmost', True)
window_width = 700
window_height = 550
screen_width = window.winfo_screenwidth()
screen_height = window.winfo_screenheight()
x_position = (screen_width - window_width) // 2
y_position = (screen_height - window_height) // 2
window.geometry(f"{window_width}x{window_height}+{x_position}+{y_position}")

cap = cv2.VideoCapture(0)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
canvas = tk.Canvas(window, width=width, height=height)
canvas.pack()

os.makedirs("recordings", exist_ok=True)
is_recording = False
is_translating = False
output_record = None
video_path = ""

def update():
    global is_recording, is_translating, output_record, video_path, cap, canvas
    ret, frame = cap.read()
    if ret:
        # frame = cv2.flip(frame, 1)
        photo = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        # photo = cv2.flip(photo, 1)
        photo = ImageTk.PhotoImage(image=Image.fromarray(photo))
        canvas.create_image(0, 0, image=photo, anchor=tk.NW)
        canvas.photo = photo
        
    if is_recording:
        if output_record is not None:
            output_record.write(frame)
        canvas.create_oval(10, 10, 30, 30, fill="red")
    
    if is_translating:
        btn_record.config(state="disabled")
        canvas.delete("all")
        canvas.create_text(width // 2, height // 2, text="Processing...", font=("Arial", 30))
        canvas.update()
        process_video()
        btn_record.config(state="normal")
        is_translating = False

        
    window.after(10, update)

def toggle_record():
    global is_recording
    if not is_recording:
        start_recording()
    else:
        stop_recording()

btn_record = tk.Button(window, text="Start Recording", width=20, command=toggle_record)
btn_record.pack(pady=10)

def start_recording():
    global output_record, is_recording, is_translating, video_path
    is_recording = True
    is_translating = False
    btn_record.config(text="Stop Recording")
    video_path = f"recordings/record_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.mp4"
    output_record = cv2.VideoWriter(video_path, cv2.VideoWriter_fourcc(*'mp4v'), 30, (width, height))

def stop_recording():
    global output_record, is_recording, is_translating
    is_recording = False
    is_translating = True
    btn_record.config(text="Start Recording")
    if output_record is not None:
        output_record.release()
        output_record = None
        print("Recording saved.")
    
def process_video():
    global video_path, cap
    print("Processing video...")
    landmarks = get_video_landmarks(video_path)
    landmarks = padding(landmarks, input_shape[1])
    prediction = predict(landmarks)
    prediction = prediction.reshape(-1)
    prediction = prediction.argmax()
    label = index_label_mapping[str(prediction)]
    msg.showinfo("Processing Completed", "Translation: " + label)
    
update()
window.mainloop()
cap.release()
cv2.destroyAllWindows()

Recording saved.
Processing video...
Recording saved.
Processing video...
Recording saved.
Processing video...
Recording saved.
Processing video...


In [38]:
cap.release()
cv2.destroyAllWindows()

---

## 3. Test on Videos

In [43]:
video_path = "Testing Videos/8507947567336565-AMAZING.mp4"

from IPython.display import Video
Video(video_path, width=640, height=480)

In [44]:
landmarks = get_video_landmarks(video_path)
landmarks = padding(landmarks, input_shape[1])
prediction = predict(landmarks)
prediction = prediction.reshape(-1)
prediction = prediction.argmax()
label = index_label_mapping[str(prediction)]
print("Label:", label)

Label: amazing


---