# Sign Language (ASL) recognition
An Artificial Intelligence project by Emile GATIGNON and Martin RAMPONT

> "Artificial Intelligence" - Course N° 12721 at Hanyang University with professor 백성용 / Sungyong Baik
> 
> Spring Semester 2023


In [None]:
# Installs
%pip install tensorflow==2.12 opencv-python mediapipe


In [None]:
# Dependencies
import cv2
import json
import matplotlib.pyplot as plt
import mediapipe as mp
import numpy as np
import tensorflow as tf

from tensorflow.keras.layers import Dense, Input, LSTM, Masking
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam


## cuDNN acceleration (if GPU available --> Nvidia CUDA drivers required)

In [None]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()


In [None]:
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)


## Landmarks detection

In [None]:
# Global parameters
labels_map_file = r'labels_map.json'
checkpoint_file = r'ckpt.hdf5'
guess_interval = 60

# Calculated params
total_param_count = (33 + 21 + 21) * 3


In [None]:
# Landmark detection - variables and functions

MP_HOLISTIC = mp.solutions.holistic
MP_DRAWING = mp.solutions.drawing_utils


def mediapipe_detection(image: cv2.Mat, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results


def draw_styled_landmarks(image, results):
    # Draw face connections
    # MP_DRAWING.draw_landmarks(image, results.face_landmarks, MP_HOLISTIC.FACEMESH_CONTOURS,
    #                           MP_DRAWING.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1),
    #                           MP_DRAWING.DrawingSpec(color=(80, 256, 121), thickness=1, circle_radius=1)
    #                           )
    # Draw pose connections
    MP_DRAWING.draw_landmarks(image, results.pose_landmarks, MP_HOLISTIC.POSE_CONNECTIONS,
                              MP_DRAWING.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=4),
                              MP_DRAWING.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2)
                              )
    # Draw left hand connections
    MP_DRAWING.draw_landmarks(image, results.left_hand_landmarks, MP_HOLISTIC.HAND_CONNECTIONS,
                              MP_DRAWING.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                              MP_DRAWING.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2)
                              )
    # Draw right hand connections
    MP_DRAWING.draw_landmarks(image, results.right_hand_landmarks, MP_HOLISTIC.HAND_CONNECTIONS,
                              MP_DRAWING.DrawingSpec(color=(245, 117, 66), thickness=2, circle_radius=4),
                              MP_DRAWING.DrawingSpec(color=(245, 66, 230), thickness=2, circle_radius=2)
                              )


def extract_landmarks(results) -> np.ndarray:
    """Transforms the results from a mediapipe process to a NumPy Array

    Args:
        results: Results from a mediapipe process

    Returns:
        np.ndarray: Vectorized results, missing landmarks are representend as numpy.nan

        results.shape = (4,)

        results[0].shape = (468, 3), results[1].shape = (33, 3),
        results[2].shape = (21, 3), results[3].shape = (21, 3)
    """
    face_landmarks = np.zeros((468, 3))
    face_landmarks.fill(np.nan)
    if results.face_landmarks != None:
        for i, landmark in enumerate(results.face_landmarks.landmark):
            face_landmarks[i] = landmark.x, landmark.y, landmark.z
    else:
        face_landmarks.fill(np.nan)

    pose_landmarks = np.zeros((33, 3))
    pose_landmarks.fill(np.nan)
    if results.pose_landmarks != None:
        for i, landmark in enumerate(results.pose_landmarks.landmark):
            pose_landmarks[i] = landmark.x, landmark.y, landmark.z
    else:
        pose_landmarks.fill(np.nan)

    left_hand_landmarks = np.zeros((21, 3))
    left_hand_landmarks.fill(np.nan)
    if results.left_hand_landmarks != None:
        for i, landmark in enumerate(results.left_hand_landmarks.landmark):
            left_hand_landmarks[i] = landmark.x, landmark.y, landmark.z
    else:
        left_hand_landmarks.fill(np.nan)

    right_hand_landmarks = np.zeros((21, 3))
    right_hand_landmarks.fill(np.nan)
    if results.right_hand_landmarks != None:
        for i, landmark in enumerate(results.right_hand_landmarks.landmark):
            right_hand_landmarks[i] = landmark.x, landmark.y, landmark.z
    else:
        right_hand_landmarks.fill(np.nan)

    return np.array([face_landmarks, pose_landmarks, left_hand_landmarks, right_hand_landmarks], dtype=object)


def video_to_landmarks(video_path: str, display: bool = False) -> np.ndarray:
    cap = cv2.VideoCapture(video_path)
    landmark_frames = np.zeros((int(cap.get(cv2.CAP_PROP_FRAME_COUNT))), dtype=np.ndarray)
    i = 0
    with MP_HOLISTIC.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            success, frame = cap.read()
            if not success:
                break

            image, results = mediapipe_detection(frame, holistic)
            landmark_frames[i] = extract_landmarks(results)
            i += 1

            if display:
                draw_styled_landmarks(image, results)
                cv2.imshow(f"Converting '{video_path}'...", image)
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    display = False
                    cv2.destroyAllWindows()
    cap.release()
    cv2.destroyAllWindows()
    return landmark_frames, display


## Data Preprocessing

In [None]:
# Data loading - functions
def vectorize_landmark(landmark: np.ndarray, *, include_face: bool = True, include_pose: bool = True,
                       include_righth: bool = True, include_lefth: bool = True) -> np.ndarray:
    return np.nan_to_num(
        np.concatenate(landmark[[include_face, include_pose, include_lefth, include_righth]]).flatten(),
        nan=0.0,
        copy=False
    )


def vectorize_landmark_frames(landmark_frames: np.ndarray, *, include_face: bool = True, include_pose: bool = True,
                              include_righth: bool = True, include_lefth: bool = True) -> np.ndarray:
    for i, landmarks in enumerate(landmark_frames):
        if type(landmarks) is np.ndarray:
            landmark_frames[i] = vectorize_landmark(landmarks, include_face=include_face, include_pose=include_pose,
                                                    include_lefth=include_lefth, include_righth=include_righth)
        else:
            landmark_frames[i] = \
                vectorize_landmark(np.array([np.zeros((468, 3)), np.zeros((33, 3)), np.zeros((21, 3)), np.zeros((21, 3))]),
                                   include_face=include_face, include_pose=include_pose,
                                   include_lefth=include_lefth, include_righth=include_righth)
    return landmark_frames


In [None]:
# Labels loading
with open(labels_map_file, "r") as labels_map_json:
    labels_map = json.load(labels_map_json)
labels_list = np.array(list(labels_map.keys()))


## LSTM Model

In [None]:
model = tf.keras.Sequential()

model.add(Input(shape=(guess_interval, total_param_count)))
model.add(Masking(mask_value=0.))
model.add(LSTM(64, return_sequences=True, activation='tanh'))
model.add(LSTM(128, return_sequences=True, activation='tanh'))
model.add(LSTM(64, return_sequences=False, activation='tanh'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(len(labels_list), activation='softmax'))

model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

model.load_weights(checkpoint_file)


## Real-Time Detection

In [None]:
colors = [(245, 117, 16), (117, 245, 16), (16, 117, 245)]


def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0, 60 + num * 40), (int(prob * 100), 90 + num * 40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85 + num * 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

    return output_frame


In [None]:
# 1. New detection variables
sequence = []
sentence = []
predictions = []
threshold = 0.5

cap = cv2.VideoCapture(0)
# Set mediapipe model
try:
    with MP_HOLISTIC.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            # 1. Input
            # Read feed
            ret, frame = cap.read()

            # Make detections
            image, results = mediapipe_detection(frame, holistic)

            # Draw landmarks
            draw_styled_landmarks(image, results)

            # 2. Prediction logic
            keypoints = vectorize_landmark(extract_landmarks(results), include_face=False)
            sequence.append(keypoints)
            sequence = sequence[-guess_interval:]

            if len(sequence) == guess_interval:
                res = model.predict(np.expand_dims(sequence, axis=0))[0]
                print(labels_list[np.argmax(res)])
                predictions.append(np.argmax(res))

            # 3. Viz logic
                if np.unique(predictions[-10:])[0] == np.argmax(res):
                    if res[np.argmax(res)] > threshold:
                        if len(sentence) > 0:
                            if labels_list[np.argmax(res)] != sentence[-1]:
                                sentence.append(labels_list[np.argmax(res)])
                        else:
                            sentence.append(labels_list[np.argmax(res)])

                if len(sentence) > 5:
                    sentence = sentence[-5:]

            cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
            cv2.putText(image, ' '.join(sentence), (3, 30),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

            # Show to screen
            cv2.imshow("ASL Recognition Demo [exit with 'Q']", image)

            # Break gracefully
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
finally:
    cap.release()
    cv2.destroyAllWindows()
