# Sign Language (ASL) recognition
An Artificial Intelligence project by Emile GATIGNON and Martin RAMPONT

> "Artificial Intelligence" - Course N° 12721 at Hanyang University with professor 백성용 / Sungyong Baik
> 
> Spring Semester 2023


In [2]:
# Installs
%pip install tensorflow==2.12 scikit-learn opencv-python mediapipe


Note: you may need to restart the kernel to use updated packages.


In [3]:
# Dependencies
import cv2
import hashlib
import json
import mediapipe as mp
import numpy as np
import os
import urllib.request
import zipfile
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical



In [4]:
# Global parameters

# Dataset
# ? Dataset source : https://www.kaggle.com/datasets/risangbaskoro/wlasl-processed
data_url = r'https://storage.googleapis.com/kaggle-data-sets/1589971/2632847/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20230528%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20230528T101545Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=3d32a329ea1d2da65583832ef994b2b0ada5e17e04f938afff9fdd04fca8643b22b889accfc807458f1f8616b28cc2b0412f464649c851bdd2d0d6d4bfca08bd85f37f0be1653fdd3c85fb44b6abf81faf3051ca1eb817c1a52158574d1545d7723f498008fb2b151c5a1a2ab855299e72727c9ecc3138965c81e33660024625a3779e065613c78c7520913c5279bdbe392010d66bab023509f1a1a792fb5567ddf7865fdb51f354fd737ac202a07d02481ec04eb5e26f44a94aa942d4dd395bd1f5984ba5eb60b46a80a5cd33b7229558c69bfd524c3e7ec49b18150956e3b0b96849ec2270cadb458b38f9cee415722a91fbbecc3acebabec79d25016c0217'
data_path = r'downloads/data'
videos_folder = r'videos'
landmarks_folder = r'landmarks'
data_description_file = r'WLASL_v0.3.json'
labels_file = r'labels.json'

# Dataset formating
inclued_landmarks = {
    "include_face": False,
    "include_pose": True,
    "include_righth": True,
    "include_lefth": True,
}


labels_file_path = os.path.join(data_path, labels_file)
data_description_file_path = os.path.join(data_path, data_description_file)
videos_folder_path = os.path.join(data_path, videos_folder)
landmarks_folder_path = os.path.join(data_path, landmarks_folder)


In [5]:
# Dataset download and extraction
data_zip_hash = '1b8198227bb3de21de639146016a7368'

if os.path.isdir(landmarks_folder_path):
    print("Landmarks found, skipping download")
elif os.path.isfile(data_description_file_path) \
        and os.path.isdir(videos_folder_path):
    print("Data already unpacked, skipping")
else:
    downloaded_hash = ''
    if os.path.isfile(data_path + '.zip'):
        print("Data already downloaded, checking intergrity...")
        hash_md5 = hashlib.md5()
        with open(data_path + '.zip', "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
        downloaded_hash = hash_md5.hexdigest()

    if data_zip_hash != downloaded_hash:
        def report_hook(count, block_size, total_size):
            percentage = (count * block_size / total_size) * 100
            print(f"Downloading data... {percentage:.2f}%", end='\r')
        urllib.request.urlretrieve(data_url, data_path + '.zip', reporthook=report_hook)
        print("\n")
    else:
        print("Downloaded zip integrity ok")

    with zipfile.ZipFile(data_path + '.zip', 'r') as zip_ref:
        total_files = len(zip_ref.namelist())
        extracted_files = 0
        for file in zip_ref.namelist():
            zip_ref.extract(file, data_path)

            extracted_files += 1
            progress = (extracted_files / total_files) * 100
            print(f"Extracting... {progress:.2f}%", end='\r')
        print("\n")


Landmarks found, skipping download


In [6]:
# Landmark detection - variables and functions

MP_HOLISTIC = mp.solutions.holistic
MP_DRAWING = mp.solutions.drawing_utils


def mediapipe_detection(image: cv2.Mat, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results


def draw_styled_landmarks(image, results):
    # Draw face connections
    MP_DRAWING.draw_landmarks(image, results.face_landmarks, MP_HOLISTIC.FACEMESH_CONTOURS,
                              MP_DRAWING.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1),
                              MP_DRAWING.DrawingSpec(color=(80, 256, 121), thickness=1, circle_radius=1)
                              )
    # Draw pose connections
    MP_DRAWING.draw_landmarks(image, results.pose_landmarks, MP_HOLISTIC.POSE_CONNECTIONS,
                              MP_DRAWING.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=4),
                              MP_DRAWING.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2)
                              )
    # Draw left hand connections
    MP_DRAWING.draw_landmarks(image, results.left_hand_landmarks, MP_HOLISTIC.HAND_CONNECTIONS,
                              MP_DRAWING.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                              MP_DRAWING.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2)
                              )
    # Draw right hand connections
    MP_DRAWING.draw_landmarks(image, results.right_hand_landmarks, MP_HOLISTIC.HAND_CONNECTIONS,
                              MP_DRAWING.DrawingSpec(color=(245, 117, 66), thickness=2, circle_radius=4),
                              MP_DRAWING.DrawingSpec(color=(245, 66, 230), thickness=2, circle_radius=2)
                              )


def extract_landmarks(results) -> np.ndarray:
    """Transforms the results from a mediapipe process to a NumPy Array

    Args:
        results: Results from a mediapipe process

    Returns:
        np.ndarray: Vectorized results, missing landmarks are representend as numpy.nan

        results.shape = (4,)

        results[0].shape = (468, 3), results[1].shape = (33, 3),
        results[2].shape = (21, 3), results[3].shape = (21, 3)
    """
    face_landmarks = np.zeros((468, 3))
    face_landmarks.fill(np.nan)
    if results.face_landmarks != None:
        for i, landmark in enumerate(results.face_landmarks.landmark):
            face_landmarks[i] = landmark.x, landmark.y, landmark.z
    else:
        face_landmarks.fill(np.nan)

    pose_landmarks = np.zeros((33, 3))
    pose_landmarks.fill(np.nan)
    if results.pose_landmarks != None:
        for i, landmark in enumerate(results.pose_landmarks.landmark):
            pose_landmarks[i] = landmark.x, landmark.y, landmark.z
    else:
        pose_landmarks.fill(np.nan)

    left_hand_landmarks = np.zeros((21, 3))
    left_hand_landmarks.fill(np.nan)
    if results.left_hand_landmarks != None:
        for i, landmark in enumerate(results.left_hand_landmarks.landmark):
            left_hand_landmarks[i] = landmark.x, landmark.y, landmark.z
    else:
        left_hand_landmarks.fill(np.nan)

    right_hand_landmarks = np.zeros((21, 3))
    right_hand_landmarks.fill(np.nan)
    if results.right_hand_landmarks != None:
        for i, landmark in enumerate(results.right_hand_landmarks.landmark):
            right_hand_landmarks[i] = landmark.x, landmark.y, landmark.z
    else:
        right_hand_landmarks.fill(np.nan)

    return np.array([face_landmarks, pose_landmarks, left_hand_landmarks, right_hand_landmarks], dtype=object)


def video_to_landmarks(video_path: str, display: bool = False) -> np.ndarray:
    cap = cv2.VideoCapture(video_path)
    landmark_frames = np.zeros((int(cap.get(cv2.CAP_PROP_FRAME_COUNT))), dtype=np.ndarray)
    i = 0
    with MP_HOLISTIC.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            success, frame = cap.read()
            if not success:
                break

            image, results = mediapipe_detection(frame, holistic)
            landmark_frames[i] = extract_landmarks(results)
            i += 1

            if display:
                draw_styled_landmarks(image, results)
                cv2.imshow(f"Converting '{video_path}'...", image)
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    display = False
                    cv2.destroyAllWindows()
    cap.release()
    cv2.destroyAllWindows()
    return landmark_frames, display


In [7]:
# Landmark detection - dataset conversion
display_conversion = False


if not os.path.exists(landmarks_folder_path):
    os.mkdir(landmarks_folder_path)
video_count = len(os.listdir(videos_folder_path))
skipped = 0

try:
    for i, video_path in enumerate(os.listdir(videos_folder_path)):
        progress = 100 * i / video_count
        print(f"Generating landmarks: {progress:6.2f}% ({skipped:5} skipped) -> {video_path}", end='\r')

        array_path = os.path.join(landmarks_folder_path, video_path[:-4]) + '.npy'
        if video_path.endswith('.mp4') and not os.path.isfile(array_path):
            landmarks, display_conversion = video_to_landmarks(os.path.join(videos_folder_path, video_path), display_conversion)
            np.save(array_path, landmarks)
        else:
            skipped += 1
except KeyboardInterrupt:
    count = len(os.listdir(landmarks_folder_path))
    progress = 100 * count / video_count
    print(f"Interrupted landmark generation at {progress:6.2f}% -> {skipped} skipped, {count - skipped} generated")
print(f"Finished landmark generation -> {skipped} skipped, {len(os.listdir(landmarks_folder_path)) - skipped} generated")


Finished landmark generation -> 11380 skipped, 600 generated


In [8]:
# Labels extraction

if os.path.isfile(labels_file_path):
    print("Labels already generated, skipping")
else:
    labels = {}

    print("Loading data description and extracting labels...")
    with open(data_description_file_path, "r") as data_descriptor:
        data_desc = json.load(data_descriptor)
        for entry in data_desc:
            for instance in entry['instances']:
                labels[instance['video_id']] = entry['gloss']

    print("Saving labels...")
    with open(labels_file_path, "w") as labels_container:
        json.dump(labels, labels_container, indent=4)

    print("Labels generated.")


Labels already generated, skipping


## Data Preprocessing

In [9]:
# Actions that we try to detect
file = open(os.path.join(data_path,r'wlasl_class_list.txt'), 'r')
lines = file.readlines()
actions = [line.strip().split('\t', maxsplit=1)[1] for line in lines]

# Python list --> numpy array
actions = np.array(actions)

# numpy array --> dictionnary
label_map = {label:num for num, label in enumerate(actions)}
print(label_map)

{'book': 0, 'drink': 1, 'computer': 2, 'before': 3, 'chair': 4, 'go': 5, 'clothes': 6, 'who': 7, 'candy': 8, 'cousin': 9, 'deaf': 10, 'fine': 11, 'help': 12, 'no': 13, 'thin': 14, 'walk': 15, 'year': 16, 'yes': 17, 'all': 18, 'black': 19, 'cool': 20, 'finish': 21, 'hot': 22, 'like': 23, 'many': 24, 'mother': 25, 'now': 26, 'orange': 27, 'table': 28, 'thanksgiving': 29, 'what': 30, 'woman': 31, 'bed': 32, 'blue': 33, 'bowling': 34, 'can': 35, 'dog': 36, 'family': 37, 'fish': 38, 'graduate': 39, 'hat': 40, 'hearing': 41, 'kiss': 42, 'language': 43, 'later': 44, 'man': 45, 'shirt': 46, 'study': 47, 'tall': 48, 'white': 49, 'wrong': 50, 'accident': 51, 'apple': 52, 'bird': 53, 'change': 54, 'color': 55, 'corn': 56, 'cow': 57, 'dance': 58, 'dark': 59, 'doctor': 60, 'eat': 61, 'enjoy': 62, 'forget': 63, 'give': 64, 'last': 65, 'meet': 66, 'pink': 67, 'pizza': 68, 'play': 69, 'school': 70, 'secretary': 71, 'short': 72, 'time': 73, 'want': 74, 'work': 75, 'africa': 76, 'basketball': 77, 'birth

In [10]:
sequences, labels=[],[]
max_timesteps=0

def vectorize_landmark(landmark: np.ndarray, *, include_face: bool = True, include_pose: bool = True,
                       include_righth: bool = True, include_lefth: bool = True) -> np.ndarray:
    return np.concatenate(landmark[[include_face, include_pose, include_lefth, include_righth]]).flatten()


def vectorize_landmark_frames(landmark_frames: np.ndarray, *, include_face: bool = True, include_pose: bool = True,
                              include_righth: bool = True, include_lefth: bool = True) -> np.ndarray:
    for i, landmarks in enumerate(landmark_frames):
        if type(landmarks) is np.ndarray:
            landmark_frames[i] = vectorize_landmark(landmarks, include_face=include_face, include_pose=include_pose,
                                                    include_lefth=include_lefth, include_righth=include_righth)
    return landmark_frames


with open(labels_file_path, "r") as labels_json:
    labels_dic = json.load(labels_json)

i=0
for sequence_id in os.listdir(landmarks_folder_path):
    video_id = sequence_id[:-4]
    if(i<3):
        window = np.load(os.path.join(landmarks_folder_path, sequence_id), allow_pickle=True)
        if(window.size>max_timesteps):
            max_timesteps=window.size
        sequences.append(vectorize_landmark_frames(window))
        labels.append(label_map[labels_dic[video_id]])
    i=i+1

sequences = np.array(sequences)
labels = np.array(labels)
print('max_timesteps = ',max_timesteps)


max_timesteps =  72


  sequences = np.array(sequences)


In [11]:
print(sequences)
print(labels)
print(sequences.shape)
print(sequences.ndim)

[array([array([ 0.49909651,  0.27945885, -0.02198693, ...,         nan,
                       nan,         nan])                              ,
        array([ 0.49801421,  0.27918899, -0.02143088, ...,         nan,
                       nan,         nan])                              ,
        array([ 0.49669135,  0.27930447, -0.02142402, ...,         nan,
                       nan,         nan])                              ,
        array([ 0.49496615,  0.27906948, -0.02126633, ...,         nan,
                       nan,         nan])                              ,
        array([ 0.49417076,  0.27899098, -0.02145034, ...,         nan,
                       nan,         nan])                              ,
        array([ 0.49286398,  0.27866563, -0.02128364, ...,         nan,
                       nan,         nan])                              ,
        array([ 0.49160776,  0.27783728, -0.02161261, ...,         nan,
                       nan,         nan])                 

In [12]:
def dim_number_jaggedArray(arr):
    dimensions = 0
    arr_copy = arr.copy()

    while True:
        try:
            _ = arr_copy[0]
            dimensions += 1
            arr_copy = arr_copy[0]
        except (TypeError, IndexError):
            break

    return dimensions


def find_jagged_dimension(arr):
    if arr.ndim != dim_number_jaggedArray(arr):
        return arr.ndim+1
    else:
        return -1
    
print(dim_number_jaggedArray(sequences))
print(find_jagged_dimension(sequences))


3
2


In [13]:
# # tentative padding manuel - NOT FULLY OPERATIONAL YET
# # Padding
# padded_sequences = []
# for sequence in sequences:
#     if sequence.shape[0] < max_timesteps:
#         padded_sequence = np.pad(sequence, ((0, max_timesteps - sequence.shape[0]), (0, 0)), mode='constant')
#     else:
#         padded_sequence = sequence[:max_timesteps, :]
#     padded_sequences.append(padded_sequence)

# # Convertir en tableau numpy
# padded_sequences = np.array(padded_sequences)
# labels = np.array(labels)


In [14]:
for i in range(sequences.size):
        sequences[i] = sequences[i][:-1]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.05)

In [16]:
print(X_train[0])

[array([ 0.56375277,  0.26887751, -0.02029927, ...,  0.56457007,
         0.98777699, -0.04537779])
 array([ 0.5647648 ,  0.27147025, -0.01931617, ...,  0.56294882,
         0.98698252, -0.04904999])
 array([ 0.5640043 ,  0.26959994, -0.01931689, ...,  0.5615651 ,
         0.98782015, -0.05095621])
 array([ 0.56383002,  0.27022573, -0.01931118, ...,  0.56063128,
         0.98815638, -0.04991805])
 array([ 0.56456256,  0.26955354, -0.0190877 , ...,  0.56033969,
         0.98862636, -0.0497687 ])
 array([ 0.56465429,  0.26908657, -0.01919399, ...,  0.56090879,
         0.99002552, -0.04988189])
 array([ 0.56418586,  0.26867801, -0.01889281, ...,  0.56136966,
         0.98905998, -0.04781205])
 array([ 0.56392151,  0.26795039, -0.0190555 , ...,  0.56131464,
         0.98852241, -0.04584948])
 array([ 0.5639267 ,  0.26724663, -0.01893209, ...,  0.56150538,
         0.98841393, -0.04465349])
 array([ 0.56366479,  0.26658541, -0.01917436, ...,  0.56176245,
         0.98637366, -0.04475087])


In [17]:
r_X_train=tf.ragged.constant(X_train)
print(r_X_train.shape)

(2, None, None)


In [20]:
# replace nan with 0. float64 in the ragged tensor to be able to compute loss and accuracy in the model fitting
def replace_nan_with_zero(ragged_tensor):
    nan_mask = tf.math.is_nan(ragged_tensor.values)
    zero_tensor = tf.zeros_like(ragged_tensor.values, dtype=tf.float64)
    filled_values = tf.where(nan_mask, zero_tensor, ragged_tensor.values)
    filled_ragged_tensor = tf.RaggedTensor.from_row_lengths(filled_values, ragged_tensor.row_lengths())
    return filled_ragged_tensor

no_nan_r_X_train=replace_nan_with_zero(r_X_train)

no_nan_r_X_train

<tf.RaggedTensor [[[0.5637527704238892, 0.2688775062561035, -0.0202992744743824, ...,
   0.5645700693130493, 0.9877769947052002, -0.045377787202596664],
  [0.5647647976875305, 0.27147024869918823, -0.0193161740899086, ...,
   0.5629488229751587, 0.986982524394989, -0.049049992114305496],
  [0.5640043020248413, 0.26959994435310364, -0.019316894933581352, ...,
   0.561565101146698, 0.9878201484680176, -0.050956208258867264],
  ...,
  [0.5438551306724548, 0.2799321413040161, -0.021055709570646286, ..., 0.0,
   0.0, 0.0],
  [0.5433005094528198, 0.28053921461105347, -0.02107234299182892, ..., 0.0,
   0.0, 0.0],
  [0.5425605177879333, 0.2802666127681732, -0.021331867203116417, ..., 0.0,
   0.0, 0.0]]                                                              ,
 [[0.49909651279449463, 0.2794588506221771, -0.021986933425068855, ...,
   0.0, 0.0, 0.0],
  [0.4980142116546631, 0.27918899059295654, -0.021430879831314087, ...,
   0.0, 0.0, 0.0],
  [0.49669134616851807, 0.27930447459220886, -0.021

In [18]:
y_train_cat = to_categorical(y_train).astype(int)
y_train_cat.shape

(2, 1284)

## LSTM Model

In [68]:
# TensorBoard to visualize model performances

log_dir_path = r'logs'
print(os.path.exists(log_dir_path))
tb_callback = TensorBoard(log_dir='C:\\Users\\Elève\\Documents\\GitHub\\Sign-Language--ASL--recognition\\logs')
if os.access('C:\\Users\\Elève\\Documents\\GitHub\\Sign-Language--ASL--recognition\\logs', os.W_OK):
    print("Le répertoire a les permissions d'accès en écriture.")
else:
    print("Le répertoire n'a pas les permissions d'accès en écriture.")

True
Le répertoire a les permissions d'accès en écriture.


In [43]:
model = tf.keras.Sequential()
model.add(Input(shape=(None,1629),ragged=True))
model.add(LSTM(64, return_sequences=True, activation='relu'))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(y_train_cat.shape[1], activation='softmax'))

model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [65]:
# Model training with TensorBoard callback = data for visualization
model.fit(no_nan_r_X_train, y_train_cat, epochs=2000, callbacks=[tb_callback])

# Model summary
model.summary()

FailedPreconditionError: {{function_node __wrapped__CreateSummaryFileWriter_device_/job:localhost/replica:0/task:0/device:CPU:0}} C:\Users\Elève\Documents\GitHub\Sign-Language--ASL--recognition\logs is not a directory [Op:CreateSummaryFileWriter]