In [1]:
# 
!pip install mediapipe

Collecting mediapipe
  Downloading mediapipe-0.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.4.6-py3-none-any.whl (31 kB)
Installing collected packages: sounddevice, mediapipe
Successfully installed mediapipe-0.10.2 sounddevice-0.4.6
[0m

In [2]:
import os
import shutil
import numpy as np
import pandas as pd
import pyarrow.parquet as pq # to read in parquet files
import tensorflow as tf
import json
import mediapipe
import matplotlib
import matplotlib.pyplot as plt
import random

from skimage.transform import resize
from mediapipe.framework.formats import landmark_pb2
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.losses import BinaryCrossentropy
from tqdm.notebook import tqdm
from matplotlib import animation, rc

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
print("TensorFlow v" + tf.__version__)
print("Mediapipe v" + mediapipe.__version__)

TensorFlow v2.12.0
Mediapipe v0.10.2


In [4]:
# the processing code is from https://www.kaggle.com/code/gusthema/asl-fingerspelling-recognition-w-tensorflow

# Load the data set

dataset_df = pd.read_csv('/kaggle/input/asl-fingerspelling/train.csv')
print("Full train dataset shape is {}".format(dataset_df.shape))

print("The first few rows of the train.csv frame is: ")
dataset_df.head()

Full train dataset shape is (67208, 5)
The first few rows of the train.csv frame is: 


Unnamed: 0,path,file_id,sequence_id,participant_id,phrase
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road


In [5]:
# The train.csv looks like a map to point to 67208 parquet files, each contains more detailed coordinate data for each phrase.

# fetch the data from a the first parquet file, represents the phrase "3 creekhouse".

sequence_id, file_id, phrase = dataset_df.iloc[0][['sequence_id', 'file_id', 'phrase']]
sample_sequence_df = pq.read_table(f"/kaggle/input/asl-fingerspelling/train_landmarks/{str(file_id)}.parquet",
                                  filters=[[('sequence_id', '=', sequence_id)], ]).to_pandas() # filter only applied to

print("The full sequence data frame is:", sample_sequence_df, end="\n")


The full sequence data frame is:              frame  x_face_0  x_face_1  x_face_2  x_face_3  x_face_4  \
sequence_id                                                            
1816796431       0  0.710588  0.699951  0.705657  0.691768  0.699669   
1816796431       1  0.709525  0.697582  0.703713  0.691016  0.697576   
1816796431       2  0.711059  0.700858  0.706272  0.693285  0.700825   
1816796431       3  0.712799  0.702518  0.707840  0.694899  0.702445   
1816796431       4  0.712349  0.705451  0.709918  0.696006  0.705180   
...            ...       ...       ...       ...       ...       ...   
1816796431     118  0.700922  0.689774  0.695984  0.679756  0.688836   
1816796431     119  0.700576  0.692017  0.697875  0.682405  0.691249   
1816796431     120  0.700621  0.690338  0.696792  0.680982  0.689429   
1816796431     121  0.698651  0.693153  0.699358  0.683020  0.692136   
1816796431     122  0.698450  0.691408  0.697766  0.681728  0.690405   

             x_face_5  x_face_

In [6]:

# Check what the columns represent
# there are 4 landmark types: face, left_hand, right_hand, pose
# face seems to have the most columns
#
print(sample_sequence_df.columns.tolist())  #outputs a list of 1630 columns representing body landmarks + landmark index.
print("the number of columns representing face landmarks: ", sum(['face' in col for col in sample_sequence_df.columns])) # print: 1404
print("the number of columns representing right_hand landmarks: ", sum(['right_hand' in col for col in sample_sequence_df.columns])) # print: 63
print("the number of columns representing left_hand landmarks: ", sum(['left_hand' in col for col in sample_sequence_df.columns])) # print: 63
print("the number of columns representing pose landmarks: ", sum(['pose' in col for col in sample_sequence_df.columns])) # print: 99


['frame', 'x_face_0', 'x_face_1', 'x_face_2', 'x_face_3', 'x_face_4', 'x_face_5', 'x_face_6', 'x_face_7', 'x_face_8', 'x_face_9', 'x_face_10', 'x_face_11', 'x_face_12', 'x_face_13', 'x_face_14', 'x_face_15', 'x_face_16', 'x_face_17', 'x_face_18', 'x_face_19', 'x_face_20', 'x_face_21', 'x_face_22', 'x_face_23', 'x_face_24', 'x_face_25', 'x_face_26', 'x_face_27', 'x_face_28', 'x_face_29', 'x_face_30', 'x_face_31', 'x_face_32', 'x_face_33', 'x_face_34', 'x_face_35', 'x_face_36', 'x_face_37', 'x_face_38', 'x_face_39', 'x_face_40', 'x_face_41', 'x_face_42', 'x_face_43', 'x_face_44', 'x_face_45', 'x_face_46', 'x_face_47', 'x_face_48', 'x_face_49', 'x_face_50', 'x_face_51', 'x_face_52', 'x_face_53', 'x_face_54', 'x_face_55', 'x_face_56', 'x_face_57', 'x_face_58', 'x_face_59', 'x_face_60', 'x_face_61', 'x_face_62', 'x_face_63', 'x_face_64', 'x_face_65', 'x_face_66', 'x_face_67', 'x_face_68', 'x_face_69', 'x_face_70', 'x_face_71', 'x_face_72', 'x_face_73', 'x_face_74', 'x_face_75', 'x_face_76',

In [7]:
# for full explanation of what each face landmark index represents: https://developers.google.com/mediapipe/solutions/vision/face_landmarker
# for full explanation of what each hand landmark index represents: https://developers.google.com/mediapipe/solutions/vision/hand_landmarker
# for full explanation of what each pose landmark index represents: https://developers.google.com/mediapipe/solutions/vision/pose_landmarker

In [8]:
# Get the saved TFRecord files into a list

tf_records = dataset_df.file_id.map(lambda x: f'/kaggle/working/preprocessed/{x}.tfrecord').unique()
print(f"List of {len(tf_records)} TFRecord files")

List of 68 TFRecord files


In [9]:
# mediapipe apis allow us to visualize the hand landmarks data

# Function create animation from images
matplotlib.rcParams['animation.embed_limit'] = 2**128 # maximum size of the animation that can be embedded.
matplotlib.rcParams['savefig.pad_inches'] = 0 # set padding between the image and the edge of the saved figure.
rc('animation', html='jshtml') # set the format of the animation output to HTML5.

def create_animation(images):
    fig = plt.figure(figsize=(6,9))
    ax = plt.Axes(fig, [0., 0., 1., 1.])
    ax.set_axis_off()
    fig.add_axes(ax)
    im = ax.imshow(images[0], cmap='gray')
    plt.close(fig)
    
    def animate_func(i):
        im.set_array(images[i])
        return [im]
    
    return animation.FuncAnimation(fig, animate_func, frames=len(images), interval= 1000/10) # `animation` is a class from the Matplotlib library.



In [10]:
# Extract the landmark data and convert it to an image using medipipe library.
# This function extracts the data for both hands.

mp_pose = mediapipe.solutions.pose
mp_hands = mediapipe.solutions.hands
mp_drawing = mediapipe.solutions.drawing_utils 
mp_drawing_styles = mediapipe.solutions.drawing_styles

def get_hands(seq_df):
    images = []
    all_hand_landmarks = []
    for seq_idx in range(len(seq_df)):
        x_hand = seq_df.iloc[seq_idx].filter(regex="x_right_hand.*").values
        y_hand = seq_df.iloc[seq_idx].filter(regex="y_right_hand.*").values
        z_hand = seq_df.iloc[seq_idx].filter(regex="z_right_hand.*").values

        right_hand_image = np.zeros((600, 600, 3))

        right_hand_landmarks = landmark_pb2.NormalizedLandmarkList()
        
        for x, y, z in zip(x_hand, y_hand, z_hand):
            right_hand_landmarks.landmark.add(x=x, y=y, z=z)

        mp_drawing.draw_landmarks(
                right_hand_image,
                right_hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style())
        
        x_hand = seq_df.iloc[seq_idx].filter(regex="x_left_hand.*").values
        y_hand = seq_df.iloc[seq_idx].filter(regex="y_left_hand.*").values
        z_hand = seq_df.iloc[seq_idx].filter(regex="z_left_hand.*").values
        
        left_hand_image = np.zeros((600, 600, 3))
        
        left_hand_landmarks = landmark_pb2.NormalizedLandmarkList()
        for x, y, z in zip(x_hand, y_hand, z_hand):
            left_hand_landmarks.landmark.add(x=x, y=y, z=z)

        mp_drawing.draw_landmarks(
                left_hand_image,
                left_hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style())
        
        images.append([right_hand_image.astype(np.uint8), left_hand_image.astype(np.uint8)])
        all_hand_landmarks.append([right_hand_landmarks, left_hand_landmarks])
    return images, all_hand_landmarks

# Visualize the hand landmarks for the first phrase '3 creekhouse'

In [11]:
!pip install ffmpeg-python

Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0
[0m

In [12]:
# Get the images created using mediapipe apis
hand_images, hand_landmarks = get_hands(sample_sequence_df)
# Fetch and show the data for right hand
anim_0 = create_animation(np.array(hand_images)[:, 0])
# saving to m4 using ffmpeg writer

In [13]:
writervideo = animation.FFMpegWriter(fps=10)
anim_0.save('3_creekhouse.mp4', writer=writervideo)

# Visualize the hand landmarks for the second phrase


In [14]:

sequence_id_2, file_id_2, phrase_2 = dataset_df.iloc[1][['sequence_id', 'file_id', 'phrase']]
sample_sequence_df_2 = pq.read_table(f"/kaggle/input/asl-fingerspelling/train_landmarks/{str(file_id_2)}.parquet",
                                  filters=[[('sequence_id', '=', sequence_id_2)], ]).to_pandas() # filter only applied to
print("The full sequence data frame is:", sample_sequence_df_2, end="\n")
print("Phrase name: ", phrase_2)

# Get the images created using mediapipe apis
hand_images_2, hand_landmarks_2 = get_hands(sample_sequence_df_2)
# Fetch and show the data for right hand
anim_1 = create_animation(np.array(hand_images_2)[:, 0])

The full sequence data frame is:              frame  x_face_0  x_face_1  x_face_2  x_face_3  x_face_4  \
sequence_id                                                            
1816825349       0  0.576603  0.572630  0.571910  0.561432  0.573156   
1816825349       1  0.577659  0.572186  0.571878  0.560978  0.572593   
1816825349       2  0.578764  0.571128  0.571279  0.560064  0.571451   
1816825349       3  0.577037  0.572955  0.572655  0.561425  0.573298   
1816825349       4  0.577897  0.573944  0.573857  0.562747  0.574342   
...            ...       ...       ...       ...       ...       ...   
1816825349     122  0.589847  0.587060  0.585885  0.574620  0.587475   
1816825349     123  0.590409  0.586016  0.585070  0.573692  0.586407   
1816825349     124  0.589767  0.586228  0.585293  0.574049  0.586645   
1816825349     125  0.589624  0.585901  0.585115  0.573770  0.586331   
1816825349     126  0.586389  0.586615  0.585369  0.574449  0.587103   

             x_face_5  x_face_

In [15]:
anim_1.save('scaleskuhaylah.mp4', writer=writervideo)

# Visualize the hand landmarks for the third phrase

In [16]:
sequence_id_3, file_id_3, phrase_3 = dataset_df.iloc[2][['sequence_id', 'file_id', 'phrase']]
sample_sequence_df_3 = pq.read_table(f"/kaggle/input/asl-fingerspelling/train_landmarks/{str(file_id_3)}.parquet",
                                  filters=[[('sequence_id', '=', sequence_id_3)], ]).to_pandas() # filter only applied to
print("The full sequence data frame is:", sample_sequence_df_3, end="\n")
print("Phrase name: ", phrase_3)

# Get the images created using mediapipe apis
hand_images_3, hand_landmarks_3 = get_hands(sample_sequence_df_3)
# Fetch and show the data for right hand
anim_2 = create_animation(np.array(hand_images_3)[:, 0])

The full sequence data frame is:              frame  x_face_0  x_face_1  x_face_2  x_face_3  x_face_4  \
sequence_id                                                            
1816909464       0  0.633601  0.618904  0.623231  0.606461  0.618572   
1816909464       1  0.635823  0.636086  0.636008  0.621483  0.636530   
1816909464       2  0.637647  0.630321  0.632407  0.615979  0.630152   
1816909464       3  0.639373  0.631797  0.633379  0.618251  0.631952   
1816909464       4  0.643489  0.633707  0.634779  0.620286  0.633956   
...            ...       ...       ...       ...       ...       ...   
1816909464     231  0.635170  0.631320  0.632036  0.616967  0.631535   
1816909464     232  0.635081  0.629520  0.630180  0.615285  0.629818   
1816909464     233  0.634258  0.627670  0.629056  0.614137  0.627872   
1816909464     234  0.632718  0.625626  0.627087  0.612802  0.626109   
1816909464     235  0.636664  0.631540  0.632232  0.617529  0.631941   

             x_face_5  x_face_

In [17]:
anim_2.save('1383 william lanier.mp4', writer=writervideo)

# Preprocess the data

Per [f'this notebook']({https://www.kaggle.com/code/gusthema/asl-fingerspelling-recognition-w-tensorflow#Fetch-the-pose-landmark-coordinates-related-to-hand-movement.}), we should arrange the data so that each parquet file not only contains the landmark coordinates but also the phrase it represents.

The new data should be saved as TFRecord format.
The sample notebook focusses on hand movement and pose coordinates.

# Fetch the pose landmark coordinates related to hand movement

In [18]:
# pose coordinates for hand movement.
LPOSE = [13, 15, 17, 19, 21]
RPOSE = [14, 16, 18, 20, 22]

POSE = LPOSE + RPOSE # print [13, 15, 17, 19, 21, 14, 16, 18, 20, 22], essentially this combines the elements of LPOSE and RPOSE
# Note: there are 32 pose columns for each axis. However, only the 10 pose indexes above are related to hand, arm, and shouder.
print(POSE)

[13, 15, 17, 19, 21, 14, 16, 18, 20, 22]


In [19]:
# Create x, y, z label names from coordinates

X = [f"x_right_hand_{i}" for i in range(21)] + [f"x_left_hand_{i}" for i in range(21)] + [f"x_pose_{i}" for i in POSE]
Y = [f"y_right_hand_{i}" for i in range(21)] + [f"y_left_hand_{i}" for i in range(21)] + [f"y_pose_{i}" for i in POSE]
Z = [f"z_right_hand_{i}" for i in range(21)] + [f"z_left_hand_{i}" for i in range(21)] + [f"z_pose_{i}" for i in POSE]


In [20]:
# create the list of all feature columns created from the X, Y, Z above
FEATURE_COLUMNS = X + Y + Z
print(FEATURE_COLUMNS)

['x_right_hand_0', 'x_right_hand_1', 'x_right_hand_2', 'x_right_hand_3', 'x_right_hand_4', 'x_right_hand_5', 'x_right_hand_6', 'x_right_hand_7', 'x_right_hand_8', 'x_right_hand_9', 'x_right_hand_10', 'x_right_hand_11', 'x_right_hand_12', 'x_right_hand_13', 'x_right_hand_14', 'x_right_hand_15', 'x_right_hand_16', 'x_right_hand_17', 'x_right_hand_18', 'x_right_hand_19', 'x_right_hand_20', 'x_left_hand_0', 'x_left_hand_1', 'x_left_hand_2', 'x_left_hand_3', 'x_left_hand_4', 'x_left_hand_5', 'x_left_hand_6', 'x_left_hand_7', 'x_left_hand_8', 'x_left_hand_9', 'x_left_hand_10', 'x_left_hand_11', 'x_left_hand_12', 'x_left_hand_13', 'x_left_hand_14', 'x_left_hand_15', 'x_left_hand_16', 'x_left_hand_17', 'x_left_hand_18', 'x_left_hand_19', 'x_left_hand_20', 'x_pose_13', 'x_pose_15', 'x_pose_17', 'x_pose_19', 'x_pose_21', 'x_pose_14', 'x_pose_16', 'x_pose_18', 'x_pose_20', 'x_pose_22', 'y_right_hand_0', 'y_right_hand_1', 'y_right_hand_2', 'y_right_hand_3', 'y_right_hand_4', 'y_right_hand_5', 'y_r

In [21]:
# extract column ids based on the type of axis, i.e. x, y, or z
X_IDX = [i for i, col in enumerate(FEATURE_COLUMNS) if "x_" in col]
Y_IDX = [i for i, col in enumerate(FEATURE_COLUMNS) if "y_" in col]
Z_IDX = [i for i, col in enumerate(FEATURE_COLUMNS) if "z_" in col]

# extract column ids based on the 
RHAND_IDX = [i for i, col in enumerate(FEATURE_COLUMNS) if "right" in col]
LHAND_IDX = [i for i, col in enumerate(FEATURE_COLUMNS) if "left" in col]
RPOSE_IDX = [i for i, col in enumerate(FEATURE_COLUMNS) if "pose" in col and int(col[-2:]) in RPOSE]
LPOSE_IDX = [i for i, col in enumerate(FEATURE_COLUMNS) if "pose" in col and int(col[-2:]) in LPOSE] # the index of the pose must be in [14, 16, 18, 20, 22] to be considered a pose related to left_hand.

print(LPOSE_IDX)

print(FEATURE_COLUMNS[0][-2:])
print(LPOSE)


[42, 43, 44, 45, 46, 94, 95, 96, 97, 98, 146, 147, 148, 149, 150]
_0
[13, 15, 17, 19, 21]


# Preprocess and write the dataset as TFRecord

# Using the extracted landmarks and phrases 

Note: Since this data set has 67208 parquet files. To work on development and to speed up prototype, we will be dealing with the first 1000 parquet files.

In [22]:
print(dataset_df.file_id.unique())

[   5414471  105143404  128822441  149822653  152029243  169560558
  175396851  234418913  296317215  349393104  388576474  425182931
  433948159  450474571  474255203  495378749  522550314  527708222
  532011803  546816846  566963657  568753759  614661748  638508439
  649779897  654436541  683666742  871280215  882979387  933868835
  939623093 1019715464 1021040628 1098899348 1099408314 1133664520
 1134756332 1255240050 1320204318 1341528257 1358493307 1365275733
 1365772051 1405046009 1448136004 1497621680 1552432300 1557244878
 1562234637 1643479812 1647220008 1662742697 1664666588 1726141437
 1785039512 1865557033 1880177496 1905462118 1906357076 1920330615
 1967755728 1969985709 1997878546 2026717426 2036580525 2072296290
 2072876091 2118949241]


In [23]:
# Set length of frames to 128

FRAME_LEN = 128

# create directory to store the new data if not exist yet. If the "preprocessed" folder already exists, create a new one with the same name.
if not os.path.isdir("preprocessed"):
    os.mkdir("preprocessed")
else:
    shutil.rmtree("preprocessed")
    os.mkdir("preprocessed")

# loop through each file_id. Note the tqdm wrapper helps show progress
for file_id in tqdm(dataset_df.file_id.unique()):
    # parquet file name
    pq_file = f"/kaggle/input/asl-fingerspelling/train_landmarks/{file_id}.parquet"
    # Filter train.csv and fetch entries only for the relevant file_id
    file_df = dataset_df.loc[dataset_df['file_id'] == file_id]
    
    # fetch the parquet file
    parquet_df = pq.read_table(f"/kaggle/input/asl-fingerspelling/train_landmarks/{str(file_id)}.parquet",
                              columns=['sequence_id'] + FEATURE_COLUMNS).to_pandas()
    
    # File name for the updated data
    tf_file = f"preprocessed/{file_id}.tfrecord"
    parquet_numpy = parquet_df.to_numpy()
    
    # Initialize the pointer to write the output of each `for loop` below as a sequence into the file.
    with tf.io.TFRecordWriter(tf_file) as file_writer:
        # Loop through each sequence in file.
        for seq_id, phrase in zip(file_df.sequence_id, file_df.phrase):
            # fetch sequence data
            frames = parquet_numpy[parquet_df.index == seq_id]
            
            # Calculate the number of NaN values in each hand landmark
            r_nonan = np.sum(np.sum(np.isnan(frames[:, RHAND_IDX]), axis = 1) == 0)
            l_nonan = np.sum(np.sum(np.isnan(frames[:, LHAND_IDX]), axis = 1) == 0)
            
            no_nan = max(r_nonan, l_nonan)
            
            if 2*len(phrase) < no_nan:
                features = {FEATURE_COLUMNS[i]: tf.train.Feature(
                    float_list = tf.train.FloatList(value=frames[:, i])) for i in range(len(FEATURE_COLUMNS))}
                features["phrase"] = tf.train.Feature(bytes_list = tf.train.BytesList(value = [bytes(phrase, 'utf-8')]))
                record_bytes = tf.train.Example(features = tf.train.Features(feature = features)).SerializeToString()
                file_writer.write(record_bytes)
            
            

  0%|          | 0/68 [00:00<?, ?it/s]

# Load character_to_prediction json file

The json file contains a character and its value. We will add 3 new characters, "<" and ">" to mark the start and end of a phrase and "P" for padding.



In [24]:
with open("/kaggle/input/asl-fingerspelling/character_to_prediction_index.json", "r") as f:
    char_to_num = json.load(f)

# char_to_num is now a dictionary that map a character to the numeric label value. Since there are no tokens to represent pad_token, start_pointer and end_pointer, we will add to the dict.
# define those extra tokens:
pad_token = 'P'
start_token = '<'
end_token = '>'
pad_token_idx = 59 # the next integer value that hasn't been assigned to any characters.
start_token_idx = 60
end_token_idx = 61

char_to_num[pad_token] = pad_token_idx
char_to_num[start_token] = start_token_idx
char_to_num[end_token] = end_token_idx


num_to_char = {j : i for i, j in char_to_num.items()} # swap the keys to the values in the dictionary, why do we need to swap?
print(num_to_char)


{0: ' ', 1: '!', 2: '#', 3: '$', 4: '%', 5: '&', 6: "'", 7: '(', 8: ')', 9: '*', 10: '+', 11: ',', 12: '-', 13: '.', 14: '/', 15: '0', 16: '1', 17: '2', 18: '3', 19: '4', 20: '5', 21: '6', 22: '7', 23: '8', 24: '9', 25: ':', 26: ';', 27: '=', 28: '?', 29: '@', 30: '[', 31: '_', 32: 'a', 33: 'b', 34: 'c', 35: 'd', 36: 'e', 37: 'f', 38: 'g', 39: 'h', 40: 'i', 41: 'j', 42: 'k', 43: 'l', 44: 'm', 45: 'n', 46: 'o', 47: 'p', 48: 'q', 49: 'r', 50: 's', 51: 't', 52: 'u', 53: 'v', 54: 'w', 55: 'x', 56: 'y', 57: 'z', 58: '~', 59: 'P', 60: '<', 61: '>'}


In [25]:
def resize_pad(x):
    """
    this function resizes and adds padding to the input tensor x. The function first checks if the number of rows in x is less than FRAME_LEN (i.e. 128 as defined earlier). If so, it pads the tensor with zeros so that it has FRAME_LEN rows.
    If not, it resizes the tensor to have FRAME_LEN rows using bilinear interpolation. The function then returns the resized and padded tensor.
    """
    if tf.shape(x)[0] < FRAME_LEN:
        x = tf.pad(x, ([[0, FRAME_LEN-tf.shape(x)[0]], [0, 0], [0, 0]]))
    else:
        x = tf.image.resize(x, (FRAME_LEN, tf.shape(x)[1]))
    return x

def pre_process(x):
    """
    This function does:
    - Detect the dominant hand from the number of NaN values. Dominant hand will have less NaN values since it is in frame moving.
    """
    rhand = tf.gather(x, RHAND_IDX, axis=1)
    lhand = tf.gather(x, LHAND_IDX, axis=1)
    rpose = tf.gather(x, RPOSE_IDX, axis=1)
    lpose = tf.gather(x, LPOSE_IDX, axis=1)
    
    
    rnan_idx = tf.reduce_any(tf.math.is_nan(rhand), axis=1)
    lnan_idx = tf.reduce_any(tf.math.is_nan(lhand), axis=1)
    
    rnans = tf.math.count_nonzero(rnan_idx)
    lnans = tf.math.count_nonzero(lnan_idx)
    
    # for dominant hand
    if rnans > lnans:
        hand = lhand
        pose = lpose
        
        hand_x = hand[:, 0*(len(LHAND_IDX)//3) : 1*(len(LHAND_IDX)//3)] # this extracts the x-coordinates from the tensor hand, len(LHAND_IDX)
        hand_y = hand[:, 1*(len(LHAND_IDX)//3) : 2*(len(LHAND_IDX)//3)]
        hand_z = hand[:, 2*(len(LHAND_IDX)//3) : 3*(len(LHAND_IDX)//3 )]
        
        ## concatenate all the axes but also flips the x-coordinates of the left hand and pose. Since the left hand is mirrored in the input image, so its x-coordinates are flipped relative to the right hand.
        hand = tf.concat([1-hand_x, hand_y, hand_z], axis=1)
        
        # repeat the same for the left pose:
        pose_x = pose[:, 0*(len(LPOSE_IDX)//3) : 1*(len(LPOSE_IDX)//3)]
        pose_y = pose[:, 1*(len(LPOSE_IDX)//3) : 2*(len(LPOSE_IDX)//3)]
        pose_z = pose[:, 2*(len(LPOSE_IDX)//3) : 3*(len(LPOSE_IDX)//3)]
        pose = tf.concat([1-pose_x, pose_y, pose_z], axis=1)
        
    else:
        hand = rhand
        pose = rpose
        
    # This blocks of code extracts the x-coordinate, y-coordinate, z-coordinate of the hand from the tensor`hand`, concatenates them into a single tensor using tf.concat(), and add a new axis to each
    # coordinate using [..., tf.newaxis] so they can be concatenated into a 3D tensor.
    #
    
    hand_x = hand[:, 0*(len(LHAND_IDX)//3) : 1*(len(LHAND_IDX)//3)]
    hand_y = hand[:, 1*(len(LHAND_IDX)//3) : 2*(len(LHAND_IDX)//3)]
    hand_z = hand[:, 2*(len(LHAND_IDX)//3) : 3*(len(LHAND_IDX)//3)]
    hand = tf.concat([hand_x[..., tf.newaxis], hand_y[..., tf.newaxis], hand_z[..., tf.newaxis]], axis=-1) # note: the `...` syntax is used to select all dimensions of the tensor except for the last one
    
    
    # normalize the coordinate values of hands
    mean = tf.math.reduce_mean(hand, axis=1)[:, tf.newaxis, :]
    std = tf.math.reduce_std(hand, axis=1)[:, tf.newaxis, :]
    hand = (hand - mean)/std
    
    # add a new axis to "pose" similarly to the "hand" above
    pose_x = pose[:, 0*(len(LPOSE_IDX)//3) : 1*(len(LPOSE_IDX)//3)]
    pose_y = pose[:, 1*(len(LPOSE_IDX)//3) : 2*(len(LPOSE_IDX)//3)]
    pose_z = pose[:, 2*(len(LPOSE_IDX)//3) : 3*(len(LPOSE_IDX)//3)]
    pose = tf.concat([pose_x[..., tf.newaxis], pose_y[..., tf.newaxis], pose_z[..., tf.newaxis]], axis=-1) # meaning the function will gather values along the last dimension of the tensor.
    
    x = tf.concat([hand, pose], axis=1)
    x = resize_pad(x)
    
    x = tf.where(tf.math.is_nan(x), tf.zeros_like(x), x) # replace all the NaN values in tensor x with zeros
    x = tf.reshape(x, (FRAME_LEN, len(LHAND_IDX) + len(LPOSE_IDX))) # reshape the tensor x into a new shape. note that the second dimension has a length of len(LHAND_IDX) + len(LPOSE_IDX) since all right hands features are no longer considered(due to having a lot of NaN values.)
    return x

# Create function to parse data from TFRecord format

In [26]:
def decode_fn(record_bytes):
    """
    this is to decode a single .tfrecord file. A schema is used to map feature names to their corresponding data types.
    the "phrase" feature in the schema data type is a tf.string data type.
    
    @param record_bytes: a binary string representation of the .tfrecord file.
    @return landmarks
    """
    schema = {COL : tf.io.VarLenFeature(dtype=tf.float32) for COL in FEATURE_COLUMNS} # dictionary comprehension?
    schema["phrase"] = tf.io.FixedLenFeature([], dtype=tf.string)
    features = tf.io.parse_single_example(record_bytes, schema)
    phrase = features["phrase"]
    landmarks = ([tf.sparse.to_dense(features[COL]) for COL in FEATURE_COLUMNS]) # landmarks is a rank 2 tensor
    
    # Transpose to maintain the original shape of landmarks data
    landmarks = tf.transpose(landmarks)
    
    return landmarks, phrase

# Create function to convert the data

In [27]:
# transposes and applies masks to the landmark coordinates. It also vectorized the phrase corresponding to the landmarks using `character_to_prediction_index.json`

# Note: Static

table = tf.lookup.StaticHashTable(
        initializer = tf.lookup.KeyValueTensorInitializer(
            keys = list(char_to_num.keys()),
            values = list(char_to_num.values()),
        ),
        default_value = tf.constant(-1), # to be returned when the key is not found in the table
        name = "class_weight" # an arbitrary name given to the table.
        )

def convert_fn(landmarks, phrase): 
    """
    preprocess the landmark also alter the phrase associated with the collection of landmarks
    by adding start_token, end_token, and padding.
    
    """
    # add the start and end pointers to a phrase, e.g. "<" for start pointer, and ">" for end pointer
    phrase = start_token + phrase + end_token
    phrase = tf.strings.bytes_split(phrase)  # split string into bytes, not unicode characters since in the `decode_fn()` earlier we defined phrase as of type tf.string
    phrase = table.lookup(phrase)
    
    # Vectorize and add padding
    phrase = tf.pad(phrase, 
                    paddings=[[0, 64 - tf.shape(phrase)[0]]], 
                    mode = 'CONSTANT',
                    constant_values = pad_token_idx)
    
    phrase = tf.one_hot(phrase, depth=len(num_to_char), axis=-1)
    
    # apply pre-process function to the landmarks
    return pre_process(landmarks), phrase
    

# Train and validation split/ Create the final datasets

In [28]:
# we can't load the whole data set into memory at once so we will train and fetch by a batch of size 64.
# splitting the data set into train, validation and test set

batch_size = 64
train_len = int(0.7 * len(tf_records))
valid_len = int(0.15 * len(tf_records))
test_len = int(0.15 * len(tf_records))

train_ds = tf.data.TFRecordDataset(tf_records[:train_len]).map(decode_fn).map(convert_fn).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE).cache()
valid_ds = tf.data.TFRecordDataset(tf_records[train_len:train_len+valid_len]).map(decode_fn).map(convert_fn).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE).cache()
test_ds = tf.data.TFRecordDataset(tf_records[train_len+valid_len:]).map(decode_fn).map(convert_fn).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE).cache()


# Note: train_ds is of type <CacheDataset element_spec=(TensorSpec(shape=(None, 128, 78), dtype=tf.float32, name=None), TensorSpec(shape=(None, None), dtype=tf.int32, name=None))>
# valid_ds is of type <CacheDataset element_spec=(TensorSpec(shape=(None, 128, 78), dtype=tf.float32, name=None), TensorSpec(shape=(None, None), dtype=tf.int32, name=None))>

# Start of the LSTM model

In [29]:
!pip install python-Levenshtein

[0m

In [31]:
num_classes = len(char_to_num)
batch_size, frame_len, num_features = next(iter(valid_ds))[0].shape

#define LSTM
model = keras.Sequential()
model.add(layers.LSTM(75, input_shape=(frame_len, num_features)))
model.add(layers.RepeatVector(FRAME_LEN))
#.5. Fit the Model 122
model.add(layers.LSTM(50, return_sequences=True))
model.add(layers.TimeDistributed(layers.Dense(num_classes, activation= 'softmax')))
model.add(layers.Lambda(lambda x: x[:, ::2, :]))  # Perform downsampling, halving the sequence length


model.compile(loss='categorical_crossentropy', optimizer= 'adam' , metrics=[ 'accuracy' ]) # TODO: find a way to get Levenshtein currently we are just using a generic "accuracy" metric
# but we really need to use levenshtein metric - this is a custom metric so you probably need to define it some way
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 75)                46200     
                                                                 
 repeat_vector (RepeatVector  (None, 128, 75)          0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 128, 50)           25200     
                                                                 
 time_distributed (TimeDistr  (None, 128, 62)          3162      
 ibuted)                                                         
                                                                 
 lambda (Lambda)             (None, 64, 62)            0         
                                                                 
Total params: 74,562
Trainable params: 74,562
Non-traina

In [35]:
def decode_prediction(y_pred):
    """
    function to decode the prediction, assumming shape of (None, 64, 62); next we can try with shape (1, 64, 62)
    
    this function will be used in the DisplayOutputs callback
    """
    # reverse decode the one-hot labels of y
    y_pred = tf.argmax(y_pred, axis=2)
    
    labels = []
    
    for i in y_pred:
        decoded = ''
        for integer_tensor in i:
            squeezed_integer = tf.squeeze(integer_tensor)
            integer_val = int(squeezed_integer.numpy())
            decoded += num_to_char[integer_val]
        labels.append(decoded)
#     labels = tf.convert_to_tensor(labels)
    return labels
    

In [None]:
class DisplayOutputs(keras.callbacks.Callback):
    def __init__(
        self, batch
    ):
        """Displays a batch of outputs after every 4 epoch

        Args:
            batch: A test batch
        """
        self.batch = batch

    def on_epoch_end(self, epoch, logs=None):
        if epoch % 4 != 0: # this only prints out the prediction for every 4 epochs just to save the computational bandwidth
            return
        source = self.batch[0]
        target = self.batch[1]
        batch_size = tf.shape(source)[0]
        preds = self.model(source)
        
        target_labels = decode_prediction(target)
        predicted_labels = decode_prediction(preds)
        
        for target_label, predicted_label in zip(target_labels, predicted_labels):
            print(f"Target Label: {target_label}\t Predicted Label: {predicted_label}")

In [None]:
# DisplayOutputs callback
batch = next(iter(test_ds))
display_cb = DisplayOutputs(batch)

num_epochs = 4


# fitting the model by passing in the train_ds, valid_ds

# Note: Please change the number of epochs to 20. I put 4 here just for the prototype; Warning: training may take a lot of time
history = model.fit(train_ds, epochs=num_epochs, validation_data=valid_ds, callbacks=[display_cb])

In [None]:
# to predict based on the first batch of the test_ds

next_test_input = next(iter(test_ds))[0] # get the next batch of 64 records in the test_ds data set
y_preds = model.predict(next_test_input)

# display the predicted labels
decode_prediction(y_preds)


In [None]:
# Access training and validation metrics
train_loss = history.history['loss']
train_accuracy = history.history['accuracy']
valid_loss = history.history['val_loss']
valid_accuracy = history.history['val_accuracy']


# Example: Print the training and validation loss for each epoch
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}:\n Training Loss={train_loss[epoch]} \n Validation Loss={valid_loss[epoch]} \n Training accuracy={train_accuracy[epoch]} \n Validation Accuracy={valid_accuracy[epoch]}\n\n")
    
    
# Example: Access the final training accuracy and validation accuracy

final_train_accuracy = train_accuracy[-1]
final_valid_accuracy = valid_accuracy[-1]
print(f"Final Training Accuracy: {final_train_accuracy}")
print(f"Final Validation Accuracy: {final_valid_accuracy}")