In [1]:
import os
import json
import cv2
import mediapipe as mp
import numpy as np
from PIL import Image
from IPython.display import display

from tqdm.auto import tqdm

2024-01-18 12:56:41.068195: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Create a list of lowercase letters from the filenames in the specified directory
letters = [l.split('.')[0].lower() for l in os.listdir("/home/ant/projects/psl/dataset/Videos/alphabet")]

# Define the vocabulary as a list containing '<start>' and '<eos>' tokens, along with the letters
vocabulary = ['<pad>', '<start>', '<eos>'] + letters

# Create a dictionary mapping each vocabulary item to its corresponding index
# Indexing starts from 1, so '<start>' is assigned index 1, '<eos>' is assigned index 2, and so on
vocabulary = {l: i+1 for i, l in enumerate(vocabulary)}

# Display the resulting vocabulary dictionary
vocabulary

{'<pad>': 1,
 '<start>': 2,
 '<eos>': 3,
 'j': 4,
 'r': 5,
 'z': 6,
 't': 7,
 's': 8,
 'n': 9,
 'g': 10,
 'b': 11,
 'l': 12,
 'y': 13,
 'ch': 14,
 'u': 15,
 'ó': 16,
 'd': 17,
 'f': 18,
 'ż': 19,
 'k': 20,
 'e': 21,
 'cz': 22,
 'sz': 23,
 'o': 24,
 'ź': 25,
 'm': 26,
 'ń': 27,
 'ć': 28,
 'c': 29,
 'ę': 30,
 'i': 31,
 'ł': 32,
 'ą': 33,
 'w': 34,
 'h': 35,
 'ś': 36,
 'rz': 37,
 'a': 38,
 'p': 39}

In [3]:
# Function to extract hand landmarks from a video
def landmarks_timeseries(video_path):
    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands()

    # Open the video file for reading
    cap = cv2.VideoCapture(video_path)

    # Get the total number of frames in the video
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Get the frames per second (fps) of the video
    fps = cap.get(cv2.CAP_PROP_FPS)

    # Set the frame rate for extracting landmarks
    frame_rate = 0.5

    # Calculate the number of frames to skip based on the frame rate
    frames_to_skip = int(fps * frame_rate)

    landmarks_data = []
    current_frame = 0

    # Loop through the frames of the video
    while cap.isOpened():
        # Set the position to the current frame
        cap.set(cv2.CAP_PROP_POS_FRAMES, current_frame)

        # Read the current frame from the video
        ret, frame = cap.read()
        if not ret:
            break

        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Process the frame and get hand landmarks using Mediapipe
        results = hands.process(rgb_frame)
        if results.multi_hand_landmarks:
            hand_landmarks = results.multi_hand_landmarks[0].landmark

            # Append the 3D coordinates of hand landmarks to the list
            landmarks_data.append([[landmark.x, landmark.y, landmark.z] for landmark in hand_landmarks])

        # Move to the next frame based on the frames to skip
        current_frame += frames_to_skip

    # Release the video capture object
    cap.release()

    # Reshape the landmarks data into a 2D array
    landmarks_data = np.array(landmarks_data).reshape(len(landmarks_data), -1)

    return landmarks_data

In [4]:
videos_path = "/home/ant/projects/psl/dataset/Videos/alphabet"
labels = []
landmarks = []
# Iterate through each file in the dynamic alphabet directory
for i, filename in enumerate(tqdm(os.listdir(videos_path))):
    if filename.endswith('.mp4'):
        video_path = os.path.join(videos_path, filename)

        label = filename.split('.')[0].lower()
        label = ['<start>', label, '<eos>']

        # Convert labels to their corresponding vocabulary indices
        label = [vocabulary[l] for l in label]
        labels.append(label)

        # Call the function 'landmarks_timeseries' to get landmarks from the video
        land = landmarks_timeseries(video_path)

        landmarks.append(land)

  0%|          | 0/36 [00:00<?, ?it/s]

I0000 00:00:1705582603.316787   19478 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1705582603.334920   19515 gl_context.cc:344] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 535.146.02), renderer: NVIDIA GeForce GTX 970/PCIe/SSE2
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
  3%|▎         | 1/36 [00:00<00:25,  1.38it/s]I0000 00:00:1705582604.031034   19478 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1705582604.037516   19539 gl_context.cc:344] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 535.146.02), renderer: NVIDIA GeForce GTX 970/PCIe/SSE2
  6%|▌         | 2/36 [00:01<00:25,  1.33it/s]I0000 00:00:1705582604.794730   19478 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1705582604.801071   19556 gl_context.cc:344] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 535.146.02), renderer: NVIDIA GeForce GTX 970/PCIe/SSE2
  6%|▌         | 2/36 [00:02<00:36,  1.09s/it]


KeyboardInterrupt: 

In [5]:
videos_path = "/home/ant/projects/psl/dataset/Videos/words"
labels_words = []
landmarks_words = []
# Iterate through each file in the words directory
for i, filename in enumerate(tqdm(os.listdir(videos_path))):
    if filename.endswith('.mp4'):
        video_path = os.path.join(videos_path, filename)

        # Extract labels from the filename, including '<start>' and '<eos>' tokens
        label = ['<start>'] + list(filename.split('.')[0].lower()) + ['<eos>']

        # Convert labels to their corresponding vocabulary indices
        label = [vocabulary[l] for l in label]

        labels_words.append(label)

        # Call the function 'landmarks_timeseries' to get landmarks from the video
        land = landmarks_timeseries(video_path)
        landmarks_words.append(land)

  0%|          | 0/375 [00:00<?, ?it/s]

I0000 00:00:1705582609.469689   19478 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1705582609.479287   19574 gl_context.cc:344] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 535.146.02), renderer: NVIDIA GeForce GTX 970/PCIe/SSE2
  0%|          | 1/375 [00:01<06:30,  1.04s/it]I0000 00:00:1705582610.510816   19478 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1705582610.517787   19591 gl_context.cc:344] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 535.146.02), renderer: NVIDIA GeForce GTX 970/PCIe/SSE2
  0%|          | 1/375 [00:01<11:11,  1.80s/it]


KeyboardInterrupt: 

In [16]:
# Define a mapping to fix certain characters in the labels
fix = {
    'Ć': 'ć',
    'Ę': 'ę',
    'Ł': 'ł',
    'Ń': 'ń',
    'Ó': 'O',
    'Ś': 'ś',
    'Ź': 'ź',
    'Ż': 'ż',
}

# Function to preprocess data from JSON files in the folder
def preprocess_data(labels_folder):
    labeled_with_landmarks_count = 0
    labeled_without_landmarks_count = 0
    data_rows = []  # List to store data rows
    labels = []  # List to store labels

    # Loop through JSON files in the folder
    for filename in tqdm(os.listdir(labels_folder)):
        if filename.endswith('.json'):
            with open(os.path.join(labels_folder, filename), 'r', encoding='utf-8') as json_file:
                data = json.load(json_file)
                if 'hand_landmarks' in data:
                    labeled_with_landmarks_count += 1

                    # Extract landmarks data and flatten it into a list
                    landmarks_data = data['hand_landmarks']
                    row = []
                    for landmark_key in landmarks_data:
                        landmark = landmarks_data[landmark_key]
                        row.extend([landmark['x'], landmark['y'], landmark['z']])

                    # Extract and preprocess the label
                    l = data['label']
                    if l in fix:
                        l = fix[l]
                    label = ['<start>', l.lower(), '<eos>']
                    label = [vocabulary[l] for l in label]
                    data_rows.append(row)
                    labels.append(label)

                else:
                    labeled_without_landmarks_count += 1

    return data_rows, labels

labels_folder = "/home/ant/projects/psl/dataset/labels"
preprocessed_data, preprocessed_labels = preprocess_data(labels_folder)

100%|██████████| 3626/3626 [00:00<00:00, 18467.71it/s]


In [18]:
labels_folder = '../dataset/labels'
data_static, labels_static = preprocess_data(labels_folder)
data_static = np.array(data_static)

100%|██████████| 3626/3626 [00:00<00:00, 21445.95it/s]


In [19]:
# Create a new list to store the modified static data
data_static_new = []

# Iterate through each element in the original static data
for d in data_static:
    # Repeat the current element along a new axis a random number of times (between 2 and 6)
    d = np.repeat(d.reshape(1, -1), repeats=np.random.randint(2, 7), axis=0)
    data_static_new.append(d)
    

In [20]:
# Concatenate all lists
data = landmarks + landmarks_words + data_static_new 
all_labels = labels + labels_words + labels_static

In [29]:
data = data_static_new 
all_labels = labels_static

In [30]:
import tensorflow as tf

In [42]:
def masked_loss(y_true, y_pred):
    # Initialize SparseCategoricalCrossentropy loss with 'from_logits' and 'reduction' parameters
    loss_function = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    
    # Calculate the cross-entropy loss for each item in the batch
    loss = loss_function(y_true - 1, y_pred)

    # Create a binary mask to filter out padding elements (where y_true is 0)
    mask = tf.cast(y_true != 0, tf.float32)

    # Apply the mask to the calculated losses
    loss *= mask

    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

In [43]:
def accuracy_for_letters(y_true, y_pred):
    result = tf.cast(tf.cast(y_true, tf.int64) == tf.argmax(y_pred, axis=-1), tf.float32)

    # Create a binary mask to filter out padding elements (where y_true is 0)
    mask = tf.cast(y_true != 0, tf.float32)

    # Apply the mask to the calculated losses
    result *= mask

    return tf.reduce_sum(result) / tf.reduce_sum(mask)

In [44]:
# Pad input sequences (data) with zeros using "post" padding
padded_inputs = tf.keras.utils.pad_sequences(data, dtype="float32", padding="post")
padded_outputs = tf.keras.utils.pad_sequences(all_labels, dtype="int32", padding="post")
padded_inputs.shape, padded_outputs.shape

((3262, 6, 63), (3262, 3))

In [45]:
from tensorflow import keras


# Define the input dimension, vocabulary size, and create a Sequential model
input_dim = 63
vocab_size = len(vocabulary)

model = keras.Sequential(
    [
        # Input layer with shape (sequence_length, input_dim)
        keras.Input(shape=(padded_inputs.shape[1], input_dim), dtype="float32"),

        # Masking layer to handle variable-length sequences
        keras.layers.Masking(),

        # LSTM layer with 32 units, returning a single output for each sequence
        keras.layers.LSTM(32, return_sequences=False),

        # Repeat the output vector for each time step in the output sequence
        keras.layers.RepeatVector(padded_outputs.shape[1]),

        # LSTM layer with 64 units, returning a sequence of vectors
        keras.layers.LSTM(64, return_sequences=True),

        # TimeDistributed layer to apply Dense layer to each time step independently
        keras.layers.TimeDistributed(keras.layers.Dense(vocab_size)),
    ]
)

# Display the model summary
model.summary()

# Compile the model using the custom masked loss function and Adam optimizer
model.compile(
    loss=masked_loss,
    optimizer=tf.keras.optimizers.legacy.Adam(1e-3),
    metrics=[accuracy_for_letters]
)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_5 (Masking)         (None, 6, 63)             0         
                                                                 
 lstm_10 (LSTM)              (None, 32)                12288     
                                                                 
 repeat_vector_5 (RepeatVect  (None, 3, 32)            0         
 or)                                                             
                                                                 
 lstm_11 (LSTM)              (None, 3, 64)             24832     
                                                                 
 time_distributed_5 (TimeDis  (None, 3, 39)            2535      
 tributed)                                                       
                                                                 
Total params: 39,655
Trainable params: 39,655
Non-trai

In [46]:
model.fit(padded_inputs, padded_outputs)



<keras.callbacks.History at 0x7f3e941bf940>