In [1]:
import os
import json
import cv2
import mediapipe as mp
import numpy as np
from PIL import Image
from IPython.display import display

from tqdm.auto import tqdm

2024-01-23 21:02:24.117318: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Create a list of lowercase letters from the filenames in the specified directory
letters = [l.split('.')[0].lower() for l in os.listdir("/home/ant/projects/psl/dataset/Videos/alphabet")]

# Define the vocabulary as a list containing '<start>' and '<eos>' tokens, along with the letters
vocabulary = ['<pad>', '<start>', '<eos>'] + letters

# Create a dictionary mapping each vocabulary item to its corresponding index
# Indexing starts from 1, so '<start>' is assigned index 1, '<eos>' is assigned index 2, and so on
vocabulary = {l: i for i, l in enumerate(vocabulary)}

# Display the resulting vocabulary dictionary
vocabulary

{'<pad>': 0,
 '<start>': 1,
 '<eos>': 2,
 'j': 3,
 'r': 4,
 'z': 5,
 't': 6,
 's': 7,
 'n': 8,
 'g': 9,
 'b': 10,
 'l': 11,
 'y': 12,
 'ch': 13,
 'u': 14,
 'ó': 15,
 'd': 16,
 'f': 17,
 'ż': 18,
 'k': 19,
 'e': 20,
 'cz': 21,
 'sz': 22,
 'o': 23,
 'ź': 24,
 'm': 25,
 'ń': 26,
 'ć': 27,
 'c': 28,
 'ę': 29,
 'i': 30,
 'ł': 31,
 'ą': 32,
 'w': 33,
 'h': 34,
 'ś': 35,
 'rz': 36,
 'a': 37,
 'p': 38}

In [3]:
# Function to extract hand landmarks from a video
mp_hands = mp.solutions.hands
hands = mp_hands.Hands()
def landmarks_timeseries(video_path):
    # Open the video file for reading
    cap = cv2.VideoCapture(video_path)

    # Get the total number of frames in the video
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Get the frames per second (fps) of the video
    fps = cap.get(cv2.CAP_PROP_FPS)

    # Set the frame rate for extracting landmarks
    frame_rate = 0.5

    # Calculate the number of frames to skip based on the frame rate
    frames_to_skip = int(fps * frame_rate)

    landmarks_data = []
    current_frame = 0

    # Loop through the frames of the video
    while cap.isOpened():
        # Set the position to the current frame
        cap.set(cv2.CAP_PROP_POS_FRAMES, current_frame)

        # Read the current frame from the video
        ret, frame = cap.read()
        if not ret:
            break

        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Process the frame and get hand landmarks using Mediapipe
        results = hands.process(rgb_frame)
        if results.multi_hand_landmarks:
            hand_landmarks = results.multi_hand_landmarks[0].landmark

            # Append the 3D coordinates of hand landmarks to the list
            landmarks_data.append([[landmark.x, landmark.y, landmark.z] for landmark in hand_landmarks])

        # Move to the next frame based on the frames to skip
        current_frame += frames_to_skip

    # Release the video capture object
    cap.release()
    del cap

    # Reshape the landmarks data into a 2D array
    landmarks_data = np.array(landmarks_data).reshape(len(landmarks_data), -1)

    return landmarks_data

I0000 00:00:1706043745.312884   57314 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1706043745.331806   57350 gl_context.cc:344] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 535.146.02), renderer: NVIDIA GeForce GTX 970/PCIe/SSE2
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [4]:
videos_path = "/home/ant/projects/psl/dataset/Videos/alphabet"
labels = []
landmarks = []
# Iterate through each file in the dynamic alphabet directory
for i, filename in enumerate(tqdm(os.listdir(videos_path))):
    if filename.endswith('.mp4'):
        video_path = os.path.join(videos_path, filename)

        label = filename.split('.')[0].lower()
        label = ['<start>', label, '<eos>']

        # Convert labels to their corresponding vocabulary indices
        label = [vocabulary[l] for l in label]
        labels.append(label)

        # Call the function 'landmarks_timeseries' to get landmarks from the video
        land = landmarks_timeseries(video_path)

        landmarks.append(land)

100%|██████████| 36/36 [00:26<00:00,  1.37it/s]


In [5]:
len(landmarks), len(labels)

(36, 36)

In [6]:
videos_path = "/home/ant/projects/psl/dataset/Videos/words"
labels_words = []
landmarks_words = []
# Iterate through each file in the words directory
for i, filename in enumerate(tqdm(os.listdir(videos_path))):
    if filename.endswith('.mp4'):
        video_path = os.path.join(videos_path, filename)

        # Extract labels from the filename, including '<start>' and '<eos>' tokens
        label = ['<start>'] + list(filename.split('.')[0].lower()) + ['<eos>']

        # Convert labels to their corresponding vocabulary indices
        label = [vocabulary[l] for l in label]

        labels_words.append(label)

        # Call the function 'landmarks_timeseries' to get landmarks from the video
        land = landmarks_timeseries(video_path)
        landmarks_words.append(land)

100%|██████████| 2/2 [00:03<00:00,  1.51s/it]


In [7]:
# Define a mapping to fix certain characters in the labels
fix = {
    'Ć': 'ć',
    'Ę': 'ę',
    'Ł': 'ł',
    'Ń': 'ń',
    'Ó': 'O',
    'Ś': 'ś',
    'Ź': 'ź',
    'Ż': 'ż',
}

# Function to preprocess data from JSON files in the folder
def preprocess_data(labels_folder):
    labeled_with_landmarks_count = 0
    labeled_without_landmarks_count = 0
    data_rows = []  # List to store data rows
    labels = []  # List to store labels

    # Loop through JSON files in the folder
    for filename in tqdm(os.listdir(labels_folder)):
        if filename.endswith('.json'):
            with open(os.path.join(labels_folder, filename), 'r', encoding='utf-8') as json_file:
                data = json.load(json_file)
                if 'hand_landmarks' in data:
                    labeled_with_landmarks_count += 1

                    # Extract landmarks data and flatten it into a list
                    landmarks_data = data['hand_landmarks']
                    row = []
                    for landmark_key in landmarks_data:
                        landmark = landmarks_data[landmark_key]
                        row.extend([landmark['x'], landmark['y'], landmark['z']])

                    # Extract and preprocess the label
                    l = data['label']
                    if l in fix:
                        l = fix[l]
                    label = ['<start>', l.lower(), '<eos>']
                    label = [vocabulary[l] for l in label]
                    data_rows.append(row)
                    labels.append(label)

                else:
                    labeled_without_landmarks_count += 1

    return data_rows, labels

labels_folder = "/home/ant/projects/psl/dataset/labels"
preprocessed_data, preprocessed_labels = preprocess_data(labels_folder)

100%|██████████| 3626/3626 [00:00<00:00, 19796.56it/s]


_______________

ASIOWE TESTY


In [8]:
# import pandas as pd

# # Load static labels from CSV file
# csv_file_path = '/home/ant/projects/psl/Polish-Sign-Language-Recognition/train_data.csv'
# static_labels_df = pd.read_csv(csv_file_path, header=None, names=['filename', 'label'], skiprows=1)



In [9]:
# csv_file_path_static = '/home/ant/projects/psl/Polish-Sign-Language-Recognition/train_data.csv'
# static_labels_df = pd.read_csv(csv_file_path_static, usecols=[0], header=None, names=['filename'], skiprows=1)
# data_static_new = []

In [10]:
def load_and_process_image(file_path):
    # Load the image in binary mode
    with open(file_path, 'rb') as file:
        image = Image.open(file)
        # You can add additional image processing logic here if needed
        processed_content = np.array(image)  # Convert the image to a NumPy array
    return processed_content

In [11]:
import tensorflow as tf

In [21]:

def read_static(landmarks_directory, filenames_df):
    ###LABELS
    # Iterate through each row in the static labels dataframe
    labels_static = []
    for i, row in filenames_df.iterrows():
        # Extract filename and label from the CSV file
        filename = row['filename']
        label = row['label']
        
        # Convert labels to their corresponding vocabulary indices
        label = ['<start>'] + list(label.lower()) + ['<eos>']
        label = [vocabulary[l] for l in label]
        labels_static.append(label)
    
    ### LANDMARKS
    # Iterate through each row in the filenames dataframe
    landmarks_static = []
    for i, row in filenames_df.iterrows():
        # Extract filename from the CSV file
        filename = row['filename']
        
        # Construct the full path to the file with landmarks
        landmarks_file_path = os.path.join(landmarks_directory, filename + '.json')
        
        # Check if the file exists before attempting to read landmarks
        if os.path.exists(landmarks_file_path):
            # Read landmarks from the file
            with open(landmarks_file_path, 'r') as landmarks_file:
                landmark_data = json.load(landmarks_file)
                
                # Extract relevant information from the JSON structure
                hand_landmarks = landmark_data.get("hand_landmarks", {})
                
                # Create a flat list of numerical values representing each hand landmark
                landmark_values = []
                for key, values in hand_landmarks.items():
                    if key.startswith("hand_landmark_"):
                        # Extract x, y, and z values directly
                        x, y, z = values.get("x", 0.0), values.get("y", 0.0), values.get("z", 0.0)
                        landmark_values.extend([x, y, z])
                
                # Debugging information
                print(f"Length of landmark_values for {filename}: {len(landmark_values)}")
                
                # Ensure the length of the list is exactly 63
                if len(landmark_values) != 63:
                    # Print the length of the list for debugging
                    print(f"Unexpected length of landmark_values for {filename}: {len(landmark_values)}")
                    
                    # Handle the unexpected length (optional)
                    # For example, you might choose to skip this sample
                    continue
                
                # Append the flat list of values to landmarks_static
                landmarks_static.append(landmark_values)
        else:
            # Handle the case when the file does not exist
            print(f"Landmarks file not found for {filename}")
    return landmarks_static, labels_static

# Print the extracted landmarks for verification
# for landmark_values in landmarks_static:
#     print(landmark_values)

In [28]:
# Load filenames from CSV file
csv_file_path = '/home/ant/projects/psl/Polish-Sign-Language-Recognition/train_data.csv'
filenames_df = pd.read_csv(csv_file_path, header=None, names=['filename', 'label'], skiprows=1)

# Specify the directory containing the files with landmarks
landmarks_directory = '/home/ant/projects/psl/dataset/labels'

train_landmarks_static, train_labels_static = read_static(landmarks_directory, filenames_df)

# Load filenames from CSV file
csv_file_path = '/home/ant/projects/psl/Polish-Sign-Language-Recognition/test_data.csv'
filenames_df = pd.read_csv(csv_file_path, header=None, names=['filename', 'label'], skiprows=1)

test_landmarks_static, test_labels_static = read_static(landmarks_directory, filenames_df)

Length of landmark_values for dcf7b38889e34d53847a54df13ba1eff: 63
Length of landmark_values for 18b38ea463a34c9fbcf3a46e0b2e8c8b: 63
Length of landmark_values for 2916f0f92bd749b9bd7c833fa6a5dbbd: 63
Length of landmark_values for 9db2cad8e8214edbb3f54992426f4761: 63
Length of landmark_values for 30d1726c3fe0475489281b728bd01803: 63
Length of landmark_values for 7907412e89344a5f9715a2cffd37d44c: 63
Length of landmark_values for 6e95930c2e34480899c3cc31041d3027: 63
Length of landmark_values for f470e515e8ae4ede92f8f3b5d5f125bb: 63
Length of landmark_values for 123d0afb29ac47da8f16a218f90c01cd: 63
Length of landmark_values for 905061e622954883a5847819aebf6383: 63
Length of landmark_values for 6231f4cd40634b8293a28102fc20e09b: 63
Length of landmark_values for 3be25370efae496a9a3487016a075c3c: 63
Length of landmark_values for 58f129975792440a8ff9cb133d734efd: 63
Length of landmark_values for 9d1fcf61da1f4e0aa37a4bc01d63a94b: 63
Length of landmark_values for e5df36b9c6be486d9cb9cd60334a8813

In [32]:
len(train_landmarks_static), len(train_labels_static), len(test_landmarks_static), len(test_labels_static)

(2609, 2609, 653, 653)

In [13]:
def reshape_landmarks_static(data):
    data = np.array(data)
    # Create a new list to store the modified static data
    data_static_new = []

    # Iterate through each element in the original static data
    for d in data:
        # Repeat the current element along a new axis a random number of times (between 2 and 6)
        d = np.repeat(d.reshape(1, -1), repeats=np.random.randint(2, 7), axis=0)
        data_static_new.append(d)
    return data_static_new

In [33]:
train_landmarks_static = reshape_landmarks_static(train_landmarks_static)
test_landmarks_static = reshape_landmarks_static(test_landmarks_static)

In [15]:
from sklearn.model_selection import train_test_split
train_landmarks_words, test_landmarks_words, train_labels_words, test_labels_words = train_test_split(landmarks_words, labels_words, test_size=0.2, random_state=42)

In [37]:
# Litery dynamiczne, slowa, litery statyczne
data = landmarks + train_landmarks_words + train_landmarks_static
all_labels = labels + train_labels_words + train_labels_static

In [34]:
# slowa, litery statyczne
test_data = test_landmarks_words + test_landmarks_static
test_all_labels = test_labels_words + test_labels_static

In [38]:
len(data), len(all_labels), len(test_data), len(test_all_labels)

(2646, 2646, 654, 654)

__________________ 

In [45]:
# Ensure that the number of samples is consistent between data and all_labels
assert len(data) == len(all_labels), "Number of samples in data and all_labels must be the same."


In [46]:
def masked_loss(y_true, y_pred):
    # Initialize SparseCategoricalCrossentropy loss with 'from_logits' and 'reduction' parameters
    loss_function = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    
    # Calculate the cross-entropy loss for each item in the batch
    loss = loss_function(y_true, y_pred)

    # Create a binary mask to filter out padding elements (where y_true is 0)
    mask = tf.cast(y_true != 0, tf.float32)

    # Apply the mask to the calculated losses
    loss *= mask

    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

In [47]:
def accuracy_for_letters(y_true, y_pred):
    result = tf.cast(tf.cast(y_true, tf.int64) == tf.argmax(y_pred, axis=-1), tf.float32)

    # Create a binary mask to filter out padding elements (where y_true is 0)
    mask = tf.cast(y_true != 0, tf.float32)

    # Apply the mask to the calculated losses
    result *= mask

    return tf.reduce_sum(result) / tf.reduce_sum(mask)

In [39]:
max_len_input = 0
for d in data:
    max_len_input = max(max_len_input, len(d))
for d in test_data:
    max_len_input = max(max_len_input, len(d))
max_len_input

12

In [40]:
max_len_output = 0
for d in all_labels:
    max_len_output = max(max_len_output, len(d))
for d in test_all_labels:
    max_len_output = max(max_len_output, len(d))
max_len_output

6

In [None]:
len(data), len(all_labels), len(test_data), len(test_all_labels)

In [41]:
# Pad input sequences (data) with zeros using "post" padding

padded_train_inputs = tf.keras.utils.pad_sequences(data, maxlen=max_len_input, dtype="float32", padding="post")
padded_test_inputs = tf.keras.utils.pad_sequences(test_data, maxlen=max_len_input, dtype="float32", padding="post")

# padded_inputs.shape, padded_outputs.shape

In [42]:
# Pad output sequences (train_labels and test_labels) with zeros using "post" padding
padded_train_outputs = tf.keras.utils.pad_sequences(all_labels, maxlen=max_len_output, dtype="int32", padding="post")
padded_test_outputs = tf.keras.utils.pad_sequences(test_all_labels, maxlen=max_len_output, dtype="int32", padding="post")

In [43]:
padded_train_inputs.shape, padded_train_outputs.shape, padded_test_inputs.shape, padded_test_outputs.shape

((2646, 12, 63), (2646, 6), (654, 12, 63), (654, 6))

In [48]:
from tensorflow import keras


# Define the input dimension, vocabulary size, and create a Sequential model
input_dim = 63
vocab_size = len(vocabulary)

model = keras.Sequential(
    [
        # Input layer with shape (sequence_length, input_dim)
        keras.Input(shape=(padded_train_inputs.shape[1], input_dim), dtype="float32"),

        # Masking layer to handle variable-length sequences
        keras.layers.Masking(),

        # LSTM layer with 32 units, returning a single output for each sequence
        keras.layers.LSTM(32, return_sequences=False),

        # Repeat the output vector for each time step in the output sequence
        keras.layers.RepeatVector(padded_train_outputs.shape[1]),

        # LSTM layer with 64 units, returning a sequence of vectors
        keras.layers.LSTM(64, return_sequences=True),

        # TimeDistributed layer to apply Dense layer to each time step independently
        keras.layers.TimeDistributed(keras.layers.Dense(vocab_size)),
    ]
)

# Display the model summary
model.summary()

# Compile the model using the custom masked loss function and Adam optimizer
model.compile(
    loss=masked_loss,
    optimizer=tf.keras.optimizers.legacy.Adam(1e-3),
    metrics=[accuracy_for_letters]
)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_1 (Masking)         (None, 12, 63)            0         
                                                                 
 lstm_2 (LSTM)               (None, 32)                12288     
                                                                 
 repeat_vector_1 (RepeatVect  (None, 6, 32)            0         
 or)                                                             
                                                                 
 lstm_3 (LSTM)               (None, 6, 64)             24832     
                                                                 
 time_distributed_1 (TimeDis  (None, 6, 39)            2535      
 tributed)                                                       
                                                                 
Total params: 39,655
Trainable params: 39,655
Non-trai

In [49]:
model.fit(padded_train_inputs, padded_train_outputs, epochs=100)

Epoch 1/100


2024-01-23 21:14:14.221970: W tensorflow/core/common_runtime/type_inference.cc:339] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_INT32
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	while inferring type of node 'cond_40/output/_23'
2024-01-23 21:14:14.380103: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8902


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100

KeyboardInterrupt: 

In [50]:
model.evaluate(padded_test_inputs, padded_test_outputs)



[0.3674933612346649, 0.8654042482376099]