## Load the necessary packages

In [None]:
import pandas as pd
import numpy as np 
import json

In [2]:
BASE_DIR = '../input/asl-fingerspelling'

train = pd.read_csv(f'{BASE_DIR}/train.csv')

## Remove pose and face columns 

**Due to the computational power restrictions, we will only analyze 9 of the provided Parquet files, which amounts to approximately 9,000 sentences.**

In [3]:
parquet_files = ['train_landmarks/1134756332.parquet',
 'train_landmarks/5414471.parquet',
 'train_landmarks/1664666588.parquet',
 'train_landmarks/1133664520.parquet',
 'train_landmarks/234418913.parquet',
 'train_landmarks/566963657.parquet',
 'train_landmarks/1920330615.parquet',
 'train_landmarks/105143404.parquet',
 'train_landmarks/933868835.parquet']

**We keep each dataframe corresponding to each parquet file in a list called 'dataframes'**

In [4]:
dataframes = []

# Iterar sobre cada archivo parquet
for file in parquet_files:
    # Leer el archivo parquet en un DataFrame
    df = pd.read_parquet(f'{BASE_DIR}/{file}')

    # Agregar el DataFrame a la lista
    dataframes.append(df)

**We removed some columns.**

In [5]:
columns = ['x_pose_', 'y_pose_', 'z_pose_', 'x_face_', 'y_face_', 'z_face_']

for i in range(len(dataframes)):
    dataframes[i] = dataframes[i].drop(columns=[col for col in dataframes[i].columns if any(col.startswith(column) for column in columns)])


**There are sentences that correspond to only a single frame, we will remove these sentences from our analysis as they are likely errors. It doesn't make sense for a lengthy phrase like a phone number to consist of a single sign language gesture. We will filter only those sentences that have more than 300 frames.**

In [6]:
for i in range(0,9):
    index_count = dataframes[i].index.value_counts()

    # Filtrar los índices que se repiten 300 o más veces
    filtered_indices = index_count[index_count >= 300].index

    # Filtrar el dataframe original
    dataframes[i] = dataframes[i].groupby(dataframes[i].index).filter(lambda x: x.index[0] in filtered_indices)


**We equalized the number of frames for each landmark sequence to 720 frames.**

In [8]:
 for i in range(9):
    sequence_id = dataframes[i].index.unique()
    for j in range(len(sequence_id)):
        current_frames = dataframes[i].index.value_counts()[sequence_id[j]]
        num_new_rows = 720 - current_frames
        if num_new_rows > 0:
            new_rows = pd.DataFrame([[current_frames + 1 + k] + [0] * 84 for k in range(num_new_rows)],
                                    columns=dataframes[0].columns, index=[sequence_id[j]] * num_new_rows)
            dataframes[i] = pd.concat([dataframes[i], new_rows], axis=0, ignore_index=False)
            
    print(i)

0
1
2
3
4
5
6
7
8


**Remove null values**

In [9]:
for i in range(9):
    dataframes[i] = dataframes[i].fillna(0)

In [10]:
dataframe = pd.concat(dataframes, ignore_index=False)

## Relating each phrase to its respective dataframe

**Creating a dictionary with the identifiers of each phrase as key and as values the corresponding dataframe.**

In [11]:
dataframe_dict = {}
for sequence_id in dataframe.index.unique():
    dataframe_dict[sequence_id] = dataframe.loc[sequence_id]

### Create a list to store the landmark dataframes and another list to store the corresponding phrases in the same order.

In [12]:
indices = dataframe.index.unique()

In [13]:
phrases = []
for index_phrase in indices:
    phrase = train.query(f'sequence_id=={index_phrase}')['phrase'].values[0]
    phrases.append(phrase)

In [14]:
sequence_phrase = {}
for i in range(len(phrases)):
    sequence_id_phrase = indices[i] # Identificador de la secuencia
    phrase = phrases[i]
    sequence_phrase[sequence_id_phrase] = phrase

In [15]:
landmarks_data = []  # Lista para almacenar los dataframes de landmarks
phrase_order = []  # Lista para mantener el orden de las frases

# Iterar sobre los sequence_id
for sequence_id, dataframe in dataframe_dict.items():
    landmarks_dataframe = dataframe.iloc[:, 1:]  # Seleccionar todas las columnas excepto la última (frames)
    landmarks_data.append(landmarks_dataframe)
    phrase = sequence_phrase.get(sequence_id)  # Obtener la frase correspondiente al sequence_id
    phrase_order.append(phrase)

**Now each landmark and phrase are related by the index in each of their respective lists.**

## Create a list to store the landmark dataframes and another list to store the corresponding letter in the same order.


### Code phrases

In [18]:
#Load the character_to_prediction_index.json file
with open('/kaggle/input/asl-fingerspelling/character_to_prediction_index.json') as f:
    character_to_index = json.load(f)
    
# Create an index dictionary to characters
index_to_character = {v: k for k, v in character_to_index.items()}

In [19]:
encoded_phrases = []

# Iterar sobre las etiquetas
for phrase in phrase_order:
    encoded_phrase = [character_to_index.get(c, 0) for c in phrase]
    encoded_phrases.append(encoded_phrase)

**We converted the sentences and our input data into arrays. Since each sentence has a different length, we equalized their lengths by padding them to a length of 30 characters. We filled these characters with 0, as 0 represents an empty space and relates well to the data in the landmarks where there are zeros.**

In [20]:
array_phrases = np.array(encoded_phrases)

  array_phrases = np.array(encoded_phrases)


In [24]:
from keras.utils import pad_sequences

In [26]:
# Realiza el padding para igualar las longitudes de las frases
padded_phrases = pad_sequences(array_phrases, maxlen=30, padding='post')

In [27]:
padded_phrases=padded_phrases.astype(np.int16)

In [30]:
for i in range(len(landmarks_data)):
    landmarks_data[i].reset_index(drop=True, inplace=True)

In [31]:
landmarks_array = np.array(landmarks_data)

In [32]:
landmarks_array=landmarks_array.astype(np.float32)

In [33]:
landmarks_array.shape

(575, 720, 84)

In [35]:
padded_phrases.shape

(575, 30)

### We store data to save resources

In [36]:
import h5py

In [37]:
with h5py.File('/kaggle/working/landmarks_array.h5', 'w') as f:
    f.create_dataset('landmarks_array', data=landmarks_array)

In [39]:
with h5py.File('/kaggle/working/y_data.h5', 'w') as f:
    f.create_dataset('y_data', data=padded_phrases)