# Data processing

In [1]:
%%capture
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
base_path='/kaggle/input/asl-fingerspelling/'

In [3]:
import warnings
warnings.filterwarnings("ignore","NumPy version", category=UserWarning)
import numpy as np 
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import re
from scipy.stats import skew, kurtosis


In [4]:
DEBUG = True

In [8]:
train_path=f'{base_path}/train.csv'
train = pd.read_csv(train_path).head(100) if DEBUG else pd.read_csv(file_path)
train

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road
...,...,...,...,...,...
95,train_landmarks/5414471.parquet,5414471,1819642365,74,stars94.bg
96,train_landmarks/5414471.parquet,5414471,1819645763,81,1115 paradise meadow
97,train_landmarks/5414471.parquet,5414471,1819671153,242,malcolm hamilton
98,train_landmarks/5414471.parquet,5414471,1819699535,102,949-600-2398


In [10]:
train['phrase_char'] = train['phrase'].apply(tuple)
train['phrase_char_len'] = train['phrase_char'].apply(len)

train_sequence_id = train.set_index('sequence_id')

train.head()

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,"(s, c, a, l, e, s, /, k, u, h, a, y, l, a, h)",15
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,"(1, 3, 8, 3, , w, i, l, l, i, a, m, , l, a, ...",19
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane,"(9, 8, 8, , f, r, a, n, k, l, i, n, , l, a, ...",17
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road,"(6, 9, 2, 0, , n, o, r, t, h, e, a, s, t, , ...",25


In [6]:
with open(base_path+'character_to_prediction_index.json') as json_file:
    CHAR2ORD = json.load(json_file)

CHAR2ORD_df=pd.Series(CHAR2ORD).to_frame('Ordinal Encoding')
print(type(CHAR2ORD),type(CHAR2ORD_df))

N_UNIQUE_CHARACTERS = len(CHAR2ORD)
print(f'CHAR2ORD: {CHAR2ORD}\nN_UNIQUE_CHARACTERS: {N_UNIQUE_CHARACTERS}')

def encode_phrase(phrase):
    # .get(char, -1): if the char doesn't exist in dict, return -1.
    return [CHAR2ORD.get(char, -1) for char in phrase]

train['ordinal_encoding'] = train['phrase'].apply(encode_phrase)

train.head()


<class 'dict'> <class 'pandas.core.frame.DataFrame'>
CHAR2ORD: {' ': 0, '!': 1, '#': 2, '$': 3, '%': 4, '&': 5, "'": 6, '(': 7, ')': 8, '*': 9, '+': 10, ',': 11, '-': 12, '.': 13, '/': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, ';': 26, '=': 27, '?': 28, '@': 29, '[': 30, '_': 31, 'a': 32, 'b': 33, 'c': 34, 'd': 35, 'e': 36, 'f': 37, 'g': 38, 'h': 39, 'i': 40, 'j': 41, 'k': 42, 'l': 43, 'm': 44, 'n': 45, 'o': 46, 'p': 47, 'q': 48, 'r': 49, 's': 50, 't': 51, 'u': 52, 'v': 53, 'w': 54, 'x': 55, 'y': 56, 'z': 57, '~': 58}
N_UNIQUE_CHARACTERS: 59


Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len,ordinal_encoding
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]"
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,"(s, c, a, l, e, s, /, k, u, h, a, y, l, a, h)",15,"[50, 34, 32, 43, 36, 50, 14, 42, 52, 39, 32, 5..."
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,"(1, 3, 8, 3, , w, i, l, l, i, a, m, , l, a, ...",19,"[16, 18, 23, 18, 0, 54, 40, 43, 43, 40, 32, 44..."
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane,"(9, 8, 8, , f, r, a, n, k, l, i, n, , l, a, ...",17,"[24, 23, 23, 0, 37, 49, 32, 45, 42, 43, 40, 45..."
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road,"(6, 9, 2, 0, , n, o, r, t, h, e, a, s, t, , ...",25,"[21, 24, 17, 15, 0, 45, 46, 49, 51, 39, 36, 32..."


In [7]:
# Add complete file path to train
def get_file_path(path):
    return f'{base_path}{path}'

train['file_path'] = train['path'].apply(get_file_path)
train.head()

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len,ordinal_encoding,file_path
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]",/kaggle/input/asl-fingerspelling/train_landmar...
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,"(s, c, a, l, e, s, /, k, u, h, a, y, l, a, h)",15,"[50, 34, 32, 43, 36, 50, 14, 42, 52, 39, 32, 5...",/kaggle/input/asl-fingerspelling/train_landmar...
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,"(1, 3, 8, 3, , w, i, l, l, i, a, m, , l, a, ...",19,"[16, 18, 23, 18, 0, 54, 40, 43, 43, 40, 32, 44...",/kaggle/input/asl-fingerspelling/train_landmar...
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane,"(9, 8, 8, , f, r, a, n, k, l, i, n, , l, a, ...",17,"[24, 23, 23, 0, 37, 49, 32, 45, 42, 43, 40, 45...",/kaggle/input/asl-fingerspelling/train_landmar...
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road,"(6, 9, 2, 0, , n, o, r, t, h, e, a, s, t, , ...",25,"[21, 24, 17, 15, 0, 45, 46, 49, 51, 39, 36, 32...",/kaggle/input/asl-fingerspelling/train_landmar...


In [8]:
# Add phrase_type col
def get_phrase_type(phrase):
    if re.match(r'^[\d+-]+$', phrase):
        return 'phone_number'
    elif any([substr in phrase for substr in ['www', '.', '/']]) and ' ' not in phrase:
        return 'url'
    else:
        return 'address'
    
train['phrase_type'] = train['phrase'].apply(get_phrase_type)
train.head()

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len,ordinal_encoding,file_path,phrase_type
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]",/kaggle/input/asl-fingerspelling/train_landmar...,address
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,"(s, c, a, l, e, s, /, k, u, h, a, y, l, a, h)",15,"[50, 34, 32, 43, 36, 50, 14, 42, 52, 39, 32, 5...",/kaggle/input/asl-fingerspelling/train_landmar...,url
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,"(1, 3, 8, 3, , w, i, l, l, i, a, m, , l, a, ...",19,"[16, 18, 23, 18, 0, 54, 40, 43, 43, 40, 32, 44...",/kaggle/input/asl-fingerspelling/train_landmar...,address
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane,"(9, 8, 8, , f, r, a, n, k, l, i, n, , l, a, ...",17,"[24, 23, 23, 0, 37, 49, 32, 45, 42, 43, 40, 45...",/kaggle/input/asl-fingerspelling/train_landmar...,address
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road,"(6, 9, 2, 0, , n, o, r, t, h, e, a, s, t, , ...",25,"[21, 24, 17, 15, 0, 45, 46, 49, 51, 39, 36, 32...",/kaggle/input/asl-fingerspelling/train_landmar...,address


# Train model

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed
from sklearn.model_selection import train_test_split
from keras.metrics import SparseCategoricalAccuracy

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [10]:
def get_landmarks_by_sequence_id(sequence_id, file_path_column):
    file_path = file_path_column.loc[sequence_id]
    landmarks = pd.read_parquet(file_path)
    return landmarks.loc[sequence_id].values

In [11]:
%%time
max_seq_len = max(len(seq) for seq in train['ordinal_encoding'])
 
file_path_column = train.set_index('sequence_id')['file_path']

X = []
y = []

for seq_id, target_sequence in zip(tqdm(train['sequence_id']), train['ordinal_encoding']):
    landmarks_sequence = get_landmarks_by_sequence_id(seq_id, file_path_column)
    
    # 检查样本是否具有所需的形状，例如 (n, 1630)
    if len(landmarks_sequence.shape) != 2 or landmarks_sequence.shape[1] != 1630:
        print(f"Skipping sequence {seq_id} due to incorrect shape {landmarks_sequence.shape}")
        continue  # 跳过这个样本
    
    X.append(landmarks_sequence)
    y.append(np.array(target_sequence).reshape(-1, 1))

X = pad_sequences(X, maxlen=max_seq_len, padding='post', dtype='float32') 


y = pad_sequences(y, maxlen=max_seq_len, padding='post') 

#print(X,y)
 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

 
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(None, X_train.shape[2])))
model.add(TimeDistributed(Dense(N_UNIQUE_CHARACTERS, activation='softmax')))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', 
              metrics=[SparseCategoricalAccuracy()])

 
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

sample_index = 0
sample_X = X_val[sample_index].reshape(1, *X_val[sample_index].shape) 
predicted_probs = model.predict(sample_X)
predicted_sequence = np.argmax(predicted_probs, axis=-1).flatten()  


  0%|          | 0/500 [00:00<?, ?it/s]

Skipping sequence 1818239060 due to incorrect shape (1630,)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 29min 49s, sys: 17min 8s, total: 46min 58s
Wall time: 27min 5s


In [17]:
sample_index = 0
sample_X = X_val[sample_index].reshape(1, *X_val[sample_index].shape) 
predicted_probs = model.predict(sample_X)
predicted_sequence = np.argmax(predicted_probs, axis=-1).flatten() 
predicted_sequence



array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [13]:
loss, accuracy = model.evaluate(X_val, y_val)
print("Validation Loss:", loss)
print("Validation Accuracy:", accuracy)

Validation Loss: nan
Validation Accuracy: 0.42399999499320984


In [14]:
print(X_train.shape)
print(X_val.shape)

(399, 30, 1630)
(100, 30, 1630)


# XXXXXX

In [15]:
%%time
def process_landmarks(file_path, debug=False):
    landmarks = pd.read_parquet(file_path)
    
#     print(landmarks)
#     return

    
    hand_columns = [col for col in landmarks.columns if ('left_hand' in col or 'right_hand' in col) and not col.startswith('z_')]
    landmarks = landmarks[['frame']+hand_columns]

    processed_landmarks_list = []
    for seq_id in tqdm(landmarks.index.unique()):
        
#         if len(processed_landmarks_list)>1:
#             break
        
        '''*** Use [[]] to make sure it's a DataFrame but not Series !!!!
        Otherwise, if there are multiple rows that match the label, a DataFrame will be returned;
        If there is only one row that matches the label, return a Series'''
        sequence = landmarks.loc[[seq_id]].copy() 
        sequence['sequence_id'] = seq_id
        
#         if debug and len(processed_landmarks_list) >= 5:
#             break
    
        # Deal with NaN
        coordinate_columns = [col for col in sequence.columns if col.startswith(('x_', 'y_'))]
        sequence.loc[:, coordinate_columns] = sequence[coordinate_columns].fillna(0.0)
        
        # Others..unfinished....
        # ...

        processed_landmarks_list.append(sequence)

    processed_landmarks = pd.concat(processed_landmarks_list, ignore_index=True)
    return processed_landmarks


processed_landmarks_list = [process_landmarks(file_path, debug=DEBUG) for file_path in tqdm(train['file_path'])]

all_processed_landmarks = pd.concat(processed_landmarks_list, ignore_index=True)

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [16]:
all_processed_landmarks.head()

NameError: name 'all_processed_landmarks' is not defined

In [None]:
zero_ratios = (all_processed_landmarks == 0).mean()

plt.figure(figsize=(15, 6))
zero_ratios.plot(kind='bar')

plt.xlabel('Columns')
plt.ylabel('Zero Ratios')
plt.title('Proportion of Zero Values in Each Column')
plt.show()


In [None]:
columns = all_processed_landmarks.columns

parts = set()

for col in columns:
    if '_' in col:
        part_name = '_'.join(col.split('_')[1:-1])
        if len(part_name)>0:
            parts.add(part_name)

parts_list = list(parts)
print(parts_list)

In [None]:
print(len(list(all_processed_landmarks.iterrows())))
has_nan = all_processed_landmarks.isna().any()
columns_with_nan = has_nan[has_nan].index.tolist()
print("Columns with NaN values:", columns_with_nan)

In [None]:
# %%time
# final_coordinates_list = []

# for row_index, frame in tqdm(all_processed_landmarks.iterrows()):
# #     if DEBUG and len(final_coordinates_list) >= 1000:
# #         break
#     for part_name in parts_list:
#         max_index = max(int(col.split('_')[-1]) for col in columns if f'x_{part_name}_' in col)
#         for index in range(max_index + 1):
#             temp_coordinates = {
#                 'sequence_id': int(frame['sequence_id']),
#                 'frame': int(frame['frame']),
#                 'part': part_name,
#                 'index': index,
#                 'x': frame[f'x_{part_name}_{index}'],
#                 'y': frame[f'y_{part_name}_{index}'],
#                 'z': frame[f'z_{part_name}_{index}'],
#             }
#             final_coordinates_list.append(temp_coordinates)

# new_df = pd.DataFrame(final_coordinates_list)

In [None]:
# # Just want to try--
# # Set MultiIndex
# new_df.set_index(['part', 'index'], inplace=True)

# print(new_df.loc[('right_hand', 20)])
# print(new_df.loc[('pose')].head())
# new_df.tail()

# new_df.reset_index(inplace=True)

In [None]:
# merged_df = pd.merge(train, new_df, on='sequence_id', how='left')

# #merged_df.query("part=='pose'")
# merged_df

In [None]:
merged_df = pd.merge(train, all_processed_landmarks, on='sequence_id', how='left')

print(', '.join(merged_df.columns))
merged_df.head()

In [None]:
# # Feature cols
# features = merged_df.drop(columns=['sequence_id', 'participant_id', 'phrase', 'phrase_char', 'phrase_char_len', 'phrase_type'])

# # Label cols
# labels = final_data['ordinal_encoding']
