# Data processing

In [21]:
%%capture
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
base_path='/kaggle/input/asl-fingerspelling/'

In [22]:
import numpy as np 
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import re
from scipy.stats import skew, kurtosis
import warnings

#warnings.filterwarnings("ignore", message="NumPy version")

In [23]:
DEBUG = True

In [24]:
train_path=f'{base_path}/train.csv'
train = pd.read_csv(train_path).head(10) if DEBUG else pd.read_csv(file_path)
train

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road
5,train_landmarks/5414471.parquet,5414471,1817141095,38,www.freem.ne.jp
6,train_landmarks/5414471.parquet,5414471,1817169529,70,https://jsi.is/hukuoka
7,train_landmarks/5414471.parquet,5414471,1817171518,202,239613 stolze street
8,train_landmarks/5414471.parquet,5414471,1817195757,136,242-197-6202
9,train_landmarks/5414471.parquet,5414471,1817216847,93,271097 bayshore boulevard


In [25]:
train['phrase_char'] = train['phrase'].apply(tuple)
train['phrase_char_len'] = train['phrase_char'].apply(len)

train_sequence_id = train.set_index('sequence_id')

train.head()

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,"(s, c, a, l, e, s, /, k, u, h, a, y, l, a, h)",15
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,"(1, 3, 8, 3, , w, i, l, l, i, a, m, , l, a, ...",19
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane,"(9, 8, 8, , f, r, a, n, k, l, i, n, , l, a, ...",17
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road,"(6, 9, 2, 0, , n, o, r, t, h, e, a, s, t, , ...",25


In [26]:
with open(base_path+'character_to_prediction_index.json') as json_file:
    CHAR2ORD = json.load(json_file)

CHAR2ORD_df=pd.Series(CHAR2ORD).to_frame('Ordinal Encoding')
print(type(CHAR2ORD),type(CHAR2ORD_df))


def encode_phrase(phrase):
    # .get(char, -1): if the char doesn't exist in dict, return -1.
    return [CHAR2ORD.get(char, -1) for char in phrase]

train['ordinal_encoding'] = train['phrase'].apply(encode_phrase)

train.head()


<class 'dict'> <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len,ordinal_encoding
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]"
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,"(s, c, a, l, e, s, /, k, u, h, a, y, l, a, h)",15,"[50, 34, 32, 43, 36, 50, 14, 42, 52, 39, 32, 5..."
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,"(1, 3, 8, 3, , w, i, l, l, i, a, m, , l, a, ...",19,"[16, 18, 23, 18, 0, 54, 40, 43, 43, 40, 32, 44..."
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane,"(9, 8, 8, , f, r, a, n, k, l, i, n, , l, a, ...",17,"[24, 23, 23, 0, 37, 49, 32, 45, 42, 43, 40, 45..."
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road,"(6, 9, 2, 0, , n, o, r, t, h, e, a, s, t, , ...",25,"[21, 24, 17, 15, 0, 45, 46, 49, 51, 39, 36, 32..."


In [27]:
# Add complete file path to train
def get_file_path(path):
    return f'{base_path}{path}'

train['file_path'] = train['path'].apply(get_file_path)
train.head()

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len,ordinal_encoding,file_path
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]",/kaggle/input/asl-fingerspelling/train_landmar...
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,"(s, c, a, l, e, s, /, k, u, h, a, y, l, a, h)",15,"[50, 34, 32, 43, 36, 50, 14, 42, 52, 39, 32, 5...",/kaggle/input/asl-fingerspelling/train_landmar...
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,"(1, 3, 8, 3, , w, i, l, l, i, a, m, , l, a, ...",19,"[16, 18, 23, 18, 0, 54, 40, 43, 43, 40, 32, 44...",/kaggle/input/asl-fingerspelling/train_landmar...
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane,"(9, 8, 8, , f, r, a, n, k, l, i, n, , l, a, ...",17,"[24, 23, 23, 0, 37, 49, 32, 45, 42, 43, 40, 45...",/kaggle/input/asl-fingerspelling/train_landmar...
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road,"(6, 9, 2, 0, , n, o, r, t, h, e, a, s, t, , ...",25,"[21, 24, 17, 15, 0, 45, 46, 49, 51, 39, 36, 32...",/kaggle/input/asl-fingerspelling/train_landmar...


In [28]:
def get_phrase_type(phrase):
    if re.match(r'^[\d+-]+$', phrase):
        return 'phone_number'
    elif any([substr in phrase for substr in ['www', '.', '/']]) and ' ' not in phrase:
        return 'url'
    else:
        return 'address'
    
train['phrase_type'] = train['phrase'].apply(get_phrase_type)
train.head()

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len,ordinal_encoding,file_path,phrase_type
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]",/kaggle/input/asl-fingerspelling/train_landmar...,address
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,"(s, c, a, l, e, s, /, k, u, h, a, y, l, a, h)",15,"[50, 34, 32, 43, 36, 50, 14, 42, 52, 39, 32, 5...",/kaggle/input/asl-fingerspelling/train_landmar...,url
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,"(1, 3, 8, 3, , w, i, l, l, i, a, m, , l, a, ...",19,"[16, 18, 23, 18, 0, 54, 40, 43, 43, 40, 32, 44...",/kaggle/input/asl-fingerspelling/train_landmar...,address
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane,"(9, 8, 8, , f, r, a, n, k, l, i, n, , l, a, ...",17,"[24, 23, 23, 0, 37, 49, 32, 45, 42, 43, 40, 45...",/kaggle/input/asl-fingerspelling/train_landmar...,address
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road,"(6, 9, 2, 0, , n, o, r, t, h, e, a, s, t, , ...",25,"[21, 24, 17, 15, 0, 45, 46, 49, 51, 39, 36, 32...",/kaggle/input/asl-fingerspelling/train_landmar...,address


In [31]:
%%time
def process_landmarks(file_path, debug=False):
    landmarks = pd.read_parquet(file_path)
    #print(landmarks.info())
    #return
    
    hand_columns = [col for col in landmarks.columns if ('left_hand' in col or 'right_hand' in col) and not col.startswith('z_')]
    landmarks = landmarks[['frame']+hand_columns]

    processed_landmarks_list = []
    for seq_id in tqdm(landmarks.index.unique()):
        '''*** Use [[]] to make sure it's a DataFrame but not Series !!!!
        Otherwise, if there are multiple rows that match the label, a DataFrame will be returned;
        If there is only one row that matches the label, return a Series'''
        sequence = landmarks.loc[[seq_id]].copy() 
        sequence['sequence_id'] = seq_id
        
        if debug and len(processed_landmarks_list) >= 5:
            break
    
        # Deal with NaN
        coordinate_columns = [col for col in sequence.columns if col.startswith(('x_', 'y_'))]
        sequence.loc[:, coordinate_columns] = sequence[coordinate_columns].fillna(0.0)
        
        # Others..unfinished....
        # ...

        processed_landmarks_list.append(sequence)

    processed_landmarks = pd.concat(processed_landmarks_list, ignore_index=True)
    return processed_landmarks


processed_landmarks_list = [process_landmarks(file_path, debug=DEBUG) for file_path in tqdm(train['file_path'])]

all_processed_landmarks = pd.concat(processed_landmarks_list, ignore_index=True)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

CPU times: user 38.9 s, sys: 27.7 s, total: 1min 6s
Wall time: 24.2 s


In [36]:
all_processed_landmarks.tail()

Unnamed: 0,frame,x_left_hand_0,x_left_hand_1,x_left_hand_2,x_left_hand_3,x_left_hand_4,x_left_hand_5,x_left_hand_6,x_left_hand_7,x_left_hand_8,...,y_right_hand_12,y_right_hand_13,y_right_hand_14,y_right_hand_15,y_right_hand_16,y_right_hand_17,y_right_hand_18,y_right_hand_19,y_right_hand_20,sequence_id
7145,97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1816967051
7146,98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1816967051
7147,99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.60621,0.656075,0.591323,0.585353,0.593196,0.679846,0.621291,0.588837,0.569418,1816967051
7148,100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.610716,0.659693,0.580883,0.578441,0.603747,0.682028,0.623064,0.587529,0.572168,1816967051
7149,101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.614005,0.645884,0.56996,0.582512,0.611837,0.66595,0.614185,0.586468,0.572873,1816967051


In [40]:
columns = all_processed_landmarks.columns

parts = set()

for col in columns:
    if '_' in col:
        part_name = '_'.join(col.split('_')[1:-1])
        if len(part_name)>0:
            parts.add(part_name)

parts_list = list(parts)
print(parts_list)

['left_hand', 'right_hand']


In [41]:
print(len(list(all_processed_landmarks.iterrows())))
has_nan = all_processed_landmarks.isna().any()
columns_with_nan = has_nan[has_nan].index.tolist()
print("Columns with NaN values:", columns_with_nan)

7150
Columns with NaN values: []


In [None]:
# %%time
# final_coordinates_list = []

# for row_index, frame in tqdm(all_processed_landmarks.iterrows()):
# #     if DEBUG and len(final_coordinates_list) >= 1000:
# #         break
#     for part_name in parts_list:
#         max_index = max(int(col.split('_')[-1]) for col in columns if f'x_{part_name}_' in col)
#         for index in range(max_index + 1):
#             temp_coordinates = {
#                 'sequence_id': int(frame['sequence_id']),
#                 'frame': int(frame['frame']),
#                 'part': part_name,
#                 'index': index,
#                 'x': frame[f'x_{part_name}_{index}'],
#                 'y': frame[f'y_{part_name}_{index}'],
#                 'z': frame[f'z_{part_name}_{index}'],
#             }
#             final_coordinates_list.append(temp_coordinates)

# new_df = pd.DataFrame(final_coordinates_list)

In [None]:
# # Just want to try--
# # Set MultiIndex
# new_df.set_index(['part', 'index'], inplace=True)

# print(new_df.loc[('right_hand', 20)])
# print(new_df.loc[('pose')].head())
# new_df.tail()

# new_df.reset_index(inplace=True)

In [None]:
# merged_df = pd.merge(train, new_df, on='sequence_id', how='left')

# #merged_df.query("part=='pose'")
# merged_df

In [43]:
merged_df = pd.merge(train, all_processed_landmarks, on='sequence_id', how='left')

print(', '.join(merged_df.columns))
merged_df.head()

path, file_id, sequence_id, participant_id, phrase, phrase_char, phrase_char_len, ordinal_encoding, file_path, phrase_type, frame, x_left_hand_0, x_left_hand_1, x_left_hand_2, x_left_hand_3, x_left_hand_4, x_left_hand_5, x_left_hand_6, x_left_hand_7, x_left_hand_8, x_left_hand_9, x_left_hand_10, x_left_hand_11, x_left_hand_12, x_left_hand_13, x_left_hand_14, x_left_hand_15, x_left_hand_16, x_left_hand_17, x_left_hand_18, x_left_hand_19, x_left_hand_20, x_right_hand_0, x_right_hand_1, x_right_hand_2, x_right_hand_3, x_right_hand_4, x_right_hand_5, x_right_hand_6, x_right_hand_7, x_right_hand_8, x_right_hand_9, x_right_hand_10, x_right_hand_11, x_right_hand_12, x_right_hand_13, x_right_hand_14, x_right_hand_15, x_right_hand_16, x_right_hand_17, x_right_hand_18, x_right_hand_19, x_right_hand_20, y_left_hand_0, y_left_hand_1, y_left_hand_2, y_left_hand_3, y_left_hand_4, y_left_hand_5, y_left_hand_6, y_left_hand_7, y_left_hand_8, y_left_hand_9, y_left_hand_10, y_left_hand_11, y_left_hand_12

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len,ordinal_encoding,file_path,phrase_type,...,y_right_hand_11,y_right_hand_12,y_right_hand_13,y_right_hand_14,y_right_hand_15,y_right_hand_16,y_right_hand_17,y_right_hand_18,y_right_hand_19,y_right_hand_20
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]",/kaggle/input/asl-fingerspelling/train_landmar...,address,...,0.490615,0.439064,0.680148,0.645134,0.697051,0.73925,0.728379,0.730378,0.770165,0.798621
1,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]",/kaggle/input/asl-fingerspelling/train_landmar...,address,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]",/kaggle/input/asl-fingerspelling/train_landmar...,address,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]",/kaggle/input/asl-fingerspelling/train_landmar...,address,...,0.528952,0.493427,0.667076,0.677703,0.737446,0.783664,0.706994,0.722034,0.768912,0.79993
4,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]",/kaggle/input/asl-fingerspelling/train_landmar...,address,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# # Feature cols
# features = merged_df.drop(columns=['sequence_id', 'participant_id', 'phrase', 'phrase_char', 'phrase_char_len', 'phrase_type'])

# # Label cols
# labels = final_data['ordinal_encoding']
