# Data processing

In [63]:
%%capture
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
base_path='/kaggle/input/asl-fingerspelling/'

In [64]:
import numpy as np 
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import re
from scipy.stats import skew, kurtosis
import warnings

#warnings.filterwarnings("ignore", message="NumPy version")

In [65]:
DEBUG = True

In [66]:
train_path=f'{base_path}/train.csv'
train = pd.read_csv(train_path).head(50) if DEBUG else pd.read_csv(file_path)
train

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road
5,train_landmarks/5414471.parquet,5414471,1817141095,38,www.freem.ne.jp
6,train_landmarks/5414471.parquet,5414471,1817169529,70,https://jsi.is/hukuoka
7,train_landmarks/5414471.parquet,5414471,1817171518,202,239613 stolze street
8,train_landmarks/5414471.parquet,5414471,1817195757,136,242-197-6202
9,train_landmarks/5414471.parquet,5414471,1817216847,93,271097 bayshore boulevard


In [67]:
train['phrase_char'] = train['phrase'].apply(tuple)
train['phrase_char_len'] = train['phrase_char'].apply(len)

train_sequence_id = train.set_index('sequence_id')

train.head()

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,"(s, c, a, l, e, s, /, k, u, h, a, y, l, a, h)",15
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,"(1, 3, 8, 3, , w, i, l, l, i, a, m, , l, a, ...",19
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane,"(9, 8, 8, , f, r, a, n, k, l, i, n, , l, a, ...",17
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road,"(6, 9, 2, 0, , n, o, r, t, h, e, a, s, t, , ...",25


In [68]:
with open(base_path+'character_to_prediction_index.json') as json_file:
    CHAR2ORD = json.load(json_file)

CHAR2ORD_df=pd.Series(CHAR2ORD).to_frame('Ordinal Encoding')
type(CHAR2ORD),type(CHAR2ORD_df)


def encode_phrase(phrase):
    # .get(char, -1): if the char doesn't exist in dict, return -1.
    return [char2ord_dict.get(char, -1) for char in phrase]

train['ordinal_encoding'] = train['phrase'].apply(encode_phrase)

train.head()


Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len,ordinal_encoding
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]"
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,"(s, c, a, l, e, s, /, k, u, h, a, y, l, a, h)",15,"[50, 34, 32, 43, 36, 50, 14, 42, 52, 39, 32, 5..."
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,"(1, 3, 8, 3, , w, i, l, l, i, a, m, , l, a, ...",19,"[16, 18, 23, 18, 0, 54, 40, 43, 43, 40, 32, 44..."
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane,"(9, 8, 8, , f, r, a, n, k, l, i, n, , l, a, ...",17,"[24, 23, 23, 0, 37, 49, 32, 45, 42, 43, 40, 45..."
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road,"(6, 9, 2, 0, , n, o, r, t, h, e, a, s, t, , ...",25,"[21, 24, 17, 15, 0, 45, 46, 49, 51, 39, 36, 32..."


In [69]:
# Add complete file path to train
def get_file_path(path):
    return f'{base_path}{path}'

train['file_path'] = train['path'].apply(get_file_path)
train.head()

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len,ordinal_encoding,file_path
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]",/kaggle/input/asl-fingerspelling/train_landmar...
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,"(s, c, a, l, e, s, /, k, u, h, a, y, l, a, h)",15,"[50, 34, 32, 43, 36, 50, 14, 42, 52, 39, 32, 5...",/kaggle/input/asl-fingerspelling/train_landmar...
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,"(1, 3, 8, 3, , w, i, l, l, i, a, m, , l, a, ...",19,"[16, 18, 23, 18, 0, 54, 40, 43, 43, 40, 32, 44...",/kaggle/input/asl-fingerspelling/train_landmar...
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane,"(9, 8, 8, , f, r, a, n, k, l, i, n, , l, a, ...",17,"[24, 23, 23, 0, 37, 49, 32, 45, 42, 43, 40, 45...",/kaggle/input/asl-fingerspelling/train_landmar...
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road,"(6, 9, 2, 0, , n, o, r, t, h, e, a, s, t, , ...",25,"[21, 24, 17, 15, 0, 45, 46, 49, 51, 39, 36, 32...",/kaggle/input/asl-fingerspelling/train_landmar...


In [70]:
def get_phrase_type(phrase):
    if re.match(r'^[\d+-]+$', phrase):
        return 'phone_number'
    elif any([substr in phrase for substr in ['www', '.', '/']]) and ' ' not in phrase:
        return 'url'
    else:
        return 'address'
    
train['phrase_type'] = train['phrase'].apply(get_phrase_type)
train.head()

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len,ordinal_encoding,file_path,phrase_type
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]",/kaggle/input/asl-fingerspelling/train_landmar...,address
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,"(s, c, a, l, e, s, /, k, u, h, a, y, l, a, h)",15,"[50, 34, 32, 43, 36, 50, 14, 42, 52, 39, 32, 5...",/kaggle/input/asl-fingerspelling/train_landmar...,url
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,"(1, 3, 8, 3, , w, i, l, l, i, a, m, , l, a, ...",19,"[16, 18, 23, 18, 0, 54, 40, 43, 43, 40, 32, 44...",/kaggle/input/asl-fingerspelling/train_landmar...,address
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane,"(9, 8, 8, , f, r, a, n, k, l, i, n, , l, a, ...",17,"[24, 23, 23, 0, 37, 49, 32, 45, 42, 43, 40, 45...",/kaggle/input/asl-fingerspelling/train_landmar...,address
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road,"(6, 9, 2, 0, , n, o, r, t, h, e, a, s, t, , ...",25,"[21, 24, 17, 15, 0, 45, 46, 49, 51, 39, 36, 32...",/kaggle/input/asl-fingerspelling/train_landmar...,address


In [73]:
%%time
def process_landmarks(file_path, debug=False):
    landmarks = pd.read_parquet(file_path)

    processed_landmarks_list = []
    for seq_id in tqdm(landmarks.index.unique()):
        '''*** Use [[]] to make sure it's a DataFrame but not Series !!!!
        Otherwise, if there are multiple rows that match the label, a DataFrame will be returned;
        If there is only one row that matches the label, return a Series
        '''
        sequence = landmarks.loc[[seq_id]].copy() 
        #print(type(sequence))
        #return
        
        if debug and len(processed_landmarks_list) >= 5:
            break
    
        # Deal with NaN
        coordinate_columns = [col for col in sequence.columns if col.startswith(('x_', 'y_', 'z_'))]
        
        sequence.loc[:, coordinate_columns] = sequence[coordinate_columns].fillna(0.0)

        # Others..unfinished....
        # ...

        processed_landmarks_list.append(sequence)

    processed_landmarks = pd.concat(processed_landmarks_list, ignore_index=True)
    return processed_landmarks


processed_landmarks_list = [process_landmarks(file_path, debug=DEBUG) for file_path in tqdm(train['file_path'])]

all_processed_landmarks = pd.concat(processed_landmarks_list, ignore_index=True)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

CPU times: user 6min 52s, sys: 1min 56s, total: 8min 49s
Wall time: 6min 27s


Unnamed: 0,frame,x_face_0,x_face_1,x_face_2,x_face_3,x_face_4,x_face_5,x_face_6,x_face_7,x_face_8,...,z_right_hand_11,z_right_hand_12,z_right_hand_13,z_right_hand_14,z_right_hand_15,z_right_hand_16,z_right_hand_17,z_right_hand_18,z_right_hand_19,z_right_hand_20
0,0,0.710588,0.699951,0.705657,0.691768,0.699669,0.701980,0.709724,0.610405,0.712660,...,-0.245855,-0.269148,-0.129743,-0.251501,-0.278687,-0.266530,-0.152852,-0.257519,-0.275822,-0.266876
1,1,0.709525,0.697582,0.703713,0.691016,0.697576,0.700467,0.709796,0.616540,0.713729,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2,0.711059,0.700858,0.706272,0.693285,0.700825,0.703319,0.711549,0.615606,0.715143,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,3,0.712799,0.702518,0.707840,0.694899,0.702445,0.704794,0.712483,0.625044,0.715677,...,-0.370770,-0.408097,-0.185217,-0.325494,-0.343373,-0.328294,-0.203126,-0.315719,-0.326104,-0.314282
4,4,0.712349,0.705451,0.709918,0.696006,0.705180,0.706928,0.712685,0.614356,0.714875,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81495,51,0.586436,0.574976,0.579976,0.562985,0.573898,0.574319,0.577008,0.488136,0.576910,...,-0.068126,-0.079747,-0.032400,-0.056950,-0.071820,-0.079492,-0.042082,-0.064037,-0.072470,-0.075721
81496,52,0.587116,0.576780,0.581191,0.564485,0.575789,0.576084,0.578418,0.487272,0.578517,...,-0.112633,-0.125549,-0.072979,-0.112802,-0.128400,-0.134561,-0.095985,-0.127614,-0.138775,-0.143754
81497,53,0.586408,0.575569,0.580175,0.563062,0.574475,0.574683,0.576834,0.485611,0.576753,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
81498,54,0.585739,0.575382,0.579858,0.562774,0.574296,0.574451,0.576459,0.485235,0.576332,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [74]:
all_processed_landmarks

Unnamed: 0,frame,x_face_0,x_face_1,x_face_2,x_face_3,x_face_4,x_face_5,x_face_6,x_face_7,x_face_8,...,z_right_hand_11,z_right_hand_12,z_right_hand_13,z_right_hand_14,z_right_hand_15,z_right_hand_16,z_right_hand_17,z_right_hand_18,z_right_hand_19,z_right_hand_20
0,0,0.710588,0.699951,0.705657,0.691768,0.699669,0.701980,0.709724,0.610405,0.712660,...,-0.245855,-0.269148,-0.129743,-0.251501,-0.278687,-0.266530,-0.152852,-0.257519,-0.275822,-0.266876
1,1,0.709525,0.697582,0.703713,0.691016,0.697576,0.700467,0.709796,0.616540,0.713729,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2,0.711059,0.700858,0.706272,0.693285,0.700825,0.703319,0.711549,0.615606,0.715143,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,3,0.712799,0.702518,0.707840,0.694899,0.702445,0.704794,0.712483,0.625044,0.715677,...,-0.370770,-0.408097,-0.185217,-0.325494,-0.343373,-0.328294,-0.203126,-0.315719,-0.326104,-0.314282
4,4,0.712349,0.705451,0.709918,0.696006,0.705180,0.706928,0.712685,0.614356,0.714875,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81495,51,0.586436,0.574976,0.579976,0.562985,0.573898,0.574319,0.577008,0.488136,0.576910,...,-0.068126,-0.079747,-0.032400,-0.056950,-0.071820,-0.079492,-0.042082,-0.064037,-0.072470,-0.075721
81496,52,0.587116,0.576780,0.581191,0.564485,0.575789,0.576084,0.578418,0.487272,0.578517,...,-0.112633,-0.125549,-0.072979,-0.112802,-0.128400,-0.134561,-0.095985,-0.127614,-0.138775,-0.143754
81497,53,0.586408,0.575569,0.580175,0.563062,0.574475,0.574683,0.576834,0.485611,0.576753,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
81498,54,0.585739,0.575382,0.579858,0.562774,0.574296,0.574451,0.576459,0.485235,0.576332,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
