# Data processing

In [11]:
%%capture
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
base_path='/kaggle/input/asl-fingerspelling/'

In [12]:
import numpy as np 
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import re
from scipy.stats import skew, kurtosis
import warnings

#warnings.filterwarnings("ignore", message="NumPy version")

In [13]:
DEBUG = True

In [14]:
train_path=f'{base_path}/train.csv'
train = pd.read_csv(train_path).head(10) if DEBUG else pd.read_csv(file_path)
train

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier


In [15]:
train['phrase_char'] = train['phrase'].apply(tuple)
train['phrase_char_len'] = train['phrase_char'].apply(len)

train_sequence_id = train.set_index('sequence_id')

train.head()

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,"(s, c, a, l, e, s, /, k, u, h, a, y, l, a, h)",15
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,"(1, 3, 8, 3, , w, i, l, l, i, a, m, , l, a, ...",19


In [16]:
with open(base_path+'character_to_prediction_index.json') as json_file:
    CHAR2ORD = json.load(json_file)

CHAR2ORD_df=pd.Series(CHAR2ORD).to_frame('Ordinal Encoding')
print(type(CHAR2ORD),type(CHAR2ORD_df))


def encode_phrase(phrase):
    # .get(char, -1): if the char doesn't exist in dict, return -1.
    return [CHAR2ORD.get(char, -1) for char in phrase]

train['ordinal_encoding'] = train['phrase'].apply(encode_phrase)

train.head()


<class 'dict'> <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len,ordinal_encoding
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]"
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,"(s, c, a, l, e, s, /, k, u, h, a, y, l, a, h)",15,"[50, 34, 32, 43, 36, 50, 14, 42, 52, 39, 32, 5..."
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,"(1, 3, 8, 3, , w, i, l, l, i, a, m, , l, a, ...",19,"[16, 18, 23, 18, 0, 54, 40, 43, 43, 40, 32, 44..."


In [17]:
# Add complete file path to train
def get_file_path(path):
    return f'{base_path}{path}'

train['file_path'] = train['path'].apply(get_file_path)
train.head()

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len,ordinal_encoding,file_path
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]",/kaggle/input/asl-fingerspelling/train_landmar...
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,"(s, c, a, l, e, s, /, k, u, h, a, y, l, a, h)",15,"[50, 34, 32, 43, 36, 50, 14, 42, 52, 39, 32, 5...",/kaggle/input/asl-fingerspelling/train_landmar...
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,"(1, 3, 8, 3, , w, i, l, l, i, a, m, , l, a, ...",19,"[16, 18, 23, 18, 0, 54, 40, 43, 43, 40, 32, 44...",/kaggle/input/asl-fingerspelling/train_landmar...


In [18]:
def get_phrase_type(phrase):
    if re.match(r'^[\d+-]+$', phrase):
        return 'phone_number'
    elif any([substr in phrase for substr in ['www', '.', '/']]) and ' ' not in phrase:
        return 'url'
    else:
        return 'address'
    
train['phrase_type'] = train['phrase'].apply(get_phrase_type)
train.head()

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,phrase_char,phrase_char_len,ordinal_encoding,file_path,phrase_type
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,"(3, , c, r, e, e, k, h, o, u, s, e)",12,"[18, 0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36]",/kaggle/input/asl-fingerspelling/train_landmar...,address
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,"(s, c, a, l, e, s, /, k, u, h, a, y, l, a, h)",15,"[50, 34, 32, 43, 36, 50, 14, 42, 52, 39, 32, 5...",/kaggle/input/asl-fingerspelling/train_landmar...,url
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,"(1, 3, 8, 3, , w, i, l, l, i, a, m, , l, a, ...",19,"[16, 18, 23, 18, 0, 54, 40, 43, 43, 40, 32, 44...",/kaggle/input/asl-fingerspelling/train_landmar...,address


In [19]:
%%time
def process_landmarks(file_path, debug=False):
    landmarks = pd.read_parquet(file_path)
    #print(landmarks.info())
    #return
    


    processed_landmarks_list = []
    for seq_id in tqdm(landmarks.index.unique()):
        '''*** Use [[]] to make sure it's a DataFrame but not Series !!!!
        Otherwise, if there are multiple rows that match the label, a DataFrame will be returned;
        If there is only one row that matches the label, return a Series'''
        sequence = landmarks.loc[[seq_id]].copy() 
        #print(type(sequence))
        #return
        #print(f"Processing sequence_id: {seq_id}")
        #print(sequence.head())
        
#         if debug and len(processed_landmarks_list) >= 5:
#             break
    
        # Deal with NaN
        coordinate_columns = [col for col in sequence.columns if col.startswith(('x_', 'y_', 'z_'))]
        sequence.loc[:, coordinate_columns] = sequence[coordinate_columns].fillna(0.0)
        
        sequence['sequence_id'] = seq_id

        # Others..unfinished....
        # ...

        processed_landmarks_list.append(sequence)

    processed_landmarks = pd.concat(processed_landmarks_list, ignore_index=True)
    return processed_landmarks


processed_landmarks_list = [process_landmarks(file_path, debug=DEBUG) for file_path in tqdm(train['file_path'])]

all_processed_landmarks = pd.concat(processed_landmarks_list, ignore_index=True)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [20]:
all_processed_landmarks.tail()

NameError: name 'all_processed_landmarks' is not defined

In [None]:
columns = all_processed_landmarks.columns

parts = set()

for col in columns:
    if '_' in col:
        part_name = '_'.join(col.split('_')[1:-1])
        if len(part_name)>0:
            parts.add(part_name)

parts_list = list(parts)
print(parts_list)

In [None]:
print(len(list(all_processed_landmarks.iterrows())))
has_nan = all_processed_landmarks.isna().any()
columns_with_nan = has_nan[has_nan].index.tolist()
print("Columns with NaN values:", columns_with_nan)

In [None]:
%%time
final_coordinates_list = []

for row_index, frame in tqdm(all_processed_landmarks.iterrows()):
#     if DEBUG and len(final_coordinates_list) >= 1000:
#         break
    for part_name in parts_list:
        max_index = max(int(col.split('_')[-1]) for col in columns if f'x_{part_name}_' in col)
        for index in range(max_index + 1):
            temp_coordinates = {
                'sequence_id': int(frame['sequence_id']),
                'frame': int(frame['frame']),
                'part': part_name,
                'index': index,
                'x': frame[f'x_{part_name}_{index}'],
                'y': frame[f'y_{part_name}_{index}'],
                'z': frame[f'z_{part_name}_{index}'],
            }
            final_coordinates_list.append(temp_coordinates)

new_df = pd.DataFrame(final_coordinates_list)

In [None]:
# Just want to try--
# Set MultiIndex
new_df.set_index(['part', 'index'], inplace=True)

print(new_df.loc[('right_hand', 20)])
print(new_df.loc[('pose')].head())
new_df.tail()

new_df.reset_index(inplace=True)

In [None]:
merged_df = pd.merge(train, new_df, on='sequence_id', how='left')

#merged_df.query("part=='pose'")
merged_df

In [None]:
merged_df = pd.merge(train, all_processed_landmarks, on='sequence_id', how='left')
merged_df.head()

In [None]:
', '.join(merged_df.columns)

In [None]:
# Feature cols
features = merged_df.drop(columns=['sequence_id', 'participant_id', 'phrase', 'phrase_char', 'phrase_char_len', 'phrase_type'])

# Label cols
labels = final_data['ordinal_encoding']
