### Load Necessary Libraries

In [1]:
import pandas as pd
import csv
import pyarrow.parquet as pq
import numpy as np
import os
import json


### Define Column Headers for Landmarks

In [2]:
# Define maximum indices for each landmark type

max_face_index = 467
max_left_hand_index = 20
max_right_hand_index = 20
max_pose_index = 32

In [3]:
# Create the list of column headers for  landmarks

face_columns = [f"face_{i}" for i in range(max_face_index + 1)]
left_hand_columns = [f"left_hand_{i}" for i in range(max_left_hand_index + 1)]
right_hand_columns = [f"right_hand_{i}" for i in range(max_right_hand_index + 1)]
pose_columns = [f"pose_{i}" for i in range(max_pose_index + 1)]


# Combine all column headers into a single list
all_columns = ["frame"] + \
              [f"{col}_{coord}" for col in face_columns for coord in ['x', 'y']] + \
              [f"{col}_{coord}" for col in left_hand_columns for coord in ['x', 'y']] + \
              [f"{col}_{coord}" for col in right_hand_columns for coord in ['x', 'y']] + \
              [f"{col}_{coord}" for col in pose_columns for coord in ['x', 'y']] + \
              ['label']



In [4]:
all_columns

['frame',
 'face_0_x',
 'face_0_y',
 'face_1_x',
 'face_1_y',
 'face_2_x',
 'face_2_y',
 'face_3_x',
 'face_3_y',
 'face_4_x',
 'face_4_y',
 'face_5_x',
 'face_5_y',
 'face_6_x',
 'face_6_y',
 'face_7_x',
 'face_7_y',
 'face_8_x',
 'face_8_y',
 'face_9_x',
 'face_9_y',
 'face_10_x',
 'face_10_y',
 'face_11_x',
 'face_11_y',
 'face_12_x',
 'face_12_y',
 'face_13_x',
 'face_13_y',
 'face_14_x',
 'face_14_y',
 'face_15_x',
 'face_15_y',
 'face_16_x',
 'face_16_y',
 'face_17_x',
 'face_17_y',
 'face_18_x',
 'face_18_y',
 'face_19_x',
 'face_19_y',
 'face_20_x',
 'face_20_y',
 'face_21_x',
 'face_21_y',
 'face_22_x',
 'face_22_y',
 'face_23_x',
 'face_23_y',
 'face_24_x',
 'face_24_y',
 'face_25_x',
 'face_25_y',
 'face_26_x',
 'face_26_y',
 'face_27_x',
 'face_27_y',
 'face_28_x',
 'face_28_y',
 'face_29_x',
 'face_29_y',
 'face_30_x',
 'face_30_y',
 'face_31_x',
 'face_31_y',
 'face_32_x',
 'face_32_y',
 'face_33_x',
 'face_33_y',
 'face_34_x',
 'face_34_y',
 'face_35_x',
 'face_35_y',
 '

### Load dataset

In [5]:
train_df = pd.read_csv('Dataset_CSVs/train.csv')

In [6]:
selected_words = ["TV", "after",  "all", "alligator", "animal", "another", "any", "apple", "arm"]
# selected_words = ["TV", "after", "airplane", "all", "alligator"]

# Filter the dataframe to include only the selected words
filtered_df = train_df[train_df['sign'].isin(selected_words)]

# Group by 'sign' and select 10 sequences for each word
sub_df = filtered_df.groupby('sign').head(15)

### Write Data to CSV

In [7]:


# Initialize an empty list to store all rows of data
all_rows = []

# Iterate through each row in sub_df
for index, row in sub_df.iterrows():
    path = row['path']
    label = row['sign']
    
    # Read the Parquet file using PyArrow
    table = pq.read_table(path)
    
    # Convert PyArrow Table to Pandas DataFrame
    df = table.to_pandas()
    
    # Initialize a list to store rows of data
    rows = []
    
    # Iterate through each unique frame
    for frame in df['frame'].unique():
        # Filter rows for the current frame
        subset_df = df[df['frame'] == frame]
        
        # Initialize dictionaries to store landmarks
        face_dict = {}
        left_hand_dict = {}
        right_hand_dict = {}
        pose_dict = {}
        
        # Iterate through rows in subset_df and populate dictionaries
        for idx, row in subset_df.iterrows():
            landmark_type = row['type']
            landmark_index = row['landmark_index']
            x = row['x']
            y = row['y']
            # z = row['z']
            
            if landmark_type == 'face':
                face_dict[f"face_{landmark_index}_x"] = x
                face_dict[f"face_{landmark_index}_y"] = y
                # face_dict[f"face_{landmark_index}_z"] = z
            elif landmark_type == 'left_hand':
                left_hand_dict[f"left_hand_{landmark_index}_x"] = x
                left_hand_dict[f"left_hand_{landmark_index}_y"] = y
                # left_hand_dict[f"left_hand_{landmark_index}_z"] = z
            elif landmark_type == 'right_hand':
                right_hand_dict[f"right_hand_{landmark_index}_x"] = x
                right_hand_dict[f"right_hand_{landmark_index}_y"] = y
                # right_hand_dict[f"right_hand_{landmark_index}_z"] = z
            elif landmark_type == 'pose':
                pose_dict[f"pose_{landmark_index}_x"] = x
                pose_dict[f"pose_{landmark_index}_y"] = y
                # pose_dict[f"pose_{landmark_index}_z"] = z
        
        # Combine dictionaries into a single row of data
        row_data = {
            'frame': frame,
            **face_dict,
            **left_hand_dict,
            **right_hand_dict,
            **pose_dict,
            'label': label,
        }
        
        # Append row_data to rows list
        rows.append(row_data)
    
    # Extend rows to all_rows
    all_rows.extend(rows)

# Define CSV file path
csv_file = 'Dataset_CSVs/ASL_word_data.csv'

# Define column headers as the union of keys from all row_data dictionaries
# header = ['frame'] + sorted(set().union(*(row.keys() for row in all_rows)))

# Write rows to CSV file
with open(csv_file, 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=all_columns)
    writer.writeheader()
    
    # Iterate through all_rows and write each row to CSV
    for row_data in all_rows:
        # Round numerical values to 3 decimal places
        rounded_row_data = {key: round(value, 6) if isinstance(value, (int, float)) else value for key, value in row_data.items()}
        
        # Replace NaN values with 0.0
        cleaned_row_data = {key: (0.0 if pd.isna(value) else value) for key, value in rounded_row_data.items()}
        
        # Write the row to CSV
        writer.writerow(cleaned_row_data)

print(f"Data has been successfully written to {csv_file}")


Data has been successfully written to Dataset_CSVs/ASL_word_data.csv
