In [None]:
from google.colab import drive
import zipfile
import os

# --- Step 1: Mount Google Drive ---
# This will prompt you for authorization.
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

# --- Step 2: Define File Paths ---
# The path now starts from '/content/drive/MyDrive/' which is the root of your drive.
drive_zip_path = '/content/drive/MyDrive/Sign_Language_Project_Dev_Data/dev_test_data_subset_archive.zip'
extract_path = 'dev_test_data'

# --- Step 3: Unzip the File ---
# Create the extraction directory if it doesn't exist
if not os.path.exists(extract_path):
    os.makedirs(extract_path)

print(f"\nExtracting {drive_zip_path}...")
# Verify the zip file exists before trying to open it
if os.path.exists(drive_zip_path):
    with zipfile.ZipFile(drive_zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"Extraction complete. Data is in the '{extract_path}' folder.")
else:
    print(f"ERROR: The file was not found at {drive_zip_path}")
    print("Please double-check the file path and folder names in your Google Drive.")

# --- Step 4: Displaying Folder Contents ---
print("\n--- Verifying Extracted Contents ---")

# Walk through the extracted directory and print the structure
if os.path.exists(extract_path):
    for root, dirs, files in os.walk(extract_path):
        # To keep the output clean, we'll just report the count of JSON files in sequence folders
        if os.path.basename(root).endswith('-rgb_front'):
            print(f"\nFolder: {root}")
            if files:
                 print(f"  Contains {len(files)} JSON files (e.g., {files[0]})")
            else:
                 print("  Contains 0 JSON files.")
            continue

        if root != extract_path:
            print(f"\nFolder: {root}")

        if dirs:
            print(f"  Subdirectories: {dirs}")

        if files:
            # Only show files for the top-level directories to avoid clutter
            if root.count(os.sep) - extract_path.count(os.sep) < 2:
                 print("  Files:")
                 for file in files:
                    print(f"    - {file}")
else:
    print(f"ERROR: The extraction directory '{extract_path}' was not created. Cannot list contents.")

print("\n--- Verification Finished ---")

Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully.

Extracting /content/drive/MyDrive/Sign_Language_Project_Dev_Data/dev_test_data_subset_archive.zip...
Extraction complete. Data is in the 'dev_test_data' folder.

--- Verifying Extracted Contents ---
  Subdirectories: ['json_keypoints', 'rendered_videos', 'raw_videos']

Folder: dev_test_data/json_keypoints
  Subdirectories: ['-fZc293MpJk_0-1-rgb_front', '-fZc293MpJk_2-1-rgb_front', '-fZc293MpJk_4-1-rgb_front', '-fZc293MpJk_3-1-rgb_front', '-fZc293MpJk_5-1-rgb_front']

Folder: dev_test_data/json_keypoints/-fZc293MpJk_0-1-rgb_front
  Contains 17 JSON files (e.g., -fZc293MpJk_0-1-rgb_front_000000000006_keypoints.json)

Folder: dev_test_data/json_keypoints/-fZc293MpJk_2-1-rgb_front
  Contains 412 JSON files (e.g., -fZc293MpJk_2-1-rgb_front_000000000145_keypoints.json)

Folder: dev_test_data/json_keypoints/-fZc293MpJk_4-1-rgb_front
  Contains 398 JSON files (e.g., -fZc293MpJk_4-1-rgb_front_000000000357_key

In [None]:
import json
import numpy as np
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences

def load_and_process_keypoints(sequence_path):
    """
    Loads all JSON files from a sequence folder, extracts and concatenates the
    x, y keypoints for pose, face, and hands, ignoring confidence scores.
    """
    json_files = sorted([os.path.join(sequence_path, f) for f in os.listdir(sequence_path) if f.endswith('.json')])

    sequence_data = []
    for file_path in json_files:
        with open(file_path, 'r') as f:
            data = json.load(f)

        if not data['people']:
            # If no person was detected in the frame, append a zero vector.
            # The number of features is 543 points * 2 coordinates = 1086.
            # This needs to be consistent with frames that do have data.
            # NOTE: We will calculate the exact feature count dynamically later.
            continue

        person = data['people'][0]

        # Extract x, y coordinates, discarding confidence scores
        pose_kps = np.array(person['pose_keypoints_2d']).reshape(-1, 3)[:, :2].flatten()
        face_kps = np.array(person['face_keypoints_2d']).reshape(-1, 3)[:, :2].flatten()
        hand_left_kps = np.array(person['hand_left_keypoints_2d']).reshape(-1, 3)[:, :2].flatten()
        hand_right_kps = np.array(person['hand_right_keypoints_2d']).reshape(-1, 3)[:, :2].flatten()

        # Concatenate all keypoints into a single feature vector for the frame
        frame_features = np.concatenate([pose_kps, face_kps, hand_left_kps, hand_right_kps])
        sequence_data.append(frame_features)

    return np.array(sequence_data)

# --- Main Script ---
DATA_PATH = "dev_test_data/json_keypoints"
sequence_folders = [os.path.join(DATA_PATH, seq) for seq in os.listdir(DATA_PATH) if os.path.isdir(os.path.join(DATA_PATH, seq))]

# Load all sequences from the disk
all_sequences = []
for sequence_path in sequence_folders:
    sequence_keypoints = load_and_process_keypoints(sequence_path)
    if sequence_keypoints.size > 0:
        all_sequences.append(sequence_keypoints)

if not all_sequences:
    print("No data was loaded. Please check the DATA_PATH and folder structure.")
else:
    # Pad sequences to ensure they all have the same length
    # This is a requirement for creating a single NumPy array for the model
    X = pad_sequences(all_sequences, padding='post', dtype='float32')

    # The final shape will be (num_samples, max_frames, num_keypoints * 2)
    # This is the "X" data for our model [cite: 62, 102]
    print("--- Data Preprocessing Complete ---")
    print(f"Final data shape (X): {X.shape}")
    print(f"Number of sequences (videos) loaded: {X.shape[0]}")
    print(f"Maximum frames in a sequence (padded): {X.shape[1]}")
    print(f"Number of features per frame (keypoints): {X.shape[2]}")

--- Data Preprocessing Complete ---
Final data shape (X): (5, 412, 274)
Number of sequences (videos) loaded: 5
Maximum frames in a sequence (padded): 412
Number of features per frame (keypoints): 274
