In [None]:
# @title Download the Datasets
#
# This cell downloads the required TEST datasets from Google Drive.

import os
import gdown
import zipfile
import tarfile
import shutil

# --- Configuration ---
# URLs for the datasets, taken from the project plan
DATA_URLS = {
    "keypoints": "https://drive.google.com/file/d/1g8tzzW5BNPzHXlamuMQOvdwlHRa-29Vp/view?usp=sharing",
    "rgb_clips": "https://drive.google.com/file/d/1qTIXFsu8M55HrCiaGv7vZ7GkdB3ubjaG/view?usp=sharing"
}

# Directory to store the development data
OUTPUT_DIR = "dev_test_data"


# --- Download and Extract Files ---
for name, url in DATA_URLS.items():
    print(f"--- Processing {name} ---")

    # Let gdown determine the filename and download to the current directory
    print(f"Downloading {name} data...")
    downloaded_file_path = gdown.download(url, quiet=False, fuzzy=True)

    if downloaded_file_path is None or not os.path.exists(downloaded_file_path):
        print(f"‚ùå Error: Download failed for {name}. Please check the URL and permissions.")
        continue

    print(f"‚úÖ Download complete: {downloaded_file_path}")

    # --- Unpack Files ---
    print(f"Attempting to unpack {downloaded_file_path}...")
    extracted = False

    # Try to extract as a zip file
    if zipfile.is_zipfile(downloaded_file_path):
        try:
            with zipfile.ZipFile(downloaded_file_path, 'r') as zip_ref:
                zip_ref.extractall(OUTPUT_DIR)
            print(f"‚úÖ Unzipped successfully.")
            extracted = True
        except Exception as e:
            print(f"An error occurred during unzipping: {e}")

    # If not a zip, try to extract as a tar file
    elif tarfile.is_tarfile(downloaded_file_path):
        try:
            with tarfile.open(downloaded_file_path, 'r:*') as tar_ref:
                tar_ref.extractall(path=OUTPUT_DIR)
            print(f"‚úÖ Extracted tar archive successfully.")
            extracted = True
        except Exception as e:
            print(f"An error occurred during tar extraction: {e}")

    if not extracted:
        print(f"‚ùå Error: The file '{downloaded_file_path}' is not a recognized zip or tar archive. Manual inspection may be needed.")

    # --- Clean up the downloaded archive file ---
    if os.path.exists(downloaded_file_path):
        os.remove(downloaded_file_path)
        print(f"Removed archive file: {downloaded_file_path}\n")


print("All dataset operations are complete.")


--- Processing keypoints ---
Downloading keypoints data...


Downloading...
From (original): https://drive.google.com/uc?id=1g8tzzW5BNPzHXlamuMQOvdwlHRa-29Vp
From (redirected): https://drive.google.com/uc?id=1g8tzzW5BNPzHXlamuMQOvdwlHRa-29Vp&confirm=t&uuid=7ed29dea-277b-4d44-8fc7-c1a2f55c034f
To: /content/test_2D_keypoints.tar.gz
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.70G/1.70G [00:17<00:00, 96.5MB/s]


‚úÖ Download complete: test_2D_keypoints.tar.gz
Attempting to unpack test_2D_keypoints.tar.gz...
‚úÖ Extracted tar archive successfully.
Removed archive file: test_2D_keypoints.tar.gz

--- Processing rgb_clips ---
Downloading rgb_clips data...


Downloading...
From (original): https://drive.google.com/uc?id=1qTIXFsu8M55HrCiaGv7vZ7GkdB3ubjaG
From (redirected): https://drive.google.com/uc?id=1qTIXFsu8M55HrCiaGv7vZ7GkdB3ubjaG&confirm=t&uuid=b5ba748b-8a2c-4c50-93b0-ebec6518c27f
To: /content/test_rgb_front_clips.zip
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2.41G/2.41G [00:38<00:00, 63.1MB/s]


‚úÖ Download complete: test_rgb_front_clips.zip
Attempting to unpack test_rgb_front_clips.zip...
‚úÖ Unzipped successfully.
Removed archive file: test_rgb_front_clips.zip

All dataset operations are complete.


In [None]:
# @title Verify Data Setup
#
# This cell checks if the data folders have been created successfully,
# meeting the success criterion.

# --- Verification ---
print(f"Verifying contents of '{OUTPUT_DIR}':")

try:
    # List the contents of the directory
    contents = os.listdir(OUTPUT_DIR)

    if contents:
        print("üéØ Success! The following files/folders are in the development directory:")
        for item in contents:
            print(f"- {item}")
    else:
        print("‚ö†Ô∏è Warning: The development directory is empty.")

except FileNotFoundError:
    print(f"‚ùå Error: The directory '{OUTPUT_DIR}' was not found.")



Verifying contents of 'dev_test_data':
üéØ Success! The following files/folders are in the development directory:
- openpose_output
- .ipynb_checkpoints
- raw_videos


In [None]:
# @title Now is time to save some data to drive
#
# This cell connects your Google Drive to this Colab notebook.
# You will be prompted to authorize this connection.

from google.colab import drive
drive.mount('/content/drive')

print("\n‚úÖ Google Drive mounted successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

‚úÖ Google Drive mounted successfully!


In [None]:
# @title Final Solution: Accurately Subset Data
#
# üìù **Goal:** Create a small but complete development subset with all relevant files (JSON keypoints, raw videos, etc.) based on the data structure we have confirmed is correct.
#
# **Action:** This script will automatically pick the first 5 video sequences from `openpose_output/json` and then find and copy all their corresponding relevant files.

import os
import shutil

# --- 1. Configuration (based on confirmed correct paths) ---
JSON_SOURCE_DIR = 'dev_test_data/openpose_output/json'
VIDEO_SOURCE_DIR = 'dev_test_data/openpose_output/video'
RAW_VIDEO_SOURCE_DIR = 'dev_test_data/raw_videos'

SUBSET_DIR = 'dev_test_data_subset'
NUM_SEQUENCES_TO_KEEP = 5 # Select 5 sequences as our development set, this quantity is entirely sufficient

print("--- Starting to create precise data subsets ---")

# --- 2. Clean and Create Subset Directory Structure ---
if os.path.exists(SUBSET_DIR):
    shutil.rmtree(SUBSET_DIR)
    print(f"Cleaned up old subset directory: {SUBSET_DIR}")

# Create a directory structure similar to the source data for clarity
subset_json_dir = os.path.join(SUBSET_DIR, 'json_keypoints')
subset_video_dir = os.path.join(SUBSET_DIR, 'rendered_videos')
subset_raw_video_dir = os.path.join(SUBSET_DIR, 'raw_videos')

os.makedirs(subset_json_dir)
os.makedirs(subset_video_dir)
os.makedirs(subset_raw_video_dir)
print(f"Created new subset directory structure at: {SUBSET_DIR}")

# --- 3. Select Video Sequences to Copy ---
if not os.path.isdir(JSON_SOURCE_DIR):
    print(f"‚ùå Error: Keypoint data source directory '{JSON_SOURCE_DIR}' does not exist! Cannot proceed.")
else:
    # Get all sequence names (i.e., all subfolders under the json directory) and sort them
    all_sequences = sorted([d for d in os.listdir(JSON_SOURCE_DIR) if os.path.isdir(os.path.join(JSON_SOURCE_DIR, d))])

    sequences_to_copy = all_sequences[:NUM_SEQUENCES_TO_KEEP]

    print(f"\nFound {len(all_sequences)} video sequences in '{JSON_SOURCE_DIR}'.")
    print(f"Selecting the first {len(sequences_to_copy)} as the subset:")
    for seq_name in sequences_to_copy:
        print(f"  - {seq_name}")

    # --- 4. Copy All Related Files ---
    copied_count = 0
    for seq_name in sequences_to_copy:
        print(f"\n--- Processing sequence: {seq_name} ---")

        # 1. Copy JSON keypoint folder
        source_json_path = os.path.join(JSON_SOURCE_DIR, seq_name)
        dest_json_path = os.path.join(subset_json_dir, seq_name)
        if os.path.isdir(source_json_path):
            shutil.copytree(source_json_path, dest_json_path)
            print(f"    ‚úÖ Keypoint data (JSONs) copied")
        else:
            print(f"    ‚ö†Ô∏è Warning: Keypoint folder not found {source_json_path}")

        # 2. Copy rendered video
        # Video filenames are usually sequence_name + .mp4
        video_filename = f"{seq_name}.mp4"
        source_video_path = os.path.join(VIDEO_SOURCE_DIR, video_filename)
        dest_video_path = os.path.join(subset_video_dir, video_filename)
        if os.path.exists(source_video_path):
            shutil.copy(source_video_path, dest_video_path)
            print(f"    ‚úÖ Rendered video copied")
        else:
            print(f"    ‚ö†Ô∏è Warning: Rendered video not found {source_video_path}")

        # 3. Copy raw video
        raw_video_filename = f"{seq_name}.mp4" # Assume raw video and sequence name also correspond
        source_raw_video_path = os.path.join(RAW_VIDEO_SOURCE_DIR, raw_video_filename)
        dest_raw_video_path = os.path.join(subset_raw_video_dir, raw_video_filename)
        if os.path.exists(source_raw_video_path):
            shutil.copy(source_raw_video_path, dest_raw_video_path)
            print(f"    ‚úÖ Raw video copied")
        else:
            # Raw video filenames might not have the -rgb_front suffix, try removing it
            base_name = seq_name.replace('-rgb_front', '')
            raw_video_filename_alt = f"{base_name}.mp4"
            source_raw_video_path_alt = os.path.join(RAW_VIDEO_SOURCE_DIR, raw_video_filename_alt)
            if os.path.exists(source_raw_video_path_alt):
                 shutil.copy(source_raw_video_path_alt, os.path.join(subset_raw_video_dir, raw_video_filename_alt))
                 print(f"    ‚úÖ Raw video copied (alternate name: {raw_video_filename_alt})")
            else:
                print(f"    ‚ö†Ô∏è Warning: Raw video not found {source_raw_video_path} or {source_raw_video_path_alt}")

        copied_count += 1

    print(f"\n--- üéØ Operation Complete! ---")
    print(f"Successfully processed {copied_count} video sequences.")
    print(f"A complete, compact development dataset is ready in '{SUBSET_DIR}'.")
    print("Now, you can run the 'Package and Upload to Google Drive' cell to save it permanently.")



--- Starting to create precise data subsets ---
Cleaned up old subset directory: dev_test_data_subset
Created new subset directory structure at: dev_test_data_subset

Found 2343 video sequences in 'dev_test_data/openpose_output/json'.
Selecting the first 5 as the subset:
  - -fZc293MpJk_0-1-rgb_front
  - -fZc293MpJk_2-1-rgb_front
  - -fZc293MpJk_3-1-rgb_front
  - -fZc293MpJk_4-1-rgb_front
  - -fZc293MpJk_5-1-rgb_front

--- Processing sequence: -fZc293MpJk_0-1-rgb_front ---
    ‚úÖ Keypoint data (JSONs) copied
    ‚úÖ Rendered video copied
    ‚úÖ Raw video copied

--- Processing sequence: -fZc293MpJk_2-1-rgb_front ---
    ‚úÖ Keypoint data (JSONs) copied
    ‚úÖ Rendered video copied
    ‚úÖ Raw video copied

--- Processing sequence: -fZc293MpJk_3-1-rgb_front ---
    ‚úÖ Keypoint data (JSONs) copied
    ‚úÖ Rendered video copied
    ‚úÖ Raw video copied

--- Processing sequence: -fZc293MpJk_4-1-rgb_front ---
    ‚úÖ Keypoint data (JSONs) copied
    ‚úÖ Rendered video copied
    ‚úÖ Raw

In [None]:
# @title Package Development Subset and Save to Google Drive
#
# üìù **Goal:** Package the final, correct 'dev_test_data_subset' folder into a zip file and permanently save it to your Google Drive.
#
# **Action:** This cell will automatically complete the entire process of mounting Drive, packaging, and copying.

from google.colab import drive
import shutil
import os

# --- 1. Configuration ---
SOURCE_DIR_TO_PACKAGE = 'dev_test_data_subset'
ARCHIVE_NAME = 'dev_test_data_subset_archive'
# You can customize the folder name saved in Google Drive
DRIVE_FOLDER_PATH = '/content/drive/MyDrive/Sign_Language_Project_Dev_Data'

# --- 2. Check if Source Folder Exists ---
if not os.path.isdir(SOURCE_DIR_TO_PACKAGE):
    print(f"‚ùå Error: Source folder '{SOURCE_DIR_TO_PACKAGE}' does not exist. Please ensure the previous step ran successfully.")
else:
    # --- 3. Package the Folder as a .zip File ---
    print(f"Packaging '{SOURCE_DIR_TO_PACKAGE}' into '{ARCHIVE_NAME}.zip'...")
    shutil.make_archive(ARCHIVE_NAME, 'zip', SOURCE_DIR_TO_PACKAGE)
    print("‚úÖ Packaging successful!")

    # --- 4. Mount Google Drive ---
    print("\nConnecting to your Google Drive...")
    drive.mount('/content/drive')

    # --- 5. Create Destination Folder in Google Drive (if it doesn't exist) ---
    if not os.path.exists(DRIVE_FOLDER_PATH):
        print(f"Creating new folder in your Google Drive: {DRIVE_FOLDER_PATH}")
        os.makedirs(DRIVE_FOLDER_PATH)

    # --- 6. Copy the Packaged File to Google Drive ---
    source_file_path = f"{ARCHIVE_NAME}.zip"
    destination_path = os.path.join(DRIVE_FOLDER_PATH, source_file_path)

    print(f"\nCopying file to: {destination_path}...")
    if os.path.exists(source_file_path):
        shutil.copy(source_file_path, destination_path)
        print(f"\n--- üéØ Operation Successful! ---")
        print(f"Development dataset '{source_file_path}' successfully saved to your Google Drive!")
        print("You can now directly download and extract this file from Google Drive in the future, without needing to re-download the original data.")
    else:
        print(f"‚ùå Error: Packaged file '{source_file_path}' not found.")



Packaging 'dev_test_data_subset' into 'dev_test_data_subset_archive.zip'...
‚úÖ Packaging successful!

Connecting to your Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Creating new folder in your Google Drive: /content/drive/MyDrive/Sign_Language_Project_Dev_Data

Copying file to: /content/drive/MyDrive/Sign_Language_Project_Dev_Data/dev_test_data_subset_archive.zip...

--- üéØ Operation Successful! ---
Development dataset 'dev_test_data_subset_archive.zip' successfully saved to your Google Drive!
You can now directly download and extract this file from Google Drive in the future, without needing to re-download the original data.
