In [1]:
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import sys

from asl_data_pipeline.models.s3 import InputType
from asl_data_pipeline.preprocessing.hand_landmark_extractor.extractor import (
    HandLandmarkExtractor,
    HandLandmarkExtractorConfig,
)
from asl_data_pipeline.preprocessing.hand_sa_converter.converter import HandSAConverter
from asl_data_pipeline.preprocessing.hand_sa_converter.models import HandSAConverterConfig
from asl_data_pipeline.preprocessing.pose_extractor.extractor import PoseExtractor, PoseExtractorConfig
from asl_data_pipeline.preprocessing.key_frame_extractor.extractor import KeyFrameExtractor, KeyFrameExtractorConfig
from asl_data_pipeline.preprocessing.key_frame_extractor.models import S3Config
from asl_data_pipeline.preprocessing.pose_sa_converter.converter import PoseSAConverter
from asl_data_pipeline.preprocessing.pose_sa_converter.models import PoseSAConverterConfig

In [2]:
# Set up application-wide logging for the notebook and imported modules.
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s] [%(levelname)s] [%(name)s] %(message)s",
    stream=sys.stdout,
)

Extract representative frames from input videos stored on S3 and create a JSONL manifest listing all generated frames. This manifest is the input for downstream landmark extraction steps.

# Step 1: Frame Extraction

In [None]:
# Extract representative key frames from source videos on S3.
# Inputs:
#   - s3.input_uri: S3 prefix containing input videos (InputType.FOLDER)
# Outputs:
#   - Frames written to s3.output_uri preserving structure
#   - A JSONL manifest (manifest.jsonl) listing frame S3 paths and metadata
# Please check out the data model of KeyFrameExtractorConfig to better control this particular step to your liking.
key_frame_extractor_config = KeyFrameExtractorConfig(
    s3=S3Config(
        input_uri="s3://bucket-name/input-folder/",
        output_uri="s3://bucket-name/output-folder/video_keyframes/",
        aws_profile="aws-profile-name",
        input_type=InputType.FOLDER,
    ),
    max_videos=4,
    force_write=True,
)
frame_extractor = KeyFrameExtractor(key_frame_extractor_config)
frame_extractor.process_videos_batch()


[2025-11-14 05:56:09,339] [INFO] [botocore.credentials] Found credentials in shared credentials file: ~/.aws/credentials
[2025-11-14 05:56:10,200] [INFO] [asl_data_pipeline.preprocessing.key_frame_extractor.extractor] KeyFrameExtractor initialized with input_uri: s3://bucket-name/input-folder/, output_uri: s3://bucket-name/output-folder/video_keyframes/, temp_dir: /var/folders/dd/rh7q5rb51c14q3dbl3rg0czm0000gp/T/keyframe_extractor, max_workers: 4
[2025-11-14 05:56:10,300] [INFO] [asl_data_pipeline.preprocessing.key_frame_extractor.extractor] Starting batch processing for 1 videos with 4 max workers
[2025-11-14 05:56:44,283] [INFO] [asl_data_pipeline.preprocessing.key_frame_extractor.extractor] Video input-folder/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000.webm processed. Success: True. Progress: 1/1
[2025-11-14 05:56:44,829] [INFO] [asl_data_pipeline.preprocessing.key_frame_extractor.extractor] Batch processing complete. Total videos: 1, successful: 1, failed: 0, total time: 34.2

BatchExtractionResult(total_videos=1, successful_extractions=1, failed_extractions=0, results=[ExtractionResult(video_metadata=VideoMetadata(video_filename='5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000.webm', video_s3_key='input-folder/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000.webm', total_frame_count=270, extracted_frame_count=36, extraction_method='motion_analysis_with_sharpness', parameters=ExtractionParameters(peak_prominence=0.1, active_region_buffer=5, trough_distance=5, trough_window_size=3), processing_timestamp=datetime.datetime(2025, 11, 14, 13, 56, 32, 622960), motion_scores=[0.3936011493206024, 0.22314812242984772, 0.3628375828266144, 0.3193560838699341, 0.2821253836154938, 0.33778566122055054, 0.2192496359348297, 0.20522093772888184, 0.1879027932882309, 0.23596736788749695, 0.3702801465988159, 0.5381451845169067, 0.5150236487388611, 0.376987099647522, 0.3710940480232239, 0.24265794456005096, 0.4631064534187317, 0.3526644706726074, 0.3029594123363495, 0.2933424

Detect 21-keypoint hand landmarks on the frames listed in the key-frames manifest, then write per-frame JSON outputs and a new JSONL manifest for use by later steps.

# Step 2: Hand Landmark Extraction

In [None]:
# Run hand landmark detection on each key frame and save results to S3.
# Inputs:
#   - s3.input_uri: JSONL manifest of key frames (InputType.MANIFEST)
# Outputs:
#   - Per-frame JSON files with 21-keypoint hand landmarks
#   - New manifest.jsonl under s3.output_uri indexing produced annotations
# Please check out the data model of HandLandmarkExtractorConfig to better control this particular step to your liking.
hand_landmark_extractor_config = HandLandmarkExtractorConfig(
    s3=S3Config(
        input_uri="s3://bucket-name/output-folder/video_keyframes/manifest.jsonl",
        output_uri="s3://bucket-name/output-folder/video_hand_landmarks_extracted/",
        aws_profile="aws-profile-name",
        input_type=InputType.MANIFEST,
    ),
    out_of_bound_removal=True,
    max_workers=4,
    force_write=True,
)
hand_landmark_extractor = HandLandmarkExtractor(hand_landmark_extractor_config)
hand_landmark_extractor.process_batch()

[2025-11-14 05:59:43,877] [INFO] [botocore.credentials] Found credentials in shared credentials file: ~/.aws/credentials
[2025-11-14 05:59:44,435] [INFO] [asl_data_pipeline.preprocessing.hand_landmark_extractor.extractor] HandLandmarkExtractor initialized with input_uri: s3://bucket-name/output-folder/video_keyframes/manifest.jsonl, output_uri: s3://bucket-name/output-folder/video_hand_landmarks_extracted/, temp_dir: /var/folders/dd/rh7q5rb51c14q3dbl3rg0czm0000gp/T/hand_landmark_extractor, max_workers: 4


I0000 00:00:1763128784.435216 5857045 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 90.5), renderer: Apple M4
W0000 00:00:1763128784.446271 5862170 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1763128784.453263 5862167 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[2025-11-14 05:59:48,407] [INFO] [asl_data_pipeline.preprocessing.hand_landmark_extractor.extractor] Removed 12 out-of-bound hand landmarks from 000109.jpg (image size: 1080x1080)
[2025-11-14 05:59:49,312] [INFO] [asl_data_pipeline.preprocessing.hand_landmark_extractor.extractor] Removed 11 out-of-bound hand landmarks from 000119.jpg (image size: 1080x1080)
[2025-11-14 05:59:49,312] [INFO] [asl_data_pipeline.preprocessing.hand_landmark_extractor.extractor] Removed 11 out-of-bound hand landmarks from 000126.jpg (image size: 1080x1080)
[2025-11-14 05:59:49,313] [INFO] [asl_data_pipeline.preprocessing.hand_landmark_extractor.extractor] Removed 11 out-of-bound hand landmarks from 000133.jpg (image size: 1080x1080)
[2025-11-14 05:59:49,349] [INFO] [asl_data_pipeline.preprocessing.hand_landmark_extractor.extractor] Removed 11 out-of-bound hand landmarks from 000141.jpg (image size: 1080x1080)
[2025-11-14 05:59:54,394] [INFO] [asl_data_pipeline.preprocessing.hand_landmark_extractor.extractor]

BatchHandLandmarkResult(total_frames=36, successful_extractions=36, no_hands_detected=0, unreadable_frames=0, manifest_s3_key='output-folder/video_hand_landmarks_extracted/manifest.jsonl', frame_results=[HandLandmarkMetadata(frame_source_s3_key='output-folder/video_keyframes/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000/artifacts/000016.jpg', landmarks_json_s3_key='output-folder/video_hand_landmarks_extracted/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000/landmarks/000016.json', annotated_image_s3_key='output-folder/video_hand_landmarks_extracted/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000/annotated/000016.jpg', num_hands=1), HandLandmarkMetadata(frame_source_s3_key='output-folder/video_keyframes/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000/artifacts/000008.jpg', landmarks_json_s3_key='output-folder/video_hand_landmarks_extracted/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000/landmarks/000008.json', annotated_image_s3_key='output-folder/video_hand_landmarks_extract

Detect 33-keypoint whole-body pose landmarks for each key frame and emit per-frame JSON plus a manifest that indexes the generated pose annotations.

# Step 3: Pose Extraction

In [None]:
# Run whole-body pose landmark detection on each key frame and save to S3.
# Inputs:
#   - s3.input_uri: JSONL manifest produced by the key-frame step (InputType.MANIFEST)
# Outputs:
#   - Per-frame JSON with 33-keypoint pose landmarks
#   - New manifest.jsonl under s3.output_uri listing pose annotation files
# Please check out the data model of PoseExtractorConfig to better control this particular step to your liking.
pose_extractor_config = PoseExtractorConfig(
    s3=S3Config(
        input_uri="s3://bucket-name/output-folder/video_keyframes/manifest.jsonl",
        output_uri="s3://bucket-name/output-folder/video_pose_landmarks_extracted/",
        aws_profile="aws-profile-name",
        input_type=InputType.MANIFEST,
    ),
    out_of_bound_removal=True,
    max_workers=4,
    force_write=True,
)
pose_extractor = PoseExtractor(pose_extractor_config)
pose_extractor.process_batch()

[2025-11-14 06:00:05,279] [INFO] [botocore.credentials] Found credentials in shared credentials file: ~/.aws/credentials
[2025-11-14 06:00:05,827] [INFO] [asl_data_pipeline.preprocessing.pose_extractor.extractor] PoseExtractor initialized with input_uri: s3://bucket-name/output-folder/video_keyframes/manifest.jsonl, output_uri: s3://bucket-name/output-folder/video_pose_landmarks_extracted/, temp_dir: /var/folders/dd/rh7q5rb51c14q3dbl3rg0czm0000gp/T/pose_extractor, max_workers: 4


I0000 00:00:1763128805.827506 5857045 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 90.5), renderer: Apple M4
W0000 00:00:1763128805.884892 5862481 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1763128805.928980 5862488 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[2025-11-14 06:00:06,545] [INFO] [asl_data_pipeline.preprocessing.pose_extractor.extractor] Removed 16 out-of-bound pose landmarks from 000016.jpg (image size: 1080x1080)
[2025-11-14 06:00:06,822] [INFO] [asl_data_pipeline.preprocessing.pose_extractor.extractor] Removed 16 out-of-bound pose landmarks from 000022.jpg (image size: 1080x1080)
[2025-11-14 06:00:06,822] [INFO] [asl_data_pipeline.preprocessing.pose_extractor.extractor] Removed 16 out-of-bound pose landmarks from 000008.jpg (image size: 1080x1080)
[2025-11-14 06:00:06,822] [INFO] [asl_data_pipeline.preprocessing.pose_extractor.extractor] Removed 16 out-of-bound pose landmarks from 000000.jpg (image size: 1080x1080)
[2025-11-14 06:00:07,798] [INFO] [asl_data_pipeline.preprocessing.pose_extractor.extractor] Removed 16 out-of-bound pose landmarks from 000027.jpg (image size: 1080x1080)
[2025-11-14 06:00:08,014] [INFO] [asl_data_pipeline.preprocessing.pose_extractor.extractor] Removed 15 out-of-bound pose landmarks from 000038.jp

BatchPoseResult(total_frames=36, successful_extractions=36, no_poses_detected=0, unreadable_frames=0, manifest_s3_key='output-folder/video_pose_landmarks_extracted/manifest.jsonl', frame_results=[PoseMetadata(frame_source_s3_key='output-folder/video_keyframes/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000/artifacts/000016.jpg', landmarks_json_s3_key='output-folder/video_pose_landmarks_extracted/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000/landmarks/000016.json', annotated_image_s3_key='output-folder/video_pose_landmarks_extracted/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000/annotated/000016.jpg', num_poses=1), PoseMetadata(frame_source_s3_key='output-folder/video_keyframes/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000/artifacts/000008.jpg', landmarks_json_s3_key='output-folder/video_pose_landmarks_extracted/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000/landmarks/000008.json', annotated_image_s3_key='output-folder/video_pose_landmarks_extracted/5xwRgAwJrmW3VhrBTNhic

Convert the extracted hand landmark JSON into SuperAnnotate (SA) keypoint annotations compatible with SA tools, producing per-image SA JSON alongside a manifest for import.

# Step 4: Hand SuperAnnotate format conversion

In [None]:
# Transform hand landmark JSON into SuperAnnotate (SA) format for labeling tools.
# Inputs:
#   - s3.input_uri: manifest produced by the hand landmark extractor (InputType.MANIFEST)
# Outputs:
#   - Per-image SA JSON files under s3.output_uri
#   - New manifest.jsonl indexing the generated SA annotations
# Please check out the data model of HandSAConverterConfig to better control this particular step to your liking.
hand_sa_converter_config = HandSAConverterConfig(
    s3=S3Config(
        input_uri="s3://bucket-name/output-folder/video_hand_landmarks_extracted/manifest.jsonl",  # noqa: E501
        output_uri="s3://bucket-name/output-folder/video_hand_landmarks_sa/",
        aws_profile="aws-profile-name",
        input_type=InputType.MANIFEST,
    ),
    force_write=True,
    max_workers=4,
)
hand_sa_converter = HandSAConverter(hand_sa_converter_config)
hand_sa_converter.process_batch()


[2025-11-14 06:01:42,035] [INFO] [botocore.credentials] Found credentials in shared credentials file: ~/.aws/credentials
[2025-11-14 06:01:42,672] [INFO] [asl_data_pipeline.preprocessing.hand_sa_converter.converter] HandSAConverter initialized with input_uri: s3://bucket-name/output-folder/video_hand_landmarks_extracted/manifest.jsonl, output_uri: s3://bucket-name/output-folder/video_hand_landmarks_sa/, temp_dir: /var/folders/dd/rh7q5rb51c14q3dbl3rg0czm0000gp/T/sa_format_converter, max_workers: 4
[2025-11-14 06:01:43,197] [INFO] [asl_data_pipeline.preprocessing.hand_sa_converter.converter] Converted 000016 to SuperAnnotate format
[2025-11-14 06:01:43,386] [INFO] [asl_data_pipeline.preprocessing.hand_sa_converter.converter] Converted 000008 to SuperAnnotate format
[2025-11-14 06:01:43,413] [INFO] [asl_data_pipeline.preprocessing.hand_sa_converter.converter] Converted 000022 to SuperAnnotate format
[2025-11-14 06:01:43,444] [INFO] [asl_data_pipeline.preprocessing.hand_sa_converter.conver

{'total_files': 36,
 'successful_conversions': 36,
 'failed_conversions': 0,
 'output_files': ['output-folder/video_hand_landmarks_sa/hand_superannotate/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000_000016.json',
  'output-folder/video_hand_landmarks_sa/hand_superannotate/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000_000008.json',
  'output-folder/video_hand_landmarks_sa/hand_superannotate/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000_000022.json',
  'output-folder/video_hand_landmarks_sa/hand_superannotate/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000_000000.json',
  'output-folder/video_hand_landmarks_sa/hand_superannotate/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000_000027.json',
  'output-folder/video_hand_landmarks_sa/hand_superannotate/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000_000038.json',
  'output-folder/video_hand_landmarks_sa/hand_superannotate/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000_000052.json',
  'output-folder/video_hand_landmarks_s

Convert pose landmark JSON into SuperAnnotate (SA) format so it can be reviewed or refined in SA, producing per-image SA JSON and a manifest suitable for import.

# Step 5: Pose SuperAnnotate format conversion

In [None]:
# Transform pose landmark JSON into SuperAnnotate (SA) format for SA tooling.
# Inputs:
#   - s3.input_uri: manifest from the pose landmark extractor (InputType.MANIFEST)
# Outputs:
#   - Per-image SA JSON files for pose under s3.output_uri
#   - New manifest.jsonl indexing the pose SA annotations
# Please check out the data model of PoseSAConverterConfig to better control this particular step to your liking.

pose_sa_converter_config = PoseSAConverterConfig(
    s3=S3Config(
        input_uri="s3://bucket-name/output-folder/video_pose_landmarks_extracted/manifest.jsonl",  # noqa: E501
        output_uri="s3://bucket-name/output-folder/video_pose_landmarks_sa/",
        aws_profile="aws-profile-name",
        input_type=InputType.MANIFEST,
    ),
    force_write=True,
    max_workers=4,
)
pose_sa_converter = PoseSAConverter(pose_sa_converter_config)
pose_sa_converter.process_batch()

[2025-11-14 06:02:39,772] [INFO] [botocore.credentials] Found credentials in shared credentials file: ~/.aws/credentials
[2025-11-14 06:02:40,356] [INFO] [asl_data_pipeline.preprocessing.pose_sa_converter.converter] PoseSAConverter initialized with input_uri: s3://bucket-name/output-folder/video_pose_landmarks_extracted/manifest.jsonl, output_uri: s3://bucket-name/output-folder/video_pose_landmarks_sa/, temp_dir: /var/folders/dd/rh7q5rb51c14q3dbl3rg0czm0000gp/T/pose_sa_converter, max_workers: 4
[2025-11-14 06:02:40,856] [INFO] [asl_data_pipeline.preprocessing.pose_sa_converter.converter] Converted 000008 to SuperAnnotate format
[2025-11-14 06:02:41,072] [INFO] [asl_data_pipeline.preprocessing.pose_sa_converter.converter] Converted 000016 to SuperAnnotate format
[2025-11-14 06:02:41,149] [INFO] [asl_data_pipeline.preprocessing.pose_sa_converter.converter] Converted 000000 to SuperAnnotate format
[2025-11-14 06:02:41,231] [INFO] [asl_data_pipeline.preprocessing.pose_sa_converter.converte

{'total_files': 36,
 'successful_conversions': 36,
 'failed_conversions': 0,
 'output_files': ['output-folder/video_pose_landmarks_sa/pose_superannotate/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000_000008.json',
  'output-folder/video_pose_landmarks_sa/pose_superannotate/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000_000016.json',
  'output-folder/video_pose_landmarks_sa/pose_superannotate/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000_000000.json',
  'output-folder/video_pose_landmarks_sa/pose_superannotate/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000_000022.json',
  'output-folder/video_pose_landmarks_sa/pose_superannotate/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000_000027.json',
  'output-folder/video_pose_landmarks_sa/pose_superannotate/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000_000047.json',
  'output-folder/video_pose_landmarks_sa/pose_superannotate/5xwRgAwJrmW3VhrBTNhicgWpXS33_1740260341-707000000_000038.json',
  'output-folder/video_pose_landmarks_s

# Step 6: Data Postprocessing (Optional)

In [None]:
# After annotating data in SuperAnnotate and exporting, use the postprocessing script 
# to redact emails and remove metadata before sharing.
#
# Compatible SuperAnnotate Export Formats: 
#   - JSON (single annotation)
#   - JSONL (one annotation per line)
#   Both formats are fully supported by the postprocessing script.
#
# Usage:
#   python scripts/asl_data_postprocessing.py <sa_export>.json -o <output>.json
#   python scripts/asl_data_postprocessing.py <sa_export>.jsonl -o <output>.jsonl
#
# For detailed usage, see scripts/asl_data_postprocessing.py or run:
#   python scripts/asl_data_postprocessing.py --help