In [1]:
!pip install mediapipe

Collecting mediapipe
  Downloading mediapipe-0.10.21-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting protobuf<5,>=4.25.3 (from mediapipe)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.3-py3-none-any.whl.metadata (1.6 kB)
INFO: pip is looking at multiple versions of opencv-contrib-python to determine which version is compatible with other requirements. This could take a while.
Collecting opencv-contrib-python (from mediapipe)
  Downloading opencv_contrib_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Downloading mediapipe-0.10.21-cp311-cp311-manylinux_2_28_x86_64.whl (35.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K   [90m━━━━━━━━━

In [2]:
import os
from IPython.display import display, clear_output
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import mediapipe as mp
from tqdm.auto import tqdm

2025-12-03 13:43:05.626090: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764769385.901996      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764769385.986510      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
mp_holistic = mp.solutions.holistic


In [4]:


def extract_and_normalize_keypoints(results):
    """
    Extracts pose and hand data, normalizes them relative to the body center,
    and filters out noise (legs, unnecessary face points).
    """

    # 1. Extract Raw Landmarks (Shape: N x 3) -------------------------
    # Helper to convert landmark object to np array
    def to_array(landmarks, count):
        if landmarks:
            return np.array([[res.x, res.y, res.z] for res in landmarks.landmark])
        else:
            return np.zeros((count, 3))

    pose_raw = to_array(results.pose_landmarks, 33)
    lh_raw = to_array(results.left_hand_landmarks, 21)
    rh_raw = to_array(results.right_hand_landmarks, 21)

    # We skip face for now to keep the vector small, or you can add specific lip indices here.

    # 2. Establish Normalization Reference (Based on Shoulders) -------
    # MediaPipe Pose indices: 11 = Left Shoulder, 12 = Right Shoulder

    if results.pose_landmarks:
        # Calculate the center of the chest (Anchor)
        left_shoulder = pose_raw[11]
        right_shoulder = pose_raw[12]

        # Center point between shoulders
        center = (left_shoulder + right_shoulder) / 2.0

        # Calculate body width (to scale size)
        # We use Euclidean distance between shoulders
        shoulder_width = np.linalg.norm(left_shoulder - right_shoulder)

        # Avoid division by zero if detection is glitchy
        if shoulder_width < 0.001:
            shoulder_width = 1.0

    else:
        # If no body is detected, we cannot normalize. Return zeros.
        # Length = (Pose_filtered + LH + RH) * 3 coordinates
        # 6 (Upper Body) + 21 + 21 = 48 points * 3 = 144 features
        return np.zeros(144)

    # 3. Apply Normalization ------------------------------------------
    # Formula: (Point - Center) / Body_Width

    pose_norm = (pose_raw - center) / shoulder_width
    lh_norm = (lh_raw - center) / shoulder_width
    rh_norm = (rh_raw - center) / shoulder_width

    # Handle missing hands: If hand was originally zero, it becomes -center/width.
    # We must reset missing hands back to 0 to avoid confusing the model.
    if not results.left_hand_landmarks: lh_norm = np.zeros((21, 3))
    if not results.right_hand_landmarks: rh_norm = np.zeros((21, 3))

    # 4. Feature Selection (Filtering) --------------------------------
    # We only care about upper body for sign language.
    # Keep: Shoulders(11-12), Elbows(13-14), Wrists(15-16)
    upper_body_indices = [11, 12, 13, 14, 15, 16]
    pose_filtered = pose_norm[upper_body_indices]

    # 5. Flatten and Concatenate --------------------------------------
    # Result: 6 pose points + 21 left hand + 21 right hand = 48 points
    # Total vector size: 48 * 3 = 144
    return np.concatenate([pose_filtered.flatten(), lh_norm.flatten(), rh_norm.flatten()])


In [5]:

def process_video(video_path, sequence_length=30):
    cap = cv2.VideoCapture(video_path)
    sequence = []

    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False

            results = holistic.process(image)

            # Use the new normalized function
            keypoints = extract_and_normalize_keypoints(results)
            sequence.append(keypoints)

        cap.release()

    # Handling empty extraction
    if len(sequence) == 0:
        return None

    # Frame Sampling (Uniform Sampling)
    # If video is 100 frames and we need 30, we skip frames evenly.
    # If video is 10 frames, we duplicate/pad.
    sequence = np.array(sequence)

    if len(sequence) >= sequence_length:
        # Downsample: Pick evenly spaced frames
        resample_indices = np.linspace(0, len(sequence) - 1, sequence_length, dtype=int)
        sequence = sequence[resample_indices]
    else:
        # Padding: Add zero-frames to the end
        padding = np.zeros((sequence_length - len(sequence), sequence.shape[1]))
        sequence = np.concatenate([sequence, padding], axis=0)

    return sequence

In [6]:
DATAPATH = "/kaggle/input/asl-20-words-dataset-v1/Full Data"

In [7]:
n = 0
words = os.listdir(DATAPATH)
for word in words:
    videos = os.listdir(os.path.join(DATAPATH,word))
    N =  len(videos)
    print(f"the word {word} has {N} videos")
    n+=N
m=len(words)
print("-------------------------------------------")
print(f"The Data Has A total of {m} Words and {n} Videos")

the word mall has 414 videos
the word good has 436 videos
the word mosque has 427 videos
the word finish has 440 videos
the word thinking has 366 videos
the word mother has 406 videos
the word eat has 440 videos
the word sad has 420 videos
the word house has 421 videos
the word love has 435 videos
the word normal has 410 videos
the word me has 430 videos
the word worry has 409 videos
the word thanks has 412 videos
the word baby has 430 videos
the word father has 452 videos
the word hear has 433 videos
the word stop has 426 videos
the word important has 446 videos
the word happy has 445 videos
-------------------------------------------
The Data Has A total of 20 Words and 8498 Videos


# ------------------------------------------------------

In [11]:
# --- Configuration ---
MAX_VIDEOS_PER_WORD = 100
BATCH_NUM = 3  # <--- CHANGE THIS: 1 for first 100, 2 for second 100, etc.

# ---------------------

label_map = {word: i for i, word in enumerate(words)}

X, y = [], []
corrupted_videos_paths = []

# Calculate start and end indices based on the batch number
start_index = (BATCH_NUM - 1) * MAX_VIDEOS_PER_WORD
end_index = start_index + MAX_VIDEOS_PER_WORD

print(f"--- Loading Batch {BATCH_NUM} ---")
print(f"Reading videos from index {start_index} to {end_index} for each word.")

for word in tqdm(words, desc="Processing words"):
    word_path = os.path.join(DATAPATH, word)

    if not os.path.isdir(word_path):
        continue

    video_files = os.listdir(word_path)
    
    # 1. Sort and Shuffle consistently
    # We sort first to ensure os.listdir order doesn't affect the seed, 
    # then shuffle with a fixed seed (42). This ensures Batch 1 is always 
    # the same set of files, and Batch 2 is the next set.
    video_files.sort()
    np.random.seed(42) 
    np.random.shuffle(video_files)

    # 2. Select the specific batch slice
    # This grabs the specific 100 files based on BATCH_NUM
    target_files = video_files[start_index:end_index]

    if not target_files:
        print(f"Warning: No videos found for word '{word}' in batch range {start_index}-{end_index}")
        continue

    video_count = 0 

    for video_file in target_files:
        video_path = os.path.join(word_path, video_file)

        if video_file.startswith('.'):
            continue

        # --- Preprocessing ---
        # Assuming process_video is defined elsewhere
        sequence = process_video(video_path, sequence_length=30)

        if sequence is None:
            corrupted_videos_paths.append(video_path)
            continue

        X.append(sequence)
        y.append(label_map[word])
        video_count += 1

X = np.array(X)
y = np.array(y)

print("\n--- Data Loading Complete ---")
print(f"Batch Number: {BATCH_NUM}")
print("Total Corrupted Videos Skipped:", len(corrupted_videos_paths))
print("X shape:", X.shape)
print("y shape:", y.shape)

--- Loading Batch 3 ---
Reading videos from index 200 to 300 for each word.


Processing words:   0%|          | 0/20 [00:00<?, ?it/s]

W0000 00:00:1764780181.797869   16133 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764780181.833028   16133 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764780181.837806   16135 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764780181.837975   16134 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764780181.838041   16136 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764780181.851468   16135 inference_feedback_manager.cc:114] Feedback manager 


--- Data Loading Complete ---
Batch Number: 3
Total Corrupted Videos Skipped: 14
X shape: (1986, 30, 144)
y shape: (1986,)


In [12]:
X.shape , y.shape

((1986, 30, 144), (1986,))

In [13]:
np.savez('third100.npz', X=X, y=y)