# WLASL100 Feature Extraction on Google Colab

This notebook extracts MediaPipe features for 100-sign WLASL dataset.

**Runtime**: Select **GPU** runtime (Runtime → Change runtime type → GPU)

**Estimated time**: 2-3 hours on Colab GPU (vs 30+ hours on laptop)

## Step 1: Setup - Install Dependencies

In [None]:
# Install MediaPipe and dependencies
!pip install mediapipe opencv-python tqdm

# Check GPU availability
import torch
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

## Step 2: Mount Google Drive

You'll need to upload your WLASL videos to Google Drive first.

**Folder structure on Google Drive:**
```
My Drive/
  └── asl_data/
      ├── videos_100/          # Upload this folder from your laptop
      │   ├── book/
      │   ├── drink/
      │   └── ...
      └── metadata.json        # Upload metadata file
```

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Verify files are there
import os
data_dir = '/content/drive/MyDrive/asl_data'
video_dir = f'{data_dir}/videos_100'
metadata_path = f'{data_dir}/metadata.json'

print(f"Video directory exists: {os.path.exists(video_dir)}")
print(f"Metadata exists: {os.path.exists(metadata_path)}")

if os.path.exists(video_dir):
    # Count subdirectories (glosses)
    glosses = [d for d in os.listdir(video_dir) if os.path.isdir(f'{video_dir}/{d}')]
    print(f"Found {len(glosses)} gloss directories")

## Step 3: Download MediaPipe Models

In [None]:
# Create models directory
!mkdir -p /content/models/mediapipe

# Download MediaPipe models with verification
import os
import urllib.request

models = [
    {
        'name': 'hand_landmarker.task',
        'url': 'https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task',
        'min_size': 1_000_000  # At least 1MB
    },
    {
        'name': 'face_landmarker.task',
        'url': 'https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task',
        'min_size': 1_000_000
    },
    {
        'name': 'pose_landmarker.task',
        'url': 'https://storage.googleapis.com/mediapipe-models/pose_landmarker/pose_landmarker_full/float16/1/pose_landmarker_full.task',
        'min_size': 1_000_000
    }
]

print("Downloading MediaPipe models...\n")

for model in models:
    filepath = f"/content/models/mediapipe/{model['name']}"
    
    # Check if already exists and valid
    if os.path.exists(filepath):
        size = os.path.getsize(filepath)
        if size >= model['min_size']:
            print(f"✓ {model['name']} already exists ({size:,} bytes)")
            continue
        else:
            print(f"⚠ {model['name']} exists but too small ({size} bytes), re-downloading...")
            os.remove(filepath)
    
    # Download
    print(f"  Downloading {model['name']}...", end=" ")
    try:
        urllib.request.urlretrieve(model['url'], filepath)
        size = os.path.getsize(filepath)
        print(f"✓ ({size:,} bytes)")
    except Exception as e:
        print(f"✗ FAILED: {e}")
        raise

print("\n" + "="*70)
print("MODEL VERIFICATION")
print("="*70)

# Verify all models are accessible
for model in models:
    filepath = f"/content/models/mediapipe/{model['name']}"
    
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Missing: {filepath}")
    
    size = os.path.getsize(filepath)
    if size < model['min_size']:
        raise ValueError(f"File too small: {filepath} ({size} bytes)")
    
    # Check read permissions
    with open(filepath, 'rb') as f:
        f.read(1024)  # Try reading first 1KB
    
    print(f"✓ {model['name']:30s} {size:>10,} bytes  [OK]")

print("="*70)
print("✓ All MediaPipe models ready!")

## Step 4: Upload Code Files

Upload these files from your laptop:
- `mediapipe_extractor_v2.py`
- `features.py`

Or paste them directly in the cells below.

In [None]:
# Option 1: Upload files manually using Colab's file upload
# from google.colab import files
# uploaded = files.upload()  # Select mediapipe_extractor_v2.py and features.py

# Option 2: Copy from Google Drive (if you uploaded them there)
!cp /content/drive/MyDrive/asl_data/code/mediapipe_extractor_v2.py /content/
!cp /content/drive/MyDrive/asl_data/code/features.py /content/

# FIX: Replace relative imports with absolute imports for Colab
import os

# Fix features.py relative import
with open('/content/features.py', 'r') as f:
    content = f.read()

# Change: from .mediapipe_extractor_v2 import RawLandmarks
# To:     from mediapipe_extractor_v2 import RawLandmarks
content = content.replace('from .mediapipe_extractor_v2 import', 'from mediapipe_extractor_v2 import')

with open('/content/features.py', 'w') as f:
    f.write(content)

print("✓ Code files ready (import fix applied)")

## Step 4.5: Fix MediaPipe Model Paths for Colab

The extractor expects models in a specific location. We'll patch it to use Colab paths.

In [None]:
# Fix MediaPipe model paths in the extractor
with open('/content/mediapipe_extractor_v2.py', 'r') as f:
    extractor_code = f.read()

# Replace the hardcoded relative paths with Colab paths
replacements = [
    # Hand landmarker
    ('model_path = str(Path(__file__).parent.parent.parent / "models" / "mediapipe" / "hand_landmarker.task")',
     'model_path = "/content/models/mediapipe/hand_landmarker.task"'),
    
    # Face landmarker
    ('model_path = str(Path(__file__).parent.parent.parent / "models" / "mediapipe" / "face_landmarker.task")',
     'model_path = "/content/models/mediapipe/face_landmarker.task"'),
    
    # Pose landmarker (also fix the filename from pose_landmarker_full.task to pose_landmarker.task)
    ('model_path = str(Path(__file__).parent.parent.parent / "models" / "mediapipe" / "pose_landmarker_full.task")',
     'model_path = "/content/models/mediapipe/pose_landmarker.task"'),
]

for old, new in replacements:
    extractor_code = extractor_code.replace(old, new)

with open('/content/mediapipe_extractor_v2.py', 'w') as f:
    f.write(extractor_code)

print("✓ MediaPipe model paths fixed for Colab")
print("  hand_landmarker.task → /content/models/mediapipe/")
print("  face_landmarker.task → /content/models/mediapipe/")
print("  pose_landmarker.task → /content/models/mediapipe/")

## Step 4.6: Test MediaPipe Initialization

Quick test to ensure MediaPipe can load all models before starting extraction.

In [None]:
# Test MediaPipe initialization
print("Testing MediaPipe extractor initialization...\n")

try:
    from mediapipe_extractor_v2 import MediaPipeExtractor
    
    # This will load all 3 models
    test_extractor = MediaPipeExtractor(
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5
    )
    
    print("\n" + "="*70)
    print("✓ SUCCESS: MediaPipe initialized correctly!")
    print("="*70)
    print("All model files are accessible and working.")
    print("Ready to start feature extraction.")
    
except RuntimeError as e:
    print("\n" + "="*70)
    print("✗ ERROR: MediaPipe initialization failed!")
    print("="*70)
    print(f"Error: {e}")
    print("\nPossible issues:")
    print("1. Model files not downloaded correctly (check Step 3)")
    print("2. Model paths not patched correctly (check Step 4.5)")
    print("3. Model files corrupted")
    print("\nRun Step 3 again to re-download models.")
    raise

import json
import pickle
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm
from collections import defaultdict
import shutil
import time

# Import our code
from mediapipe_extractor_v2 import MediaPipeExtractor
from features import FeatureExtractor

# Configuration
METADATA_PATH = metadata_path
VIDEO_DIR = Path(video_dir)
OUTPUT_DIR = Path('/content/output')  # Save to Colab storage first
OUTPUT_DIR.mkdir(exist_ok=True)

# Create temp directory for local video copies
TEMP_DIR = Path('/content/temp_videos')
TEMP_DIR.mkdir(exist_ok=True)

# OPTIMIZATION SETTINGS
FRAME_SKIP = 2          # Process every 2nd frame (2x speedup)
MAX_FRAMES = 500        # Cap at 500 frames per video
MIN_SUCCESS_RATE = 0.7  # Skip videos with <70% frame detection

def load_metadata():
    """Load WLASL metadata."""
    with open(METADATA_PATH, 'r') as f:
        metadata = json.load(f)
    
    gloss_data = {}
    for entry in metadata:
        gloss = entry['gloss']
        gloss_data[gloss] = entry['instances']
    
    # Top 100 glosses by video count
    sorted_glosses = sorted(gloss_data.items(),
                           key=lambda x: len(x[1]),
                           reverse=True)[:100]
    
    print(f"✓ Loaded metadata for 100 glosses")
    print(f"  Total videos: {sum(len(v) for _, v in sorted_glosses)}")
    
    return dict(sorted_glosses)

def find_video_file(video_id: str, gloss: str, video_dir: Path) -> Path:
    """Find video file in gloss subdirectory.
    
    Tries multiple strategies:
    1. Exact match: {video_id}.mp4
    2. Try other extensions: .avi, .mov, .mkv
    3. Match by stem (filename without extension)
    """
    gloss_dir = video_dir / gloss
    if not gloss_dir.exists():
        return None
    
    # Strategy 1: Try exact match with .mp4
    video_file = gloss_dir / f"{video_id}.mp4"
    if video_file.exists():
        return video_file
    
    # Strategy 2: Try other common video extensions
    for ext in ['.avi', '.mov', '.mkv', '.webm', '.flv']:
        video_file = gloss_dir / f"{video_id}{ext}"
        if video_file.exists():
            return video_file
    
    # Strategy 3: Search for any file with matching stem (filename without extension)
    # This handles cases where video_id might have slight variations
    for video_file in gloss_dir.glob("*"):
        if video_file.is_file() and video_file.stem == video_id:
            return video_file
    
    # Strategy 4: Case-insensitive match
    video_id_lower = video_id.lower()
    for video_file in gloss_dir.glob("*"):
        if video_file.is_file() and video_file.stem.lower() == video_id_lower:
            return video_file
    
    return None

def copy_video_to_local(video_path: Path, max_retries: int = 3) -> Path:
    """
    Copy video from Google Drive to local Colab storage.
    
    FIX for errno=11: MediaPipe can't handle Google Drive FUSE mount I/O.
    Copying to local storage avoids network latency and rate limiting.
    """
    local_path = TEMP_DIR / video_path.name
    
    for attempt in range(max_retries):
        try:
            shutil.copy2(str(video_path), str(local_path))
            return local_path
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(0.5 * (attempt + 1))  # Exponential backoff
                continue
            else:
                raise RuntimeError(f"Failed to copy {video_path} after {max_retries} attempts: {e}")
    
    return local_path

def extract_features_optimized(mp_extractor, feature_extractor, video_path):
    """Optimized feature extraction with frame sampling."""
    import cv2
    
    # FIX: Copy video to local storage first to avoid Google Drive I/O issues
    try:
        local_video_path = copy_video_to_local(video_path)
    except Exception as e:
        return None, f"copy_failed: {e}"
    
    try:
        cap = cv2.VideoCapture(str(local_video_path))
        fps = cap.get(cv2.CAP_PROP_FPS)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
        # Skip very long videos (>5 minutes)
        if total_frames > 9000:
            cap.release()
            local_video_path.unlink()  # Clean up temp file
            return None, "video_too_long"
        
        landmarks_sequence = []
        frame_idx = 0
        frames_processed = 0
        
        while cap.isOpened():
            success, frame = cap.read()
            if not success:
                break
            
            # Frame sampling: process every FRAME_SKIP frames
            if frame_idx % FRAME_SKIP != 0:
                frame_idx += 1
                continue
            
            # Stop at max frames
            if frames_processed >= MAX_FRAMES:
                break
            
            # Convert to RGB
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            timestamp = frame_idx / fps if fps > 0 else frame_idx * (1/30.0)
            
            # Extract landmarks
            landmarks = mp_extractor.extract_frame(frame_rgb, timestamp)
            
            if landmarks is not None:
                landmarks_sequence.append(landmarks)
            
            frame_idx += 1
            frames_processed += 1
            
            # Early exit if success rate is too low
            if frames_processed >= 50:
                success_rate = len(landmarks_sequence) / frames_processed
                if success_rate < MIN_SUCCESS_RATE:
                    cap.release()
                    local_video_path.unlink()  # Clean up temp file
                    return None, "low_quality"
        
        cap.release()
        
        # Clean up temp file
        local_video_path.unlink()
        
        if len(landmarks_sequence) == 0:
            return None, "no_detections"
        
        # Extract features
        feature_sequence = []
        for landmarks in landmarks_sequence:
            feats = feature_extractor.extract_features(landmarks, include_temporal=True)
            feature_sequence.append(feats.concatenate())
        
        features = np.array(feature_sequence, dtype=np.float32)
        
        return features, "success"
        
    except Exception as e:
        # Clean up temp file on error
        if local_video_path.exists():
            local_video_path.unlink()
        return None, f"extraction_error: {str(e)}"

def extract_split(gloss_data, split):
    """Extract features for train/val/test split."""
    mp_extractor = MediaPipeExtractor(
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5
    )
    feature_extractor = FeatureExtractor()
    
    cache_data = {}
    gloss_to_id = {}
    next_gloss_id = 1
    
    # Collect split instances
    split_instances = []
    for gloss, instances in gloss_data.items():
        for instance in instances:
            if instance['split'] == split:
                split_instances.append((gloss, instance))
    
    print(f"\n{'='*70}")
    print(f"Extracting {split.upper()} features (OPTIMIZED)")
    print(f"{'='*70}")
    print(f"Videos: {len(split_instances)}")
    print(f"Frame skip: {FRAME_SKIP} (sampling every {FRAME_SKIP}th frame)")
    print(f"Max frames: {MAX_FRAMES}")
    print(f"Min success rate: {MIN_SUCCESS_RATE:.0%}")
    print(f"FIX: Copying videos to local storage to avoid Drive I/O issues")
    print(f"{'='*70}\n")
    
    stats = defaultdict(int)
    
    for gloss, instance in tqdm(split_instances, desc=f"{split} extraction"):
        video_id = instance['video_id']
        video_path = find_video_file(video_id, gloss, VIDEO_DIR)
        
        if not video_path or not video_path.exists():
            stats['missing_file'] += 1
            continue
        
        features, status = extract_features_optimized(mp_extractor, feature_extractor, video_path)
        
        if features is None:
            stats[status] += 1
            continue
        
        # Assign gloss ID
        if gloss not in gloss_to_id:
            gloss_to_id[gloss] = next_gloss_id
            next_gloss_id += 1
        
        cache_data[video_id] = {
            'features': features,
            'gloss': gloss,
            'gloss_id': gloss_to_id[gloss],
            'video_id': video_id,
            'split': split,
        }
        
        stats['success'] += 1
    
    print(f"\n{'='*70}")
    print(f"{split.upper()} Summary")
    print(f"{'='*70}")
    for key, count in sorted(stats.items()):
        print(f"  {key}: {count}")
    print(f"  Success rate: {stats['success'] / len(split_instances) * 100:.1f}%")
    print(f"{'='*70}\n")
    
    return cache_data, gloss_to_id

print("✓ Feature extraction functions ready (with Drive I/O fix)")

In [None]:
import json
import pickle
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm
from collections import defaultdict

# Import our code
from mediapipe_extractor_v2 import MediaPipeExtractor
from features import FeatureExtractor

# Configuration
METADATA_PATH = metadata_path
VIDEO_DIR = Path(video_dir)
OUTPUT_DIR = Path('/content/output')  # Save to Colab storage first
OUTPUT_DIR.mkdir(exist_ok=True)

# OPTIMIZATION SETTINGS
FRAME_SKIP = 2          # Process every 2nd frame (2x speedup)
MAX_FRAMES = 500        # Cap at 500 frames per video
MIN_SUCCESS_RATE = 0.7  # Skip videos with <70% frame detection

def load_metadata():
    """Load WLASL metadata."""
    with open(METADATA_PATH, 'r') as f:
        metadata = json.load(f)
    
    gloss_data = {}
    for entry in metadata:
        gloss = entry['gloss']
        gloss_data[gloss] = entry['instances']
    
    # Top 100 glosses by video count
    sorted_glosses = sorted(gloss_data.items(),
                           key=lambda x: len(x[1]),
                           reverse=True)[:100]
    
    print(f"✓ Loaded metadata for 100 glosses")
    print(f"  Total videos: {sum(len(v) for _, v in sorted_glosses)}")
    
    return dict(sorted_glosses)

def find_video_file(video_id: str, gloss: str, video_dir: Path) -> Path:
    """Find video file in gloss subdirectory."""
    gloss_dir = video_dir / gloss
    if not gloss_dir.exists():
        return None
    
    video_file = gloss_dir / f"{video_id}.mp4"
    return video_file if video_file.exists() else None

def extract_features_optimized(mp_extractor, feature_extractor, video_path):
    """Optimized feature extraction with frame sampling."""
    import cv2
    
    cap = cv2.VideoCapture(str(video_path))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    # Skip very long videos (>5 minutes)
    if total_frames > 9000:
        cap.release()
        return None, "video_too_long"
    
    landmarks_sequence = []
    frame_idx = 0
    frames_processed = 0
    
    while cap.isOpened():
        success, frame = cap.read()
        if not success:
            break
        
        # Frame sampling: process every FRAME_SKIP frames
        if frame_idx % FRAME_SKIP != 0:
            frame_idx += 1
            continue
        
        # Stop at max frames
        if frames_processed >= MAX_FRAMES:
            break
        
        # Convert to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        timestamp = frame_idx / fps if fps > 0 else frame_idx * (1/30.0)
        
        # Extract landmarks
        landmarks = mp_extractor.extract_frame(frame_rgb, timestamp)
        
        if landmarks is not None:
            landmarks_sequence.append(landmarks)
        
        frame_idx += 1
        frames_processed += 1
        
        # Early exit if success rate is too low
        if frames_processed >= 50:
            success_rate = len(landmarks_sequence) / frames_processed
            if success_rate < MIN_SUCCESS_RATE:
                cap.release()
                return None, "low_quality"
    
    cap.release()
    
    if len(landmarks_sequence) == 0:
        return None, "no_detections"
    
    # Extract features
    feature_sequence = []
    for landmarks in landmarks_sequence:
        feats = feature_extractor.extract_features(landmarks, include_temporal=True)
        feature_sequence.append(feats.concatenate())
    
    features = np.array(feature_sequence, dtype=np.float32)
    
    return features, "success"

def extract_split(gloss_data, split):
    """Extract features for train/val/test split."""
    mp_extractor = MediaPipeExtractor(
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5
    )
    feature_extractor = FeatureExtractor()
    
    cache_data = {}
    gloss_to_id = {}
    next_gloss_id = 1
    
    # Collect split instances
    split_instances = []
    for gloss, instances in gloss_data.items():
        for instance in instances:
            if instance['split'] == split:
                split_instances.append((gloss, instance))
    
    print(f"\n{'='*70}")
    print(f"Extracting {split.upper()} features (OPTIMIZED)")
    print(f"{'='*70}")
    print(f"Videos: {len(split_instances)}")
    print(f"Frame skip: {FRAME_SKIP} (sampling every {FRAME_SKIP}th frame)")
    print(f"Max frames: {MAX_FRAMES}")
    print(f"Min success rate: {MIN_SUCCESS_RATE:.0%}")
    print(f"{'='*70}\n")
    
    stats = defaultdict(int)
    
    for gloss, instance in tqdm(split_instances, desc=f"{split} extraction"):
        video_id = instance['video_id']
        video_path = find_video_file(video_id, gloss, VIDEO_DIR)
        
        if not video_path or not video_path.exists():
            stats['missing_file'] += 1
            continue
        
        features, status = extract_features_optimized(mp_extractor, feature_extractor, video_path)
        
        if features is None:
            stats[status] += 1
            continue
        
        # Assign gloss ID
        if gloss not in gloss_to_id:
            gloss_to_id[gloss] = next_gloss_id
            next_gloss_id += 1
        
        cache_data[video_id] = {
            'features': features,
            'gloss': gloss,
            'gloss_id': gloss_to_id[gloss],
            'video_id': video_id,
            'split': split,
        }
        
        stats['success'] += 1
    
    print(f"\n{'='*70}")
    print(f"{split.upper()} Summary")
    print(f"{'='*70}")
    for key, count in sorted(stats.items()):
        print(f"  {key}: {count}")
    print(f"  Success rate: {stats['success'] / len(split_instances) * 100:.1f}%")
    print(f"{'='*70}\n")
    
    return cache_data, gloss_to_id

print("✓ Feature extraction functions ready")

## Step 6: Run Extraction

## Step 5.5: Diagnostic - Check Video File Paths

Before running extraction, let's verify that video files can be found. This diagnostic will help identify any path or naming issues.

In [None]:
# Diagnostic: Check video file paths and naming
print("="*70)
print("DIAGNOSTIC: Checking video file paths")
print("="*70)

# Get video directory (use the same path as defined in Step 4)
from pathlib import Path
import os

# Check if Google Drive is mounted
drive_base = Path('/content/drive/MyDrive')
print(f"\nGoogle Drive base: {drive_base}")
print(f"Google Drive mounted: {drive_base.exists()}\n")

if not drive_base.exists():
    print("ERROR: Google Drive is not mounted!")
    print("Please run Step 2 (Mount Google Drive) first.\n")
else:
    # Check what's in MyDrive
    print("Contents of /content/drive/MyDrive:")
    try:
        items = list(drive_base.iterdir())
        for item in sorted(items)[:20]:  # Show first 20 items
            item_type = "DIR" if item.is_dir() else "FILE"
            print(f"  [{item_type}] {item.name}")
        if len(items) > 20:
            print(f"  ... and {len(items) - 20} more items")
    except Exception as e:
        print(f"  Error listing directory: {e}")
    print()

# Check expected paths
video_dir_diag = Path(video_dir) if 'video_dir' in globals() else Path('/content/drive/MyDrive/asl_data/videos_100')
metadata_path_diag = metadata_path if 'metadata_path' in globals() else '/content/drive/MyDrive/asl_data/metadata.json'

print(f"Expected video directory: {video_dir_diag}")
print(f"Expected metadata path: {metadata_path_diag}\n")

# Check if asl_data directory exists
asl_data_dir = video_dir_diag.parent
print(f"Checking parent directory: {asl_data_dir}")
if asl_data_dir.exists():
    print(f"✓ asl_data directory exists")
    print(f"\nContents of {asl_data_dir}:")
    try:
        items = list(asl_data_dir.iterdir())
        for item in sorted(items):
            item_type = "DIR" if item.is_dir() else "FILE"
            size = ""
            if item.is_file():
                try:
                    size = f" ({os.path.getsize(item):,} bytes)"
                except:
                    pass
            print(f"  [{item_type}] {item.name}{size}")
    except Exception as e:
        print(f"  Error: {e}")
else:
    print(f"✗ asl_data directory does not exist")
    print(f"\nPlease check:")
    print(f"  1. Did you upload the videos_100 folder to Google Drive?")
    print(f"  2. Is it in the correct location: MyDrive/asl_data/videos_100/")
    print(f"  3. Check the exact folder name (case-sensitive)")
    print()

# Load metadata to check video_id format
import json
with open(metadata_path_diag, 'r') as f:
    metadata = json.load(f)

gloss_data_diag = {}
for entry in metadata:
    gloss = entry['gloss']
    gloss_data_diag[gloss] = entry['instances']

# Top 100 glosses by video count
sorted_glosses = sorted(gloss_data_diag.items(),
                       key=lambda x: len(x[1]),
                       reverse=True)[:100]
gloss_data_diag = dict(sorted_glosses)

# Check first few glosses and their video files
print(f"\nVideo directory: {video_dir_diag}")
print(f"Video directory exists: {video_dir_diag.exists()}\n")

if video_dir_diag.exists():
    # List first few gloss directories
    gloss_dirs = [d for d in sorted(video_dir_diag.iterdir()) if d.is_dir()][:5]
    print(f"Sample gloss directories found: {[d.name for d in gloss_dirs]}\n")
    
    # Check a few examples from metadata
    sample_count = 0
    for gloss, instances in list(gloss_data_diag.items())[:3]:
        if sample_count >= 3:
            break
        print(f"Gloss: {gloss}")
        print(f"  Instances: {len(instances)}")
        
        # Check if gloss directory exists
        gloss_dir = video_dir_diag / gloss
        print(f"  Directory exists: {gloss_dir.exists()}")
        
        if gloss_dir.exists():
            # List files in directory
            files = list(gloss_dir.glob("*"))
            print(f"  Files in directory: {len(files)}")
            if files:
                print(f"  Sample files: {[f.name for f in files[:3]]}")
            
            # Check first instance
            if instances:
                first_instance = instances[0]
                video_id = first_instance['video_id']
                print(f"  First video_id from metadata: {video_id}")
                expected_path = gloss_dir / f"{video_id}.mp4"
                print(f"  Expected path: {expected_path}")
                print(f"  File exists: {expected_path.exists()}")
                
                # Try to find any .mp4 file
                mp4_files = list(gloss_dir.glob("*.mp4"))
                if mp4_files:
                    print(f"  Found .mp4 files: {[f.name for f in mp4_files[:3]]}")
                    # Check if video_id matches any file (without extension)
                    matching = [f for f in mp4_files if f.stem == video_id]
                    if matching:
                        print(f"  ✓ Found matching file: {matching[0].name}")
                    else:
                        print(f"  ✗ No file matches video_id '{video_id}'")
                        if mp4_files:
                            print(f"    File stems: {[f.stem for f in mp4_files[:3]]}")
        print()
        sample_count += 1
else:
    print(f"ERROR: Video directory does not exist: {video_dir_diag}")
    print(f"Please check Step 2 (Mount Google Drive) and verify the path is correct.")

print("="*70)

In [None]:
# Load metadata
gloss_data = load_metadata()

# Extract all splits
all_gloss_to_id = {}

for split in ['train', 'val', 'test']:
    cache_data, gloss_to_id = extract_split(gloss_data, split)
    all_gloss_to_id.update(gloss_to_id)
    
    # Save to pickle
    output_file = OUTPUT_DIR / f"features_{split}_wlasl100.pkl"
    with open(output_file, 'wb') as f:
        pickle.dump(cache_data, f)
    
    print(f"✓ Saved {len(cache_data)} features to {output_file}")

# Save vocabulary
vocab_file = OUTPUT_DIR / "vocabulary.json"
vocab_data = {
    'gloss_to_id': all_gloss_to_id,
    'id_to_gloss': {v: k for k, v in all_gloss_to_id.items()},
    'vocab_size': len(all_gloss_to_id) + 1,
    'num_glosses': len(all_gloss_to_id),
}

with open(vocab_file, 'w') as f:
    json.dump(vocab_data, f, indent=2)

print(f"\n{'='*70}")
print("EXTRACTION COMPLETE!")
print(f"{'='*70}")
print(f"Files created in {OUTPUT_DIR}:")
print(f"  - features_train_wlasl100.pkl")
print(f"  - features_val_wlasl100.pkl")
print(f"  - features_test_wlasl100.pkl")
print(f"  - vocabulary.json")
print(f"{'='*70}")

## Step 7: Download Results

Copy results back to Google Drive so you can download them later.

In [None]:
# Copy to Google Drive
import shutil

drive_output = '/content/drive/MyDrive/asl_data/extracted_features'
!mkdir -p {drive_output}

!cp /content/output/*.pkl {drive_output}/
!cp /content/output/*.json {drive_output}/

print(f"✓ Results saved to Google Drive: {drive_output}")
print("\nYou can now download these files to your laptop!")