# Create file_to_label.json for ISL-123 Cache

**Problem:** Cache exists but file_to_label.json is missing  
**Solution:** Scan INCLUDE dataset and map each cache file to its class  
**Runtime:** ~30 seconds

---

### What this does:
1. Scans INCLUDE dataset folder structure
2. Matches cache filenames to source videos
3. Creates file_to_label.json mapping
4. Verifies all 123 classes are represented

In [None]:
# ============================================================
# CELL 1: Setup
# ============================================================
import json
import os
from pathlib import Path
from collections import Counter, defaultdict
from tqdm import tqdm

print("‚úÖ Imports ready")

In [None]:
# ============================================================
# CELL 2: Paths
# ============================================================
INCLUDE_DIR = Path('/kaggle/input/include')  # Source dataset
CACHE_DIR = Path('/kaggle/input/isl-123-cache/isl_cache_123')  # Cache directory
MAPPING_FILE = Path('/kaggle/input/isl-123-cache/label_mapping_123.json')  # Label mapping
OUTPUT_FILE = Path('/kaggle/working/file_to_label.json')  # Output

print(f"üìÇ Paths:")
print(f"   INCLUDE: {INCLUDE_DIR}")
print(f"   Cache: {CACHE_DIR}")
print(f"   Mapping: {MAPPING_FILE}")
print(f"   Output: {OUTPUT_FILE}")

# Verify paths exist
if not INCLUDE_DIR.exists():
    raise FileNotFoundError(f"INCLUDE dataset not found at {INCLUDE_DIR}")
if not CACHE_DIR.exists():
    raise FileNotFoundError(f"Cache not found at {CACHE_DIR}")
if not MAPPING_FILE.exists():
    raise FileNotFoundError(f"Mapping not found at {MAPPING_FILE}")

print("\n‚úÖ All paths verified")

In [None]:
# ============================================================
# CELL 3: Load Label Mapping
# ============================================================
with open(MAPPING_FILE) as f:
    mapping = json.load(f)

label_to_id = mapping['label_to_id']
print(f"‚úÖ Loaded {len(label_to_id)} classes from mapping")
print(f"\nFirst 10 classes:")
for i, cls in enumerate(list(label_to_id.keys())[:10]):
    print(f"   {i+1}. {cls}")

In [None]:
# ============================================================
# CELL 4: Scan INCLUDE Dataset & Match to Cache
# ============================================================
print("="*60)
print("üîç SCANNING INCLUDE DATASET")
print("="*60)

# First, get all cache files
cache_files = {f.stem: f.name for f in CACHE_DIR.glob('*.npy')}
print(f"\nüì¶ Found {len(cache_files)} cache files")

# Now scan INCLUDE dataset to find source videos
file_to_label = {}
video_to_class = {}  # Track source video -> class mapping

print(f"\nüîç Scanning INCLUDE directory structure...")

for root, dirs, files in os.walk(INCLUDE_DIR):
    for file in files:
        if file.lower().endswith(('.mov', '.mp4')):
            # Get relative path
            relative = Path(root).relative_to(INCLUDE_DIR)
            parts = relative.parts
            
            # Extract class from folder structure
            # Expected format: INCLUDE/Sign Videos/123 Classes/<number>. <class>/videos/file.mov
            if len(parts) >= 3:
                class_folder = parts[2]  # e.g., "87. hot"
                
                # Extract class name
                if '. ' in class_folder:
                    class_name = class_folder.split('. ', 1)[1].strip().lower()
                else:
                    class_name = class_folder.strip().lower()
                
                # Store mapping
                video_stem = Path(file).stem
                video_to_class[video_stem] = class_name

print(f"‚úÖ Found {len(video_to_class)} source videos")

# Match cache files to source videos
print(f"\nüîó Matching cache files to classes...")
matched = 0
unmatched = []

for cache_stem, cache_name in tqdm(cache_files.items(), desc="Matching"):
    # Try exact match first
    if cache_stem in video_to_class:
        file_to_label[cache_name] = video_to_class[cache_stem]
        matched += 1
    else:
        # Try partial match (cache might have extra suffixes)
        found = False
        for video_stem, class_name in video_to_class.items():
            if video_stem in cache_stem or cache_stem in video_stem:
                file_to_label[cache_name] = class_name
                matched += 1
                found = True
                break
        
        if not found:
            unmatched.append(cache_stem)

print(f"\nüìä Matching Results:")
print(f"   Matched: {matched}/{len(cache_files)}")
print(f"   Unmatched: {len(unmatched)}")

if unmatched:
    print(f"\n‚ö†Ô∏è  Unmatched files (first 10):")
    for u in unmatched[:10]:
        print(f"   {u}")

In [None]:
# ============================================================
# CELL 5: Verify Class Distribution
# ============================================================
print("="*60)
print("üìä VERIFYING CLASS DISTRIBUTION")
print("="*60)

class_counts = Counter(file_to_label.values())

print(f"\nüìà Statistics:")
print(f"   Total samples: {len(file_to_label)}")
print(f"   Total classes: {len(class_counts)}/{len(label_to_id)}")
print(f"   Min/class: {min(class_counts.values())}")
print(f"   Max/class: {max(class_counts.values())}")
print(f"   Avg/class: {len(file_to_label)/len(class_counts):.1f}")

# Check for missing classes
mapped_classes = set(class_counts.keys())
expected_classes = set(label_to_id.keys())
missing = expected_classes - mapped_classes

if missing:
    print(f"\n‚ö†Ô∏è  Warning: {len(missing)} classes have NO samples")
    print(f"   First 10: {list(missing)[:10]}")
else:
    print(f"\n‚úÖ All {len(label_to_id)} classes have samples!")

# Show top classes
print(f"\nüîù Top 10 classes by sample count:")
for cls, count in class_counts.most_common(10):
    print(f"   {cls:20s}: {count:3d} samples")

# Show bottom classes
print(f"\nüîª Bottom 10 classes by sample count:")
for cls, count in sorted(class_counts.items(), key=lambda x: x[1])[:10]:
    print(f"   {cls:20s}: {count:3d} samples")

In [None]:
# ============================================================
# CELL 6: Save file_to_label.json
# ============================================================
print("="*60)
print("üíæ SAVING file_to_label.json")
print("="*60)

with open(OUTPUT_FILE, 'w') as f:
    json.dump(file_to_label, f, indent=2, sort_keys=True)

print(f"\n‚úÖ Saved to: {OUTPUT_FILE}")
print(f"   File size: {OUTPUT_FILE.stat().st_size / 1024:.1f} KB")
print(f"   Total entries: {len(file_to_label)}")

print("\n" + "="*60)
print("‚úÖ DONE!")
print("="*60)
print(f"\nüìã Next steps:")
print(f"   1. Download file_to_label.json from /kaggle/working")
print(f"   2. Add it to your isl-123-cache dataset (re-upload)")
print(f"   3. OR: Just use this file in the same notebook")
print(f"\nüí° Tip: In training notebook, change Cell 3 to:")
print(f"   file_to_label_path = Path('/kaggle/working/file_to_label.json')")