# Sign Language Recognition - Training Notebook

This notebook automatically trains the **CNN + LSTM** model for sign language recognition.

## Quick Start:
1. **Runtime ‚Üí Change runtime type ‚Üí Select GPU**
2. **Runtime ‚Üí Run all**

Or click the "Open in Colab" badge in the GitHub repository!

---

**Note:** Make sure your data is available in the `Data/` directory.


In [None]:
# Clone repository - delete existing and clone fresh
import os
import shutil
from pathlib import Path

# Go to /content first
os.chdir('/content')
print(f"Current directory: {os.getcwd()}")

# Delete existing SignLanguage-Recognition directory if it exists
if os.path.exists('SignLanguage-Recognition'):
    print("üóëÔ∏è  Deleting existing SignLanguage-Recognition directory...")
    shutil.rmtree('SignLanguage-Recognition')
    print("‚úÖ Deleted existing directory")

# Clone fresh
print("üì• Cloning repository...")
os.system('git clone https://github.com/MAya0M/SignLanguage-Recognition.git')
print("‚úÖ Repository cloned")

# Change to the directory
os.chdir('/content/SignLanguage-Recognition')
print(f"‚úÖ Now in: {os.getcwd()}")


In [None]:
# Check GPU availability
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("GPU Available:", tf.config.list_physical_devices('GPU'))


## Install Dependencies


In [None]:
# Install required packages
%pip install -q tensorflow numpy pandas scikit-learn opencv-python mediapipe tqdm


In [None]:
# Optional: Re-extract keypoints with MINIMAL normalization (only translate)
# This preserves size/rotation differences which help distinguish classes
# ‚ö†Ô∏è  This will OVERWRITE all existing keypoint files!
# Uncomment the line below if you want to re-extract keypoints:
# !python scripts/re_extract_with_minimal_normalization.py


In [None]:
## Verify Data


In [None]:
# Verify data exists and check dataset size + class imbalance
import os
import pandas as pd
from pathlib import Path
import numpy as np

data_dir = Path('Data')
if data_dir.exists():
    print("‚úÖ Data directory found")
    csv_path = data_dir / 'Labels' / 'dataset.csv'
    keypoints_dir = data_dir / 'Keypoints' / 'rawVideos'
    
    if csv_path.exists():
        print(f"‚úÖ CSV file: {csv_path}")
        # Check dataset size
        df = pd.read_csv(csv_path)
        print(f"\nüìä Dataset Statistics:")
        print(f"   Total samples: {len(df)}")
        print(f"   Number of classes: {df['label'].nunique()}")
        
        print(f"\n   Per label:")
        label_counts = df.groupby('label').size().sort_values(ascending=False)
        max_count = label_counts.max()
        min_count = label_counts.min()
        imbalance_ratio = max_count / min_count if min_count > 0 else float('inf')
        most_common_label = label_counts.index[0]
        
        for label, count in label_counts.items():
            status = "‚úÖ" if count >= 20 else "‚ö†Ô∏è"
            if count == max_count:
                status = "üî¥"  # Most common (might cause issues)
            print(f"      {status} {label:12s}: {count:3d} samples ({(count/len(df)*100):5.2f}%)")
        
        print(f"\n   ‚ö†Ô∏è  Class Imbalance Analysis:")
        print(f"      Most common: {most_common_label} ({max_count} samples)")
        print(f"      Least common: {label_counts.index[-1]} ({min_count} samples)")
        print(f"      Imbalance ratio: {imbalance_ratio:.2f}x")
        
        if imbalance_ratio > 2.0:
            print(f"\n      üî¥ WARNING: Significant class imbalance detected!")
            print(f"         The model might favor '{most_common_label}' (majority class).")
            print(f"         This is why the model predicts '{most_common_label}' on everything.")
            print(f"         ‚úÖ Solution: Class weights will be used during training.")
            print(f"         üí° Recommendation: Add more samples to minority classes (aim for 30-50 per class)")
        else:
            print(f"      ‚úÖ Classes are relatively balanced")
        
        print(f"\n   Per split:")
        for split in ['train', 'val', 'test']:
            split_df = df[df['split'] == split]
            print(f"      {split:6s}: {len(split_df):3d} samples")
            
            # Show per-label distribution in split
            split_label_counts = split_df.groupby('label').size().sort_values(ascending=False)
            for label, count in split_label_counts.items():
                print(f"         {label:12s}: {count:3d}")
    else:
        print(f"‚ùå CSV file not found: {csv_path}")
        print(f"   Run: !python scripts/create_dataset_csv.py")
    
    if keypoints_dir.exists():
        print(f"\n‚úÖ Keypoints directory: {keypoints_dir}")
        # Count keypoint files
        npy_files = list(keypoints_dir.rglob("*.npy"))
        print(f"   Found {len(npy_files)} keypoint files")
        
        # Count per label
        print(f"\n   Keypoints per label (actual files on disk):")
        keypoint_counts = {}
        for label_dir in sorted(keypoints_dir.iterdir()):
            if label_dir.is_dir():
                npy_count = len(list(label_dir.glob("*.npy")))
                if npy_count > 0:
                    keypoint_counts[label_dir.name] = npy_count
                    print(f"      {label_dir.name:12s}: {npy_count:3d} files")
        
        # Compare with CSV
        if csv_path.exists():
            print(f"\n   Comparison (CSV vs Keypoints):")
            csv_label_counts = df.groupby('label').size().to_dict()
            for label_name in sorted(keypoint_counts.keys()):
                keypoint_count = keypoint_counts[label_name]
                # Find matching label in CSV (case-insensitive)
                csv_label = None
                csv_count = 0
                for csv_lbl in csv_label_counts.keys():
                    if csv_lbl.upper() == label_name.upper():
                        csv_label = csv_lbl
                        csv_count = csv_label_counts[csv_lbl]
                        break
                
                if csv_count > 0:
                    status = "‚úÖ" if keypoint_count == csv_count else "‚ö†Ô∏è"
                    print(f"      {status} {label_name:12s}: CSV={csv_count:3d}, Keypoints={keypoint_count:3d}")
                    if keypoint_count != csv_count:
                        print(f"         ‚ö†Ô∏è  Mismatch! CSV has {csv_count} but {keypoint_count} keypoint files exist")
                else:
                    print(f"      ‚ö†Ô∏è  {label_name:12s}: {keypoint_count:3d} keypoint files, but not in CSV!")
    else:
        print(f"‚ùå Keypoints directory not found: {keypoints_dir}")
        print(f"   Run: !python scripts/extract_keypoints.py")
else:
    print("‚ùå Data directory not found")
    print("Please upload data first!")


In [None]:
# Train CNN + LSTM model
# Architecture: CNN (spatial patterns) + LSTM (temporal patterns)
# 
# ‚úÖ NEW: Class weights are automatically used to handle class imbalance
# ‚úÖ NEW: Smart frame sampling (skips similar start, focuses on gesture)
#
# Optimized parameters:
# - Batch size: 8 (better for small dataset)
# - Epochs: 200 (give model time to learn)
# - CNN filters: 64 (first layer), 128 (second layer)
# - LSTM units: 128
# - Dropout: 0.3 (prevent overfitting)
# - Learning rate: 0.001
# - Class weights: Automatic (balanced) - handles class imbalance

!python scripts/train_model.py \
    --csv Data/Labels/dataset.csv \
    --keypoints-dir Data/Keypoints/rawVideos \
    --output-dir models \
    --batch-size 8 \
    --epochs 200 \
    --cnn-filters 64 \
    --lstm-units 128 \
    --num-cnn-layers 2 \
    --dropout 0.3 \
    --learning-rate 0.001


## Check Training Results

After training, check the results:


In [None]:
# Check training results
import glob
import json
from pathlib import Path

models_dir = sorted(glob.glob('models/run_*'))
if models_dir:
    latest_run = models_dir[-1]
    print(f"üìä Latest training run: {Path(latest_run).name}")
    print("=" * 60)
    
    # Check test results
    test_results_path = Path(latest_run) / "test_results.json"
    if test_results_path.exists():
        with open(test_results_path, 'r') as f:
            results = json.load(f)
        print(f"\n‚úÖ Test Results:")
        print(f"   Accuracy: {results['test_accuracy']:.2%}")
        print(f"   Loss: {results['test_loss']:.4f}")
    else:
        print("‚ö†Ô∏è  Test results not found - model may still be training")
    
    # Check training history
    history_path = Path(latest_run) / "training_history.json"
    if history_path.exists():
        with open(history_path, 'r') as f:
            history = json.load(f)
        if 'val_accuracy' in history:
            best_val_acc = max(history['val_accuracy'])
            print(f"\n‚úÖ Best Validation Accuracy: {best_val_acc:.2%}")
else:
    print("‚ùå No models found - train the model first!")


## Download Model (Optional)

To save your trained model to Google Drive:


In [None]:
# Download model to your computer
from google.colab import files
import shutil
import glob
from pathlib import Path

models_dir = sorted(glob.glob('models/run_*'))
if models_dir:
    latest_run = models_dir[-1]  # Latest run
    print(f"üì¶ Preparing model: {Path(latest_run).name}")
    
    # Create a zip file
    zip_name = f"{Path(latest_run).name}"
    shutil.make_archive(zip_name, 'zip', latest_run)
    
    # Download
    print(f"‚¨áÔ∏è Downloading {zip_name}.zip...")
    files.download(f'{zip_name}.zip')
    print("‚úÖ Model downloaded! Extract and add to your GitHub repo.")
else:
    print("‚ùå No models found - train the model first!")

# Alternative: Save to Google Drive (uncomment to use)
# from google.colab import drive
# drive.mount('/content/drive')
# dest = f'/content/drive/MyDrive/{Path(latest_run).name}'
# shutil.copytree(latest_run, dest, dirs_exist_ok=True)
# print(f"‚úÖ Model saved to Google Drive: {Path(latest_run).name}")
