In [1]:
import os
import cv2
import numpy as np
from glob import glob
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv3D, MaxPooling3D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.model_selection import train_test_split


In [4]:
# ============================================
# Cell 2: Import Libraries
# ============================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import mediapipe as mp
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
import json
import warnings
warnings.filterwarnings('ignore')

# Import sklearn components with fallback
try:
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
    print("✓ scikit-learn imported successfully")
except ImportError as e:
    print(f"⚠ scikit-learn import issue: {e}")
    print("  Installing latest scikit-learn...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-U', 'scikit-learn'])
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
    print("✓ scikit-learn re-imported successfully")

print(f"TensorFlow Version: {tf.__version__}")
print(f"MediaPipe Version: {mp.__version__}")
print(f"OpenCV Version: {cv2.__version__}")

# GPU Check
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"✓ GPU Available: {len(gpus)} device(s)")
    for gpu in gpus:
        print(f"  - {gpu}")
else:
    print("⚠ No GPU detected - will use CPU (slower)")

print("\nAll imports successful!")

ModuleNotFoundError: No module named 'seaborn'

In [None]:
# ============================================
# Cell 3: Configuration
# ============================================

CONFIG = {
    # Data
    'NUM_CLASSES': 28,  # A-Z (26) + space + nothing
    'SEQUENCE_LENGTH': 30,  # frames per gesture
    'NUM_KEYPOINTS': 21,  # MediaPipe hand keypoints
    'KEYPOINT_DIMS': 2,  # x, y coordinates (no z for simplicity)
    
    # Model
    'LSTM_UNITS_1': 128,
    'LSTM_UNITS_2': 64,
    'DENSE_UNITS': 32,
    'DROPOUT': 0.3,
    'BATCH_SIZE': 32,
    'EPOCHS': 50,
    
    # Training
    'LEARNING_RATE': 0.001,
    'VALIDATION_SPLIT': 0.2,
    'TEST_SPLIT': 0.1,
    
    # Paths
    'OUTPUT_DIR': 'output',
    'MODEL_PATH': 'output/asl_lstm_model.h5',
    'SAVEDMODEL_PATH': 'models/asl_lstm_savedmodel',
    'TFJS_PATH': 'output/tfjs_asl_lstm',
}

# Create output directory
os.makedirs(CONFIG['OUTPUT_DIR'], exist_ok=True)
os.makedirs('models', exist_ok=True)

# Class labels
CLASSES = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') + ['space', 'nothing']
CLASS_TO_IDX = {c: i for i, c in enumerate(CLASSES)}
IDX_TO_CLASS = {i: c for i, c in enumerate(CLASSES)}

print("\nConfiguration:")
for key, value in CONFIG.items():
    if not key.endswith('_PATH'):
        print(f"  {key:20s}: {value}")

print(f"\nClasses ({len(CLASSES)}): {', '.join(CLASSES)}")

In [7]:

# Cell 4 (New): Keypoint Extraction Setup
import cv2
import mediapipe as mp
import numpy as np
import csv
from pathlib import Path
from tqdm import tqdm # For progress bar

# --- Configuration for Keypoint Extraction ---
# Update DATA_DIR to point to the correct, *full* ASL Alphabet directory (A/B/.../Z/space/)
DATA_DIR_IMAGES = Path("../datasets/ASL Dataset/asl_alphabet_train") 
OUTPUT_CSV_PATH = Path("../datasets/ASL_Keypoints.csv")
MAX_NUM_HANDS = 1 # We are classifying single letters

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=True, 
    max_num_hands=MAX_NUM_HANDS, 
    min_detection_confidence=0.5
)

def extract_and_save_keypoints(image_dir, output_csv):
    print(f"Starting keypoint extraction from: {image_dir}")
    all_landmarks = []
    
    # Header for CSV: 'class', 'x0', 'y0', 'z0', 'x1', 'y1', 'z1', ..., 'x20', 'y20', 'z20'
    header = ['class']
    for i in range(21): # 21 landmarks per hand
        header.extend([f'x{i}', f'y{i}', f'z{i}'])

    with open(output_csv, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header)

        # Iterate through subdirectories (classes)
        class_dirs = [d for d in image_dir.iterdir() if d.is_dir()]
        for class_dir in tqdm(class_dirs, desc="Processing Classes"):
            class_name = class_dir.name
            
            # Iterate through images in class directory
            for image_path in class_dir.glob('*.jpg'):
                try:
                    image = cv2.imread(str(image_path))
                    if image is None: continue
                    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                    
                    results = hands.process(image)
                    
                    if results.multi_hand_landmarks:
                        # Only take the first detected hand for simplicity
                        landmarks = results.multi_hand_landmarks[0].landmark
                        
                        # Flatten the normalized coordinates (x, y, z) into a single row vector
                        row = [class_name]
                        for landmark in landmarks:
                            # Normalize coordinates by hand location (e.g., relative to wrist)
                            # Simple normalization: raw (x, y, z) here. Better: relative to wrist (lm[0])
                            row.extend([landmark.x, landmark.y, landmark.z])
                        
                        writer.writerow(row)
                except Exception as e:
                    # print(f"Error processing {image_path}: {e}")
                    continue
        
    print(f"\n✓ Keypoint data saved to: {output_csv}")
    print("  Run this cell only once to generate the CSV dataset!")

# UNCOMMENT THE LINE BELOW TO RUN KEYPOINT EXTRACTION!
# extract_and_save_keypoints(DATA_DIR_IMAGES, OUTPUT_CSV_PATH)

In [8]:
# ============================================
# Cell 5: Data Generation Functions
# ============================================

def extract_gesture_sequence(video_path, gesture_class, sequence_length=30):
    """
    Extract hand keypoint sequence from a video file.
    
    Args:
        video_path: path to video file
        gesture_class: class label (e.g., 'A', 'B', 'space')
        sequence_length: number of frames to extract
    
    Returns:
        sequence: array of shape (sequence_length, 21, 2) or None
        gesture_class: class label
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None, gesture_class
    
    frames_keypoints = []
    frame_count = 0
    
    while len(frames_keypoints) < sequence_length:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Extract hand keypoints
        keypoints = extractor.extract(frame)
        
        if keypoints is not None:
            frames_keypoints.append(keypoints)
        
        frame_count += 1
    
    cap.release()
    
    # Pad or trim to sequence_length
    if len(frames_keypoints) == 0:
        return None, gesture_class
    
    if len(frames_keypoints) < sequence_length:
        # Pad with last frame
        last_frame = frames_keypoints[-1]
        while len(frames_keypoints) < sequence_length:
            frames_keypoints.append(last_frame)
    else:
        # Subsample if too long
        indices = np.linspace(0, len(frames_keypoints)-1, sequence_length, dtype=int)
        frames_keypoints = [frames_keypoints[i] for i in indices]
    
    sequence = np.array(frames_keypoints)  # shape: (sequence_length, 21, 2)
    return sequence, gesture_class

def generate_synthetic_data(num_samples_per_class=50, sequence_length=30):
    """
    Generate synthetic hand gesture data for testing and demonstration.
    
    In production, replace this with actual gesture videos from:
    - ASL Alphabet Dataset
    - ASL-LEX Dataset
    - Custom recordings
    
    Returns:
        X: sequences of shape (total_samples, sequence_length, 21, 2)
        y: class labels
    """
    X = []
    y = []
    
    print(f"Generating {num_samples_per_class} synthetic samples per class...")
    
    for class_idx, class_name in enumerate(CLASSES):
        for sample_idx in range(num_samples_per_class):
            # Generate random walk for hand keypoints
            sequence = np.zeros((sequence_length, 21, 2))
            
            # Create realistic motion patterns
            base_x = np.random.uniform(0.3, 0.7)
            base_y = np.random.uniform(0.3, 0.7)
            
            for frame_idx in range(sequence_length):
                # Add motion
                t = frame_idx / sequence_length
                
                # Different motion patterns per class (for variety)
                motion_x = np.sin(class_idx * t * np.pi) * 0.1
                motion_y = np.cos(class_idx * t * np.pi * 2) * 0.1
                
                # Random noise for each keypoint
                for kp_idx in range(21):
                    noise = np.random.normal(0, 0.02, 2)
                    x = base_x + motion_x + noise[0]
                    y = base_y + motion_y + noise[1]
                    
                    # Clip to [0, 1]
                    sequence[frame_idx, kp_idx] = [
                        np.clip(x, 0, 1),
                        np.clip(y, 0, 1)
                    ]
            
            X.append(sequence)
            y.append(CLASS_TO_IDX[class_name])
        
        if (class_idx + 1) % 7 == 0:
            print(f"  Generated {(class_idx + 1) * num_samples_per_class} samples...")
    
    return np.array(X), np.array(y)

print("\n✓ Data generation functions ready")
print(f"  - Can load video files and extract keypoints")
print(f"  - Can generate synthetic data for testing")


✓ Data generation functions ready
  - Can load video files and extract keypoints
  - Can generate synthetic data for testing


In [13]:
# ============================================
# Cell 6: Generate or Load Data
# ============================================

print("\nPreparing dataset...")
print("="*60)

# For production, replace with actual videos
# X, y = load_real_videos_from_dataset()

# For demonstration, use synthetic data
NUM_SAMPLES_PER_CLASS = 20  # Increase to 100+ for better model

X, y = generate_synthetic_data(
    num_samples_per_class=NUM_SAMPLES_PER_CLASS,
    sequence_length=CONFIG['SEQUENCE_LENGTH']
)

print(f"\nDataset Shape:")
print(f"  X: {X.shape} (samples, frames, keypoints, coords)")
print(f"  y: {y.shape} (class indices)")
print(f"  Total samples: {len(X)}")
print(f"  Total classes: {len(np.unique(y))}")
print(f"  Samples per class: {NUM_SAMPLES_PER_CLASS}")

# Normalize keypoint coordinates to [-1, 1]
print(f"\nNormalizing data...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.reshape(len(X), -1))
X_scaled = X_scaled.reshape(X.shape)

print(f"✓ Data normalized")
print(f"  - Mean: {X_scaled.mean():.4f}")
print(f"  - Std: {X_scaled.std():.4f}")


Preparing dataset...
Generating 20 synthetic samples per class...


AttributeError: 'numpy.float64' object has no attribute 'append'

In [14]:
# ============================================
# Cell 7: Split Data
# ============================================

# First split: train + val vs test
X_temp, X_test, y_temp, y_test = train_test_split(
    X_scaled, y,
    test_size=CONFIG['TEST_SPLIT'],
    random_state=42,
    stratify=y
)

# Second split: train vs validation
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=CONFIG['VALIDATION_SPLIT'],
    random_state=42,
    stratify=y_temp
)

print("\nData Split:")
print(f"  Training   : {X_train.shape[0]} samples")
print(f"  Validation : {X_val.shape[0]} samples")
print(f"  Test       : {X_test.shape[0]} samples")
print(f"  Total      : {len(X)} samples")

# Class distribution
print(f"\nClass Distribution (Training):")
unique, counts = np.unique(y_train, return_counts=True)
for class_idx, count in zip(unique[:5], counts[:5]):
    print(f"  {CLASSES[class_idx]:10s}: {count:3d} samples")
print(f"  ... ({len(unique)} total classes)")

SyntaxError: unterminated string literal (detected at line 45) (2035478964.py, line 45)

In [None]:
# ============================================
# Cell 8: Build LSTM Model
# ============================================

print("\nBuilding LSTM model for ASL recognition...")
print("="*60)

def build_lstm_model(config):
    """
    Build LSTM model for hand gesture recognition.
    
    Input shape: (batch_size, sequence_length, num_keypoints, keypoint_dims)
    Output shape: (batch_size, num_classes)
    """
    input_shape = (
        config['SEQUENCE_LENGTH'],
        config['NUM_KEYPOINTS'],
        config['KEYPOINT_DIMS']
    )
    
    # Reshape input to 3D for LSTM
    # (sequence_length, num_keypoints*keypoint_dims)
    inputs = keras.Input(shape=input_shape, name='hand_sequence')
    
    # Flatten keypoints for LSTM
    x = layers.Reshape((
        config['SEQUENCE_LENGTH'],
        config['NUM_KEYPOINTS'] * config['KEYPOINT_DIMS']
    ))(inputs)
    
    # LSTM layers
    x = layers.LSTM(
        config['LSTM_UNITS_1'],
        return_sequences=True,
        name='lstm_1'
    )(x)
    x = layers.Dropout(config['DROPOUT'])(x)
    
    x = layers.LSTM(
        config['LSTM_UNITS_2'],
        return_sequences=False,
        name='lstm_2'
    )(x)
    x = layers.Dropout(config['DROPOUT'])(x)
    
    # Dense layers
    x = layers.Dense(config['DENSE_UNITS'], activation='relu', name='dense')(x)
    x = layers.Dropout(config['DROPOUT'])(x)
    
    # Output
    outputs = layers.Dense(config['NUM_CLASSES'], activation='softmax', name='output')(x)
    
    model = Model(inputs=inputs, outputs=outputs, name='ASL_LSTM')
    
    return model

# Build model
model = build_lstm_model(CONFIG)

# Compile
optimizer = keras.optimizers.Adam(learning_rate=CONFIG['LEARNING_RATE'])
model.compile(
    optimizer=optimizer,
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Summary
model.summary()

print(f"\n✓ Model built successfully")
print(f"  - Input shape: {model.input_shape}")
print(f"  - Output shape: {model.output_shape}")
print(f"  - Total parameters: {model.count_params():,}")

In [None]:
# ============================================
# Cell 9: Train Model
# ============================================

print("\nTraining model...")
print("="*60)

# Callbacks
callbacks = [
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-7,
        verbose=1
    )
]

# Train
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=CONFIG['BATCH_SIZE'],
    epochs=CONFIG['EPOCHS'],
    callbacks=callbacks,
    verbose=1
)

print("\n✓ Training complete")

In [None]:
# ============================================
# Cell 10: Evaluate Model
# ============================================

print("\nEvaluating model...")
print("="*60)

# Predictions
y_pred_prob = model.predict(X_test, verbose=0)
y_pred = np.argmax(y_pred_prob, axis=1)

# Metrics
test_acc = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")

# Classification report
print(f"\nClassification Report:")
print(classification_report(
    y_test, y_pred,
    target_names=[CLASSES[i] for i in range(CONFIG['NUM_CLASSES'])],
    digits=4
))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"\nConfusion Matrix Shape: {conf_matrix.shape}")
print(f"Correct predictions (diagonal): {np.trace(conf_matrix)}")

In [None]:
# ============================================
# Cell 11: Visualize Results
# ============================================

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Training history
axes[0, 0].plot(history.history['loss'], label='Training Loss')
axes[0, 0].plot(history.history['val_loss'], label='Validation Loss')
axes[0, 0].set_title('Model Loss')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].plot(history.history['accuracy'], label='Training Accuracy')
axes[0, 1].plot(history.history['val_accuracy'], label='Validation Accuracy')
axes[0, 1].set_title('Model Accuracy')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Confusion matrix
sns.heatmap(
    conf_matrix[:10, :10],  # First 10 classes for visibility
    annot=False,
    fmt='d',
    cmap='Blues',
    ax=axes[1, 0]
)
axes[1, 0].set_title('Confusion Matrix (First 10 Classes)')
axes[1, 0].set_xlabel('Predicted')
axes[1, 0].set_ylabel('Actual')

# Class-wise accuracy
class_accuracies = np.diag(conf_matrix) / conf_matrix.sum(axis=1)
axes[1, 1].barh(CLASSES, class_accuracies)
axes[1, 1].set_title('Per-Class Accuracy')
axes[1, 1].set_xlabel('Accuracy')
axes[1, 1].set_xlim([0, 1])

plt.tight_layout()
plt.savefig(f"{CONFIG['OUTPUT_DIR']}/training_results.png", dpi=100, bbox_inches='tight')
print("\n✓ Saved training_results.png")
plt.show()

In [None]:
# ============================================
# Cell 12: Save Model
# ============================================

print("\nSaving model...")
print("="*60)

# Save as HDF5
model.save(CONFIG['MODEL_PATH'])
print(f"✓ Saved: {CONFIG['MODEL_PATH']}")

# Save as SavedModel
tf.saved_model.save(model, CONFIG['SAVEDMODEL_PATH'])
print(f"✓ Saved: {CONFIG['SAVEDMODEL_PATH']}")

# Save metadata
metadata = {
    'model_type': 'LSTM Hand Gesture Recognition',
    'num_classes': CONFIG['NUM_CLASSES'],
    'sequence_length': CONFIG['SEQUENCE_LENGTH'],
    'num_keypoints': CONFIG['NUM_KEYPOINTS'],
    'classes': CLASSES,
    'test_accuracy': float(test_acc),
    'input_shape': [None, CONFIG['SEQUENCE_LENGTH'], CONFIG['NUM_KEYPOINTS'], CONFIG['KEYPOINT_DIMS']],
    'output_shape': [None, CONFIG['NUM_CLASSES']],
    'architecture': 'LSTM(128) -> LSTM(64) -> Dense(32)',
    'training_samples': len(X_train),
    'validation_samples': len(X_val),
    'test_samples': len(X_test)
}

with open(f"{CONFIG['OUTPUT_DIR']}/model_metadata.json", 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"✓ Saved: model_metadata.json")
print(f"\nModel Info:")
for key, value in metadata.items():
    if key != 'classes':
        print(f"  {key:20s}: {value}")

In [None]:
# ============================================
# Cell 13: Convert to TFJS (Optional)
# ============================================

import subprocess

print("\nConverting to TensorFlow.js format...")
print("="*60)

# Note: This requires tensorflowjs package
# pip install tensorflowjs

try:
    import tensorflowjs as tfjs
    
    # Create TFJS output directory
    os.makedirs(CONFIG['TFJS_PATH'], exist_ok=True)
    
    # Convert
    tfjs.converters.save_keras_model(model, CONFIG['TFJS_PATH'])
    
    print(f"✓ Converted to TFJS: {CONFIG['TFJS_PATH']}")
    print(f"\nFiles created:")
    for file in os.listdir(CONFIG['TFJS_PATH']):
        file_path = os.path.join(CONFIG['TFJS_PATH'], file)
        size = os.path.getsize(file_path)
        print(f"  - {file:40s} ({size:,} bytes)")

except ImportError:
    print("⚠ tensorflowjs not installed")
    print("  To convert, run: pip install tensorflowjs")
    print("  Then: tensorflowjs_converter --input_format=keras_saved_model --output_format=tfjs_graph_model")
    print(f"       {CONFIG['SAVEDMODEL_PATH']} {CONFIG['TFJS_PATH']}")

In [None]:
# ============================================
# Cell 14: Test Real-Time Inference
# ============================================

print("\nTesting real-time inference...")
print("="*60)

def predict_gesture(sequence, model, classes):
    """
    Predict gesture from a sequence of hand keypoints.
    
    Args:
        sequence: array of shape (sequence_length, num_keypoints, 2)
        model: trained keras model
        classes: list of class names
    
    Returns:
        prediction: dict with class, confidence, top-3
    """
    # Add batch dimension
    input_seq = np.expand_dims(sequence, axis=0)
    
    # Predict
    probabilities = model.predict(input_seq, verbose=0)[0]
    
    # Get top-3
    top_3_idx = np.argsort(probabilities)[-3:][::-1]
    
    return {
        'predicted_class': classes[np.argmax(probabilities)],
        'confidence': float(np.max(probabilities)),
        'top_3': [
            {'class': classes[idx], 'probability': float(probabilities[idx])}
            for idx in top_3_idx
        ]
    }

# Test on a few samples
print(f"\nTest Predictions (First 5 test samples):")
for i in range(min(5, len(X_test))):
    test_seq = X_test[i]
    true_class = CLASSES[y_test[i]]
    
    result = predict_gesture(test_seq, model, CLASSES)
    
    match = "✓" if result['predicted_class'] == true_class else "✗"
    print(f"\n  Sample {i+1}: {match}")
print(f"    True Class    : {true_class}")
    print(f"    Predicted    : {result['predicted_class']}")
    print(f"    Confidence   : {result['confidence']*100:.2f}%")
    print(f"    Top-3        : ", end="")
    for pred in result['top_3']:
        print(f"{pred['class']} ({pred['probability']*100:.1f}%) ", end="")
    print()

In [None]:
# ============================================
# Cell 15: Integration with Web App
# ============================================

print("\nWeb App Integration Guide")
print("="*60)

print("""
## How to Use in Web Application

### 1. Copy TFJS Model Files
Copy the generated TFJS files to your web app:
```
cp -r output/tfjs_asl_lstm/* public/models/
```

### 2. Update Web Component

In your React/Vue/etc component:

```javascript
// Load model
const model = await tf.loadLayersModel('/models/model.json');

// Load hand landmarks (use MediaPipe)
const landmarks = await extractHandLandmarks(videoFrame);

// Create sequence (30 frames x 21 keypoints x 2 coords)
const sequence = tf.tensor3d(your_landmarks_sequence);

// Predict
const output = model.predict(sequence);
const probabilities = await output.data();
const classIndex = tf.argMax(probabilities).dataSync()[0];
const className = CLASSES[classIndex];
```

### 3. Benefits of LSTM Model
✓ Captures temporal hand movements
✓ Robust to lighting conditions
✓ Works with different hand sizes
✓ Small model size (fast inference)
✓ Privacy-friendly (keypoints only)

### 4. Production Improvements
- Use real gesture videos (not synthetic data)
- Collect more samples per class (100+)
- Fine-tune with user-specific data
- Add data augmentation
- Implement confidence thresholds
- Add gesture sequences (multi-letter words)
""")