# Hand Gesture Recognition - Model Training
## This notebook will help you:
1. Collect hand gesture data from your webcam
2. Preprocess the data
3. Train a CNN model using TensorFlow
4. Save the trained model for real-time application

## Step 1: Import Required Libraries

In [None]:
import cv2
import numpy as np
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from datetime import datetime
import mediapipe as mp

print("TensorFlow Version:", tf.__version__)
print("GPU Available:", tf.config.list_physical_devices('GPU'))

## Step 2: Define Gesture Classes
We'll create 6 different gestures for controlling the mouse

In [None]:
# Define gesture classes
GESTURES = {
    0: 'palm',           # Move cursor
    1: 'index',          # Left click
    2: 'peace',          # Right click
    3: 'fist',           # Scroll
    4: 'thumb_pinky',    # Open app 1
    5: 'okay'            # Open app 2
}

NUM_CLASSES = len(GESTURES)
IMG_SIZE = 128
SAMPLES_PER_GESTURE = 500  # Collect 500 samples per gesture

print("Gestures to be trained:")
for key, value in GESTURES.items():
    print(f"{key}: {value}")

## Step 3: Data Collection Function
This will capture images from your webcam for each gesture

In [None]:
def collect_gesture_data(gesture_name, gesture_id, num_samples=500):
    """
    Collect hand gesture images from webcam
    """
    # Create directory for this gesture
    gesture_dir = f"../data/raw/{gesture_name}"
    os.makedirs(gesture_dir, exist_ok=True)
    
    # Initialize MediaPipe hands
    mp_hands = mp.solutions.hands
    mp_drawing = mp.solutions.drawing_utils
    hands = mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.5)
    
    cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
    
    count = 0
    print(f"\nCollecting data for gesture: {gesture_name}")
    print("Press SPACE to start capturing")
    print("Press 'q' to quit")
    
    capturing = False
    
    while count < num_samples:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Flip frame horizontally for mirror view
        frame = cv2.flip(frame, 1)
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Process with MediaPipe
        results = hands.process(rgb_frame)
        
        # Draw hand landmarks
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(
                    frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                
                if capturing:
                    # Get bounding box of hand
                    h, w, c = frame.shape
                    x_coords = [landmark.x for landmark in hand_landmarks.landmark]
                    y_coords = [landmark.y for landmark in hand_landmarks.landmark]
                    
                    x_min = int(min(x_coords) * w) - 20
                    x_max = int(max(x_coords) * w) + 20
                    y_min = int(min(y_coords) * h) - 20
                    y_max = int(max(y_coords) * h) + 20
                    
                    # Ensure coordinates are within frame
                    x_min = max(0, x_min)
                    y_min = max(0, y_min)
                    x_max = min(w, x_max)
                    y_max = min(h, y_max)
                    
                    # Extract hand region
                    hand_roi = frame[y_min:y_max, x_min:x_max]
                    
                    if hand_roi.size > 0:
                        # Resize to fixed size
                        hand_roi = cv2.resize(hand_roi, (IMG_SIZE, IMG_SIZE))
                        
                        # Save image
                        img_path = os.path.join(gesture_dir, f"{gesture_name}_{count}.jpg")
                        cv2.imwrite(img_path, hand_roi)
                        count += 1
                        
                        # Draw rectangle on original frame
                        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
        
        # Display info
        status = "CAPTURING" if capturing else "Press SPACE to start"
        cv2.putText(frame, f"{gesture_name.upper()}", (10, 30), 
                   cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        cv2.putText(frame, f"Status: {status}", (10, 70), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2)
        cv2.putText(frame, f"Samples: {count}/{num_samples}", (10, 110), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 2)
        
        cv2.imshow('Data Collection', frame)
        
        key = cv2.waitKey(1) & 0xFF
        if key == ord(' '):
            capturing = not capturing
        elif key == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()
    hands.close()
    
    print(f"Collected {count} samples for {gesture_name}")
    return count

## Step 4: Collect Data for All Gestures
**Instructions:**
- Make sure you have good lighting
- Position your hand clearly in front of the camera
- Press SPACE to start/stop capturing
- Move your hand slightly while capturing to add variation
- Press 'q' to skip to next gesture

In [None]:
# Collect data for all gestures
print("="*50)
print("GESTURE DATA COLLECTION")
print("="*50)

for gesture_id, gesture_name in GESTURES.items():
    input(f"\nPress Enter to start collecting '{gesture_name}' gesture...")
    collect_gesture_data(gesture_name, gesture_id, SAMPLES_PER_GESTURE)

print("\n" + "="*50)
print("Data collection completed!")
print("="*50)

## Step 5: Load and Preprocess Data

In [None]:
def load_data():
    """
    Load all collected gesture images and create dataset
    """
    X = []
    y = []
    
    data_dir = "../data/raw"
    
    for gesture_id, gesture_name in GESTURES.items():
        gesture_path = os.path.join(data_dir, gesture_name)
        
        if not os.path.exists(gesture_path):
            print(f"Warning: {gesture_path} not found")
            continue
        
        images = os.listdir(gesture_path)
        print(f"Loading {len(images)} images for {gesture_name}...")
        
        for img_name in images:
            img_path = os.path.join(gesture_path, img_name)
            img = cv2.imread(img_path)
            
            if img is not None:
                # Convert BGR to RGB
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                # Normalize pixel values
                img = img.astype('float32') / 255.0
                
                X.append(img)
                y.append(gesture_id)
    
    X = np.array(X)
    y = np.array(y)
    
    print(f"\nTotal samples loaded: {len(X)}")
    print(f"Data shape: {X.shape}")
    print(f"Labels shape: {y.shape}")
    
    return X, y

# Load the data
X, y = load_data()

# Convert labels to categorical
y_categorical = to_categorical(y, NUM_CLASSES)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

## Step 6: Visualize Sample Data

In [None]:
# Display sample images from each gesture
fig, axes = plt.subplots(2, 3, figsize=(12, 8))
axes = axes.ravel()

for i, (gesture_id, gesture_name) in enumerate(GESTURES.items()):
    # Find first image of this gesture
    idx = np.where(y == gesture_id)[0][0]
    axes[i].imshow(X[idx])
    axes[i].set_title(f"{gesture_name.upper()}", fontsize=12, fontweight='bold')
    axes[i].axis('off')

plt.tight_layout()
plt.savefig('../data/sample_gestures.png', dpi=150, bbox_inches='tight')
plt.show()

print("Sample gestures visualized!")

## Step 7: Build CNN Model

In [None]:
def create_model():
    """
    Create a CNN model for gesture recognition
    """
    model = Sequential([
        # First Convolutional Block
        Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 3)),
        BatchNormalization(),
        Conv2D(32, (3, 3), activation='relu'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        # Second Convolutional Block
        Conv2D(64, (3, 3), activation='relu'),
        BatchNormalization(),
        Conv2D(64, (3, 3), activation='relu'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        # Third Convolutional Block
        Conv2D(128, (3, 3), activation='relu'),
        BatchNormalization(),
        Conv2D(128, (3, 3), activation='relu'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        # Flatten and Dense Layers
        Flatten(),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        
        # Output Layer
        Dense(NUM_CLASSES, activation='softmax')
    ])
    
    return model

# Create the model
model = create_model()

# Compile the model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Display model summary
model.summary()

## Step 8: Data Augmentation

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Create data augmentation generator
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

datagen.fit(X_train)

print("Data augmentation configured!")

## Step 9: Train the Model

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

# Define callbacks
checkpoint = ModelCheckpoint(
    '../models/best_gesture_model.h5',
    monitor='val_accuracy',
    save_best_only=True,
    mode='max',
    verbose=1
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    min_lr=1e-7,
    verbose=1
)

# Train the model
EPOCHS = 50
BATCH_SIZE = 32

print("Starting training...")
print(f"Epochs: {EPOCHS}")
print(f"Batch size: {BATCH_SIZE}")

history = model.fit(
    datagen.flow(X_train, y_train, batch_size=BATCH_SIZE),
    validation_data=(X_test, y_test),
    epochs=EPOCHS,
    callbacks=[checkpoint, early_stop, reduce_lr],
    verbose=1
)

print("\nTraining completed!")

## Step 10: Evaluate Model Performance

In [None]:
# Evaluate on test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

print("="*50)
print("MODEL EVALUATION RESULTS")
print("="*50)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy*100:.2f}%")
print("="*50)

## Step 11: Plot Training History

In [None]:
# Plot training history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot accuracy
ax1.plot(history.history['accuracy'], label='Training Accuracy', linewidth=2)
ax1.plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
ax1.set_title('Model Accuracy', fontsize=14, fontweight='bold')
ax1.set_xlabel('Epoch', fontsize=12)
ax1.set_ylabel('Accuracy', fontsize=12)
ax1.legend(loc='lower right', fontsize=10)
ax1.grid(True, alpha=0.3)

# Plot loss
ax2.plot(history.history['loss'], label='Training Loss', linewidth=2)
ax2.plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
ax2.set_title('Model Loss', fontsize=14, fontweight='bold')
ax2.set_xlabel('Epoch', fontsize=12)
ax2.set_ylabel('Loss', fontsize=12)
ax2.legend(loc='upper right', fontsize=10)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../models/training_history.png', dpi=150, bbox_inches='tight')
plt.show()

print("Training history plotted!")

## Step 12: Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

# Predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# Confusion matrix
cm = confusion_matrix(y_true_classes, y_pred_classes)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=GESTURES.values(),
            yticklabels=GESTURES.values())
plt.title('Confusion Matrix', fontsize=16, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('../models/confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

# Classification report
print("\nClassification Report:")
print("="*70)
report = classification_report(y_true_classes, y_pred_classes, 
                               target_names=GESTURES.values())
print(report)

## Step 13: Save Final Model

In [None]:
# Save the final model
model.save('../models/gesture_model.h5')
print("Model saved as 'gesture_model.h5'")

# Save model architecture as JSON
model_json = model.to_json()
with open("../models/model_architecture.json", "w") as json_file:
    json_file.write(model_json)
print("Model architecture saved as 'model_architecture.json'")

# Save gesture mapping
import json
with open('../models/gesture_mapping.json', 'w') as f:
    json.dump(GESTURES, f, indent=4)
print("Gesture mapping saved as 'gesture_mapping.json'")

print("\n" + "="*70)
print("MODEL TRAINING COMPLETED SUCCESSFULLY!")
print("="*70)
print("\nYou can now use this model in the real-time gesture controller!")
print("Run the 'gesture_controller.py' file from VS Code.")

## Step 14: Test Model with Live Camera (Optional)

In [None]:
def test_model_live():
    """
    Test the trained model with live camera feed
    """
    mp_hands = mp.solutions.hands
    mp_drawing = mp.solutions.drawing_utils
    hands = mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.5)
    
    cap = cv2.VideoCapture(0)
    
    print("Testing model with live camera...")
    print("Press 'q' to quit")
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        frame = cv2.flip(frame, 1)
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb_frame)
        
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                
                # Get hand bounding box
                h, w, c = frame.shape
                x_coords = [landmark.x for landmark in hand_landmarks.landmark]
                y_coords = [landmark.y for landmark in hand_landmarks.landmark]
                
                x_min = int(min(x_coords) * w) - 20
                x_max = int(max(x_coords) * w) + 20
                y_min = int(min(y_coords) * h) - 20
                y_max = int(max(y_coords) * h) + 20
                
                x_min = max(0, x_min)
                y_min = max(0, y_min)
                x_max = min(w, x_max)
                y_max = min(h, y_max)
                
                hand_roi = frame[y_min:y_max, x_min:x_max]
                
                if hand_roi.size > 0:
                    # Preprocess for prediction
                    hand_roi_resized = cv2.resize(hand_roi, (IMG_SIZE, IMG_SIZE))
                    hand_roi_rgb = cv2.cvtColor(hand_roi_resized, cv2.COLOR_BGR2RGB)
                    hand_roi_normalized = hand_roi_rgb.astype('float32') / 255.0
                    hand_roi_batch = np.expand_dims(hand_roi_normalized, axis=0)
                    
                    # Predict
                    prediction = model.predict(hand_roi_batch, verbose=0)
                    gesture_id = np.argmax(prediction)
                    confidence = prediction[0][gesture_id]
                    
                    gesture_name = GESTURES[gesture_id]
                    
                    # Draw prediction
                    cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                    cv2.putText(frame, f"{gesture_name}: {confidence:.2f}", 
                               (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 
                               0.7, (0, 255, 0), 2)
        
        cv2.imshow('Model Test', frame)
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()
    hands.close()

# Uncomment to test
# test_model_live()