# Hand Sign Model Training
This file is used to train machine learning models for hand sign recognition. It uses the MediaPipe Hands model to detect hands and the Tensorflow Object Detection API to train a model for object detection. The model is then saved to a file for future use.

## Chores

In [1]:
from colorama import Fore, Back, Style
class Logger:
    @staticmethod
    def log_success(message):
        print(Fore.GREEN + f"[SUCCESS] : {message}" + Style.RESET_ALL)

    @staticmethod
    def log_error(message):
        print(Fore.RED + f"[ERROR] : {message}" + Style.RESET_ALL)


In [2]:
## Importing Libraries
import os
import zipfile
import cv2
import mediapipe as mp
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
import kaggle

Logger.log_success("Import Library Success")

[32m[SUCCESS] : Import Library Success[0m


In [3]:
import zipfile
from tqdm import tqdm
import kaggle
import os

KAGGLE_DATASET = "teukumariefafwan/indonesian-sign-language-bahasa-isyarat-indonesia"
DATASET_FOLDER = "BISINDO_Word_Dataset"
VIDEOS_PATH = os.path.join(DATASET_FOLDER, "BISINDO Sign Language Dataset", "Video")

# Define the expected zip file name based on the dataset slug
zip_filename = f"{KAGGLE_DATASET.split('/')[1]}.zip"

if not os.path.exists(DATASET_FOLDER):
    print(f"Downloading dataset '{KAGGLE_DATASET}'...")
    try:
        # Step 1: Download the file WITHOUT unzipping it
        # After
        kaggle.api.dataset_download_files(KAGGLE_DATASET, path='.', unzip=False)
        print(f"Download complete. Now extracting '{zip_filename}'...")

        # Step 2: Unzip the file manually with a tqdm progress bar
        with zipfile.ZipFile(zip_filename, 'r') as zf:
            # Wrap zf.infolist() with tqdm to create the progress bar
            for member in tqdm(zf.infolist(), desc='Extracting '):
                try:
                    # Extract each file into the target folder
                    zf.extract(member, path=DATASET_FOLDER)
                except zipfile.error as e:
                    print(f"Error extracting file {member.filename}: {e}")
        
        # Clean up the downloaded zip file after extraction
        os.remove(zip_filename)
        print("Extraction complete and zip file removed.")

    except Exception as e:
        print(f"An error occurred: {e}")
        raise
else:
    print(f"Dataset folder '{DATASET_FOLDER}' already exists.")

if not os.path.exists(VIDEOS_PATH):
    raise FileNotFoundError(f"Error: Videos path not found at '{VIDEOS_PATH}'. Check the folder structure.")

Downloading dataset 'teukumariefafwan/indonesian-sign-language-bahasa-isyarat-indonesia'...
Dataset URL: https://www.kaggle.com/datasets/teukumariefafwan/indonesian-sign-language-bahasa-isyarat-indonesia
Download complete. Now extracting 'indonesian-sign-language-bahasa-isyarat-indonesia.zip'...


Extracting : 100%|██████████| 2600/2600 [00:24<00:00, 106.65it/s]


Extraction complete and zip file removed.


FileNotFoundError: Error: Videos path not found at 'BISINDO_Word_Dataset\BISINDO Sign Language Dataset\Video'. Check the folder structure.

In [None]:
SEQUENCE_LENGTH = 30  # Number of frames per video to analyze
NUM_LANDMARKS = 21 * 3 # 21 landmarks * (x, y, z)
DATA_FILE = "landmark_sequences.npy"

if not os.path.exists(DATA_FILE):
    print(f"'{DATA_FILE}' not found. Starting video processing...")
    mp_hands = mp.solutions.hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
    
    sequences, labels = [], []
    
    # The dataset has category folders (e.g., 'WH-Question'), and inside them are the word videos
    for category in os.listdir(VIDEOS_PATH):
        category_path = os.path.join(VIDEOS_PATH, category)
        if not os.path.isdir(category_path): continue

        for video_file in tqdm(os.listdir(category_path), desc=f"Processing {category}"):
            video_path = os.path.join(category_path, video_file)
            word_label = os.path.splitext(video_file)[0] # Label is the video filename without extension

            cap = cv2.VideoCapture(video_path)
            video_sequence = []

            for frame_num in range(SEQUENCE_LENGTH):
                ret, frame = cap.read()
                if not ret: break # Break if video ends early
                
                results = mp_hands.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                if results.multi_hand_landmarks:
                    landmarks = np.array([[lm.x, lm.y, lm.z] for lm in results.multi_hand_landmarks[0].landmark]).flatten()
                    video_sequence.append(landmarks)
                else:
                    # Append zeros if no hand is detected to maintain sequence length
                    video_sequence.append(np.zeros(NUM_LANDMARKS))
            
            cap.release()
            
            # Only include sequences that have the correct length
            if len(video_sequence) == SEQUENCE_LENGTH:
                sequences.append(video_sequence)
                labels.append(word_label)
    
    mp_hands.close()
    
    X = np.array(sequences)
    y = np.array(labels)
    
    np.save(DATA_FILE, {'features': X, 'labels': y})
    print(f"Processed data saved to '{DATA_FILE}'. Shape: {X.shape}")
else:
    print(f"'{DATA_FILE}' already exists. Loading data.")
    data = np.load(DATA_FILE, allow_pickle=True).item()
    X = data['features']
    y = data['labels']
    print(f"Data loaded. Shape: {X.shape}")

## EDA

### Statistics

In [None]:
# # Dataset Info
# print(df_raw.info())

In [None]:
# # Nullity Check
# print(df_raw.isnull().sum())      

In [None]:
# # First 5 Rows
# print(df_raw.head())

### Sampling

In [None]:
# # Class Distribution
# plt.figure(figsize=(12, 8))
# sns.countplot(y='label', data=df_raw, order=df_raw['label'].value_counts().index, palette='viridis')
# plt.title('Class Distribution in the Dataset', fontsize=16)
# plt.xlabel('Number of Samples', fontsize=12)
# plt.ylabel('Gesture Label', fontsize=12)
# plt.tight_layout()
# plt.show()    

In [None]:
# # Sample Hand Landmarks
# HAND_CONNECTIONS = mp.solutions.hands.HAND_CONNECTIONS

# def visualize_landmarks(landmarks_row):
#     """Plots a 2D representation of the hand landmarks."""
#     landmarks = landmarks_row.values.reshape(21, 3)
#     x = landmarks[:, 0]
#     y = landmarks[:, 1]
    
#     plt.figure(figsize=(5, 5))
#     plt.scatter(x, y)
#     # Invert y-axis to match image coordinates (origin at top-left)
#     plt.gca().invert_yaxis()
    
#     # Draw connections
#     for connection in HAND_CONNECTIONS:
#         start_idx = connection[0]
#         end_idx = connection[1]
#         plt.plot([x[start_idx], x[end_idx]], [y[start_idx], y[end_idx]], 'r-')
        
#     plt.xlabel('X coordinate')
#     plt.ylabel('Y coordinate')
#     plt.title(f"Landmark Visualization for Label: {landmarks_row.name}")
#     plt.axis('equal')
#     plt.show()

# # Visualize one sample from a few different classes
# sample_labels = ['A', 'B', 'C', 'D']
# for label in sample_labels:
#     sample_row = df_raw[df_raw['label'] == label].iloc[0]
#     visualize_landmarks(sample_row.drop('label'))

# Logger.log_success("Sample hand landmarks visualization complete.")

In [None]:
Logger.log_success("EDA complete.")

## Preprocessing

### Data Split

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.20, random_state=42, stratify=y_encoded
)

print(f"Data split complete. Training samples: {len(X_train)}, Testing samples: {len(X_test)}")
print(f"Number of classes: {len(label_encoder.classes_)}")
print("Cell 4: Preprocessing complete.")

### Preprocessing

In [None]:
# df = pd.read_csv(RAW_CSV_FILE)
# X = df.drop('label', axis=1).values
# y = df['label'].values

# # Feature Engineering: Normalize landmarks
# X_processed = []
# for row in X:
#     landmarks = row.reshape(21, 3)
#     wrist = landmarks[0]
    
#     # Translation invariance
#     relative_landmarks = landmarks - wrist
    
#     # Scale invariance
#     max_dist = np.max(np.linalg.norm(relative_landmarks, axis=1))
#     if max_dist == 0: max_dist = 1 # Avoid division by zero
#     normalized_landmarks = relative_landmarks / max_dist
    
#     X_processed.append(normalized_landmarks.flatten())

# X_processed = np.array(X_processed)

# # Encode labels
# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(y)

# print(f"Data preprocessed. Feature shape: {X_processed.shape}")
# print("Cell 5: Preprocessing complete.")

## Modeling and Validation

### Modeling

In [None]:
model = Sequential([
    # Input shape: (SEQUENCE_LENGTH, NUM_FEATURES) -> (30, 63)
    LSTM(64, return_sequences=True, activation='relu', input_shape=(SEQUENCE_LENGTH, NUM_LANDMARKS)),
    Dropout(0.5),
    LSTM(128, return_sequences=False, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    # Output layer with one neuron per class
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()
print("Cell 5: LSTM model defined.")

In [None]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=0.00001)
]

history = model.fit(X_train, y_train,
                    epochs=200,
                    batch_size=16,
                    validation_data=(X_test, y_test),
                    callbacks=callbacks)
print("Cell 6: Model training complete.")


### Validation

In [None]:
print("--- Model Validation ---")
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nModel Accuracy on Test Set: {accuracy * 100:.2f}%")

y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(18, 15))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix', fontsize=20)
plt.ylabel('Actual Label', fontsize=15)
plt.xlabel('Predicted Label', fontsize=15)
plt.show()
print("Cell 7: Validation complete.")


## Error Analytics

In [None]:
# misclassified_indices = np.where(y_pred != y_test)[0]
# if len(misclassified_indices) > 0:
#     print(f"Found {len(misclassified_indices)} misclassified samples. Analyzing a few...")
    
#     # Find the original image paths (this is a bit slow but good for analysis)
#     df_test_indices = pd.DataFrame({'original_index': X_test.shape[0]*[0]}).index
#     df_test_indices = train_test_split(df.index, test_size=0.2, random_state=42, stratify=y)[1]

#     plt.figure(figsize=(15, 10))
#     for i, idx in enumerate(misclassified_indices[:5]): # Show first 5 errors
#         original_idx = df_test_indices[idx]
#         true_label = label_encoder.inverse_transform([y_test[idx]])[0]
#         pred_label = label_encoder.inverse_transform([y_pred[idx]])[0]
        
#         # Reconstruct image path
#         image_path = os.path.join(DATASET_PATH, true_label, f"{true_label}{original_idx % 100 + 1}.jpg") # This is an assumption on file naming
        
#         # We will just display the labels as finding the exact image is complex
#         print(f"Sample {i+1}: True='{true_label}', Predicted='{pred_label}'")

# else:
#     print("No misclassified samples found on the test set. Excellent model!")

## Export

In [None]:
MODEL_FILE = 'sibi_word_model.h5'
ENCODER_FILE = 'word_label_encoder.pkl'

model.save(MODEL_FILE)
with open(ENCODER_FILE, 'wb') as f:
    pickle.dump(label_encoder, f)

print(f"Model saved to '{MODEL_FILE}'")
print(f"Label encoder saved to '{ENCODER_FILE}'")
print("Cell 8: Export complete.")

## TESTING REAL TIME

In [None]:
print("\n--- Starting Real-Time Prediction ---")
print("Run this cell to start the webcam feed. Press 'q' in the window to quit.")

# Load artifacts
try:
    model = tf.keras.models.load_model(MODEL_FILE)
    with open(ENCODER_FILE, 'rb') as f:
        label_encoder = pickle.load(f)
except (FileNotFoundError, IOError) as e:
    print(f"Error loading artifacts: {e}. Please run all preceding cells.")
    raise

# Initialize MediaPipe and OpenCV
live_hands = mp.solutions.hands.Hands(model_complexity=0, min_detection_confidence=0.5, min_tracking_confidence=0.5)
cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)

# Variables for real-time prediction
sequence = []
current_prediction = ""
prediction_confidence = 0.0

while cap.isOpened():
    success, image = cap.read()
    if not success: continue

    image = cv2.flip(image, 1)
    results = live_hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

    if results.multi_hand_landmarks:
        hand_landmarks = results.multi_hand_landmarks[0]
        mp.solutions.drawing_utils.draw_landmarks(image, hand_landmarks, mp.solutions.hands.HAND_CONNECTIONS)
        
        # Append landmarks to the sequence
        landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks]).flatten()
        sequence.append(landmarks)
        # Keep the sequence at the correct length
        sequence = sequence[-SEQUENCE_LENGTH:]

        # Make a prediction once we have a full sequence
        if len(sequence) == SEQUENCE_LENGTH:
            input_data = np.expand_dims(sequence, axis=0)
            prediction = model.predict(input_data, verbose=0)
            
            predicted_class_index = np.argmax(prediction)
            prediction_confidence = prediction[0][predicted_class_index]
            
            # Only update if confidence is high enough
            if prediction_confidence > 0.7:
                current_prediction = label_encoder.inverse_transform([predicted_class_index])[0]

    # Display Prediction
    cv2.rectangle(image, (0, 0), (400, 60), (0, 0, 0), -1)
    display_text = f'{current_prediction} ({prediction_confidence:.2f})'
    cv2.putText(image, display_text, (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (255, 255, 255), 2, cv2.LINE_AA)

    cv2.imshow('SIBI Word Recognition', image)

    if cv2.waitKey(5) & 0xFF == ord('q'): break

cap.release()
cv2.destroyAllWindows()
live_hands.close()
print("Webcam feed stopped.")
