In [3]:
import os
import cv2

# Path to store collected data (modified for your location)
DATA_DIR = r'C:\Users\Chait\welllog.ipynb\project_directory\data'
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

# Define the alphabet and the number of images per class
alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
dataset_size = 100  # Number of images for each letter

# Initialize webcam
cap = cv2.VideoCapture(0)

# Loop over each letter in the alphabet
for letter in alphabet:
    letter_dir = os.path.join(DATA_DIR, letter)
    if not os.path.exists(letter_dir):
        os.makedirs(letter_dir)

    print(f'Collecting data for class {letter}')
    input("Press Enter when ready to start capturing images for " + letter)

    counter = 0
    while counter < dataset_size:
        ret, frame = cap.read()
        if not ret:
            print("Failed to grab frame")
            continue
        
        cv2.imshow('frame', frame)
        cv2.waitKey(25)
        
        # Save the captured frames as images
        cv2.imwrite(os.path.join(letter_dir, f'{counter}.jpg'), frame)
        counter += 1

        # Exit on pressing 'q'
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

# Release webcam and close window
cap.release()
cv2.destroyAllWindows()


Collecting data for class A


Press Enter when ready to start capturing images for A 


Collecting data for class B


Press Enter when ready to start capturing images for B 


Collecting data for class C


Press Enter when ready to start capturing images for C 


Collecting data for class D


Press Enter when ready to start capturing images for D 


Collecting data for class E


Press Enter when ready to start capturing images for E 


Collecting data for class F


Press Enter when ready to start capturing images for F 


Collecting data for class G


Press Enter when ready to start capturing images for G 


Collecting data for class H


Press Enter when ready to start capturing images for H 


Collecting data for class I


Press Enter when ready to start capturing images for I 


Collecting data for class J


Press Enter when ready to start capturing images for J 


Collecting data for class K


Press Enter when ready to start capturing images for K 


Collecting data for class L


Press Enter when ready to start capturing images for L 


Collecting data for class M


Press Enter when ready to start capturing images for M 


Collecting data for class N


Press Enter when ready to start capturing images for N 


Collecting data for class O


Press Enter when ready to start capturing images for O 


Collecting data for class P


Press Enter when ready to start capturing images for P 


Collecting data for class Q


Press Enter when ready to start capturing images for Q 


Collecting data for class R


Press Enter when ready to start capturing images for R 


Collecting data for class S


Press Enter when ready to start capturing images for S 


Collecting data for class T


Press Enter when ready to start capturing images for T 


Collecting data for class U


Press Enter when ready to start capturing images for U 


Collecting data for class V


Press Enter when ready to start capturing images for V 


Collecting data for class W


Press Enter when ready to start capturing images for W 


Collecting data for class X


Press Enter when ready to start capturing images for X 


Collecting data for class Y


Press Enter when ready to start capturing images for Y 


Collecting data for class Z


Press Enter when ready to start capturing images for Z 


In [4]:
!pip install scikit-learn



In [5]:
!pip install mediapipe



In [6]:
!pip install tqdm



In [7]:
pip install torch torchvision torchaudio

Collecting torch
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/5a/6a/775b93d6888c31f1f1fc457e4f5cc89f0984412d5dcdef792b8f2aa6e812/torch-2.4.1-cp311-cp311-win_amd64.whl.metadata
  Downloading torch-2.4.1-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting torchvision
  Obtaining dependency information for torchvision from https://files.pythonhosted.org/packages/f8/69/dc769cf54df8e828c0b8957b4521f35178f5bd4cc5b8fbe8a37ffd89a27c/torchvision-0.19.1-cp311-cp311-win_amd64.whl.metadata
  Downloading torchvision-0.19.1-cp311-cp311-win_amd64.whl.metadata (6.1 kB)
Collecting torchaudio
  Obtaining dependency information for torchaudio from https://files.pythonhosted.org/packages/b7/ba/6dde28d32906dba5e9a1b240c9b328f564ce3ac020c0f159cc13c2d47d9d/torchaudio-2.4.1-cp311-cp311-win_amd64.whl.metadata
  Downloading torchaudio-2.4.1-cp311-cp311-win_amd64.whl.metadata (6.5 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Obtaining dependency informatio

In [8]:
import os
import cv2
import mediapipe as mp
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm  # Progress bar library
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)

# Path to dataset (modified for your location)
DATA_DIR = r'C:\Users\Chait\welllog.ipynb\project_directory\data'
alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

# Prepare the data and labels
data = []
labels = []

# Count total images for progress tracking
total_images = sum([len(files) for r, d, files in os.walk(DATA_DIR)])

# Loop through each letter in the dataset and display progress
for label in tqdm(alphabet, desc="Processing letters", unit="letter"):
    label_dir = os.path.join(DATA_DIR, label)
    for img_path in tqdm(os.listdir(label_dir), desc=f"Processing images for {label}", unit="image", leave=False):
        img = cv2.imread(os.path.join(label_dir, img_path))
        if img is None:
            continue

        # Convert the image to RGB for MediaPipe processing
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # Process the image with MediaPipe Hands
        results = hands.process(img_rgb)

        # Collect hand landmarks if found
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                data_aux = []
                x_ = []
                y_ = []

                # Collect x and y coordinates
                for landmark in hand_landmarks.landmark:
                    x = landmark.x
                    y = landmark.y
                    x_.append(x)
                    y_.append(y)

                # Normalize the coordinates
                for landmark in hand_landmarks.landmark:
                    data_aux.append(landmark.x - min(x_))
                    data_aux.append(landmark.y - min(y_))

                # Append the data and label
                data.append(data_aux)
                labels.append(label)

# Convert labels to numeric format
labels = [alphabet.index(label) for label in labels]

# Convert data and labels to torch tensors and move them to the GPU if available
data_tensor = torch.tensor(np.array(data), dtype=torch.float32).to(device)
labels_tensor = torch.tensor(np.array(labels), dtype=torch.long).to(device)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_tensor, labels_tensor, test_size=0.2, random_state=42)

# Create a simple neural network model
class SignLanguageModel(nn.Module):
    def __init__(self):
        super(SignLanguageModel, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 128)  # Input layer
        self.fc2 = nn.Linear(128, 64)                # Hidden layer
        self.fc3 = nn.Linear(64, len(alphabet))      # Output layer

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)  # No activation, as we'll use CrossEntropyLoss
        return x

# Initialize the model and move it to the GPU if available
model = SignLanguageModel().to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Prepare DataLoader for batch processing
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}")

# Test the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total * 100
print(f'Model Accuracy: {accuracy:.2f}%')

# Save the model
torch.save(model.state_dict(), 'sign_language_model.pth')
print("Model saved as 'sign_language_model.pth'")


Using device: cpu


Processing letters:   0%|          | 0/26 [00:00<?, ?letter/s]

Processing images for A:   2%|▏         | 2/100 [00:00<00:05, 18.27image/s][A
Processing images for A:   5%|▌         | 5/100 [00:00<00:04, 20.51image/s][A
Processing images for A:   8%|▊         | 8/100 [00:00<00:04, 22.32image/s][A
Processing images for A:  11%|█         | 11/100 [00:00<00:04, 20.54image/s][A
Processing images for A:  14%|█▍        | 14/100 [00:00<00:03, 22.41image/s][A
Processing images for A:  17%|█▋        | 17/100 [00:00<00:04, 19.89image/s][A
Processing images for A:  20%|██        | 20/100 [00:00<00:03, 21.31image/s][A
Processing images for A:  23%|██▎       | 23/100 [00:01<00:03, 21.33image/s][A
Processing images for A:  27%|██▋       | 27/100 [00:01<00:03, 23.89image/s][A
Processing images for A:  30%|███       | 30/100 [00:01<00:02, 23.69image/s][A
Processing images for A:  33%|███▎      | 33/100 [00:01<00:02, 23.63image/s][A
Processing images for A:  36%|███▌      | 36/100 [00:01<00:

Epoch [1/10], Loss: 3.1793
Epoch [2/10], Loss: 2.5499
Epoch [3/10], Loss: 1.6311
Epoch [4/10], Loss: 1.0460
Epoch [5/10], Loss: 0.6904
Epoch [6/10], Loss: 0.4743
Epoch [7/10], Loss: 0.3588
Epoch [8/10], Loss: 0.2724
Epoch [9/10], Loss: 0.2128
Epoch [10/10], Loss: 0.1764
Model Accuracy: 95.85%
Model saved as 'sign_language_model.pth'


In [9]:
import os
import pickle
import cv2
import mediapipe as mp
import numpy as np
import torch
from tqdm import tqdm  # Progress bar
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)

# Path to dataset and test images (updated for your directory structure)
DATA_DIR = r'C:\Users\Chait\welllog.ipynb\project_directory\data'  # Path to your dataset
TEST_IMAGE_PATH = r'C:\Users\Chait\welllog.ipynb\project_directory\projectsign\testimages'  # Path to your test images
MODEL_PATH = r'C:\Users\Chait\welllog.ipynb\project_directory\model.p'  # Path to save the trained model
alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

# Prepare the data and labels
data = []
labels = []

# Count total images for progress tracking
total_images = sum([len(files) for r, d, files in os.walk(DATA_DIR)])

# Loop through each letter in the dataset and display progress
for label in tqdm(alphabet, desc="Processing letters", unit="letter"):
    label_dir = os.path.join(DATA_DIR, label)
    for img_path in tqdm(os.listdir(label_dir), desc=f"Processing images for {label}", unit="image", leave=False):
        img = cv2.imread(os.path.join(label_dir, img_path))
        if img is None:
            continue

        # Convert the image to RGB for MediaPipe processing
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # Process the image with MediaPipe Hands
        results = hands.process(img_rgb)

        # Collect hand landmarks if found
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                data_aux = []
                x_ = []
                y_ = []

                # Collect x and y coordinates
                for landmark in hand_landmarks.landmark:
                    x = landmark.x
                    y = landmark.y
                    x_.append(x)
                    y_.append(y)

                # Normalize the coordinates
                for landmark in hand_landmarks.landmark:
                    data_aux.append(landmark.x - min(x_))
                    data_aux.append(landmark.y - min(y_))

                # Append the data and label
                data.append(data_aux)
                labels.append(label)

# Convert labels to numeric format
labels = [alphabet.index(label) for label in labels]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(np.array(data), np.array(labels), test_size=0.2, random_state=42)

# Train a RandomForestClassifier
print("Training the model...")
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Test the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy * 100:.2f}%')

# Save the model to a file
with open(MODEL_PATH, 'wb') as f:
    pickle.dump({'model': model}, f)

print(f"Model saved as '{MODEL_PATH}'")

# Function to predict sign language from test images
def predict_sign_language(image_path, model, hands):
    img = cv2.imread(image_path)
    if img is None:
        print(f"Image not found: {image_path}")
        return None

    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    results = hands.process(img_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            data_aux = []
            x_ = []
            y_ = []

            for landmark in hand_landmarks.landmark:
                x = landmark.x
                y = landmark.y
                x_.append(x)
                y_.append(y)

            for landmark in hand_landmarks.landmark:
                data_aux.append(landmark.x - min(x_))
                data_aux.append(landmark.y - min(y_))

            data_aux = np.array(data_aux).reshape(1, -1)
            prediction = model.predict(data_aux)
            return alphabet[prediction[0]]
    return None

# Load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
with open(MODEL_PATH, 'rb') as f:
    model_dict = pickle.load(f)
model = model_dict['model']

# Loop through the test images and predict each one
test_images = os.listdir(TEST_IMAGE_PATH)

print("Starting prediction on test images...")

# Display progress using tqdm
for image_file in tqdm(test_images, desc="Predicting signs"):
    image_path = os.path.join(TEST_IMAGE_PATH, image_file)
    prediction = predict_sign_language(image_path, model, hands)
    if prediction:
        print(f"Prediction for {image_file}: {prediction}")
    else:
        print(f"No hand detected in {image_file}")

print("Finished processing all test images.")


Processing letters:   0%|          | 0/26 [00:00<?, ?letter/s]

Processing images for A:   2%|▏         | 2/100 [00:00<00:05, 16.68image/s][A
Processing images for A:   5%|▌         | 5/100 [00:00<00:04, 21.47image/s][A
Processing images for A:   8%|▊         | 8/100 [00:00<00:04, 21.11image/s][A
Processing images for A:  11%|█         | 11/100 [00:00<00:03, 23.83image/s][A
Processing images for A:  14%|█▍        | 14/100 [00:00<00:03, 24.23image/s][A
Processing images for A:  17%|█▋        | 17/100 [00:00<00:03, 23.53image/s][A
Processing images for A:  20%|██        | 20/100 [00:00<00:03, 24.98image/s][A
Processing images for A:  23%|██▎       | 23/100 [00:00<00:03, 23.38image/s][A
Processing images for A:  26%|██▌       | 26/100 [00:01<00:03, 23.51image/s][A
Processing images for A:  30%|███       | 30/100 [00:01<00:02, 26.27image/s][A
Processing images for A:  33%|███▎      | 33/100 [00:01<00:02, 26.10image/s][A
Processing images for A:  36%|███▌      | 36/100 [00:01<00:

Training the model...
Model Accuracy: 99.80%
Model saved as 'C:\Users\Chait\welllog.ipynb\project_directory\model.p'


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\Chait\\welllog.ipynb\\project_directory\\projectsign\\testimages'

In [10]:
TEST_IMAGE_PATH = r'C:\Users\Chait\welllog.ipynb\project_directory\testimages'


In [11]:
# Load the model
with open(MODEL_PATH, 'rb') as f:
    model_dict = pickle.load(f)
model = model_dict['model']

# Loop through the test images and predict each one
test_images = os.listdir(TEST_IMAGE_PATH)

print("Starting prediction on test images...")

# Display progress using tqdm
for image_file in tqdm(test_images, desc="Predicting signs"):
    image_path = os.path.join(TEST_IMAGE_PATH, image_file)
    prediction = predict_sign_language(image_path, model, hands)
    if prediction:
        print(f"Prediction for {image_file}: {prediction}")
    else:
        print(f"No hand detected in {image_file}")

print("Finished processing all test images.")


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\Chait\\welllog.ipynb\\project_directory\\testimages'

In [12]:
import os
import pickle
import cv2
import mediapipe as mp
import numpy as np
import torch
from tqdm import tqdm  # Progress bar
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Ensure required libraries are installed
try:
    import cv2
    import mediapipe as mp
    import torch
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    from tqdm import tqdm
except ImportError:
    print("Some required libraries are missing, installing them now...")
    os.system('pip install opencv-python mediapipe torch scikit-learn tqdm')

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)

# Define paths (update the paths as per your system)
DATA_DIR = r'C:\Users\Chait\welllog.ipynb\project_directory\data'  # Path to your dataset
TEST_IMAGE_PATH = r'C:\Users\Chait\welllog.ipynb\project_directory\testimages'  # Path to your test images
MODEL_PATH = r'C:\Users\Chait\welllog.ipynb\project_directory\model.p'  # Path to save the trained model
alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

# Check if dataset and test image directories exist
if not os.path.exists(DATA_DIR):
    print(f"Dataset directory does not exist: {DATA_DIR}")
    exit(1)
if not os.path.exists(TEST_IMAGE_PATH):
    print(f"Test image directory does not exist: {TEST_IMAGE_PATH}")
    exit(1)

# Prepare the data and labels
data = []
labels = []

# Count total images for progress tracking
total_images = sum([len(files) for r, d, files in os.walk(DATA_DIR)])

# Loop through each letter in the dataset and display progress
for label in tqdm(alphabet, desc="Processing letters", unit="letter"):
    label_dir = os.path.join(DATA_DIR, label)
    for img_path in tqdm(os.listdir(label_dir), desc=f"Processing images for {label}", unit="image", leave=False):
        img = cv2.imread(os.path.join(label_dir, img_path))
        if img is None:
            continue

        # Convert the image to RGB for MediaPipe processing
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # Process the image with MediaPipe Hands
        results = hands.process(img_rgb)

        # Collect hand landmarks if found
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                data_aux = []
                x_ = []
                y_ = []

                # Collect x and y coordinates
                for landmark in hand_landmarks.landmark:
                    x = landmark.x
                    y = landmark.y
                    x_.append(x)
                    y_.append(y)

                # Normalize the coordinates
                for landmark in hand_landmarks.landmark:
                    data_aux.append(landmark.x - min(x_))
                    data_aux.append(landmark.y - min(y_))

                # Append the data and label
                data.append(data_aux)
                labels.append(label)

# Convert labels to numeric format
labels = [alphabet.index(label) for label in labels]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(np.array(data), np.array(labels), test_size=0.2, random_state=42)

# Train a RandomForestClassifier if model doesn't already exist
if not os.path.exists(MODEL_PATH):
    print("Training the model...")
    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    # Test the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Model Accuracy: {accuracy * 100:.2f}%')

    # Save the model to a file
    with open(MODEL_PATH, 'wb') as f:
        pickle.dump({'model': model}, f)

    print(f"Model saved as '{MODEL_PATH}'")
else:
    # Load the existing model
    print(f"Loading existing model from '{MODEL_PATH}'...")
    with open(MODEL_PATH, 'rb') as f:
        model_dict = pickle.load(f)
    model = model_dict['model']

# Function to predict sign language from test images
def predict_sign_language(image_path, model, hands):
    img = cv2.imread(image_path)
    if img is None:
        print(f"Image not found: {image_path}")
        return None

    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    results = hands.process(img_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            data_aux = []
            x_ = []
            y_ = []

            for landmark in hand_landmarks.landmark:
                x = landmark.x
                y = landmark.y
                x_.append(x)
                y_.append(y)

            for landmark in hand_landmarks.landmark:
                data_aux.append(landmark.x - min(x_))
                data_aux.append(landmark.y - min(y_))

            data_aux = np.array(data_aux).reshape(1, -1)
            prediction = model.predict(data_aux)
            return alphabet[prediction[0]]
    return None

# Loop through the test images and predict each one
test_images = os.listdir(TEST_IMAGE_PATH)
print("Starting prediction on test images...")

# Display progress using tqdm
for image_file in tqdm(test_images, desc="Predicting signs"):
    image_path = os.path.join(TEST_IMAGE_PATH, image_file)
    prediction = predict_sign_language(image_path, model, hands)
    if prediction:
        print(f"Prediction for {image_file}: {prediction}")
    else:
        print(f"No hand detected in {image_file}")

print("Finished processing all test images.")


Test image directory does not exist: C:\Users\Chait\welllog.ipynb\project_directory\testimages


Processing letters:   0%|          | 0/26 [00:00<?, ?letter/s]

Processing images for A:   2%|▏         | 2/100 [00:00<00:05, 16.36image/s][A
Processing images for A:   5%|▌         | 5/100 [00:00<00:04, 20.05image/s][A
Processing images for A:   8%|▊         | 8/100 [00:00<00:04, 22.91image/s][A
Processing images for A:  11%|█         | 11/100 [00:00<00:03, 23.56image/s][A
Processing images for A:  14%|█▍        | 14/100 [00:00<00:03, 23.58image/s][A
Processing images for A:  17%|█▋        | 17/100 [00:00<00:03, 23.96image/s][A
Processing images for A:  20%|██        | 20/100 [00:00<00:03, 25.17image/s][A
Processing images for A:  23%|██▎       | 23/100 [00:00<00:03, 24.39image/s][A
Processing images for A:  26%|██▌       | 26/100 [00:01<00:03, 23.81image/s][A
Processing images for A:  29%|██▉       | 29/100 [00:01<00:03, 23.57image/s][A
Processing images for A:  33%|███▎      | 33/100 [00:01<00:02, 26.02image/s][A
Processing images for A:  36%|███▌      | 36/100 [00:01<00:

Loading existing model from 'C:\Users\Chait\welllog.ipynb\project_directory\model.p'...





FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\Chait\\welllog.ipynb\\project_directory\\testimages'

In [2]:
import cv2
import mediapipe as mp
import numpy as np
import pickle
from collections import deque
from autocorrect import Speller
from tqdm import tqdm  # Progress bar for processing

# Load the trained model from the specified path
model_path = r'C:\users\chait\welllog.ipynb\project_directory\model.p'
with open(model_path, 'rb') as f:
    model_dict = pickle.load(f)
model = model_dict['model']

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, min_detection_confidence=0.3)

# Speller for autocorrecting words
spell = Speller(lang='en')

# Define a deque to store detected letters
detected_letters = deque(maxlen=20)  # Adjust maxlen based on how many letters you want to store

# Define a function to combine letters into words and autocorrect them
def form_words_and_autocorrect(detected_letters):
    word = ''.join(detected_letters)
    corrected_word = spell(word)
    return corrected_word

# Function to capture and process webcam frames
def capture_webcam_frame():
    cap = cv2.VideoCapture(0)  # Use the first webcam device
    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return None

    frame_count = 0
    frame_skip = 5  # You can adjust this to process every nth frame
    total_frames = 100  # Define the total number of frames you want to process

    with tqdm(total=total_frames, desc="Processing Frames", unit="frame") as pbar:  # Progress bar
        while True:
            ret, frame = cap.read()
            if not ret:
                print("Error: Could not read frame.")
                break

            # Skip frames for performance
            if frame_count % frame_skip == 0:
                # Process the frame and predict the letter
                prediction = process_frame_with_model(frame)

                # If a letter is detected, append it to the detected_letters deque
                if prediction is not None:
                    detected_letter = prediction
                    detected_letters.append(detected_letter)
                    print(f"Detected Letter: {detected_letter}")

                    # Combine letters into a word, autocorrect, and display the word
                    corrected_word = form_words_and_autocorrect(detected_letters)
                    print(f"Corrected Word: {corrected_word}")

            frame_count += 1
            pbar.update(1)  # Update the progress bar

            # Press 'q' to quit
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

            # Stop the progress bar after the total frames are processed
            if frame_count >= total_frames:
                break

    cap.release()
    cv2.destroyAllWindows()

# Function to process the frame with the trained model and detect a letter
def process_frame_with_model(frame):
    img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert to RGB for MediaPipe processing
    results = hands.process(img_rgb)

    # If hand landmarks are detected, predict the letter using the model
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            data_aux = []
            x_ = []
            y_ = []

            # Collect x and y coordinates
            for landmark in hand_landmarks.landmark:
                x = landmark.x
                y = landmark.y
                x_.append(x)
                y_.append(y)

            # Normalize the coordinates
            for landmark in hand_landmarks.landmark:
                data_aux.append(landmark.x - min(x_))
                data_aux.append(landmark.y - min(y_))

            # Convert the landmarks to a numpy array
            data_aux = np.array(data_aux).reshape(1, -1)

            # Predict using the model
            prediction = model.predict(data_aux)

            # Get the predicted letter (assuming it's mapped as an alphabet letter)
            alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
            return alphabet[prediction[0]]

    return None

# Start capturing frames and detecting letters
capture_webcam_frame()


Processing Frames: 100%|██████████| 100/100 [00:03<00:00, 25.32frame/s]


In [5]:
def display_hand_landmarks(frame, results):
    # Draw the landmarks on the frame
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp.solutions.drawing_utils.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

def capture_webcam_frame():
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return None

    frame_count = 0
    frame_skip = 5
    total_frames = 100

    with tqdm(total=total_frames, desc="Processing Frames", unit="frame") as pbar:
        while True:
            ret, frame = cap.read()
            if not ret:
                print("Error: Could not read frame.")
                break

            if frame_count % frame_skip == 0:
                # Process the frame
                prediction = process_frame_with_model(frame)

                # Visualize hand landmarks
                img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = hands.process(img_rgb)
                display_hand_landmarks(frame, results)

                if prediction is not None:
                    detected_letters.append(prediction)
                    corrected_word = form_words_and_autocorrect(detected_letters)
                    print(f"Corrected Word: {corrected_word}")

            frame_count += 1
            pbar.update(1)

            # Show the frame with hand landmarks
            cv2.imshow('Webcam', frame)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

            if frame_count >= total_frames:
                break

    cap.release()
    cv2.destroyAllWindows()


In [6]:
import cv2
import mediapipe as mp
import numpy as np
import pickle
from collections import deque
from autocorrect import Speller
from tqdm import tqdm  # Progress bar for processing

# Load the trained model from the specified path
model_path = 'model.p'  # Replace with the correct path if needed
with open(model_path, 'rb') as f:
    model_dict = pickle.load(f)
model = model_dict['model']

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, min_detection_confidence=0.3)

# Speller for autocorrecting words
spell = Speller(lang='en')

# Define a deque to store detected letters
detected_letters = deque(maxlen=20)  # Adjust maxlen based on how many letters you want to store

# Define a function to combine letters into words and autocorrect them
def form_words_and_autocorrect(detected_letters):
    word = ''.join(detected_letters)
    corrected_word = spell(word)
    return corrected_word

# Function to display hand landmarks on the frame for debugging
def display_hand_landmarks(frame, results):
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp.solutions.drawing_utils.draw_landmarks(
                frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

# Function to capture and process webcam frames
def capture_webcam_frame():
    cap = cv2.VideoCapture(0)  # Use the first webcam device
    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return None

    frame_count = 0
    frame_skip = 5  # You can adjust this to process every nth frame
    total_frames = 100  # Define the total number of frames you want to process

    with tqdm(total=total_frames, desc="Processing Frames", unit="frame") as pbar:  # Progress bar
        while True:
            ret, frame = cap.read()
            if not ret:
                print("Error: Could not read frame.")
                break

            # Skip frames for performance
            if frame_count % frame_skip == 0:
                # Process the frame and predict the letter
                prediction = process_frame_with_model(frame)

                # Display hand landmarks
                img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = hands.process(img_rgb)
                display_hand_landmarks(frame, results)

                # If a letter is detected, append it to the detected_letters deque
                if prediction is not None:
                    detected_letter = prediction
                    detected_letters.append(detected_letter)
                    print(f"Detected Letter: {detected_letter}")

                    # Combine letters into a word, autocorrect, and display the word
                    corrected_word = form_words_and_autocorrect(detected_letters)
                    print(f"Corrected Word: {corrected_word}")

            frame_count += 1
            pbar.update(1)  # Update the progress bar

            # Show the frame with hand landmarks
            cv2.imshow('Webcam', frame)

            # Press 'q' to quit
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

            # Stop the progress bar after the total frames are processed
            if frame_count >= total_frames:
                break

    cap.release()
    cv2.destroyAllWindows()

# Function to process the frame with the trained model and detect a letter
def process_frame_with_model(frame):
    img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert to RGB for MediaPipe processing
    results = hands.process(img_rgb)

    # If hand landmarks are detected, predict the letter using the model
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            data_aux = []
            x_ = []
            y_ = []

            # Collect x and y coordinates
            for landmark in hand_landmarks.landmark:
                x = landmark.x
                y = landmark.y
                x_.append(x)
                y_.append(y)

            # Normalize the coordinates
            for landmark in hand_landmarks.landmark:
                data_aux.append(landmark.x - min(x_))
                data_aux.append(landmark.y - min(y_))

            # Convert the landmarks to a numpy array
            data_aux = np.array(data_aux).reshape(1, -1)

            # Predict using the model
            prediction = model.predict(data_aux)

            # Get the predicted letter (assuming it's mapped as an alphabet letter)
            alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
            return alphabet[prediction[0]]

    return None

# Start capturing frames and detecting letters
capture_webcam_frame()


Processing Frames: 100%|██████████| 100/100 [00:04<00:00, 23.23frame/s]
