In [1]:
import cv2
import os
import numpy as np
import mediapipe as mp
from datetime import datetime

# === [1] Configurations === #
labels_map = {
    '1': 'hello',
    '2': 'goodbye',
    '3': 'please',
    '4': 'thank_you',
    '5': 'yes',
    '6': 'no'
}
output_dir = "captured_data"
images_dir = os.path.join(output_dir, "images")
landmarks_dir = os.path.join(output_dir, "landmarks")

# Create necessary folders
for label in labels_map.values():
    os.makedirs(os.path.join(images_dir, label), exist_ok=True)
    os.makedirs(os.path.join(landmarks_dir, label), exist_ok=True)

# === [2] Initialize MediaPipe and Webcam === #
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False,
                       max_num_hands=1,
                       min_detection_confidence=0.7,
                       min_tracking_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils
cap = cv2.VideoCapture(1)

print("Ready to capture. Press keys:")
for key, label in labels_map.items():
    print(f"{key} = {label}")
print("Press 'q' to quit.")

# === [3] Main Loop === #
while cap.isOpened():
    success, frame = cap.read()
    if not success:
        print("Failed to capture frame.")
        break

    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb_frame)

    # Draw landmarks if detected
    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    # Display instructions
    cv2.putText(frame, "Press 1-6 to label and save, q to quit", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2)

    cv2.imshow("Sign Capture", frame)
    key = cv2.waitKey(1) & 0xFF
    key_char = chr(key)

    # Handle quit
    if key_char == 'q':
        break

    # If valid label key is pressed
    if key_char in labels_map and result.multi_hand_landmarks:
        label = labels_map[key_char]
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")

        # Save image
        img_path = os.path.join(images_dir, label, f"{label}_{timestamp}.jpg")
        cv2.imwrite(img_path, frame)

        # Save landmarks
        hand_landmark = result.multi_hand_landmarks[0]
        landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmark.landmark])
        lm_path = os.path.join(landmarks_dir, label, f"{label}_{timestamp}.npy")
        np.save(lm_path, landmarks)

        print(f"Saved {label} → {img_path}, {lm_path}")

# === [4] Cleanup === #
cap.release()
cv2.destroyAllWindows()
hands.close()

  from pandas.core import (
I0000 00:00:1744304659.546836 6340516 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1744304659.572294 6340761 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1744304659.579820 6340761 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Ready to capture. Press keys:
1 = hello
2 = goodbye
3 = please
4 = thank_you
5 = yes
6 = no
Press 'q' to quit.


W0000 00:00:1744304661.353050 6340756 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.
2025-04-10 13:04:21.757 python[94141:6340516] +[IMKClient subclass]: chose IMKClient_Modern
2025-04-10 13:04:21.757 python[94141:6340516] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Saved please → captured_data/images/please/please_20250410_130525_369566.jpg, captured_data/landmarks/please/please_20250410_130525_369566.npy
Saved please → captured_data/images/please/please_20250410_130526_659234.jpg, captured_data/landmarks/please/please_20250410_130526_659234.npy
Saved please → captured_data/images/please/please_20250410_130527_958850.jpg, captured_data/landmarks/please/please_20250410_130527_958850.npy
Saved please → captured_data/images/please/please_20250410_130529_124361.jpg, captured_data/landmarks/please/please_20250410_130529_124361.npy
Saved please → captured_data/images/please/please_20250410_130530_191839.jpg, captured_data/landmarks/please/please_20250410_130530_191839.npy


First code not working for 3, 4 and 5 (thank you, yes, no) and also some problems for q (quit).

In the initial version of the code, the keypress detection was implemented using `cv2.waitKey()` in combination with `chr(key)` to convert the detected key code into a character. While this approach worked correctly for keys like '1', '2', and '3', it caused issues for other keys such as '4', '5', '6', and even 'q' on certain systems, particularly macOS. This is because the `cv2.waitKey()` function behaves differently across platforms, and the values returned for certain keys may not directly map to their corresponding character using `chr(key)`. As a result, pressing these keys did not trigger the expected conditions in the code, leading to the failure in capturing images and landmark data for the last three signs.

To resolve this issue and make the key detection platform-independent and reliable, the updated code uses `ord()` to directly compare the detected key with its ASCII value. By using conditions like `if key == ord('1')` or `if key == ord('q')`, the code ensures that the correct label is assigned or the program exits cleanly regardless of the operating system or terminal behavior. This modification provides a more robust and consistent way to handle keyboard inputs in OpenCV, making the code reliable across different environments.

In [1]:
import cv2
import os
import numpy as np
import mediapipe as mp
from datetime import datetime

# === [1] Configurations === #
labels_map = {
    ord('1'): 'hello',
    ord('2'): 'goodbye',
    ord('3'): 'please',
    ord('4'): 'thank_you',
    ord('5'): 'yes',
    ord('6'): 'no'
}

output_dir = "captured_data"
images_dir = os.path.join(output_dir, "images")
landmarks_dir = os.path.join(output_dir, "landmarks")

# Create folders for each label
for label in labels_map.values():
    os.makedirs(os.path.join(images_dir, label), exist_ok=True)
    os.makedirs(os.path.join(landmarks_dir, label), exist_ok=True)

# === [2] Initialize MediaPipe and Webcam === #
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False,
                       max_num_hands=1,
                       min_detection_confidence=0.7,
                       min_tracking_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils
cap = cv2.VideoCapture(1)

print("Ready to capture. Press keys:")
for k, v in labels_map.items():
    print(f"{chr(k)} = {v}")
print("Press 'q' to quit.")

# === [3] Main Loop === #
while cap.isOpened():
    success, frame = cap.read()
    if not success:
        print("Failed to capture frame.")
        break

    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb_frame)

    # Draw landmarks if detected
    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    # Display instructions on screen
    cv2.putText(frame, "Press 1-6 to label & save | Press 'q' to quit", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2)

    cv2.imshow("Sign Capture", frame)
    key = cv2.waitKey(1) & 0xFF

    if key == ord('q'):
        break

    if key in labels_map and result.multi_hand_landmarks:
        label = labels_map[key]
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")

        # Save image
        img_path = os.path.join(images_dir, label, f"{label}_{timestamp}.jpg")
        cv2.imwrite(img_path, frame)

        # Save landmarks
        landmarks = np.array([[lm.x, lm.y, lm.z] for lm in result.multi_hand_landmarks[0].landmark])
        lm_path = os.path.join(landmarks_dir, label, f"{label}_{timestamp}.npy")
        np.save(lm_path, landmarks)

        print(f"[Captured] Label: {label} → Image: {img_path}, Landmarks: {lm_path}")

# === [4] Cleanup === #
cap.release()
cv2.destroyAllWindows()
hands.close()

  from pandas.core import (
I0000 00:00:1744305597.653992 6352851 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1744305597.738327 6354688 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1744305597.750354 6354688 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Ready to capture. Press keys:
1 = hello
2 = goodbye
3 = please
4 = thank_you
5 = yes
6 = no
Press 'q' to quit.


2025-04-10 13:20:01.726 python[94509:6352851] +[IMKClient subclass]: chose IMKClient_Modern
2025-04-10 13:20:01.727 python[94509:6352851] +[IMKInputSession subclass]: chose IMKInputSession_Modern
W0000 00:00:1744305604.100228 6354689 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


[Captured] Label: thank_you → Image: captured_data/images/thank_you/thank_you_20250410_132054_201415.jpg, Landmarks: captured_data/landmarks/thank_you/thank_you_20250410_132054_201415.npy
[Captured] Label: thank_you → Image: captured_data/images/thank_you/thank_you_20250410_132055_092697.jpg, Landmarks: captured_data/landmarks/thank_you/thank_you_20250410_132055_092697.npy
[Captured] Label: thank_you → Image: captured_data/images/thank_you/thank_you_20250410_132056_025295.jpg, Landmarks: captured_data/landmarks/thank_you/thank_you_20250410_132056_025295.npy
[Captured] Label: thank_you → Image: captured_data/images/thank_you/thank_you_20250410_132056_925161.jpg, Landmarks: captured_data/landmarks/thank_you/thank_you_20250410_132056_925161.npy
[Captured] Label: thank_you → Image: captured_data/images/thank_you/thank_you_20250410_132057_990932.jpg, Landmarks: captured_data/landmarks/thank_you/thank_you_20250410_132057_990932.npy
[Captured] Label: thank_you → Image: captured_data/images/th

[Captured] Label: yes → Image: captured_data/images/yes/yes_20250410_132315_908047.jpg, Landmarks: captured_data/landmarks/yes/yes_20250410_132315_908047.npy
[Captured] Label: yes → Image: captured_data/images/yes/yes_20250410_132316_835851.jpg, Landmarks: captured_data/landmarks/yes/yes_20250410_132316_835851.npy
[Captured] Label: yes → Image: captured_data/images/yes/yes_20250410_132318_431855.jpg, Landmarks: captured_data/landmarks/yes/yes_20250410_132318_431855.npy
[Captured] Label: yes → Image: captured_data/images/yes/yes_20250410_132319_299686.jpg, Landmarks: captured_data/landmarks/yes/yes_20250410_132319_299686.npy
[Captured] Label: yes → Image: captured_data/images/yes/yes_20250410_132320_133232.jpg, Landmarks: captured_data/landmarks/yes/yes_20250410_132320_133232.npy
[Captured] Label: yes → Image: captured_data/images/yes/yes_20250410_132321_265908.jpg, Landmarks: captured_data/landmarks/yes/yes_20250410_132321_265908.npy
[Captured] Label: yes → Image: captured_data/images/

### Explanation of Data Collection using Webcam and Saving Landmark Data

In the first part of the project, a Python script was developed using OpenCV and MediaPipe to collect sign language data through the webcam. The system was designed to detect the user's hand in real-time, identify 21 key landmarks on the hand, and draw connections between these points for visualization.

When a specific key (1 to 6) was pressed, corresponding to a particular sign (hello, goodbye, please, thank_you, yes, no), the script captured the current frame and saved two things:  
1. The image with the hand visible — for reference.  
2. The landmark coordinates extracted by MediaPipe — saved as a `.npy` file for later use in machine learning.

This process helped in creating a labeled dataset of hand signs, where both visual data and numerical landmark data were stored and organized systematically into folders based on their respective classes.