In [1]:
import cv2
import os
import mediapipe as mp
import json
import pandas as pd
from datetime import datetime
import keyboard
import numpy as np

In [2]:
# Initialize MediaPipe models for hands, face, and pose
mp_hands = mp.solutions.hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.5)
mp_face = mp.solutions.face_mesh.FaceMesh(static_image_mode=False, min_detection_confidence=0.5)
mp_pose = mp.solutions.pose.Pose(static_image_mode=False, min_detection_confidence=0.5)


In [3]:
# Create directories if they don't exist
output_dir = 'sign_language_dataset'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


In [4]:
# Function to process a single frame and extract keypoints for hands, face, and pose
def process_frame(frame):
    # Convert the BGR image to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Process the hands, face, and pose detections
    results_hands = mp_hands.process(rgb_frame)
    results_face = mp_face.process(rgb_frame)
    results_pose = mp_pose.process(rgb_frame)
    # print(results_hands)

    
    # Initialize variables to store landmarks
    right_hand_landmarks = None
    left_hand_landmarks = None
    face_landmarks = None
    pose_landmarks = None

    # Extract keypoints from hands
    left_hand_landmarks, right_hand_landmarks = None, None
    if results_hands.multi_hand_landmarks:
        for hand_landmarks, handedness in zip(results_hands.multi_hand_landmarks, results_hands.multi_handedness):
            if handedness.classification[0].label == 'Left':
                left_hand_landmarks = hand_landmarks
                print('lft')
            elif handedness.classification[0].label == 'Right':
                right_hand_landmarks = hand_landmarks
                print('right')
    
    # Extract keypoints from face
    if results_face.multi_face_landmarks:
        face_landmarks = results_face.multi_face_landmarks[0]  # Assuming only one face is detected
    
    # Extract keypoints from pose
    if results_pose.pose_landmarks:
        pose_landmarks = results_pose.pose_landmarks
    
    # Return extracted keypoints for hands, face, and pose
    return right_hand_landmarks, left_hand_landmarks, face_landmarks, pose_landmarks

In [5]:
# Initialize webcam capture
cap = cv2.VideoCapture(0)

# Initialize variables for recording
record = False
gesture_label = None
start_time = None
frame_count = 0
max_frames = 1 # Number of frames to record for each gesture

# Data collection variables
data = []

In [6]:
def preprocess_landmarks(left_hand_landmarks, right_hand_landmarks, pose_landmarks, face_landmarks):
    # Convert landmarks to arrays
    if face_landmarks:
        face = np.array([[lm.x, lm.y] for lm in face_landmarks.landmark])
    else:
        face = np.zeros((0, 2))  # Empty array if no face landmarks
    
    if left_hand_landmarks:
        left_hand = np.array([[lm.x, lm.y] for lm in left_hand_landmarks.landmark])
    else:
        left_hand = np.zeros((0, 2))  # Empty array if no left hand landmarks
    
    if pose_landmarks:
        pose = np.array([[lm.x, lm.y] for lm in pose_landmarks.landmark])
    else:
        pose = np.zeros((0, 2))  # Empty array if no pose landmarks
    
    if right_hand_landmarks:
        right_hand = np.array([[lm.x, lm.y] for lm in right_hand_landmarks.landmark])
    else:
        right_hand = np.zeros((0, 2))  # Empty array if no right hand landmarks
    
    # Ensure the landmarks have consistent lengths
    max_landmarks = max(len(face), len(left_hand), len(pose), len(right_hand))
    face = np.pad(face, ((0, max_landmarks - len(face)), (0, 0)), mode='constant')
    left_hand = np.pad(left_hand, ((0, max_landmarks - len(left_hand)), (0, 0)), mode='constant')
    pose = np.pad(pose, ((0, max_landmarks - len(pose)), (0, 0)), mode='constant')
    right_hand = np.pad(right_hand, ((0, max_landmarks - len(right_hand)), (0, 0)), mode='constant')
    

    return face, left_hand, pose, right_hand

In [7]:
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("Failed to capture frame from webcam.")
        break
    
    # Flip the frame horizontally for natural viewing
    frame = cv2.flip(frame, 1)
    
    # Process the current frame to extract keypoints for hands, face, and pose
    right_hand_landmarks, left_hand_landmarks, face_landmarks, pose_landmarks = process_frame(frame)
    face, left_hand, pose, right_hand= preprocess_landmarks(left_hand_landmarks, right_hand_landmarks, pose_landmarks, face_landmarks)
    
    # Display hand keypoints on the frame
    if right_hand_landmarks:
        for landmark in right_hand_landmarks.landmark:
            # Draw each landmark point if it's visible
            if landmark.visibility > 0.5:
                cx, cy = int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])
                cv2.circle(frame, (cx, cy), 5, (255, 0, 0), -1)
    
    if left_hand_landmarks:
        for landmark in left_hand_landmarks.landmark:
            # Draw each landmark point if it's visible
            if landmark.visibility > 0.5:
                cx, cy = int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])
                cv2.circle(frame, (cx, cy), 5, (0, 255, 0), -1)
    
    # Display face keypoints on the frame
    if face_landmarks:
        for landmark in face_landmarks.landmark:
            # Draw each landmark point if it's visible
            if landmark.visibility > 0.5:
                cx, cy = int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])
                cv2.circle(frame, (cx, cy), 5, (0, 0, 255), -1)
    
    # Display pose keypoints on the frame
    if pose_landmarks:
        for landmark in pose_landmarks.landmark:
            # Draw each landmark point if it's visible
            if landmark.visibility > 0.5:
                cx, cy = int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])
                cv2.circle(frame, (cx, cy), 5, (255, 255, 0), -1)
    
    # Display instructions
    if not record:
        cv2.putText(frame, "Press 'r' to start recording, 's' to stop recording", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
    
    # Start/stop recording based on user input
    if keyboard.is_pressed('r') and not record:
        record = True
        gesture_label = input("Enter gesture label (A-Z): ")
        start_time = datetime.now()
        # Create directory for the current gesture label if it doesn't exist
        gesture_dir = os.path.join(output_dir, gesture_label)
        if not os.path.exists(gesture_dir):
            os.makedirs(gesture_dir)
        print(f"Recording gesture '{gesture_label}'...")
    elif keyboard.is_pressed('s') and record:
        record = False
        gesture_label = None
        start_time = None
        frame_count = 0
        print("Stopped recording.")
    
    # Record frames if recording is active
    if record:
        # Save frame keypoints as JSON
        keypoints_data = {
            'right_hand': right_hand,
            'left_hand': left_hand,
            'face': face,
            'pose': pose
        }
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
        json_filename = f"{gesture_label}_{timestamp}.json"
        json_path = os.path.join(gesture_dir, json_filename)
        with open(json_path, 'w') as f:
            json.dump(keypoints_data, f)
        
        # Append path and label to data list
        data.append({'path': json_path, 'label': gesture_label})
        
        frame_count += 1
        if frame_count >= max_frames:
            record = False
            gesture_label = None
            start_time = None
            frame_count = 0
            print(f"Stopped recording for gesture '{gesture_label}'.")
    
    # Display the frame
    cv2.imshow('Frame', frame)
    
    # Press 'q' to exit the loop
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
lft
Recording gesture '1'...


TypeError: Object of type ndarray is not JSON serializable

In [None]:

# Release the capture and close all windows
cap.release()
cv2.destroyAllWindows()

# Save paths and labels to a single CSV file
df = pd.DataFrame(data)
csv_path = os.path.join(output_dir, 'dataset.csv')
df.to_csv(csv_path, index=False)
# Release MediaPipe models
mp_hands.close()
mp_face.close()
mp_pose.close()