In [None]:
#collect_imgs
import os

import cv2


DATA_DIR = './data'
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

number_of_classes = 9
dataset_size = 200

cap = cv2.VideoCapture(0)
for j in range(number_of_classes):
    if not os.path.exists(os.path.join(DATA_DIR, str(j))):
        os.makedirs(os.path.join(DATA_DIR, str(j)))

    print('Collecting data for class {}'.format(j))

    done = False
    while True:
        ret, frame = cap.read()
        cv2.putText(frame, 'Ready? Press "Q" ! :)', (100, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 255, 0), 3,
                    cv2.LINE_AA)
        cv2.imshow('frame', frame)
        if cv2.waitKey(25) == ord('q'):
            break

    counter = 0
    while counter < dataset_size:
        ret, frame = cap.read()
        cv2.imshow('frame', frame)
        cv2.waitKey(25)
        cv2.imwrite(os.path.join(DATA_DIR, str(j), '{}.jpg'.format(counter)), frame)

        counter += 1

cap.release()
cv2.destroyAllWindows()

In [None]:
#create_dataset

import os
import pickle
import mediapipe as mp
import cv2

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

# Set max_num_hands to 2
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.3)

DATA_DIR = './data'

data = []
labels = []

for dir_ in os.listdir(DATA_DIR):
    if not os.path.isdir(os.path.join(DATA_DIR, dir_)):
        continue

    for img_path in os.listdir(os.path.join(DATA_DIR, dir_)):
        if not img_path.lower().endswith(('.png', '.jpg', '.jpeg')):
            continue

        data_aux = []
        x_all = []
        y_all = []

        img = cv2.imread(os.path.join(DATA_DIR, dir_, img_path))
        if img is None:
            continue  

        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        results = hands.process(img_rgb)

        if results.multi_hand_landmarks and len(results.multi_hand_landmarks) == 2:
            results.multi_hand_landmarks = sorted(
                results.multi_hand_landmarks,
                key=lambda hand_landmarks: hand_landmarks.landmark[0].x
            )

        if results.multi_hand_landmarks:

            for hand_landmarks in results.multi_hand_landmarks[:2]: 
                x_ = []
                y_ = []
                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y

                    x_.append(x)
                    y_.append(y)

                x_all.extend(x_)
                y_all.extend(y_)

                min_x = min(x_)
                min_y = min(y_)
                for i in range(len(hand_landmarks.landmark)):
                    data_aux.append(x_[i] - min_x)
                    data_aux.append(y_[i] - min_y)

            expected_feature_size = 42 * 2
            current_size = len(data_aux)

            if current_size == 42:
                data_aux.extend([0.0] * 42)
            elif current_size == 0:
                continue
            elif current_size != expected_feature_size:
                continue

            data.append(data_aux)
            labels.append(dir_)

f = open('data.pickle', 'wb')
pickle.dump({'data': data, 'labels': labels}, f)
f.close()

In [4]:
#train_classifier
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np


data_dict = pickle.load(open('./data.pickle', 'rb'))

data = np.asarray(data_dict['data'])
labels = np.asarray(data_dict['labels'])

x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, shuffle=True, stratify=labels)

model = RandomForestClassifier()

model.fit(x_train, y_train)

y_predict = model.predict(x_test)

score = accuracy_score(y_predict, y_test)

print('{}% of samples were classified correctly !'.format(score * 100))

f = open('model.p', 'wb')
pickle.dump({'model': model}, f)
f.close()

98.41772151898735% of samples were classified correctly !


In [None]:
#synapse_interactive.py
import pickle
import cv2
import mediapipe as mp
import numpy as np
import os
import threading
import time

from gtts import gTTS
from playsound import playsound
import speech_recognition as sr

print("Available microphones:")
for index, name in enumerate(sr.Microphone.list_microphone_names()):
    print(f"  Microphone \"{name}\" found for `Microphone(device_index={index})`")

model_dict = pickle.load(open('./model.p', 'rb'))
model = model_dict['model']
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.3)

gesture_to_phrase = {
    0: 'Hello',
    1: 'Hello',
    2: 'Good Job',
    3: 'Good Job',
    4: 'Namaste',
    5: 'Home',
    6: 'Peace',
    7: 'Peace',
    8: 'NO!',
}

phrase_to_gesture = {
    "hello": "hello.jpg",
    "good job": "goodjob.jpeg",
    "namaste": "namaste.jpeg",
    "home": "home.jpg",
    "peace": "Peace.jpeg",
    "no": "no.jpg"
}

def text_to_speech(text):
    """ Converts text to speech and plays it. Caches audio files. """
    try:
        filename = f"./temp_{text.replace(' ', '_').replace('!', '')}.mp3"

        if not os.path.exists(filename):
            print(f"Generating audio for: '{text}'")
            tts = gTTS(text=text, lang='en')
            tts.save(filename)

        playsound(filename)

    except Exception as e:
        print(f"Error in text_to_speech: {e}")

def display_gesture_image(gesture_name):
    """ Displays an image of the recognized gesture. """
    image_path = os.path.join('gesture_images', gesture_name)
    if os.path.exists(image_path):
        img = cv2.imread(image_path)
        cv2.imshow('Spoken Gesture', img)
        cv2.waitKey(2000) 
        cv2.destroyWindow('Spoken Gesture')
    else:
        print(f"Image not found: {image_path}")

def listen_for_commands():
    """ Runs in a separate thread to listen for voice commands. """
    recognizer = sr.Recognizer()
    microphone = sr.Microphone(device_index=1)

    with microphone as source:
        print("Calibrating for ambient noise, please be quiet...")
        recognizer.adjust_for_ambient_noise(source, duration=2)
        print("Calibration complete. Listening for commands...")

    while True:
        with microphone as source:
            try:
                audio = recognizer.listen(source, timeout=5, phrase_time_limit=4)
                text = recognizer.recognize_google(audio).lower()
                print(f"You said: {text}")
                if text in phrase_to_gesture:
                    gesture_image_name = phrase_to_gesture[text]
                    print(f"Recognized command! Displaying: {gesture_image_name}")
                    display_gesture_image(gesture_image_name)

            except sr.UnknownValueError:
                print("DEBUG: Could not understand the audio. Please speak more clearly.")
            except sr.RequestError as e:
                print(f"Could not request results from Google API; {e}")
            except Exception as e:
                pass 
        time.sleep(0.1) 

listener_thread = threading.Thread(target=listen_for_commands, daemon=True)
listener_thread.start()

cap = cv2.VideoCapture(0)

last_gesture_spoken = None
last_speech_time = time.time()
speech_cooldown = 3 

print("\nWebcam and gesture recognition started. Look at the camera!")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    H, W, _ = frame.shape
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        data_aux = []
        x_all, y_all = [], []

        if len(results.multi_hand_landmarks) == 2:
            results.multi_hand_landmarks = sorted(
                results.multi_hand_landmarks,
                key=lambda hand_landmarks: hand_landmarks.landmark[0].x
            )

        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                frame, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                mp_drawing_styles.get_default_hand_landmarks_style(),
                mp_drawing_styles.get_default_hand_connections_style())

            x_, y_ = [], []
            for landmark in hand_landmarks.landmark:
                x_.append(landmark.x)
                y_.append(landmark.y)

            min_x, min_y = min(x_), min(y_)
            for i in range(len(hand_landmarks.landmark)):
                data_aux.append(x_[i] - min_x)
                data_aux.append(y_[i] - min_y)

        if len(data_aux) == 42:
            data_aux.extend([0.0] * 42)

        if len(data_aux) == 84:
            prediction = model.predict([np.asarray(data_aux)])
            predicted_class = int(prediction[0])
            predicted_character = gesture_to_phrase.get(predicted_class, 'Unknown')

            for hand_landmarks in results.multi_hand_landmarks:
                for landmark in hand_landmarks.landmark:
                    x_all.append(landmark.x)
                    y_all.append(landmark.y)

            x1, y1 = int(min(x_all) * W) - 10, int(min(y_all) * H) - 10
            x2, y2 = int(max(x_all) * W) + 10, int(max(y_all) * H) + 10
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 0), 4)
            cv2.putText(frame, predicted_character, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 0, 0), 3, cv2.LINE_AA)

            current_time = time.time()
            if predicted_class != last_gesture_spoken or (current_time - last_speech_time) > speech_cooldown:
                tts_thread = threading.Thread(target=text_to_speech, args=(predicted_character,), daemon=True)
                tts_thread.start()
                last_gesture_spoken = predicted_class
                last_speech_time = current_time

    cv2.imshow('frame', frame)
    if cv2.waitKey(1) == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()