In [None]:
import cv2
import mediapipe as mp
from gtts import gTTS
import pygame
import threading
import time
import os
import tempfile

# -------------------- MEDIAPIPE SETUP --------------------
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

hands = mp_hands.Hands(
    max_num_hands=2,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.5
)

# -------------------- AUDIO SETUP --------------------
pygame.mixer.init()

is_speaking = False
gesture_hold_time = 2.0
last_spoken_gesture = ""

# -------------------- SPEAK FUNCTION --------------------
def speak(text):
    global is_speaking

    def run():
        global is_speaking
        try:
            is_speaking = True

            temp_file = os.path.join(tempfile.gettempdir(), "gesture_speech.mp3")
            gTTS(text=text, lang="en").save(temp_file)

            pygame.mixer.music.load(temp_file)
            pygame.mixer.music.play()

            while pygame.mixer.music.get_busy():
                time.sleep(0.1)

            pygame.mixer.music.unload()
            os.remove(temp_file)

        except Exception as e:
            print("Speech Error:", e)

        is_speaking = False

    if not is_speaking:
        threading.Thread(target=run, daemon=True).start()

# -------------------- FINGER STATES --------------------
def get_finger_states(hand_landmarks, handedness):
    finger_states = []

    finger_tips = [8, 12, 16, 20]
    finger_pips = [6, 10, 14, 18]

    for tip, pip in zip(finger_tips, finger_pips):
        finger_states.append(1 if hand_landmarks.landmark[tip].y < hand_landmarks.landmark[pip].y else 0)

    thumb_tip = hand_landmarks.landmark[4]
    thumb_ip = hand_landmarks.landmark[3]

    if handedness == "Right":
        thumb_up = thumb_tip.x < thumb_ip.x
    else:
        thumb_up = thumb_tip.x > thumb_ip.x

    finger_states.insert(0, 1 if thumb_up else 0)
    return finger_states

# -------------------- GESTURE DICTIONARY --------------------
gesture_dict = {

    # -------- TWO HAND GESTURES (10 VALUES) --------

    (1,1,1,1,1, 0,1,0,0,0): {
        "name": "Good Morning",
        "sentence": "Good morning"
    },

    (0,1,1,0,0, 0,1,0,0,0): {
        "name": "How Are You",
        "sentence": "How are you"
    },

    (1,0,0,0,0, 1,0,0,1,0): {
        "name": "I Am Fine",
        "sentence": "I am fine"
    },

    (0,0,0,1,1, 1,1,1,1,1): {
        "name": "Thank You",
        "sentence": "Thank you very much"
    },

    (1,0,0,0,0, 1,1,1,0,1): {
        "name": "Hungry",
        "sentence": "I am hungry"
    },

    (0,0,1,1,1, 0,1,1,1,0): {
        "name": "Stop",
        "sentence": "Stop please"
    }
}

# -------------------- MAIN LOOP --------------------
cap = cv2.VideoCapture(0)

current_gesture = None
gesture_start_time = None

print("\nGesture Recognition Started (Hold 2 sec for speech)\n")

while cap.isOpened():
    success, frame = cap.read()
    if not success:
        break

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb)

    current_time = time.time()
    fingers_combined = []
    gesture_info = None

    if results.multi_hand_landmarks:
        for idx, hand_landmarks in enumerate(results.multi_hand_landmarks):
            handedness = results.multi_handedness[idx].classification[0].label

            mp_drawing.draw_landmarks(
                frame, hand_landmarks, mp_hands.HAND_CONNECTIONS
            )

            fingers = get_finger_states(hand_landmarks, handedness)
            fingers_combined.extend(fingers)

        if len(fingers_combined) == 10:
            gesture_info = gesture_dict.get(tuple(fingers_combined))

        if gesture_info:
            name = gesture_info["name"]
            sentence = gesture_info["sentence"]

            cv2.putText(frame, f"Gesture: {name}", (10, 40),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)

            if current_gesture != name:
                current_gesture = name
                gesture_start_time = current_time
                last_spoken_gesture = ""

            if current_time - gesture_start_time >= gesture_hold_time:
                if last_spoken_gesture != name and not is_speaking:
                    speak(sentence)
                    last_spoken_gesture = name
    else:
        current_gesture = None
        gesture_start_time = None
        cv2.putText(frame, "No Hand Detected", (10, 40),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)

    cv2.imshow("Hand Gesture â†’ Speech", frame)

    if cv2.waitKey(1) & 0xFF == 27:
        break

cap.release()
cv2.destroyAllWindows()
pygame.mixer.quit()



Gesture Recognition Started (Hold 2 sec for speech)

