In [2]:
import cv2
import mediapipe as mp
from gtts import gTTS
import pygame
import threading
import time
import os
import tempfile

# ------------- Config -------------
MAX_HANDS = 2
GESTURE_HOLD_TIME = 1.5      # seconds to hold a gesture before speaking
DISPLAY_DURATION = 3.5       # seconds to keep the text on screen
DEBOUNCE_SECONDS = 3.0       # don't repeat the same phrase within this window
# ----------------------------------

# Initialize MediaPipe Hands (supports two hands)
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(
    max_num_hands=MAX_HANDS,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.5
)

# init pygame mixer for TTS playback
pygame.mixer.init()

is_speaking = False
last_spoken_gesture = ""
last_spoken_time = 0.0

# persistent displayed message
display_message = ""
display_message_time = 0.0

def speak(text):
    """Generate and play speech in background thread (gTTS)."""
    global is_speaking

    def run_speech():
        global is_speaking
        try:
            is_speaking = True
            print("Speaking:", text)
            tmp = os.path.join(tempfile.gettempdir(), "gesture_speech.mp3")
            tts = gTTS(text=text, lang='en', slow=False)
            tts.save(tmp)
            pygame.mixer.music.load(tmp)
            pygame.mixer.music.play()
            while pygame.mixer.music.get_busy():
                time.sleep(0.1)
            # cleanup
            try:
                pygame.mixer.music.unload()
            except Exception:
                pass
            if os.path.exists(tmp):
                try:
                    os.remove(tmp)
                except Exception:
                    pass
        except Exception as e:
            print("Speech error:", e)
        finally:
            is_speaking = False

    # don't block main thread
    if not is_speaking:
        t = threading.Thread(target=run_speech, daemon=True)
        t.start()

def get_finger_states(hand_landmarks, handedness):
    """
    Return 5-tuple: (thumb, index, middle, ring, pinky)
    1 = extended (up), 0 = folded.
    """
    finger_states = []
    finger_tips = [8, 12, 16, 20]   # index, middle, ring, pinky tips
    finger_pips = [6, 10, 14, 18]   # those fingers PIP joints

    for tip, pip in zip(finger_tips, finger_pips):
        finger_states.append(1 if hand_landmarks.landmark[tip].y < hand_landmarks.landmark[pip].y else 0)

    # thumb: compare x position of tip and ip depending on handedness
    thumb_tip = hand_landmarks.landmark[4]
    thumb_ip = hand_landmarks.landmark[3]
    if handedness == "Right":
        thumb_up = thumb_tip.x < thumb_ip.x
    else:
        thumb_up = thumb_tip.x > thumb_ip.x

    finger_states.insert(0, 1 if thumb_up else 0)
    return tuple(finger_states)

# ----------------- Gesture dictionary -----------------
# Single-hand gestures: 5-tuples (thumb, index, middle, ring, pinky)
# Two-hand gestures: 10-tuples (Left five then Right five)
# === NOTE ===: these are *approximate static* mappings you can tune.
gesture_dict = {
    # Basic single-hand gestures (existing + requested)
    (1,1,1,1,1): {"name":"Open Hand", "sentence":"My hand is fully open"},
    (0,0,0,0,0): {"name":"Fist", "sentence":"I am showing a closed fist"},
    (1,0,0,0,0): {"name":"Thumbs Up", "sentence":"Thumbs up, okay"},
    (0,1,1,0,0): {"name":"Peace", "sentence":"Peace sign, victory"},
    (0,1,0,0,0): {"name":"Pointing", "sentence":"I am pointing"},
    (0,0,1,0,0): {"name":"Middle Finger", "sentence":"(rude)"},
    (1,0,0,1,0): {"name":"Love You", "sentence":"I love you"},
    (1,1,1,1,0): {"name":"Thank You (approx)", "sentence":"Thank you"},   # chosen pattern for "thank you"
    (0,0,1,1,0): {"name":"No (approx)", "sentence":"No"},                # chosen pattern for "no"
    (1,0,1,0,1): {"name":"Please (approx)", "sentence":"Please"},        # chosen pattern for "please"
    (0,1,0,1,0): {"name":"Rock", "sentence":"Rock and roll"},
    (0,1,1,1,1): {"name":"Wait", "sentence":"Please wait a moment"},
    (1,0,1,0,0): {"name":"Go", "sentence":"Go ahead"},
    (0,1,1,0,1): {"name":"Hungry", "sentence":"I am hungry, I need food"},
    (1,1,0,0,1): {"name":"Need Water", "sentence":"I need water, I am thirsty"},
    # New: who/where/why approximations as single-hand patterns (may be ambiguous).
    (0,1,0,1,1): {"name":"Where (approx)", "sentence":"Where?"},        # index+ring+pinky
    (1,1,0,1,0): {"name":"Who (approx)", "sentence":"Who?"},            # thumb+index+ring
    (0,1,1,0,1): {"name":"Why (approx)", "sentence":"Why?"},            # index+middle+pinky

    # Two-hand gestures (Left then Right) - chose patterns unlikely to collide
    (0,1,0,0,0,  0,1,0,0,0): {"name":"Both Index", "sentence":"thankyou"},
    (0,0,0,0,0,  0,0,0,0,0): {"name":"Both Fists", "sentence":"Both hands are yes "},
    (1,1,1,1,1,  1,1,1,1,1): {"name":"Both Hands Open", "sentence":"My hands are fully open"},
    (1,0,0,0,0,  1,0,0,0,0): {"name":"Double Thumbs Up", "sentence":"Thumbs up with both hands"},
    (1,1,0,0,0,  0,1,0,0,0): {"name":"Both Pointing", "sentence":"Pointing with both hands"},
    (0,1,1,0,0,  0,1,1,0,0): {"name":"Double Peace", "sentence":"Peace sign with both hands"},
    (1,0,1,0,1,  1,0,1,0,1): {"name":"Love You Both", "sentence":"I love you both"},
    # Add more two-hand mappings here if you want...
}
# ------------------------------------------------------

# runtime tracking
current_gesture = None
gesture_start_time = None

# open webcam
cap = cv2.VideoCapture(0)
print("Hand Gesture Recognition - Text+TTS")
print(f"Hold a gesture for {GESTURE_HOLD_TIME:.1f} seconds to convert to text/speech.")
print("Press ESC to exit.")

try:
    while cap.isOpened():
        success, frame = cap.read()
        if not success:
            break

        frame = cv2.flip(frame, 1)
        h, w, _ = frame.shape
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb)
        now = time.time()

        left_states = None
        right_states = None
        gesture_info = None

        if results.multi_hand_landmarks and results.multi_handedness:
            # gather states for each detected hand
            for hand_landmarks, handedness_struct in zip(results.multi_hand_landmarks, results.multi_handedness):
                label = handedness_struct.classification[0].label  # "Left" or "Right"
                mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                states = get_finger_states(hand_landmarks, label)

                # debug near wrist
                wrist = hand_landmarks.landmark[0]
                cx, cy = int(wrist.x * w), int(wrist.y * h)
                cv2.putText(frame, f"{label}:{states}", (max(cx-120,5), max(cy-20,20)),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.45, (255,255,255), 1)

                if label == "Left":
                    left_states = states
                else:
                    right_states = states

            # prefer two-hand combined detection first
            if left_states is not None and right_states is not None:
                combined = tuple(left_states + right_states)
                gesture_info = gesture_dict.get(combined, None)

            # single-hand fallback
            if gesture_info is None:
                if left_states is not None:
                    gesture_info = gesture_dict.get(tuple(left_states), None)
                if gesture_info is None and right_states is not None:
                    gesture_info = gesture_dict.get(tuple(right_states), None)

            # If recognized, manage timing, display, and speak
            if gesture_info:
                gesture_name = gesture_info["name"]
                sentence = gesture_info["sentence"]

                # start timing when gesture changes
                if current_gesture != gesture_name:
                    current_gesture = gesture_name
                    gesture_start_time = now
                    # reset last_spoken for immediate retrigger if user changed gesture
                    # note: we still have global debounce last_spoken_time to avoid repeats
                    print("New gesture:", gesture_name)

                hold_duration = now - (gesture_start_time or now)
                # draw small HUD
                cv2.putText(frame, f"Gesture: {gesture_name}", (10,40),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,255,0), 2)
                cv2.putText(frame, f"Hold: {hold_duration:.1f}/{GESTURE_HOLD_TIME:.1f}s", (10,80),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,0), 2)

                # trigger if held long enough and debounce passed
                if (hold_duration >= GESTURE_HOLD_TIME
                        and (now - last_spoken_time) >= DEBOUNCE_SECONDS
                        and not is_speaking):
                    # speak + show
                    print("Triggering:", sentence)
                    speak(sentence)
                    # set persistent on-screen message
                    display_message = sentence
                    display_message_time = now
                    last_spoken_gesture = gesture_name
                    last_spoken_time = now

            else:
                # hands found but not recognized
                current_gesture = None
                gesture_start_time = None
                cv2.putText(frame, "Gesture: Unknown", (10,40),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,0,255), 2)
        else:
            # no hands
            current_gesture = None
            gesture_start_time = None
            cv2.putText(frame, "No hand detected", (10,40),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,0,255), 2)

        # Draw persistent display message if within DISPLAY_DURATION
        if display_message and (now - display_message_time) <= DISPLAY_DURATION:
            # large, semi-transparent box + text
            cv2.rectangle(frame, (10, frame.shape[0]-80), (frame.shape[1]-10, frame.shape[0]-10), (20,20,20), -1)
            cv2.putText(frame, display_message, (20, frame.shape[0]-30),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255,255,255), 2)
        else:
            display_message = ""

        cv2.imshow("Gesture->Text(TTS)", frame)
        if cv2.waitKey(1) & 0xFF == 27:   # ESC to quit
            break

finally:
    cap.release()
    cv2.destroyAllWindows()
    try:
        pygame.mixer.quit()
    except Exception:
        pass


Hand Gesture Recognition - Text+TTS
Hold a gesture for 1.5 seconds to convert to text/speech.
Press ESC to exit.
