In [1]:
pip install opencv-python mediapipe pyttsx3





In [None]:
"""
Hand Gesture Recognition (10-finger support) with offline voice (pyttsx3)

- MediaPipe Hands for detection (max 2 hands)
- pyttsx3 for offline TTS
- Majority-vote smoothing + hold duration before speaking
- If exact two-hand pattern exists in gesture_dict -> use it
- Otherwise fallback to count-based sentences for total fingers (0..10)

This version formats displayed finger-state strings WITHOUT commas:
  Left:11010  Right:10101
and Combined:1101010101
"""

import cv2
import mediapipe as mp
import pyttsx3
import threading
import time
from collections import deque, Counter

# -------------------------
# Config
# -------------------------
MAX_NUM_HANDS = 2
MIN_DET_CONF = 0.65
MIN_TRACK_CONF = 0.5

VOTE_WINDOW = 8            # smoothing window (frames)
VOTE_THRESHOLD = 0.6       # fraction of window that must agree
HOLD_TIME = 1.5            # seconds hold to trigger speech

# -------------------------
# MediaPipe init
# -------------------------
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(
    max_num_hands=MAX_NUM_HANDS,
    min_detection_confidence=MIN_DET_CONF,
    min_tracking_confidence=MIN_TRACK_CONF
)

# -------------------------
# pyttsx3 init (offline tts)
# -------------------------
tts_engine = pyttsx3.init()
tts_engine.setProperty("rate", 160)  # words per minute

_speaking = False
_speaking_lock = threading.Lock()

def speak_offline(text):
    """Speak text using pyttsx3 in background thread."""
    global _speaking
    def run():
        global _speaking
        with _speaking_lock:
            try:
                _speaking = True
                print("Speaking:", text)
                tts_engine.say(text)
                tts_engine.runAndWait()
            except Exception as e:
                print("TTS error:", e)
            finally:
                _speaking = False
    if not _speaking:
        t = threading.Thread(target=run, daemon=True)
        t.start()

# -------------------------
# Gesture dictionaries
# - gesture_dict: explicit keys for 5-tuple (single-hand) or 10-tuple (left+right)
# - count_sentences: fallback sentences for total fingers 0..10
# -------------------------
gesture_dict = {
    # Single-hand (examples)
    (0,0,0,0,0): {"name": "Fist", "sentence": "I am showing a closed fist"},
    (1,1,1,1,1): {"name": "Hii/Hello", "sentence": "My hand is fully open"},
    (1,0,0,0,0): {"name": "Thumbs Up", "sentence": "Thumbs up, everything is good"},
    (0,1,0,0,0): {"name": "Pointing", "sentence": "I am pointing at something"},
    (0,1,1,0,0): {"name": "Peace", "sentence": "Peace sign, victory gesture"},
    (0,0,1,1,1): {"name": "OK", "sentence": "OK sign, everything is okay"},
    (1,0,0,1,0): {"name": "Love You", "sentence": "I love you"},
    (0,1,0,1,0): {"name": "Rock", "sentence": "Rock and roll hand sign"},
    (1,0,1,1,1): {"name": "Waving Hi", "sentence": "Hello, I am greeting you"},
    (1,1,1,0,1): {"name": "Stop", "sentence": "Stop, please wait"},
    (1,1,1,1,0): {"name": "Thank You", "sentence": "Thank you very much"},
    (1,0,1,0,0): {"name": "Go", "sentence": "Go ahead, you can proceed"},
    (0,0,1,1,0): {"name": "No", "sentence": "No, I disagree"},
    (1,0,0,1,1): {"name": "Yes", "sentence": "Yes, I agree"},
    (0,1,0,1,1): {"name": "Help", "sentence": "I need help, please assist me"},
    (1,1,0,0,1): {"name": "Need Water", "sentence": "I need water, I am thirsty"},
    (0,1,1,0,1): {"name": "Hungry", "sentence": "I am hungry, I need food"},
    (1,0,1,1,0): {"name": "Good Morning", "sentence": "Good morning, have a great day"},
    (0,0,1,0,1): {"name": "Good Night", "sentence": "Good night, sleep well"},
    (1,1,0,1,1): {"name": "Sorry", "sentence": "I am sorry, please forgive me"},
    (0,1,0,0,1): {"name": "Okay", "sentence": "Okay, I understand"},

    # Two-hand (examples)
    (1,1,1,1,1, 1,1,1,1,1): {"name": "All Ten Fingers", "sentence": "I have all ten fingers up"},
    (0,0,0,0,0, 0,0,0,0,0): {"name": "Both Fists", "sentence": "I am showing closed fists with both hands"},
    (1,0,0,0,0, 1,0,0,0,0): {"name": "Double Thumbs Up", "sentence": "Thumbs up with both hands, everything is great"},
    (0,1,0,0,0, 0,1,0,0,0): {"name": "Both Pointing", "sentence": "Pointing forward with both index fingers"},
    (0,1,1,0,0, 0,1,1,0,0): {"name": "Double Peace", "sentence": "Peace sign with both hands"},
    (1,1,1,1,1, 0,0,0,0,0): {"name": "Left Open Right Fist", "sentence": "Left hand open, right hand closed"},
    (0,0,0,0,0, 1,1,1,1,1): {"name": "Right Open Left Fist", "sentence": "Right hand open, left hand closed"},
    (1,0,0,0,0, 0,0,0,0,0): {"name": "Left Thumbs Up", "sentence": "Left hand thumbs up"},
    (0,0,0,0,0, 1,0,0,0,0): {"name": "Right Thumbs Up", "sentence": "Right hand thumbs up"},
    (1,1,1,1,1, 1,0,0,0,0): {"name": "Open Left, Right Thumb", "sentence": "Left open and right thumbs up"},
    (1,0,0,0,0, 1,1,1,1,1): {"name": "Right Open, Left Thumb", "sentence": "Right open and left thumbs up"},
    (0,1,1,0,0, 1,0,0,0,0): {"name": "Left Peace Right Thumb", "sentence": "Left peace sign and right thumbs up"},
    (1,0,0,0,0, 0,1,1,0,0): {"name": "Left Thumb Right Peace", "sentence": "Left thumbs up and right peace sign"},
    (0,1,0,0,0, 0,0,0,0,0): {"name": "Left Pointing Only", "sentence": "Left is pointing"},
    (0,0,0,0,0, 0,1,0,0,0): {"name": "Right Pointing Only", "sentence": "Right is pointing"},
    (1,1,0,0,1, 1,1,0,0,1): {"name": "Both Need Water", "sentence": "Both hands signal: I need water"},
    (0,1,1,0,1, 0,1,1,0,1): {"name": "Both Hungry", "sentence": "Both hands signal: I am hungry"},
    (1,0,1,0,0, 1,0,1,0,0): {"name": "Both Hi", "sentence": "Hello from both hands"},
    (1,1,0,1,0, 1,1,0,1,0): {"name": "Both Call Me", "sentence": "Call me with both hands"},
    (1,1,1,0,1, 1,1,1,0,1): {"name": "Both Stop", "sentence": "Stop, please hold for a moment"},
    (0,1,0,1,1, 0,1,0,1,1): {"name": "Both Help", "sentence": "Both hands: I need help"},
    (1,1,1,0,0, 1,1,0,0,0): {"name": "Left Three Right Two", "sentence": "Left three and right two, total five"},
    (1,1,0,0,0, 1,1,0,0,0): {"name": "Two and Two", "sentence": "Two on each hand"},
    (0,1,1,0,0, 0,1,0,0,0): {"name": "Left Peace Right Point", "sentence": "Left peace sign and right is pointing"},
    (0,1,0,0,1, 0,1,0,0,1): {"name": "Both Fancy Wave", "sentence": "Both hands show the fancy wave"},
}

# Fallback sentences for total finger counts 0..10
count_sentences = {
    0: "No fingers are raised",
    1: "One finger is raised",
    2: "Two fingers are raised",
    3: "Three fingers are raised",
    4: "Four fingers are raised",
    5: "Five fingers are raised",
    6: "Six fingers are raised",
    7: "Seven fingers are raised",
    8: "Eight fingers are raised",
    9: "Nine fingers are raised",
    10: "All ten fingers are raised"
}

# -------------------------
# Finger detection utility
# Returns tuple (thumb, index, middle, ring, pinky)
# -------------------------
def get_finger_states(hand_landmarks, handedness_label):
    tips = [8,12,16,20]
    pips = [6,10,14,18]
    states = []
    for tip_idx, pip_idx in zip(tips, pips):
        states.append(1 if hand_landmarks.landmark[tip_idx].y < hand_landmarks.landmark[pip_idx].y else 0)
    thumb_tip = hand_landmarks.landmark[4]
    thumb_ip = hand_landmarks.landmark[3]
    if handedness_label == "Right":
        thumb_up = thumb_tip.x < thumb_ip.x
    else:
        thumb_up = thumb_tip.x > thumb_ip.x
    states.insert(0, 1 if thumb_up else 0)
    return tuple(states)

# Helper: format a 5-tuple as compact string '11010' (no commas)
def fmt_states_compact(states_tuple):
    return ''.join(str(int(x)) for x in states_tuple)

# -------------------------
# Smoothing window (store detected gesture name or None)
# -------------------------
gesture_window = deque(maxlen=VOTE_WINDOW)

def majority_vote_name():
    if not gesture_window:
        return None
    counts = Counter(gesture_window)
    counts.pop(None, None)
    if not counts:
        return None
    most_common, cnt = counts.most_common(1)[0]
    if cnt / VOTE_WINDOW >= VOTE_THRESHOLD:
        return most_common
    return None

# -------------------------
# Main loop
# -------------------------
cap = cv2.VideoCapture(0)

current_stable_name = None
stable_since = None
last_spoken = None

print("\n" + "="*50)
print("Hand Gesture Recognition (10-finger) - Offline voice (no commas in state display)")
print("="*50)
print(f"Hold a gesture for {HOLD_TIME:.1f}s (smoothed over {VOTE_WINDOW} frames). Press ESC to exit.")
print("="*50 + "\n")

try:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.flip(frame, 1)
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb)

        detected_name = None
        detected_sentence = None
        left_states = (0,0,0,0,0)
        right_states = (0,0,0,0,0)

        combined_compact = None  # compact string for combined 10-bit state

        if results.multi_hand_landmarks and results.multi_handedness:
            hands_by_label = {}
            for idx, hand_landmarks in enumerate(results.multi_hand_landmarks):
                label = results.multi_handedness[idx].classification[0].label
                mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                states = get_finger_states(hand_landmarks, label)
                hands_by_label[label] = states

            left_states = hands_by_label.get("Left", left_states)
            right_states = hands_by_label.get("Right", right_states)

            # If both hands present, create 10-tuple and compact string without commas
            if len(hands_by_label) == 2:
                combined_key = tuple(left_states + right_states)
                combined_compact = fmt_states_compact(left_states) + fmt_states_compact(right_states)
                info = gesture_dict.get(combined_key)
                if info:
                    detected_name = info["name"]
                    detected_sentence = info["sentence"]
            else:
                only_label = next(iter(hands_by_label))
                key5 = hands_by_label[only_label]
                info = gesture_dict.get(key5)
                if info:
                    detected_name = info["name"]
                    detected_sentence = info["sentence"]

            # Show compact per-hand states (no commas)
            y = 140
            if "Left" in hands_by_label:
                left_compact = fmt_states_compact(hands_by_label["Left"])
                cv2.putText(frame, f"Left:{left_compact}", (10, y),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (220,220,220), 1)
                y += 22
            if "Right" in hands_by_label:
                right_compact = fmt_states_compact(hands_by_label["Right"])
                cv2.putText(frame, f"Right:{right_compact}", (10, y),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (220,220,220), 1)
                y += 22
            if combined_compact:
                cv2.putText(frame, f"Combined:{combined_compact}", (10, y),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (180,255,180), 1)
                y += 22

        # Fallback to count-based if no explicit mapping found
        if detected_name is None:
            total = sum(left_states) + sum(right_states)
            detected_name = f"{total}_fingers"
            detected_sentence = count_sentences.get(total, f"{total} fingers detected")

        # Append to smoothing window (we store the detected string name)
        gesture_window.append(detected_name)

        # Majority vote
        stable_name = majority_vote_name()

        if stable_name != current_stable_name:
            current_stable_name = stable_name
            stable_since = time.time() if stable_name is not None else None

        if current_stable_name is not None and stable_since is not None:
            held = time.time() - stable_since
            progress = min(held / HOLD_TIME, 1.0)
            sentence_to_speak = None

            # search explicit dict by name
            for key, info in gesture_dict.items():
                if info["name"] == current_stable_name:
                    sentence_to_speak = info["sentence"]
                    break

            if sentence_to_speak is None and current_stable_name.endswith("_fingers"):
                try:
                    count_val = int(current_stable_name.split("_")[0])
                    sentence_to_speak = count_sentences.get(count_val, None)
                except Exception:
                    sentence_to_speak = None

            if held >= HOLD_TIME and sentence_to_speak and (last_spoken != current_stable_name) and (not _speaking):
                speak_offline(sentence_to_speak)
                last_spoken = current_stable_name

            status_text = f"Stable: {current_stable_name} ({held:.1f}s)"
        else:
            progress = 0.0
            status_text = "No stable gesture"

        # UI overlays
        display_name = current_stable_name if current_stable_name else "Unknown"
        cv2.putText(frame, f"Gesture: {display_name}", (10, 40),
                    cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,255,0) if current_stable_name else (0,0,255), 2)
        cv2.putText(frame, status_text, (10, 80),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,0), 2)

        # progress bar
        bar_w = 300
        filled = int(progress * bar_w)
        cv2.rectangle(frame, (10,100), (10+bar_w,120), (70,70,70), -1)
        cv2.rectangle(frame, (10,100), (10+filled,120), (0,200,0), -1)

        # Last spoken
        cv2.putText(frame, f"Last spoken: {last_spoken}", (10, 155),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (200,200,200), 1)

        cv2.imshow("10-Finger Gesture Recognition - Offline Voice", frame)
        key = cv2.waitKey(1) & 0xFF
        if key == 27:
            break

finally:
    cap.release()
    cv2.destroyAllWindows()
    try:
        tts_engine.stop()
    except Exception:
        pass
    print("Exited.")



Hand Gesture Recognition (10-finger) - Offline voice (no commas in state display)
Hold a gesture for 1.5s (smoothed over 8 frames). Press ESC to exit.

Speaking: Thumbs up, everything is good
Speaking: No fingers are raised
Speaking: My hand is fully open
Speaking: No fingers are raised
Speaking: My hand is fully open
Speaking: No fingers are raised
Speaking: Thumbs up, everything is good
Speaking: No fingers are raised
Speaking: Peace sign, victory gesture
Speaking: I am hungry, I need food
Speaking: No fingers are raised
Speaking: I need water, I am thirsty
Speaking: No fingers are raised
Speaking: Four fingers are raised
Speaking: No fingers are raised
Speaking: My hand is fully open
Speaking: No fingers are raised
Speaking: Thumbs up, everything is good
Speaking: I am showing a closed fist
Speaking: No fingers are raised
Speaking: My hand is fully open
Speaking: No fingers are raised
Speaking: I am showing a closed fist
Speaking: Thumbs up, everything is good
Speaking: No fingers 