In [10]:
!pip install h5py



In [11]:
import cv2
import numpy as np
import mediapipe as mp
import pyautogui
import pygetwindow as gw
import math
from tensorflow.keras.models import load_model
import pickle
from collections import Counter, deque
import pyttsx3

# Disable PyAutoGUI failsafe
pyautogui.FAILSAFE = False

# Initialize text-to-speech engine
engine = pyttsx3.init()

# Get screen size
screen_width, screen_height = pyautogui.size()

# Define actions and statuses
actions = np.array(["Swipe Up", "Swipe Down", "Swipe Left", "Swipe Right", "Backspace", "Tab", "Enter", "Ctrl_A"])
statuses = ["like", "love", "request", "victory", "dislike", "closed_fist", "none"]

# Initialize MediaPipe holistic model
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

# Load pre-trained model and weights
model = load_model(r'AllRemain-LSTMv2.h5')
model.load_weights(r'AllRemain-LSTMv2.h5')

# Load SVM model for static gesture recognition
with open('svm_model.pkl', 'rb') as file:
    smodel = pickle.load(file)

# Load feature vectors
loaded_list = np.load(r'avg_600_feature_vector1.npy', allow_pickle=True)

# Helper functions
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

def draw_landmarks(image, results, clr):
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=clr))
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=clr))

def draw_styled_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2))
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(245, 117, 66), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color=(245, 66, 230), thickness=2, circle_radius=2))

def extract_keypoints(results):
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21 * 3)
    return np.concatenate([lh, rh])

def s_extract_keypoints(results):
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21 * 3)
    res = np.concatenate([lh, rh])
    for i in range(len(res)):
        if i < 63:
            k = 0
        else:
            k = 63
        if i % 3 == 0:
            res[i] = res[i] - res[k]
        elif i % 3 == 1:
            res[i] = res[i] - res[k + 1]
        elif i % 3 == 2:
            res[i] = res[i] - res[k + 2]
    return res

async def do_map(k):
    action_msgs = {
        "Swipe Up": "Scrolling up",
        "Swipe Down": "Scrolling down",
        "Swipe Right": "Scrolling right",
        "Swipe Left": "Scrolling left",
        "Ctrl_A": "Select All",
        "Tab": "Tab",
        "Backspace": "Backspace",
        "Enter": "Enter"
    }
    return action_msgs.get(actions[k], "None")

async def fun1(initial_sequence, flag):
    sequence = initial_sequence if flag else []
    msg = ""
    prediction_history = deque(maxlen=5)
    while len(sequence) <= 20:
        ret, frame = cap.read()
        image, results = mediapipe_detection(frame, holistic)
        draw_styled_landmarks(image, results)
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        if len(sequence) == 10:
            if sum(np.all(seq == 0) for seq in sequence) > 5:
                return ""
            sequence = [np.zeros(126)] * 5 + sequence + [np.zeros(126)] * 5
            res = model.predict(np.array([sequence]))
            prediction = np.argmax(res)
            confidence = np.max(res)
            if confidence >= 0.60:
                prediction_history.append(actions[prediction])
            if len(prediction_history) == prediction_history.maxlen:
                most_common_prediction = Counter(prediction_history).most_common(1)[0][0]
                msg = most_common_prediction
            else:
                msg = actions[prediction]
            cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
            print(msg)
            engine.say(msg)  # Add this line for voice output
            engine.runAndWait()  # Add this line to wait until the speech is finished
        if cv2.waitKey(10) & 0xff == ord('q'):
            break
        cv2.putText(image, "Last Gesture : " + msg, (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.imshow('OpenCV Feed', image)
    return msg

def eucal(p1, p2):
    return math.sqrt((p1.x - p2.x) ** 2 + (p1.y - p2.y) ** 2)

def pointer(dis, msg):
    first_time = 1
    buffer = 5 if dis < 0 else -2
    while dis < 0.10 and (first_time == 1 or (results.right_hand_landmarks and results.left_hand_landmarks is None)):
        first_time = 0
        ret, frame = cap.read()
        image, results = mediapipe_detection(frame, holistic)
        if results.right_hand_landmarks:
            l4 = results.right_hand_landmarks.landmark[4]
            l12 = results.right_hand_landmarks.landmark[12]
            draw_landmarks(image, results, (0, 0, 255))
            dis = eucal(l4, l12)
            if dis < 0.10:
                l1 = results.right_hand_landmarks.landmark[8]
                ix = (screen_width + 10) - (screen_width + 10) * l1.x
                iy = (screen_height + 10) * l1.y
                pyautogui.moveTo(ix, iy)
                if buffer > 0:
                    return 1, msg
        if dis >= 0.10 or results.right_hand_landmarks is None:
            if buffer == -2:
                val, msg = pointer(-1, msg)
                if val == 1:
                    dis = 0
                    first_time = 1
            elif buffer > 0:
                cv2.putText(image, "In rec", (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA)
                if results.right_hand_landmarks:
                    l11 = results.right_hand_landmarks.landmark[11].y
                    l7 = results.right_hand_landmarks.landmark[7].y
                    if l11 < l7:
                        pyautogui.press('ctrl', presses=5)
                        pyautogui.click()
                        msg = "Click"
                        print("click occurred")
                        engine.say(msg)  # Add this line for voice output
                        engine.runAndWait()  # Add this line to wait until the speech is finished
                        return 1, msg
                buffer -= 1
                dis = 0
                first_time = 1
            else:
                return 0, msg
        cv2.putText(image, "Action : Pointer", (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.putText(image, "Last Gesture : " + msg, (3, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.imshow('OpenCV Feed', image)
        if cv2.waitKey(10) & 0xff == ord('q'):
            return

def map_stat(msg):
    stat_msgs = {
        "like": "Like",
        "love": "Love",
        "dislike": "Dislike",
        "request": "Request",
        "victory": "Victory",
        "closed_fist": "Fist Closed",
        "none": "None"
    }
    print(stat_msgs.get(msg, "None"))
    engine.say(stat_msgs.get(msg, "None"))  # Add this line for voice output
    engine.runAndWait()  # Add this line to wait until the speech is finished

def static():
    res = []
    total = 5
    msg = ""
    ret_keypoints = []
    for _ in range(total):
        ret, frame = cap.read()
        image, results = mediapipe_detection(frame, holistic)
        draw_landmarks(image, results, (0, 255, 0))
        keypoints = s_extract_keypoints(results)
        keypts = extract_keypoints(results)
        ret_keypoints.append(keypts)
        gest = smodel.predict([keypoints])
        res.append(statuses[gest[0]])
        cv2.putText(image, "NO HANDS", (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.putText(image, "Last Gesture : " + msg, (3, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.imshow('OpenCV Feed', image)
        if cv2.waitKey(10) & 0xff == ord('q'):
            break
    frequency = Counter(res)
    for element, count in frequency.items():
        if count == total:
            msg = element
            break
    if msg and msg not in {"none", "closed_fist"}:
        map_stat(msg)
        return 1, ret_keypoints, msg
    else:
        return 0, ret_keypoints, "None"
    
# Implement PyAutoGUI actions based on gestures
def execute_action(action):
    if action == "Swipe Up":
        pyautogui.hotkey('volumeup')  # volume up
    elif action == "Swipe Down":
        pyautogui.hotkey('volumedown')  # volume down
    elif action == "Swipe Right":
#         pyautogui.hotkey('p')  # for previous song
        pyautogui.hotkey('right')
    elif action == "Swipe Left":
#         pyautogui.hotkey('n')  # for next song
        pyautogui.hotkey('left')
    elif action == "Enter":
        pyautogui.press('space')  # pause/play
    elif action == "Backspace":
        pyautogui.press('backspace')
    elif action == "Tab":
        pyautogui.press('tab')
    elif action == "Ctrl_A":
        pyautogui.hotkey('ctrl', 'a')  # select all

# Main code
cap = cv2.VideoCapture(0)
sequence = []
msg = ""
s_msg = ""
c = 0

with mp_holistic.Holistic(min_detection_confidence=0.6, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        image, results = mediapipe_detection(frame, holistic)
        cv2.putText(image, "NO HANDS", (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA)
        cv2.putText(image, "Last D-Gesture : " + msg, (3, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
        cv2.putText(image, "Last S-Gesture : " + s_msg, (3, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
        cv2.imshow('OpenCV Feed', image)
        
        if results.left_hand_landmarks or results.right_hand_landmarks:
            if results.right_hand_landmarks and not results.left_hand_landmarks:
                l4 = results.right_hand_landmarks.landmark[4]
                l12 = results.right_hand_landmarks.landmark[12]
                dis = eucal(l4, l12)
                if dis < 0.10:
                    pointer(dis, "")
                    continue
            t_msg = s_msg
            s, keys, s_msg = static()
            if s_msg == "None":
                s_msg = t_msg
            if s == 1:
                continue
            else:
                msg = await fun1(keys, 1)
            c = 0
        if cv2.waitKey(10) & 0xff == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

ImportError: `load_model()` using h5 format requires h5py. Could not import h5py.

In [2]:
#smooth logic 24/06(1)

import cv2
import numpy as np
import mediapipe as mp
import pyautogui
import pygetwindow as gw
import math
import pickle
from collections import Counter, deque
import threading
import pyttsx3

# Disable PyAutoGUI failsafe
pyautogui.FAILSAFE = False

# Initialize text-to-speech engine
engine = pyttsx3.init()

# Get screen size
screen_width, screen_height = pyautogui.size()

# Define actions and statuses
actions = np.array(["Swipe Up", "Swipe Down", "Swipe Left", "Swipe Right", "Backspace", "Tab", "Enter", "Ctrl_A"])
statuses = ["like", "love", "request", "victory", "dislike", "closed_fist", "none"]

# Initialize MediaPipe holistic model
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

# Load pre-trained model and weights
model = load_model(r'AllRemain-LSTMv2.h5')
model.load_weights(r'AllRemain-LSTMv2.h5')

# Load SVM model for static gesture recognition
with open('svm_model.pkl', 'rb') as file:
    smodel = pickle.load(file)

# Load feature vectors
loaded_list = np.load(r'avg_600_feature_vector1.npy', allow_pickle=True)

# Helper functions
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

def draw_landmarks(image, results, clr):
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=clr))
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=clr))

def draw_styled_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2))
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(245, 117, 66), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color=(245, 66, 230), thickness=2, circle_radius=2))

def extract_keypoints(results):
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21 * 3)
    return np.concatenate([lh, rh])

def s_extract_keypoints(results):
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21 * 3)
    res = np.concatenate([lh, rh])
    for i in range(len(res)):
        if i < 63:
            k = 0
        else:
            k = 63
        if i % 3 == 0:
            res[i] = res[i] - res[k]
        elif i % 3 == 1:
            res[i] = res[i] - res[k + 1]
        elif i % 3 == 2:
            res[i] = res[i] - res[k + 2]
    return res

def speak(msg):
    engine.say(msg)
    engine.runAndWait()

def eucal(p1, p2):
    return math.sqrt((p1.x - p2.x) ** 2 + (p1.y - p2.y) ** 2)

def pointer(dis, msg):
    first_time = 1
    buffer = 5 if dis < 0 else -2
    while dis < 0.10 and (first_time == 1 or (results.right_hand_landmarks and results.left_hand_landmarks is None)):
        first_time = 0
        ret, frame = cap.read()
        image, results = mediapipe_detection(frame, holistic)
        if results.right_hand_landmarks:
            l4 = results.right_hand_landmarks.landmark[4]
            l12 = results.right_hand_landmarks.landmark[12]
            draw_landmarks(image, results, (0, 0, 255))
            dis = eucal(l4, l12)
            if dis < 0.10:
                l1 = results.right_hand_landmarks.landmark[8]
                ix = (screen_width + 10) - (screen_width + 10) * l1.x
                iy = (screen_height + 10) * l1.y
                pyautogui.moveTo(ix, iy)
                if buffer > 0:
                    return 1, msg
        if dis >= 0.10 or results.right_hand_landmarks is None:
            if buffer == -2:
                val, msg = pointer(-1, msg)
                if val == 1:
                    dis = 0
                    first_time = 1
            elif buffer > 0:
                cv2.putText(image, "In rec", (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA)
                if results.right_hand_landmarks:
                    l11 = results.right_hand_landmarks.landmark[11].y
                    l7 = results.right_hand_landmarks.landmark[7].y
                    if l11 < l7:
                        pyautogui.press('ctrl', presses=5)
                        pyautogui.click()
                        msg = "Click"
                        threading.Thread(target=speak, args=(msg,)).start()  # Use threading for voice output
                        return 1, msg
                buffer -= 1
                dis = 0
                first_time = 1
            else:
                return 0, msg
        cv2.putText(image, "Action : Pointer", (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.putText(image, "Last Gesture : " + msg, (3, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.imshow('OpenCV Feed', image)
        if cv2.waitKey(10) & 0xff == ord('q'):
            return

def map_stat(msg):
    stat_msgs = {
        "like": "Like",
        "love": "Love",
        "dislike": "Dislike",
        "request": "Request",
        "victory": "Victory",
        "closed_fist": "Fist Closed",
        "none": "None"
    }
    stat_msg = stat_msgs.get(msg, "None")
    print(stat_msg)
    threading.Thread(target=speak, args=(stat_msg,)).start()  # Use threading for voice output

def static():
    res = []
    total = 5
    msg = ""
    ret_keypoints = []
    for _ in range(total):
        ret, frame = cap.read()
        image, results = mediapipe_detection(frame, holistic)
        draw_landmarks(image, results, (0, 255, 0))
        keypoints = s_extract_keypoints(results)
        keypts = extract_keypoints(results)
        ret_keypoints.append(keypts)
        gest = smodel.predict([keypoints])
        res.append(statuses[gest[0]])
        cv2.putText(image, "NO HANDS", (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.putText(image, "Last Gesture : " + msg, (3, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.imshow('OpenCV Feed', image)
        if cv2.waitKey(10) & 0xff == ord('q'):
            break
    frequency = Counter(res)
    for element, count in frequency.items():
        if count == total:
            msg = element
            break
    if msg and msg not in {"none", "closed_fist"}:
        map_stat(msg)
        return 1, ret_keypoints, msg
    else:
        return 0, ret_keypoints, "None"

def process_gesture(gesture_seq, prediction_history, msg):
    res = model.predict(np.array([gesture_seq]))
    prediction = np.argmax(res)
    confidence = np.max(res)
    if confidence >= 0.60:
        prediction_history.append(actions[prediction])
    if len(prediction_history) == prediction_history.maxlen:
        most_common_prediction = Counter(prediction_history).most_common(1)[0][0]
        msg = most_common_prediction
    else:
        msg = actions[prediction]
    return msg

async def fun1(initial_sequence, flag):
    sequence = initial_sequence if flag else []
    msg = ""
    prediction_history = deque(maxlen=5)
    while len(sequence) <= 20:
        ret, frame = cap.read()
        image, results = mediapipe_detection(frame, holistic)
        draw_styled_landmarks(image, results)
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        if len(sequence) == 10:
            if sum(np.all(seq == 0) for seq in sequence) > 5:
                return ""
            sequence = [np.zeros(126)] * 5 + sequence + [np.zeros(126)] * 5
            msg = process_gesture(sequence, prediction_history, msg)
            threading.Thread(target=speak, args=(msg,)).start()  # Use threading for voice output
        if cv2.waitKey(10) & 0xff == ord('q'):
            break
        cv2.putText(image, "Last Gesture : " + msg, (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.imshow('OpenCV Feed', image)
    return msg

# Implement PyAutoGUI actions based on gestures
def execute_action(action):
    if action == "Swipe Up":
        pyautogui.hotkey('volumeup')  # volume up
    elif action == "Swipe Down":
        pyautogui.hotkey('volumedown')  # volume down
    elif action == "Swipe Right":
#         pyautogui.hotkey('p')  # for previous song
        pyautogui.hotkey('right')
    elif action == "Swipe Left":
#         pyautogui.hotkey('n')  # for next song
        pyautogui.hotkey('left')
    elif action == "Enter":
        pyautogui.press('space')  # pause/play
    elif action == "Backspace":
        pyautogui.press('backspace')
    elif action == "Tab":
        pyautogui.press('tab')
    elif action == "Ctrl_A":
        pyautogui.hotkey('ctrl', 'a')  # select all
        
        
# Main code
cap = cv2.VideoCapture(0)
sequence = []
msg = ""
s_msg = ""
c = 0

with mp_holistic.Holistic(min_detection_confidence=0.6, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        image, results = mediapipe_detection(frame, holistic)
        cv2.putText(image, "NO HANDS", (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA)
        cv2.putText(image, "Last D-Gesture : " + msg, (3, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
        cv2.putText(image, "Last S-Gesture : " + s_msg, (3, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
        cv2.imshow('OpenCV Feed', image)
        
        if results.left_hand_landmarks or results.right_hand_landmarks:
            if results.right_hand_landmarks and not results.left_hand_landmarks:
                l4 = results.right_hand_landmarks.landmark[4]
                l12 = results.right_hand_landmarks.landmark[12]
                dis = eucal(l4, l12)
                if dis < 0.10:
                    pointer(dis, "")
                    continue
            t_msg = s_msg
            s, keys, s_msg = static()
            if s_msg == "None":
                s_msg = t_msg
            if s == 1:
                continue
            else:
                msg = await fun1(keys, 1)
            c = 0
        if cv2.waitKey(10) & 0xff == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()


NameError: name 'load_model' is not defined

In [3]:
    cap.release()
    cv2.destroyAllWindows()

In [3]:
#smooth logic 24/06(2)


import cv2
import numpy as np
import mediapipe as mp
import pyautogui
import pygetwindow as gw
import math
import pickle
from collections import Counter, deque
import threading
import pyttsx3
import queue
from tensorflow.keras.models import load_model
import time

# Disable PyAutoGUI failsafe
pyautogui.FAILSAFE = False

# Initialize text-to-speech engine
engine = pyttsx3.init()

# Create a queue for text-to-speech
tts_queue = queue.Queue()

# Function to handle text-to-speech requests
def tts_worker():
    while True:
        msg = tts_queue.get()
        if msg is None:
            break
        engine.say(msg)
        engine.runAndWait()
        tts_queue.task_done()

# Start the text-to-speech thread
tts_thread = threading.Thread(target=tts_worker)
tts_thread.daemon = True
tts_thread.start()

# Get screen size
screen_width, screen_height = pyautogui.size()

# Define actions and statuses
actions = np.array(["Swipe Up", "Swipe Down", "Swipe Left", "Swipe Right", "Backspace", "Tab", "Enter", "Ctrl_A"])
statuses = ["like", "love", "request", "victory", "dislike", "closed_fist", "none"]

# Initialize MediaPipe holistic model
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

# Load pre-trained model and weights
model = load_model(r'AllRemain-LSTMv2.h5')
model.load_weights(r'AllRemain-LSTMv2.h5')

# Load SVM model for static gesture recognition
with open('svm_model.pkl', 'rb') as file:
    smodel = pickle.load(file)

# Load feature vectors
loaded_list = np.load(r'avg_600_feature_vector1.npy', allow_pickle=True)

# Helper functions
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

def draw_landmarks(image, results, clr):
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=clr))
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=clr))

def draw_styled_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2))
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(245, 117, 66), thickness=2, circle_radius=4),
                              mp_drawing.DrawingSpec(color=(245, 66, 230), thickness=2, circle_radius=2))

def extract_keypoints(results):
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21 * 3)
    return np.concatenate([lh, rh])

def s_extract_keypoints(results):
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21 * 3)
    res = np.concatenate([lh, rh])
    for i in range(len(res)):
        if i < 63:
            k = 0
        else:
            k = 63
        if i % 3 == 0:
            res[i] = res[i] - res[k]
        elif i % 3 == 1:
            res[i] = res[i] - res[k + 1]
        elif i % 3 == 2:
            res[i] = res[i] - res[k + 2]
    return res

def speak(msg):
    tts_queue.put(msg)

def eucal(p1, p2):
    return math.sqrt((p1.x - p2.x) ** 2 + (p1.y - p2.y) ** 2)

def pointer(dis, msg):
    first_time = 1
    buffer = 5 if dis < 0 else -2
    while dis < 0.10 and (first_time == 1 or (results.right_hand_landmarks and results.left_hand_landmarks is None)):
        first_time = 0
        ret, frame = cap.read()
        image, results = mediapipe_detection(frame, holistic)
        if results.right_hand_landmarks:
            l4 = results.right_hand_landmarks.landmark[4]
            l12 = results.right_hand_landmarks.landmark[12]
            draw_landmarks(image, results, (0, 0, 255))
            dis = eucal(l4, l12)
            if dis < 0.10:
                l1 = results.right_hand_landmarks.landmark[8]
                ix = (screen_width + 10) - (screen_width + 10) * l1.x
                iy = (screen_height + 10) * l1.y
                pyautogui.moveTo(ix, iy)
                if buffer > 0:
                    return 1, msg
        if dis >= 0.10 or results.right_hand_landmarks is None:
            if buffer == -2:
                val, msg = pointer(-1, msg)
                if val == 1:
                    dis = 0
                    first_time = 1
            elif buffer > 0:
                cv2.putText(image, "In rec", (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA)
                if results.right_hand_landmarks:
                    l11 = results.right_hand_landmarks.landmark[11].y
                    l7 = results.right_hand_landmarks.landmark[7].y
                    if l11 < l7:
                        pyautogui.press('ctrl', presses=5)
                        pyautogui.click()
                        msg = "Click"
                        threading.Thread(target=speak, args=(msg,)).start()  # Use threading for voice output
                        return 1, msg
                buffer -= 1
                dis = 0
                first_time = 1
            else:
                return 0, msg
        cv2.putText(image, "Action : Pointer", (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.putText(image, "Last Gesture : " + msg, (3, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.imshow('OpenCV Feed', image)
        if cv2.waitKey(10) & 0xff == ord('q'):
            return

def map_stat(msg):
    stat_msgs = {
        "like": "Like",
        "love": "Love",
        "dislike": "Dislike",
        "request": "Request",
        "victory": "Victory",
        "closed_fist": "Fist Closed",
        "none": "None"
    }
    stat_msg = stat_msgs.get(msg, "None")
    print(stat_msg)
    speak(stat_msg)  # Use speak function to add the message to the queue

def static():
    res = []
    total = 5
    msg = ""
    ret_keypoints = []
    for _ in range(total):
        ret, frame = cap.read()
        image, results = mediapipe_detection(frame, holistic)
        draw_landmarks(image, results, (0, 255, 0))
        keypoints = s_extract_keypoints(results)
        keypts = extract_keypoints(results)
        ret_keypoints.append(keypts)
        gest = smodel.predict([keypoints])
        res.append(statuses[gest[0]])
        cv2.putText(image, "NO HANDS", (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.putText(image, "Last Gesture : " + res[-1], (3, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.imshow('OpenCV Feed', image)
        if cv2.waitKey(10) & 0xff == ord('q'):
            break
    frequency = Counter(res)
    for element, count in frequency.items():
        if count == total:
            msg = element
            break
    if msg and msg not in {"none", "closed_fist"}:
        map_stat(msg)
        return 1, ret_keypoints, msg
    else:
        return 0, ret_keypoints, "None"

def process_gesture(gesture_seq, prediction_history, msg):
    res = model.predict(np.array([gesture_seq]))
    prediction = np.argmax(res)
    confidence = np.max(res)
    if confidence >= 0.60:
        prediction_history.append(actions[prediction])
    if len(prediction_history) == prediction_history.maxlen:
        most_common_prediction = Counter(prediction_history).most_common(1)[0][0]
        msg = most_common_prediction
    else:
        msg = actions[prediction]
    return msg

async def fun1(initial_sequence, flag):
    sequence = initial_sequence if flag else []
    msg = ""
    prediction_history = deque(maxlen=5)
    while len(sequence) <= 20:
        ret, frame = cap.read()
        image, results = mediapipe_detection(frame, holistic)
        draw_styled_landmarks(image, results)
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        if len(sequence) == 10:
            if sum(np.all(seq == 0) for seq in sequence) > 5:
                return ""
            sequence = [np.zeros(126)] * 5 + sequence + [np.zeros(126)] * 5
            msg = process_gesture(sequence, prediction_history, msg)
            threading.Thread(target=speak, args=(msg,)).start()  # Use threading for voice output
        if cv2.waitKey(10) & 0xff == ord('q'):
            break
        cv2.putText(image, "Last Gesture : " + msg, (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.imshow('OpenCV Feed', image)
    return msg


# Implement PyAutoGUI actions based on gestures
def execute_action(action):
    if action == "Swipe Up":
        pyautogui.hotkey('volumeup')  # volume up
    elif action == "Swipe Down":
        pyautogui.hotkey('volumedown')  # volume down
    elif action == "Swipe Right":
#         pyautogui.hotkey('p')  # for previous song
        pyautogui.hotkey('right')
    elif action == "Swipe Left":
#         pyautogui.hotkey('n')  # for next song
        pyautogui.hotkey('left')
    elif action == "Enter":
        pyautogui.press('space')  # pause/play
    elif action == "Backspace":
        pyautogui.press('backspace')
    elif action == "Tab":
        pyautogui.press('tab')
    elif action == "Ctrl_A":
        pyautogui.hotkey('ctrl', 'a')  # select all
        
        
    time.sleep(1.0) 


# Main code
cap = cv2.VideoCapture(0)
sequence = []
msg = ""
s_msg = ""
c = 0

with mp_holistic.Holistic(min_detection_confidence=0.6, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        image, results = mediapipe_detection(frame, holistic)
        cv2.putText(image, "NO HANDS", (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA)
        cv2.putText(image, "Last D-Gesture : " + msg, (3, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
        cv2.putText(image, "Last S-Gesture : " + s_msg, (3, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
        cv2.imshow('OpenCV Feed', image)
        
        if results.left_hand_landmarks or results.right_hand_landmarks:
            if results.right_hand_landmarks and not results.left_hand_landmarks:
                l4 = results.right_hand_landmarks.landmark[4]
                l12 = results.right_hand_landmarks.landmark[12]
                dis = eucal(l4, l12)
                if dis < 0.10:
                    pointer(dis, "")
                    continue
            t_msg = s_msg
            s, keys, s_msg = static()
            if s_msg == "None":
                s_msg = t_msg
            if s == 1:
                continue
            else:
                msg = await fun1(keys, 1)
            c = 0
        if cv2.waitKey(10) & 0xff == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

# Stop the text-to-speech thread
tts_queue.put(None)
tts_thread.join()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


