In [1]:
!pip install opencv-python mediapipe torch torchvision torchaudio pyttsx3 fer





[notice] A new release of pip is available: 24.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting pyttsx3
  Downloading pyttsx3-2.99-py3-none-any.whl.metadata (6.2 kB)
Collecting fer
  Downloading fer-22.5.1-py3-none-any.whl.metadata (6.4 kB)
Collecting pypiwin32 (from pyttsx3)
  Downloading pypiwin32-223-py3-none-any.whl.metadata (236 bytes)
Collecting facenet-pytorch (from fer)
  Downloading facenet_pytorch-2.6.0-py3-none-any.whl.metadata (12 kB)
Collecting moviepy (from fer)
  Downloading moviepy-2.2.1-py3-none-any.whl.metadata (6.9 kB)
Collecting ffmpeg==1.4 (from fer)
  Downloading ffmpeg-1.4.tar.gz (5.1 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pillow!=8.3.*,>=5.3.0 (from torchvision)
  Downloading pillow-10.2.0-cp38-cp38-win_amd64.whl.metadata (9.9 kB)
INFO: pip is looking at multiple versions of facenet-pytorch to determine which version is compatible with other requirements. This could take a while.
Collecting facenet-pytorch (from fer)
  Downloading facenet_pytorch-2.5.3-py3-none-any.whl

In [1]:
import cv2
import mediapipe as mp
import torch
import numpy as np
import collections
import pyttsx3
import torch.nn as nn
from fer import FER
import time
import os

# -----------------------
# Settings
# -----------------------

SEQ_LEN = 20  # number of frames for LSTM gesture
GESTURES = ["Food", "I", "Sorry", "Thank You", "Water"]  # your trained gestures
DEVICE = torch.device("cpu")
MODEL_PATH = "gesture_lstm_cpu_2.pth"

# -----------------------
# Load Hand Gesture Model (Module 1 LSTM)
# -----------------------
class GestureLSTM(nn.Module):
    def __init__(self, input_dim=126, hidden_dim=64, num_layers=1, num_classes=5, dropout=0.3):
        super(GestureLSTM, self).__init__()
        
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.fc = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_dim * 2, num_classes)
        )

    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]     # last timestep
        out = self.fc(out)
        return out
device='cpu'

def load_lstm_model():
    model = GestureLSTM(input_dim=126, hidden_dim=64, num_classes=len(GESTURES)).to(device)
    if os.path.exists(MODEL_PATH):
        model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
        model.eval()
        print("Loaded trained model from", MODEL_PATH)
    else:
        raise FileNotFoundError(f"{MODEL_PATH} not found. Train model first.")
    return model

model_kp = load_lstm_model()

# -----------------------
# Initialize MediaPipe Hands & FER
# -----------------------
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.3, min_tracking_confidence=0.3)
buf = collections.deque(maxlen=SEQ_LEN)

face_detector = FER(mtcnn=True)

# -----------------------
# Text-to-Speech
# -----------------------
engine = pyttsx3.init()
last_gesture = None
last_emotion = None


Loaded trained model from gesture_lstm_cpu_2.pth


INFO:comtypes.client._code_cache:Imported existing <module 'comtypes.gen' from 'C:\\Users\\harshit kumar\\anaconda3\\lib\\site-packages\\comtypes\\gen\\__init__.py'>
INFO:comtypes.client._code_cache:Using writeable comtypes cache directory: 'C:\Users\harshit kumar\anaconda3\lib\site-packages\comtypes\gen'


In [2]:
import cv2
import mediapipe as mp
import torch
import numpy as np
import collections
import pyttsx3
from fer import FER  # pre-trained facial expression recognition

# Hand Gesture Setup (reuse Module 1)
SEQ_LEN = 20
GESTURES = ["Food", "I", "Sorry", "Thank You", "Water"]
device = torch.device("cpu")

# Load LSTM gesture model (Module 1)
model_kp = GestureLSTM(input_dim=126, hidden_dim=64, num_classes=len(GESTURES)).to(device)
model_kp.load_state_dict(torch.load("gesture_lstm_cpu_2.pth", map_location=device))
model_kp.eval()

# Mediapipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.3, min_tracking_confidence=0.3)
buf = collections.deque(maxlen=SEQ_LEN)

# FER Detector
face_detector = FER(mtcnn=True)

# Text-to-Speech
#engine = pyttsx3.init()


In [3]:
cap = cv2.VideoCapture(0)
prev_time = 0
buf = collections.deque(maxlen=SEQ_LEN)

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # ---- Hand Gesture ----
    res = hands.process(rgb)
    data = []
    hands_detected = 0
    if res.multi_hand_landmarks:
        hands_detected = len(res.multi_hand_landmarks)
        for i, hand in enumerate(res.multi_hand_landmarks):
            if i >= 2: break
            for lm in hand.landmark:
                data += [lm.x, lm.y, lm.z]
        while len(data) < 126:
            data += [0]
        for hand_landmarks in res.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
    else:
        data = [0]*126

    buf.append(np.array(data, dtype=np.float32))

    # Predict Gesture only if buffer is full and hands detected
    gesture_pred = None
    if len(buf) == SEQ_LEN and hands_detected > 0:
        seq_input = torch.tensor([list(buf)], dtype=torch.float32).to(DEVICE)
        with torch.no_grad():
            out = model_kp(seq_input)
            pred = out.argmax(dim=1).item()
        gesture_pred = GESTURES[pred]
        cv2.putText(frame, f"Gesture: {gesture_pred}", (10,40), cv2.FONT_HERSHEY_SIMPLEX,1,(0,255,0),2)

    # ---- Facial Emotion ----
    face_emotions = face_detector.top_emotion(frame)
    emotion_label = "Unknown"
    if face_emotions is not None:
        emotion_label = face_emotions[0]
        cv2.putText(frame, f"Emotion: {emotion_label}", (10,80), cv2.FONT_HERSHEY_SIMPLEX,1,(255,0,0),2)

    # ---- FPS display ----
    curr_time = time.time()
    fps = 1 / (curr_time - prev_time) if prev_time else 0
    prev_time = curr_time
    cv2.putText(frame, f"FPS: {int(fps)}", (10,110), cv2.FONT_HERSHEY_SIMPLEX,0.8,(255,0,255),2)

    cv2.imshow("Gesture & Emotion Detection", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


  seq_input = torch.tensor([list(buf)], dtype=torch.float32).to(DEVICE)

