In [None]:
# Run this in a Jupyter cell with a leading ! or in terminal
!pip install opencv-python mediapipe torch torchvision torchaudio tqdm


In [None]:
# -----------------------------
# Single Gesture Recorder
# -----------------------------

import cv2, os, time

# ✅ Change this to the gesture you want to record
gesture = "Water"  

BASE_DIR = "dataset"
os.makedirs(BASE_DIR, exist_ok=True)
os.makedirs(os.path.join(BASE_DIR, gesture), exist_ok=True)

# Initialize webcam
cap = cv2.VideoCapture(0)
fps = cap.get(cv2.CAP_PROP_FPS) or 20.0

# Count existing recordings
count = len(os.listdir(os.path.join(BASE_DIR, gesture)))
print(f"Recording gesture: {gesture}")
print("Press 'r' to start recording a ~2s clip. Press 'q' to quit.")

while True:
    ret, frame = cap.read()
    if not ret:
        break
    cv2.imshow("Single Gesture Recorder", frame)
    key = cv2.waitKey(1) & 0xFF
    
    if key == ord('q'):
        break
    
    # Press 'r' to record
    if key == ord('r'):
        out_name = f"{gesture}_{count}.mp4"
        out_path = os.path.join(BASE_DIR, gesture, out_name)
        h, w = frame.shape[:2]
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        writer = cv2.VideoWriter(out_path, fourcc, 20.0, (w, h))
        print(f"Recording {gesture} -> {out_path}")
        start = time.time()
        # Record ~2 seconds
        while time.time() - start < 2.0:
            ret, frame = cap.read()
            if not ret: break
            writer.write(frame)
            cv2.imshow("Single Gesture Recorder", frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        writer.release()
        count += 1
        print(f"Saved {out_path}")

cap.release()
cv2.destroyAllWindows()


In [None]:
import os, glob, numpy as np, cv2, mediapipe as mp
mp_hands = mp.solutions.hands

BASE="dataset"
OUT="processed/keypoints"
os.makedirs(OUT, exist_ok=True)
SEQ_LEN = 20  # smaller seq_len for CPU

def extract_keypoints_from_video(path, seq_len=SEQ_LEN):
    cap = cv2.VideoCapture(path)
    hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.5)
    seq=[]
    while len(seq) < seq_len:
        ret, frame = cap.read()
        if not ret: break
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        res = hands.process(rgb)
        if res.multi_hand_landmarks:
            data=[]
            for hand in res.multi_hand_landmarks:
                for lm in hand.landmark:
                    data += [lm.x, lm.y, lm.z]
            if len(res.multi_hand_landmarks)==1:
                data += [0]*63
            seq.append(data)
        else:
            seq.append([0]*126)
    while len(seq) < seq_len:
        seq.append([0]*126)
    cap.release(); hands.close()
    return np.array(seq, dtype=np.float32)

for gesture in sorted(os.listdir(BASE)):
    in_dir = os.path.join(BASE, gesture)
    out_dir = os.path.join(OUT, gesture)
    os.makedirs(out_dir, exist_ok=True)
    for mp4 in glob.glob(os.path.join(in_dir,"*.mp4")):
        name = os.path.splitext(os.path.basename(mp4))[0]
        out_path = os.path.join(out_dir, name + ".npy")
        if os.path.exists(out_path): continue
        arr = extract_keypoints_from_video(mp4)
        np.save(out_path, arr)
        print("Saved", out_path)


In [None]:
BASE="dataset"
OUT="processed/frames"
os.makedirs(OUT, exist_ok=True)
TARGET_FRAMES = 8    # fewer frames
TARGET_SIZE = (64,64)  # smaller resolution

def extract_video_clip(path, target_frames=TARGET_FRAMES, size=TARGET_SIZE):
    cap = cv2.VideoCapture(path)
    frames=[]
    while len(frames) < target_frames:
        ret, frame = cap.read()
        if not ret: break
        frame = cv2.resize(frame, size)
        frames.append(frame[..., ::-1])
    cap.release()
    while len(frames) < target_frames:
        frames.append(frames[-1] if frames else np.zeros((size[1],size[0],3), dtype=np.uint8))
    arr = np.stack(frames, axis=0).astype(np.uint8)
    return arr

for gesture in sorted(os.listdir(BASE)):
    in_dir = os.path.join(BASE,gesture)
    out_dir = os.path.join(OUT,gesture)
    os.makedirs(out_dir, exist_ok=True)
    for mp4 in glob.glob(os.path.join(in_dir,"*.mp4")):
        name = os.path.splitext(os.path.basename(mp4))[0]
        out_path = os.path.join(out_dir, name + ".npy")
        if os.path.exists(out_path): continue
        clip = extract_video_clip(mp4)
        np.save(out_path, clip)
        print("Saved frames", out_path)


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np, os, glob

device = torch.device("cpu")  # CPU only

# Keypoint dataset
class KeypointDataset(Dataset):
    def __init__(self, root="processed/keypoints"):
        self.samples=[]; self.labels=[]
        self.classes = sorted(os.listdir(root))
        for i,cls in enumerate(self.classes):
            for npy in glob.glob(os.path.join(root,cls,"*.npy")):
                self.samples.append(npy)
                self.labels.append(i)
    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        arr = np.load(self.samples[idx])
        return torch.tensor(arr, dtype=torch.float32), self.labels[idx]

# Frame dataset
import os, glob
import numpy as np
import torch
from torch.utils.data import Dataset
import cv2

class VideoDataset(Dataset):
    def __init__(self, root="processed/frames", seq_len=12, resize=(32,32)):
        self.samples = []
        self.labels = []
        self.seq_len = seq_len
        self.resize = resize
        self.classes = sorted(os.listdir(root))
        self.class2idx = {c:i for i,c in enumerate(self.classes)}

        for cls in self.classes:
            cls_path = os.path.join(root, cls)
            for npy in glob.glob(os.path.join(cls_path, "*.npy")):
                self.samples.append(npy)
                self.labels.append(self.class2idx[cls])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        arr = np.load(self.samples[idx]).astype(np.float32)/255.0  # normalize
        # Pad or crop sequence to seq_len
        T, H, W, C = arr.shape
        if T < self.seq_len:
            pad = np.zeros((self.seq_len - T, H, W, C), dtype=np.float32)
            arr = np.concatenate([arr, pad], axis=0)
        elif T > self.seq_len:
            arr = arr[:self.seq_len]

        # Resize frames if needed
        if self.resize:
            arr_resized = np.stack([cv2.resize(f, self.resize) for f in arr], axis=0)
            arr = arr_resized

        # Transpose to [C, T, H, W] for Conv3D
        arr = np.transpose(arr, (3,0,1,2))
        return torch.tensor(arr, dtype=torch.float32), self.labels[idx]


In [None]:
import torch.nn as nn

class GestureLSTM(nn.Module):
    def __init__(self, input_dim=126, hidden_dim=64, num_classes=5):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, num_classes)
    def forward(self,x):
        out,_ = self.lstm(x)
        return self.fc(out[:,-1,:])

class Simple3DCNN(nn.Module):
    def __init__(self, num_classes=5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv3d(3,16,3,padding=1), nn.ReLU(), nn.MaxPool3d((1,2,2)),
            nn.Conv3d(16,32,3,padding=1), nn.ReLU(), nn.MaxPool3d((2,2,2)),
            nn.Conv3d(32,64,3,padding=1), nn.ReLU(), nn.AdaptiveAvgPool3d((1,1,1)),
        )
        self.fc = nn.Linear(64,num_classes)
    def forward(self,x):
        x=self.net(x)
        x=x.view(x.size(0),-1)
        return self.fc(x)


In [None]:
import numpy as np

# Replace with your file path
file_path = "processed/keypoints/I/I_0.npy"

data = np.load(file_path)
print(data.shape)   # Check the shape of the array
print(data)         # See the actual data
print("Sequence length:", data.shape[0])
print("Number of features per frame:", data.shape[1])
print("First frame keypoints:", data[0])


In [None]:
kp_dataset = KeypointDataset("processed/keypoints")
kp_loader = DataLoader(kp_dataset, batch_size=4, shuffle=True)  # small batch

model_kp = GestureLSTM(input_dim=kp_dataset[0][0].shape[1], hidden_dim=64, num_classes=len(kp_dataset.classes)).to(device)
opt = torch.optim.Adam(model_kp.parameters(), lr=1e-3)
crit = nn.CrossEntropyLoss()

for ep in range(50):  # fewer epochs for CPU
    total_loss=0; cnt=0
    model_kp.train()
    for X,y in kp_loader:
        X=X.to(device); y=torch.tensor(y).to(device)
        opt.zero_grad()
        out = model_kp(X)
        loss = crit(out,y)
        loss.backward(); opt.step()
        total_loss+=loss.item(); cnt+=1
    print(f"Epoch {ep+1}, loss={total_loss/cnt:.4f}")
torch.save(model_kp.state_dict(),"gesture_lstm_cpu.pth")


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import os

# Device
device = torch.device("cpu")

# --- Dataset ---
# Make sure your VideoDataset class accepts seq_len and resize
video_ds = VideoDataset("processed/frames", seq_len=12, resize=(32,32))
video_loader = DataLoader(video_ds, batch_size=4, shuffle=True)

# --- Model ---
model_3d = Simple3DCNN(num_classes=len(video_ds.classes)).to(device)
opt3 = torch.optim.Adam(model_3d.parameters(), lr=1e-4)
crit3 = nn.CrossEntropyLoss()

# --- Training parameters ---
num_epochs = 50  # can increase further for better accuracy

for ep in range(num_epochs):
    total_loss = 0
    cnt = 0
    model_3d.train()
    
    for X, y in video_loader:
        # Ensure correct tensor type
        X = X.float().to(device)       # [batch, C, seq_len, H, W]
        y = torch.tensor(y, dtype=torch.long).to(device)
        
        opt3.zero_grad()
        out = model_3d(X)
        loss = crit3(out, y)
        loss.backward()
        opt3.step()
        
        total_loss += loss.item()
        cnt += 1
    
    print(f"3D Epoch {ep+1}/{num_epochs}, loss={total_loss/cnt:.4f}")

# --- Save model ---
MODEL_SAVE_PATH = "gesture_3d_cpu.pth"
torch.save(model_3d.state_dict(), MODEL_SAVE_PATH)
print(f"Saved improved 3D-CNN model to {MODEL_SAVE_PATH}!")


In [None]:
from sklearn.metrics import accuracy_score
import time

def eval_model(model, dataset, batch_size=1):
    model.eval()
    preds=[]; trues=[]
    for X,y in DataLoader(dataset,batch_size=batch_size):
        X=X.to(device)
        with torch.no_grad(): out = model(X)
        preds += out.argmax(1).cpu().numpy().tolist()
        trues += y
    return accuracy_score(trues,preds)

acc_kp = eval_model(model_kp,kp_dataset)
acc_3d = eval_model(model_3d,video_ds)
print("LSTM CPU acc:",acc_kp)
print("3D-CNN CPU acc:",acc_3d)


In [None]:
import collections
import cv2, mediapipe as mp
import torch
import numpy as np
import time
import os
import pyttsx3

# -----------------------
# Real-time LSTM Gesture Recognition (CPU)
# -----------------------

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
SEQ_LEN = 20  # should match training
GESTURES = ["Food", "I", "Sorry", "Thank You", "Water"]  # same as training
device = torch.device("cpu")
MODEL_PATH = "gesture_lstm_cpu.pth"

# Function to load model
def load_lstm_model():
    model = GestureLSTM(input_dim=126, hidden_dim=64, num_classes=len(GESTURES)).to(device)
    if os.path.exists(MODEL_PATH):
        model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
        model.eval()
        print("Loaded trained model from", MODEL_PATH)
    else:
        raise FileNotFoundError(f"{MODEL_PATH} not found. Train model first.")
    return model

# Load model once
model_kp = load_lstm_model()

tts_engine = pyttsx3.init()


# Initialize webcam
cap = cv2.VideoCapture(0)
hands = mp_hands.Hands(
    max_num_hands=2,
    min_detection_confidence=0.3,  # lower threshold
    min_tracking_confidence=0.3
)
buf = collections.deque(maxlen=SEQ_LEN)

prev_time = 0  # for FPS calculation

while True:
    ret, frame = cap.read()
    if not ret:
        break

    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    res = hands.process(rgb)
    
    # Extract keypoints for LSTM
    data = []
    hands_detected = 0
    if res.multi_hand_landmarks:
        hands_detected = len(res.multi_hand_landmarks)
        for i, hand in enumerate(res.multi_hand_landmarks):
            if i >= 2:
                break
            for lm in hand.landmark:
                data += [lm.x, lm.y, lm.z]
        while len(data) < 126:
            data += [0]
        for hand_landmarks in res.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
    else:
        data = [0]*126

    buf.append(np.array(data, dtype=np.float32))

    # Show buffer status
    cv2.putText(frame, f"Buffer: {len(buf)}/{SEQ_LEN}", (10, 110),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2)

    # Show warning if no hands
    if hands_detected == 0:
        cv2.putText(frame, "No hands detected!", (10, 140),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)

    # Predict gesture only if at least one hand is detected
    if len(buf) == SEQ_LEN and hands_detected > 0:
        seq_input = torch.tensor([list(buf)], dtype=torch.float32).to(device)
        with torch.no_grad():
            out = model_kp(seq_input)
            pred = out.argmax(dim=1).item()
        gesture_word = GESTURES[pred]    
        cv2.putText(frame, f"Gesture: {GESTURES[pred]}", (10, 40),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        print(f"[DEBUG] Hands detected: {hands_detected}, Predicted gesture: {GESTURES[pred]}")
        tts_engine.say(gesture_word)
        tts_engine.runAndWait()

    # Calculate and display FPS
    curr_time = time.time()
    fps = 1 / (curr_time - prev_time) if prev_time else 0
    prev_time = curr_time
    cv2.putText(frame, f"FPS: {int(fps)}", (10, 80),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 0), 2)

    cv2.imshow("Real-time Gesture Recognition", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hands detected: 1, Predicted gesture: Water
[DEBUG] Hand

In [None]:
import pyttsx3
# ...existing code...
model_3d = Simple3DCNN(num_classes=len(GESTURES)).to(device)
model_3d.load_state_dict(torch.load("gesture_3d_cpu.pth", map_location=device))
model_3d.eval()
print("Loaded 3D-CNN model successfully!")

tts_engine = pyttsx3.init()  # Add TTS engine

SEQ_LEN = 16  # match what was used in training
frame_buffer = deque(maxlen=SEQ_LEN)

cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Resize to 64x64 (or your training size)
    frame_resized = cv2.resize(frame, (64,64))
    frame_buffer.append(frame_resized)

    # Predict when buffer is full
    if len(frame_buffer) == SEQ_LEN:
        # Convert buffer to tensor: [batch=1, C=3, seq, H, W]
        frames_np = np.stack(frame_buffer, axis=0)       # [seq, H, W, C]
        frames_np = frames_np.transpose(3,0,1,2)        # [C, seq, H, W]
        frames_tensor = torch.tensor(frames_np, dtype=torch.float32).unsqueeze(0).to(device)
        frames_tensor /= 255.0  # normalize if model trained on [0,1]

        with torch.no_grad():
            out = model_3d(frames_tensor)
            pred = out.argmax(dim=1).item()
        gesture_word = GESTURES[pred]
        cv2.putText(frame, f"3D-CNN Gesture: {gesture_word}", (10,40),
                    cv2.FONT_HERSHEY_SIMPLEX,1,(0,255,0),2)
        tts_engine.say(gesture_word)
        tts_engine.runAndWait()

    cv2.imshow("3D-CNN Gesture Recognition", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()