In [None]:
import cv2
import numpy as np
import pyttsx3
import mediapipe as mp
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Load model
asl_model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(29, activation='softmax')
])
asl_model.load_weights("asl_cnn_model.h5")

# Label map
asl_labels = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ") + ['del', 'nothing', 'space']

# TTS setup
engine = pyttsx3.init()

# Mediapipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1)
mp_drawing = mp.solutions.drawing_utils

# Video input
video_path = "sign_language_video.mp4"
cap = cv2.VideoCapture(video_path)

frame_interval = 5
frame_count = 0
pred_buffer = deque(maxlen=3)
final_sequence = []

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    if frame_count % frame_interval == 0:
        img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(img_rgb)

        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                # Bounding box for the hand
                img_h, img_w, _ = frame.shape
                x_coords = [landmark.x for landmark in hand_landmarks.landmark]
                y_coords = [landmark.y for landmark in hand_landmarks.landmark]
                x_min, x_max = int(min(x_coords) * img_w), int(max(x_coords) * img_w)
                y_min, y_max = int(min(y_coords) * img_h), int(max(y_coords) * img_h)

                # Add margin
                margin = 20
                x_min = max(x_min - margin, 0)
                y_min = max(y_min - margin, 0)
                x_max = min(x_max + margin, img_w)
                y_max = min(y_max + margin, img_h)

                hand_img = frame[y_min:y_max, x_min:x_max]
                hand_img = cv2.resize(hand_img, (64, 64))
                hand_img = hand_img.astype("float32") / 255.0
                hand_img = np.expand_dims(hand_img, axis=0)

                # Predict sign
                pred = asl_model.predict(hand_img, verbose=0)
                label = asl_labels[np.argmax(pred)]
                pred_buffer.append(label)

                # Smoothing: check if last 3 are same
                if len(pred_buffer) == 3 and all(p == pred_buffer[0] for p in pred_buffer):
                    if pred_buffer[0] != "nothing" and (len(final_sequence) == 0 or final_sequence[-1] != pred_buffer[0]):
                        final_sequence.append(pred_buffer[0])
                    pred_buffer.clear()

    frame_count += 1

cap.release()

# Convert to final text
final_text = ""
for label in final_sequence:
    if label == "space":
        final_text += " "
    elif label == "del":
        final_text = final_text[:-1]
    else:
        final_text += label

print("Predicted Sentence:", final_text)
engine.say(final_text)
engine.runAndWait()
