In [1]:
from flask import Flask, Response
from flask_cors import CORS
import cv2
import mediapipe as mp
import numpy as np
import pickle
import threading


# Initialize Flask app
app = Flask(__name__)
CORS(app)

# Load trained model
model_dict = pickle.load(open('./model.p', 'rb'))
model = model_dict['model']

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, min_detection_confidence=0.5)

# Labels dictionary for predictions
labels_dict = {i: chr(65 + i) for i in range(26)}

# List to store detected letters and form the sentence
sentence = []
current_letter = "?"  # Track the current detected letter
final_sentence = ""  # Store the printed sentence


def generate_frames():
    global current_letter, sentence, final_sentence  # Access global variables
    cap = cv2.VideoCapture(0)  # Open camera

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Convert to RGB for MediaPipe processing
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(frame_rgb)

        # Default to "?" if no hand is detected
        current_letter = "?"

        # If hand landmarks are detected, predict the character
        if results.multi_hand_landmarks:
            landmarks = results.multi_hand_landmarks[0]
            data_aux = []

            x_, y_ = [lm.x for lm in landmarks.landmark], [lm.y for lm in landmarks.landmark]
            for lm in landmarks.landmark:
                data_aux.append(lm.x - min(x_))
                data_aux.append(lm.y - min(y_))

            # Ensure proper data length for model input
            data_aux = data_aux[:42] + [0] * max(0, 42 - len(data_aux))
            prediction = model.predict([np.asarray(data_aux)])
            current_letter = labels_dict.get(int(prediction[0]), "?")

        # Add the current forming sentence, detected letter, and final sentence to the video frame
        cv2.putText(frame, f"Current Letter: {current_letter}", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 0, 255), 3)
        cv2.putText(frame, "Forming Sentence: " + "".join(sentence), (50, 100), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2)
        cv2.putText(frame, "Final Sentence: " + final_sentence, (50, 150), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 255, 0), 2)

        # Display the frame
        cv2.imshow("Sign Language Detection", frame)

        # Listen for key presses
        key = cv2.waitKey(1) & 0xFF
        if key == ord('k'):  # Save the detected letter
            if current_letter != "?":
                sentence.append(current_letter)
                print(f"Saved Letter: {current_letter}")
        elif key == ord('s'):  # Add a space
            sentence.append(" ")
            print("Added Space.")
        elif key == ord('d'):  # Delete the last letter (Backspace functionality)
            if sentence:
                deleted_letter = sentence.pop()
                print(f"Deleted Letter: {deleted_letter}")
            else:
                print("No letter to delete.")
        elif key == ord('p'):  # Print the final sentence on screen and reset `sentence`
            final_sentence = "".join(sentence)
            print(f"Final Sentence Printed: {final_sentence}")
            sentence = []  # Clear the forming sentence after printing
        elif key == ord('q'):  # Quit the loop and stop the program
            print("Exiting...")
            
            break

    cap.release()
    cv2.destroyAllWindows()

@app.route('/video_feed')

def video_feed():
    return Response(generate_frames(), mimetype='multipart/x-mixed-replace; boundary=frame')

# Run Flask app in a separate thread
def run_app():
    app.run(debug=True, use_reloader=False)

flask_thread = threading.Thread(target=run_app)
flask_thread.start()

# Run video capture and key detection in the main thread
# generate_frames()

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


Saved Letter: A
Saved Letter: C
Exiting...


127.0.0.1 - - [22/Jun/2025 23:06:12] "GET /video_feed HTTP/1.1" 200 -


Added Space.
Added Space.
Deleted Letter:  
Deleted Letter:  
Deleted Letter: C
Deleted Letter: A
No letter to delete.
No letter to delete.
No letter to delete.
No letter to delete.
No letter to delete.
No letter to delete.
No letter to delete.
Exiting...


127.0.0.1 - - [22/Jun/2025 23:12:45] "GET /video_feed HTTP/1.1" 200 -


Saved Letter: P
Saved Letter: P
Saved Letter: H
Saved Letter: Y
Saved Letter: A
Saved Letter: A
Saved Letter: P
Exiting...


127.0.0.1 - - [22/Jun/2025 23:15:24] "GET /video_feed HTTP/1.1" 200 -


Saved Letter: A
Saved Letter: A
Saved Letter: P
Saved Letter: Y
Saved Letter: K
Saved Letter: L
Saved Letter: L
Final Sentence Printed: PPHYAAPAAPYKLL
Saved Letter: A
Saved Letter: A
Saved Letter: P
Saved Letter: K
Saved Letter: A
Deleted Letter: A
Deleted Letter: K
Deleted Letter: P
Deleted Letter: A
Final Sentence Printed: A
Exiting...


127.0.0.1 - - [22/Jun/2025 23:17:04] "GET /video_feed HTTP/1.1" 200 -


In [2]:
pip install flask-cors

Note: you may need to restart the kernel to use updated packages.


In [4]:
from flask import Flask, Response
from flask_cors import CORS
import cv2
import mediapipe as mp
import numpy as np
import pickle
import threading

# Initialize Flask web app
app = Flask(__name__)  # Fixed typo: "_name_" should be "__name__"
CORS(app)  # Enable Cross-Origin Resource Sharing (CORS)

# Load the pre-trained model
model_dict = pickle.load(open('./model.p', 'rb'))  # Load model from pickle file
model = model_dict['model']

# Initialize MediaPipe for hand detection and drawing
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils  # For drawing landmarks and connections

# Create a dictionary that maps model predictions (0–25) to letters A–Z
labels_dict = {i: chr(65 + i) for i in range(26)}

# Variables to track the current state
sentence = []          # Letters forming the sentence
current_letter = "?"   # Currently predicted letter
final_sentence = ""    # Final complete sentence

# Function to capture video frames and process hand gestures
def generate_frames():
    global current_letter, sentence, final_sentence
    cap = cv2.VideoCapture(0)  # Open webcam

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Convert the frame from BGR to RGB for MediaPipe
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(frame_rgb)  # Process the frame to detect hands

        current_letter = "?"

        if results.multi_hand_landmarks:
            landmarks = results.multi_hand_landmarks[0]

            # Draw hand landmarks and connections
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(
                    frame, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                    mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=4),
                    mp_drawing.DrawingSpec(color=(255, 0, 0), thickness=2)
                )

            # Prepare data for prediction
            data_aux = []
            x_, y_ = [lm.x for lm in landmarks.landmark], [lm.y for lm in landmarks.landmark]
            for lm in landmarks.landmark:
                data_aux.append(lm.x - min(x_))
                data_aux.append(lm.y - min(y_))

            # Ensure the input size matches expected size for the model (padding if necessary)
            data_aux = data_aux[:42] + [0] * max(0, 42 - len(data_aux))
            prediction = model.predict([np.asarray(data_aux)])  # Predict gesture
            current_letter = labels_dict.get(int(prediction[0]), "?")  # Map prediction to letter

        # Overlay current state on the frame
        cv2.putText(frame, f"Current Letter: {current_letter}", (50, 50),
                    cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 0, 255), 3)
        cv2.putText(frame, "Forming Sentence: " + "".join(sentence), (50, 100),
                    cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2)
        cv2.putText(frame, "Final Sentence: " + final_sentence, (50, 150),
                    cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 255, 0), 2)

        # Show the processed frame
        cv2.imshow("Sign Language Detection", frame)

        # Handle keyboard inputs:
        key = cv2.waitKey(1) & 0xFF
        if key == ord('k'):
            if current_letter != "?":
                sentence.append(current_letter)  # Save current letter
                print(f"Saved Letter: {current_letter}")
        elif key == ord('s'):
            sentence.append(" ")  # Add space
            print("Added Space.")
        elif key == ord('d'):
            if sentence:
                deleted_letter = sentence.pop()  # Delete last letter
                print(f"Deleted Letter: {deleted_letter}")
            else:
                print("No letter to delete.")
        elif key == ord('p'):
            final_sentence = "".join(sentence)  # Finalize sentence
            print(f"Final Sentence Printed: {final_sentence}")
            sentence = []
        elif key == ord('q'):
            print("Exiting...")  # Quit the loop
            break

    # Release resources
    cap.release()
    cv2.destroyAllWindows()

# Flask route to serve the video feed
@app.route('/video_feed')
def video_feed():
    return Response(generate_frames(), mimetype='multipart/x-mixed-replace; boundary=frame')

# Function to run the Flask app in a separate thread
def run_app():
    app.run(debug=False, use_reloader=False)

# Start Flask server in a background thread
flask_thread = threading.Thread(target=run_app)
flask_thread.start()

# Start gesture detection loop in the main thread
generate_frames()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


Exiting...
Saved Letter: H
Saved Letter: E
Saved Letter: L
Saved Letter: L
Saved Letter: O
Final Sentence Printed: HELLO
Exiting...


127.0.0.1 - - [28/Jun/2025 13:04:37] "GET /video_feed HTTP/1.1" 200 -


Exiting...


127.0.0.1 - - [28/Jun/2025 13:14:29] "GET /video_feed HTTP/1.1" 200 -
