In [None]:
import cv2
import numpy as np
import pyttsx3
from tensorflow.keras.models import load_model
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import img_to_array
import time

# Load models with error handling
def load_asl_model(model_path):
    try:
        model = tf.keras.models.load_model(model_path, compile=False)
        return model
    except ValueError as e:
        print("Error loading model:", e)
        return None

asl_model = load_asl_model('asl_cnn_model.h5')
emotion_model = load_asl_model('fer2013_emotion_model.h5')

# Define labels
asl_labels = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I', 9: 'J', 10: 'K', 11: 'L', 12: 'M', 13: 'N', 14: 'O', 15: 'P', 16: 'Q', 17: 'R', 18: 'S', 19: 'T', 20: 'U', 21: 'V', 22: 'W', 23: 'X', 24: 'Y', 25: 'Z'}
emotion_labels = {0: 'Angry', 1: 'Disgust', 2: 'Fear', 3: 'Happy', 4: 'Sad', 5: 'Surprise', 6: 'Neutral'}

# Initialize pyttsx3 for speech
engine = pyttsx3.init()

# Function to predict emotion
def predict_emotion(face_img):
    face_img = cv2.cvtColor(face_img, cv2.COLOR_BGR2GRAY)
    face_img = cv2.resize(face_img, (48, 48))
    face_img = face_img.astype("float32") / 255.0
    face_img = np.expand_dims(face_img, axis=-1)
    face_img = np.expand_dims(face_img, axis=0)
    emotion_prediction = emotion_model.predict(face_img)
    emotion_index = np.argmax(emotion_prediction)
    return emotion_labels[emotion_index]

# Function to predict ASL
def predict_asl(sign_img):
    sign_img = cv2.resize(sign_img, (64, 64))
    sign_img = sign_img.astype('float32') / 255.0
    sign_img = np.expand_dims(sign_img, axis=-1)
    sign_img = np.expand_dims(sign_img, axis=0)
    asl_prediction = asl_model.predict(sign_img)
    asl_index = np.argmax(asl_prediction)
    return asl_labels[asl_index]

# Start webcam
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    
    if not ret:
        break

    # Display the frame
    cv2.imshow("Webcam Input", frame)

    # Detect face and predict emotion
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

    if len(faces) > 0:
        x, y, w, h = faces[0]
        face_img = frame[y:y + h, x:x + w]
        emotion = predict_emotion(face_img)
        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
        cv2.putText(frame, emotion, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)

    # Predict ASL
    asl_sign = predict_asl(frame)
    cv2.putText(frame, f"ASL: {asl_sign}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
    
    # Text to speech
    engine.say(f"Sign Detected: {asl_sign}")
    engine.runAndWait()

    cv2.imshow("ASL and Emotion Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


In [3]:
!pip install python-pptx


Collecting python-pptx




  Downloading python_pptx-1.0.2-py3-none-any.whl (472 kB)
Installing collected packages: python-pptx
Successfully installed python-pptx-1.0.2


In [10]:
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor

# Create a presentation object
prs = Presentation()

# Slide 1: Title Slide
slide_1 = prs.slides.add_slide(prs.slide_layouts[0])
title = slide_1.shapes.title
subtitle = slide_1.placeholders[1]

title.text = "Real-Time American Sign Language to Speech System using Deep Learning"
subtitle.text = "Integrating Sign Recognition and Emotion-Aware Speech Synthesis\nPresented by: [Your Name]\nInstitution: [Your Institution Name]\nCourse: Design and Analysis of Algorithms\nDate: [Insert Date]"

# Styling title
title.text_frame.paragraphs[0].font.size = Pt(18)
subtitle.text_frame.paragraphs[0].font.size = Pt(18)
subtitle.text_frame.paragraphs[0].font.bold = True
subtitle.text_frame.paragraphs[0].font.color.rgb = RGBColor(0, 102, 204)

# Slide 2: Introduction
slide_2 = prs.slides.add_slide(prs.slide_layouts[1])
title = slide_2.shapes.title
content = slide_2.shapes.placeholders[1]

title.text = "Introduction"
content.text = (
    "Communication is a basic human need, but individuals who are deaf or speech-impaired often struggle due to lack of accessible communication tools.\n"
    "Sign Language is a rich and expressive visual language.\n"
    "This project bridges communication by recognizing ASL signs and converting them into emotionally contextualized speech using AI.\n"
    "Emphasis on real-time interaction and accurate prediction through deep learning."
)

# Slide 3: Problem Statement
slide_3 = prs.slides.add_slide(prs.slide_layouts[1])
title = slide_3.shapes.title
content = slide_3.shapes.placeholders[1]

title.text = "Problem Statement"
content.text = (
    "Challenge: Bridging the communication barrier between hearing-impaired and hearing individuals.\n\n"
    "Gap in Current Systems:\n"
    "- Many systems are expensive, require gloves or external sensors.\n"
    "- Lack of emotion awareness.\n"
    "- Not hardware-friendly or deployable.\n\n"
    "Goal: Develop a webcam-based, real-time, emotion-aware ASL-to-speech system using CNN."
)

# Slide 4: Objectives
slide_4 = prs.slides.add_slide(prs.slide_layouts[1])
title = slide_4.shapes.title
content = slide_4.shapes.placeholders[1]

title.text = "Objectives"
content.text = (
    "1. Train deep learning models to recognize static ASL alphabet signs.\n"
    "2. Recognize facial expressions to capture emotional tone.\n"
    "3. Generate natural, expressive speech from sign input.\n"
    "4. Deploy the system using a simple camera without extra hardware."
)

# Slide 5: Sustainable Development Goals (SDGs)
slide_5 = prs.slides.add_slide(prs.slide_layouts[1])
title = slide_5.shapes.title
content = slide_5.shapes.placeholders[1]

title.text = "Sustainable Development Goals (SDGs)"
content.text = (
    "Goal 4: Quality Education – Provides educational tools for inclusive communication.\n"
    "Goal 10: Reduced Inequality – Promotes digital inclusion for hearing-impaired individuals.\n"
    "Goal 9: Industry, Innovation and Infrastructure – Utilizes AI in developing accessible tools."
)

# Slide 6: Dataset Overview
slide_6 = prs.slides.add_slide(prs.slide_layouts[1])
title = slide_6.shapes.title
content = slide_6.shapes.placeholders[1]

title.text = "Dataset Overview"
content.text = (
    "ASL Dataset:\n"
    "- 29 Classes (A–Z, 'del', 'nothing', 'space')\n"
    "- ~87,000 images, RGB, 200x200 px\n\n"
    "FER2013 Emotion Dataset:\n"
    "- 7 emotions: Angry, Disgust, Fear, Happy, Sad, Surprise, Neutral\n"
    "- Grayscale, 48x48 px, ~35,000 images"
)

# Slide 7: Preprocessing Steps
slide_7 = prs.slides.add_slide(prs.slide_layouts[1])
title = slide_7.shapes.title
content = slide_7.shapes.placeholders[1]

title.text = "Preprocessing Steps"
content.text = (
    "1. Resize and normalize images\n"
    "2. Label encoding for ASL letters and emotions\n"
    "3. One-hot encoding of labels\n"
    "4. Augmentation: Random flip, rotation, shift (optional)\n"
    "5. Split: 80% training, 20% validation"
)

# Slide 8: CNN Architecture for ASL Recognition
slide_8 = prs.slides.add_slide(prs.slide_layouts[1])
title = slide_8.shapes.title
content = slide_8.shapes.placeholders[1]

title.text = "CNN Architecture for ASL Recognition"
content.text = (
    "Input: 200x200x3 image\n\n"
    "3 Convolutional Blocks:\n"
    "- Conv2D → ReLU → MaxPooling\n\n"
    "Flatten Layer\n"
    "Dense(128) → Dropout(0.5)\n"
    "Output Layer: 29 neurons (softmax)\n\n"
    "Accuracy: ~98%"
)

# Slide 9: CNN Architecture for Emotion Detection
slide_9 = prs.slides.add_slide(prs.slide_layouts[1])
title = slide_9.shapes.title
content = slide_9.shapes.placeholders[1]

title.text = "CNN Architecture for Emotion Detection"
content.text = (
    "Input: 48x48 grayscale face image\n\n"
    "Conv2D (3 layers): ReLU + MaxPooling\n\n"
    "Flatten → Dense(128) → Dropout\n"
    "Output: 7-class softmax\n\n"
    "Accuracy: ~89%"
)

# Slide 10: System Flow (Architecture Diagram)
slide_10 = prs.slides.add_slide(prs.slide_layouts[5])
title = slide_10.shapes.title
title.text = "System Flow (Architecture Diagram)"
# Insert the actual diagram as an image
slide_10.shapes.add_picture('sign_to_speech_archi.png', Inches(0.5), Inches(1.5), width=Inches(9))

# Slide 11: Real-Time Pipeline Details
slide_11 = prs.slides.add_slide(prs.slide_layouts[1])
title = slide_11.shapes.title
content = slide_11.shapes.placeholders[1]

title.text = "Real-Time Pipeline Details"
content.text = (
    "1. Open webcam\n"
    "2. Define hand and face regions\n"
    "3. Predict character every 10 frames\n"
    "4. Maintain sliding buffer for prediction stability\n"
    "5. Capture sentence\n"
    "6. Detect emotion\n"
    "7. Generate speech with appropriate tone and volume"
)

# Slide 14: Experimental Setup
slide_14 = prs.slides.add_slide(prs.slide_layouts[1])
title = slide_14.shapes.title
content = slide_14.shapes.placeholders[1]

title.text = "Experimental Setup"
content.text = (
    "Hardware: i5 Processor, 8GB RAM, No GPU\n"
    "Environment: Jupyter Notebook\n\n"
    "Models:\n"
    "- ASL CNN Model (.keras)\n"
    "- FER2013 CNN Model (.keras)\n\n"
    "Accuracy:\n"
    "- ASL Model: ~98%\n"
    "- Emotion Model: ~89%\n\n"
    "Real-time latency: ~150ms/frame"
)

# Slide 15: Existing Solutions vs Our Model
slide_15 = prs.slides.add_slide(prs.slide_layouts[1])
title = slide_15.shapes.title
content = slide_15.shapes.placeholders[1]

title.text = "Existing Solutions vs Our Model"
content.text = (
    "Feature        | SignAll | DeepASL | Smart Gloves | Proposed System\n"
    "---------------------------------------------------------\n"
    "Hardware-Free  | No      | No      | No           | Yes\n"
    "Emotion Detection| No    | No      | No           | Yes\n"
    "Real-Time      | Yes     | No      | Yes          | Yes\n"
    "Cost-effective | No      | No      | No           | Yes\n"
    "Open Source    | No      | Yes     | No           | Yes"
)

# Slide 16: Key Challenges
slide_16 = prs.slides.add_slide(prs.slide_layouts[1])
title = slide_16.shapes.title
content = slide_16.shapes.placeholders[1]

title.text = "Key Challenges"
content.text = (
    "1. Visual similarity between some signs (e.g., M vs N)\n"
    "2. Lighting and background noise\n"
    "3. Misclassification due to hand misplacement\n"
    "4. Real-time performance on low-end hardware\n"
    "5. Integrating emotional tone to speech output"
)

# Slide 17: Future Scope
slide_17 = prs.slides.add_slide(prs.slide_layouts[1])
title = slide_17.shapes.title
content = slide_17.shapes.placeholders[1]

title.text = "Future Scope"
content.text = (
    "1. Use hand landmark models like MediaPipe\n"
    "2. Extend support for dynamic gestures\n"
    "3. Add full sentence and grammar construction\n"
    "4. Deploy as mobile/web app\n"
    "5. Add multilingual speech output"
)

# Slide 18: Conclusion
slide_18 = prs.slides.add_slide(prs.slide_layouts[1])
title = slide_18.shapes.title
content = slide_18.shapes.placeholders[1]

title.text = "Conclusion"
content.text = (
    "1. Successfully created a novel, real-time ASL-to-speech system\n"
    "2. Incorporated emotion detection to enhance expressiveness\n"
    "3. Outperforms many existing solutions in affordability and scope\n"
    "4. Promotes inclusion and accessibility using AI"
)

# Slide 19: References
slide_19 = prs.slides.add_slide(prs.slide_layouts[1])
title = slide_19.shapes.title
content = slide_19.shapes.placeholders[1]

title.text = "References"
content.text = (
    "1. Kaggle ASL Alphabet Dataset\n"
    "2. FER2013 Emotion Dataset\n"
    "3. TensorFlow/Keras Documentation\n"
    "4. OpenCV Python Docs\n"
    "5. Pyttsx3 TTS Engine\n"
    "6. Google Mediapipe"
)

# Save presentation
prs.save("ASL_to_Speech_Full_Presentation.pptx")

