In [None]:
!pip install openai-whisper
!pip install pyaudio sounddevice ffmpeg-python
!pip install flask flask_cors
!pip install pyngrok
!pip install TTS  
!pip install pronouncing

In [None]:
!ngrok config add-authtoken my_token

In [None]:
import os
import time
import tempfile
import re
import torch
import whisper
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
from pyngrok import ngrok
from TTS.api import TTS

app = Flask(__name__)
CORS(app)

# Initialize models
device = "cuda" if torch.cuda.is_available() else "cpu"
whisper_model = whisper.load_model("tiny.en").to(device)
tts_model = TTS(model_name="tts_models/en/ljspeech/glow-tts").to(device)

# Session storage
sessions = {}
BAD_WORDS = ["badword", "idiot", "stupid", "dumb", "hate", "kill"]

# Simplified curriculum - use string formatting to reduce hardcoding
ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
COMMON_WORDS = {
    "A": ["Apple", "Ant", "Alligator"], "B": ["Ball", "Bat", "Banana"],
    "C": ["Cat", "Car", "Cookie"], "D": ["Dog", "Duck", "Door"],
    "E": ["Elephant", "Egg", "Elbow"], "F": ["Fish", "Fox", "Flower"],
    "G": ["Goat", "Guitar", "Grapes"], "H": ["Hat", "House", "Horse"],
    "I": ["Igloo", "Insect", "Ice"], "J": ["Juice", "Jump", "Jacket"]
}

# Enhanced phonetic pronunciation mapping
PRONUNCIATION_MAP = {
    "see": "C", "sea": "C", "cee": "C",
    "why": "Y", 
    "you": "U", 
    "are": "R", "ar": "R",
    "be": "B", "bee": "B",
    "eye": "I", "i": "I",
    "aitch": "H", "h": "H",
    "dee": "D", "d": "D",
    "ee": "E", "e": "E", 
    "ef": "F", "f": "F",
    "gee": "G", "jee": "G", "g": "G",
    "jay": "J", "j": "J",
    "kay": "K", "k": "K",
    "el": "L", "l": "L",
    "em": "M", "m": "M",
    "en": "N", "n": "N",
    "oh": "O", "o": "O",
    "pee": "P", "p": "P",
    "cue": "Q", "q": "Q", "kyu": "Q",
    "ess": "S", "s": "S",
    "tee": "T", "t": "T",
    "vee": "V", "v": "V",
    "double you": "W", "doubleyou": "W", "w": "W",
    "ex": "X", "x": "X",
    "zed": "Z", "zee": "Z", "z": "Z",
    "a": "A", "b": "B", "c": "C"
}

# Phonetic coaching for common mispronunciations
PHONETIC_COACHING = {
    "B": {
        "sound": "/b/",
        "instruction": "press your lips together and voice it",
        "common_mistakes": ["P", "D", "V"]
    },
    "P": {
        "sound": "/p/", 
        "instruction": "press your lips together and puff air out",
        "common_mistakes": ["B", "F"]
    },
    "D": {
        "sound": "/d/",
        "instruction": "touch your tongue to the roof of your mouth and voice it",
        "common_mistakes": ["T", "B", "G"]
    },
    "T": {
        "sound": "/t/",
        "instruction": "touch your tongue to the roof of your mouth and puff air",
        "common_mistakes": ["D", "P", "K"]
    },
    "C": {
        "sound": "/k/",
        "instruction": "make a 'kuh' sound from the back of your throat",
        "common_mistakes": ["S", "T", "K"]
    },
    "H": {
        "sound": "/h/",
        "instruction": "breathe out gently while saying 'huh'",
        "common_mistakes": ["A", "E"]
    }
}

# Generate remaining words programmatically
for i, letter in enumerate(ALPHABET):
    if letter not in COMMON_WORDS:
        COMMON_WORDS[letter] = [f"{letter}ox", f"{letter}at", f"{letter}og"]

def get_session(session_id):
    if session_id not in sessions:
        sessions[session_id] = {
            "memory": [], "child_name": None, "current_letter": None,
            "stars": 0, "completed_letters": set()
        }
    return sessions[session_id]

def add_memory(session_id, user_text, assistant_text):
    session = get_session(session_id)
    session["memory"].append({"user": user_text, "assistant": assistant_text, "timestamp": time.time()})
    session["memory"] = session["memory"][-3:]  # Keep last 3

def extract_letter(text):
    """Extract letter from text using improved phonetic mapping"""
    if not text:
        return None

    # Clean and normalize text
    text = text.lower().strip()
    
    # Remove common filler words and punctuation
    text = re.sub(r'\b(the|letter|is|it\'s|that\'s)\b', '', text)
    text = re.sub(r'[.!?,;:\'"()\[\]{}<>-]', '', text).strip()
    
    # Direct single letter match
    if len(text) == 1 and text.isalpha():
        return text.upper()

    # Check pronunciation map with exact matching
    if text in PRONUNCIATION_MAP:
        return PRONUNCIATION_MAP[text]
    
    # Check for partial matches in longer phrases
    words = text.split()
    for word in words:
        if word in PRONUNCIATION_MAP:
            return PRONUNCIATION_MAP[word]
    
    # Pattern matching for teaching requests
    patterns = [
        r"teach me ([a-z])", r"show me ([a-z])", r"learn ([a-z])", 
        r"letter ([a-z])", r"the letter ([a-z])", r"i want ([a-z])",
        r"practice ([a-z])", r"try ([a-z])"
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(1).upper()

    return None

def detect_intent(text, session_id):
    """Improved intent detection with better phonetic handling"""
    if not text:
        return {"type": "unclear"}

    # Clean the text first - remove punctuation and extra spaces
    original_text = text
    text = re.sub(r'[.!?,;:\'"()\[\]{}<>]', '', text).lower().strip()
    
    session = get_session(session_id)

    # Name detection
    name_match = re.search(r"(?:my name is|i'm|i am|call me) (\w+)", text)
    if name_match:
        return {"type": "name", "value": name_match.group(1).title()}

    # Letter learning requests
    if any(word in text for word in ["teach", "learn", "show", "want", "letter", "practice"]):
        letter = extract_letter(text)
        if letter:
            return {"type": "teach", "letter": letter}
        # If no specific letter mentioned, start with A
        return {"type": "teach", "letter": "A"}

    # Check if this is a navigation response first
    last_assistant_msg = session["memory"][-1]["assistant"].lower() if session["memory"] else ""
    is_ready_prompt = "ready for" in last_assistant_msg or "ready?" in last_assistant_msg
    
    if is_ready_prompt and any(word in text for word in ["ready", "yes", "ok", "sure", "yeah", "yep", "go"]):
        current = session.get("current_letter", "A")
        next_letter = chr(ord(current) + 1) if current < "Z" else "A"
        return {"type": "teach", "letter": next_letter}

    # Pronunciation detection - this is the key improvement
    if session.get("current_letter"):
        detected_letter = extract_letter(text)
        if detected_letter:
            return {"type": "pronunciation", "letter": detected_letter, "original_text": original_text}
        
        # Handle cases where child might say words starting with the letter
        current_letter = session.get("current_letter")
        if current_letter and text.startswith(current_letter.lower()):
            return {"type": "pronunciation", "letter": current_letter, "original_text": original_text}

    # Other navigation
    if any(phrase in text for phrase in ["next letter", "move to next", "go to next", "skip this", "next"]):
        current = session.get("current_letter", "A")
        next_letter = chr(ord(current) + 1) if current < "Z" else "A"
        return {"type": "teach", "letter": next_letter}

    if any(word in text for word in ["help", "repeat", "again", "what", "how"]):
        return {"type": "help"}
    
    # Greeting detection
    if any(word in text for word in ["hello", "hi", "hey", "start"]):
        return {"type": "greeting"}

    return {"type": "chat", "text": text}

def generate_response(intent, session_id):
    """Generate appropriate response based on intent"""
    session = get_session(session_id)
    name = f"{session['child_name']}, " if session['child_name'] else ""
    
    if intent["type"] == "name":
        session["child_name"] = intent["value"]
        return f"Nice to meet you, {intent['value']}! Which letter would you like to learn first? You can say 'teach me A' or any letter!"

    elif intent["type"] == "greeting":
        return f"{name}Hello! Ready to learn letters? Which letter would you like to start with?"

    elif intent["type"] == "teach":
        letter = intent.get("letter") or "A"
        session["current_letter"] = letter
        words = COMMON_WORDS.get(letter, [f"{letter}ox"])
        examples = ", ".join(words[:2])
        
        # Provide phonetic guidance for tricky letters
        coaching = PHONETIC_COACHING.get(letter, {})
        sound_guide = f" Remember: {coaching['sound']} - {coaching['instruction']}" if coaching else ""
        
        return f"{name}Let's learn {letter}! For example: {examples}. Now say '{letter}' for me!{sound_guide}"

    elif intent["type"] == "pronunciation":
        expected = session.get("current_letter")
        heard = intent.get("letter")
        original_text = intent.get("original_text", "")
        
        if not expected:
            return f"{name}Which letter would you like to practice? Say 'teach me' and a letter!"
        
        if heard == expected:
            session["stars"] += 1
            session["completed_letters"].add(expected)
            next_letter = chr(ord(expected) + 1) if expected < "Z" else "A"
            
            # Special congratulations for completing the alphabet
            if expected == "Z":
                return f"🎉 Fantastic! You said {expected} correctly! You've completed the entire alphabet! You earned {session['stars']} stars total! Want to practice more letters?"
            
            return f"Perfect! You said {expected} correctly! You earned a star! ⭐ Ready for {next_letter}?"
        
        # Mispronunciation coaching
        if heard and heard != expected:
            coaching = PHONETIC_COACHING.get(expected, {})
            sound = coaching.get("sound", f"/{expected.lower()}/")
            instruction = coaching.get("instruction", "try again")
            
            return f"Good try! I heard '{heard}' but let's practice {expected}. Try {sound} - {instruction}. Say '{expected}' again!"
        
        # If no letter detected at all
        coaching = PHONETIC_COACHING.get(expected, {})
        sound = coaching.get("sound", f"/{expected.lower()}/")
        return f"I couldn't hear a clear letter. Try saying '{expected}' loudly and clearly! Remember: {sound}"

    elif intent["type"] == "help":
        current = session.get("current_letter")
        if current:
            coaching = PHONETIC_COACHING.get(current, {})
            help_text = f"Try saying '{current}' clearly!"
            if coaching:
                help_text += f" Remember: {coaching['sound']} - {coaching['instruction']}"
            return help_text
        return f"{name}Say 'teach me' and a letter to start learning, like 'teach me C'!"

    elif intent["type"] == "chat":
        # Handle casual conversation
        text = intent.get("text", "").lower()
        if any(word in text for word in ["good", "great", "awesome", "cool"]):
            return f"{name}That's wonderful! Which letter would you like to learn next?"
        elif any(word in text for word in ["tired", "stop", "done", "finish"]):
            return f"{name}Great job today! You earned {session['stars']} stars! Come back anytime to learn more letters!"
        else:
            return f"{name}That's interesting! Would you like to learn a letter? Say 'teach me' and any letter!"

    return f"{name}I didn't quite understand. Try saying 'teach me' and a letter, or just say the letter clearly!"

@app.route("/")
def home():
    return "<h1>Alphabet Teacher API</h1><p>Ready for voice and text input</p>"

@app.route("/text_input", methods=["POST"])
def text_input():
    data = request.json or {}
    text = data.get("text", "")
    session_id = data.get("session_id", "default")

    if not text:
        return jsonify({"error": "No text provided"}), 400

    intent = detect_intent(text, session_id)
    response_text = generate_response(intent, session_id)
    add_memory(session_id, text, response_text)
    session = get_session(session_id)
    
    print(f"Intent: {intent}")
    print(f"Response: {response_text}")

    return jsonify({
        "transcript": text,
        "assistant_text": response_text,
        "feedback": {"ok": intent["type"] == "pronunciation" and ("Perfect!" in response_text or "✅" in response_text)},
        "memory": session["memory"],
        "state": {"child_name": session["child_name"], "current_letter": session["current_letter"]},
        "progress": {"stars": session["stars"], "completed_letters": list(session["completed_letters"])},
        "intent": intent  # Added for debugging
    })

@app.route("/asr", methods=["POST"])
def asr():
    audio_file = request.files.get("audio")
    session_id = request.form.get("session_id", "default")

    if not audio_file:
        return jsonify({"error": "No audio file"}), 400

    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
        audio_file.save(tmp.name)
        try:
            result = whisper_model.transcribe(tmp.name, language="en")
            transcript = result.get("text", "").strip()
        except Exception as e:
            print(f"Transcription error: {e}")
            transcript = ""
        finally:
            os.unlink(tmp.name)

    if not transcript:
        return jsonify({
            "transcript": "", 
            "assistant_text": "I couldn't hear you clearly. Please try again!",
            "feedback": {"ok": False}
        })

    # Safety filter
    if any(word in transcript.lower() for word in BAD_WORDS):
        return jsonify({
            "transcript": "[filtered]", 
            "assistant_text": "Let's use kind words! Which letter would you like to learn?",
            "feedback": {"ok": False}
        })

    intent = detect_intent(transcript, session_id)
    response_text = generate_response(intent, session_id)
    add_memory(session_id, transcript, response_text)
    session = get_session(session_id)
    
    print(f"Transcript: {transcript}")
    print(f"Intent: {intent}")
    print(f"Response: {response_text}")

    # Show detected letter 
    display_text = transcript
    if intent["type"] == "pronunciation" and intent.get("letter"):
        display_text = intent["letter"]
    
    return jsonify({
        "transcript": display_text, 
        "assistant_text": response_text,
        "feedback": {"ok": intent["type"] == "pronunciation" and ("Perfect!" in response_text or "✅" in response_text)},
        "memory": session["memory"],
        "state": {"child_name": session["child_name"], "current_letter": session["current_letter"]},
        "progress": {"stars": session["stars"], "completed_letters": list(session["completed_letters"])},
        "intent": intent  # Added for debugging
    })

@app.route("/tts", methods=["POST"])
def tts():
    data = request.json or {}
    text = data.get("text", "")

    if not text:
        return jsonify({"error": "No text provided"}), 400

    # Clean text for TTS - remove emojis and special characters
    clean_text = re.sub(r'[^\w\s\.,!?\-]', '', text)
    clean_text = re.sub(r'[✅🎉⭐]', '', clean_text)  # Remove emojis

    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
        try:
            tts_model.tts_to_file(text=clean_text, file_path=tmp.name)
            return send_file(tmp.name, mimetype="audio/wav")
        except Exception as e:
            print(f"TTS error: {e}")
            return jsonify({"error": "TTS failed"}), 500

@app.route("/reset/<session_id>", methods=["POST"])
def reset_session(session_id):
    if session_id in sessions:
        del sessions[session_id]
    return jsonify({"status": "reset", "message": "Session reset successfully"})

@app.route("/session/<session_id>", methods=["GET"])
def get_session_info(session_id):
    """Get current session information"""
    session = get_session(session_id)
    return jsonify({
        "session_id": session_id,
        "child_name": session["child_name"],
        "current_letter": session["current_letter"],
        "stars": session["stars"],
        "completed_letters": list(session["completed_letters"]),
        "memory_count": len(session["memory"])
    })

if __name__ == "__main__":
    public_url = ngrok.connect(3000)
    print(f"🚀 Alphabet Teacher Server running at: {public_url}")
    print(f"📚 Ready to teach letters with enhanced phonetic recognition!")
    app.run(host="0.0.0.0", port=3000)