In [None]:
import os
import torch
import logging
import tempfile
import requests
import sounddevice as sd
import wave
import asyncio
from flask import Flask, render_template, request, jsonify, Response
from deepgram import Deepgram
# from google import genai
from waitress import serve
import concurrent.futures
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the DialoGPT-medium model
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")



# API Configuration
DEEPGRAM_API_KEY = "14a53259c35bbe5d06ba288ed295228348aaebe6"  
ELEVEN_LABS_API_KEY = "sk_b10646eba101273b1379f61958fec54857162174f82ec117" 
ELEVEN_LABS_VOICE_ID = "2EiwWnXFnvU5JabPnv8n"
# GEMINI_API_KEY = "AIzaSyBkuNCgQNSvaCy1ci3F8eux0xEtwvrzFY8"  

# Initialize Flask App
app = Flask(__name__)

# Configure Logging
logging.basicConfig(
    level=logging.INFO,
    format="%(message)s",
    handlers=[logging.StreamHandler()]
)

# Initialize Gemini AI Client
if not GEMINI_API_KEY:
    raise ValueError("Missing GEMINI API Key. Set it as an environment variable.")
client = genai.Client(api_key=GEMINI_API_KEY)

# Initialize Deepgram Client
dg_client = Deepgram(DEEPGRAM_API_KEY)

# Chatbot State
class State:
    def __init__(self):
        self.messages = []

    def add_message(self, role, content):
        self.messages.append({"role": role, "content": content})

    def get_messages(self):
        return self.messages

state = State()
state.add_message("system", "You are a helpful servent of the royal family. Keep responses simple and royal.")

# # Generate AI Response using Google Gemini
def call_model(state):
    context = "\n".join([f"{msg['role']}: {msg['content']}" for msg in state.get_messages()])
    response = client.models.generate_content(model='gemini-2.0-flash-001', contents=context)
    return response.text

def get_chatbot_response(user_text):
    state.add_message("user", user_text)
    response = call_model(state)
    state.add_message("assistant", response)
    return response
def get_chatbot_response(user_text):
    global chat_history_ids

    # Encode user input, add EOS token
    new_user_input_ids = tokenizer.encode(user_text + tokenizer.eos_token, return_tensors='pt')

    # If there's already chat history, append
    if chat_history_ids is not None:
        bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1)
    else:
        bot_input_ids = new_user_input_ids

    # Generate a response (limit chat history to 1000 tokens)
    chat_history_ids = model.generate(
        bot_input_ids,
        max_length=1000,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode only the newly generated tokens
    bot_response = tokenizer.decode(
        chat_history_ids[:, bot_input_ids.shape[-1]:][0],
        skip_special_tokens=True
    )
    return bot_response


# Convert Text to Speech using Eleven Labs
def synthesize_speech(text):
    url = f"https://api.elevenlabs.io/v1/text-to-speech/{ELEVEN_LABS_VOICE_ID}/stream"
    headers = {
        "xi-api-key": ELEVEN_LABS_API_KEY,
        "Content-Type": "application/json"
    }
    payload = {
        "text": text,
        "voice_settings": {
            "stability": 0.5,
            "similarity_boost": 0.75
        }
    }

    try:
        response = requests.post(url, headers=headers, json=payload, stream=True, timeout=10)
        if response.status_code == 200:
            return response
        else:
            logging.error(f"Speech synthesis failed: {response.text}")
    except Exception as e:
        logging.error(f"Unexpected error during speech synthesis: {e}")
    return None

# Record Audio from User
def record_audio(duration=3, sample_rate=16000):
    logging.info("Recording audio...")
    audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='int16')
    sd.wait()
    file_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
    with wave.open(file_path, "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio_data.tobytes())
    logging.info(f"Audio saved at {file_path}")
    return file_path

# Transcribe Audio using Deepgram
async def transcribe_audio(file_path):
    with open(file_path, 'rb') as audio:
        source = {'buffer': audio, 'mimetype': 'audio/wav'}
        result = await dg_client.transcription.prerecorded(source, {'punctuate': True})
        alternatives = result['results']['channels'][0].get('alternatives', [])
        return alternatives[0]['transcript'] if alternatives else "(No speech detected)"

# Flask Routes
@app.route('/')
def index():
    return render_template('index.html')
    
@app.route('/start', methods=['POST'])
def start_conversation():
    try:
        # 1) Record user speech
        audio_file_path = record_audio()
        user_text = asyncio.run(transcribe_audio(audio_file_path))
        os.remove(audio_file_path)

        if not user_text or user_text == "(No speech detected)":
            return jsonify({"error": "No speech detected, please try again."})

        print("User:", user_text)

        # 2) Attempt to get DialoGPT response within 2 seconds
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future = executor.submit(get_chatbot_response, user_text)
            try:
                bot_response = future.result(timeout=2)
            except concurrent.futures.TimeoutError:
                bot_response = "I'm sorry, I didn't catch that. Please say again."

        print("ChatBot:", bot_response)

        # 3) Convert Response to Speech (Eleven Labs)
        speech_response = synthesize_speech(bot_response)
        if speech_response:
            return Response(speech_response.iter_content(chunk_size=1024), content_type="audio/mpeg")

        return jsonify({"error": "Failed to generate speech response."})

    except Exception as e:
        logging.error(f"Error during request processing: {e}")
        return jsonify({"error": "An error occurred while processing your request."})

@app.route('/stop', methods=['POST'])
def stop_conversation():
    return jsonify({"status": "Conversation stopped."})

# Run Flask Server
if __name__ == '__main__':
    logging.info("Starting the server with Waitress...")
    serve(app, host='0.0.0.0', port=5001)


Starting the server with Waitress...
Serving on http://0.0.0.0:5001
Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmpezf2xwfj.wav
Error during request processing: name 'chat_history_ids' is not defined


User: Good morning, India.


In [None]:
import os
import logging
import tempfile
import requests
import sounddevice as sd
import wave
import asyncio
from flask import Flask, render_template, request, jsonify, Response
from deepgram import Deepgram
from waitress import serve
import concurrent.futures

# --- DialoGPT Imports ---
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# 1) Global chat_history_ids for multi-turn conversation
chat_history_ids = None

# 2) Load DialoGPT (Medium)
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

# API Configuration
DEEPGRAM_API_KEY = "14a53259c35bbe5d06ba288ed295228348aaebe6"
ELEVEN_LABS_API_KEY = "sk_b10646eba101273b1379f61958fec54857162174f82ec117"
ELEVEN_LABS_VOICE_ID = "2EiwWnXFnvU5JabPnv8n"

# Initialize Flask App
app = Flask(__name__)

# Configure Logging
logging.basicConfig(
    level=logging.INFO,
    format="%(message)s",
    handlers=[logging.StreamHandler()]
)

# Initialize Deepgram
dg_client = Deepgram(DEEPGRAM_API_KEY)

# --- Synthesize Speech with Eleven Labs ---
def synthesize_speech(text):
    url = f"https://api.elevenlabs.io/v1/text-to-speech/{ELEVEN_LABS_VOICE_ID}/stream"
    headers = {
        "xi-api-key": ELEVEN_LABS_API_KEY,
        "Content-Type": "application/json"
    }
    payload = {
        "text": text,
        "voice_settings": {
            "stability": 0.5,
            "similarity_boost": 0.75
        }
    }
    try:
        response = requests.post(url, headers=headers, json=payload, stream=True, timeout=10)
        if response.status_code == 200:
            return response
        else:
            logging.error(f"Speech synthesis failed: {response.text}")
    except Exception as e:
        logging.error(f"Unexpected error during speech synthesis: {e}")
    return None

# --- Record Audio ---
def record_audio(duration=3, sample_rate=16000):
    logging.info("Recording audio...")
    audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='int16')
    sd.wait()
    file_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
    with wave.open(file_path, "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio_data.tobytes())
    logging.info(f"Audio saved at {file_path}")
    return file_path

# --- Transcribe Audio (Deepgram) ---
async def transcribe_audio(file_path):
    with open(file_path, 'rb') as audio:
        source = {'buffer': audio, 'mimetype': 'audio/wav'}
        result = await dg_client.transcription.prerecorded(source, {'punctuate': True})
        alternatives = result['results']['channels'][0].get('alternatives', [])
        return alternatives[0]['transcript'] if alternatives else "(No speech detected)"

# --- Your Snippet: get_Chat_response ---
def get_Chat_response(text):
    """
    Chat for 5 lines, but effectively returns after first iteration due to 'return' inside the loop.
    """
    global chat_history_ids

    for step in range(5):
        new_user_input_ids = tokenizer.encode(str(text) + tokenizer.eos_token, return_tensors='pt')
        if step > 0:
            bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1)
        else:
            bot_input_ids = new_user_input_ids

        chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

        # This 'return' means we only see the first iteration
        return tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)

# --- Flask Routes ---
@app.route('/')
def index():
    return render_template('index.html')

@app.route('/start', methods=['POST'])
def start_conversation():
    try:
        # 1) Record User Speech
        audio_file_path = record_audio()
        user_text = asyncio.run(transcribe_audio(audio_file_path))
        os.remove(audio_file_path)

        if not user_text or user_text == "(No speech detected)":
            return jsonify({"error": "No speech detected, please try again."})

        print("User:", user_text)

        # 2) Attempt to get DialoGPT snippet response within 2 seconds
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future = executor.submit(get_Chat_response, user_text)
            try:
                bot_response = future.result(timeout=2)
            except concurrent.futures.TimeoutError:
                bot_response = "I'm sorry, I didn't catch that. Please say again."

        print("ChatBot:", bot_response)

        # 3) Convert to Speech (Eleven Labs)
        speech_response = synthesize_speech(bot_response)
        if speech_response:
            return Response(speech_response.iter_content(chunk_size=1024), content_type="audio/mpeg")

        return jsonify({"error": "Failed to generate speech response."})

    except Exception as e:
        logging.error(f"Error during request processing: {e}")
        return jsonify({"error": "An error occurred while processing your request."})

@app.route('/stop', methods=['POST'])
def stop_conversation():
    return jsonify({"status": "Conversation stopped."})

# --- Run Waitress ---
if __name__ == '__main__':
    logging.info("Starting the server with Waitress...")
    serve(app, host='0.0.0.0', port=5001)


Starting the server with Waitress...
Serving on http://0.0.0.0:5001
Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmpl1_0vrea.wav
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


User: Hey brothers. Good morning.
ChatBot: Good morning!


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmpotf46nwa.wav


User: What are you doing today?
ChatBot: I'm going to a party.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmp2t5mvxv1.wav


User: Should we go and play with talk?
ChatBot: I'm not sure what you mean by that.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmphg7x8gvk.wav


User: Well, I'm talking about, you know, playing with dogs.
ChatBot: I'm sorry, I didn't catch that. Please say again.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmpkw1zjspt.wav


User: I'm thinking you of playing with my dog.
ChatBot: I'm thinking you of playing with my dog.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmpyvhjd7sg.wav


User: Yeah. Exactly. My dog shi.
ChatBot: I'm sorry for your loss.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmpc41a7sw2.wav


User: No. My dog doesn't die. You.
ChatBot: I'm sorry for your loss.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmp999mfpeq.wav


User: Why are you sorry? You
ChatBot: I'm sorry, I didn't catch that. Please say again.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmpj_hg6rbu.wav


User: I'm saying good morning.
ChatBot: I'm saying good morning.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmprg6_22uf.wav


User: Good morning to you too.
ChatBot: Good morning!


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmpap_72pp5.wav


User: Are you playing any games today?
ChatBot: I'm playing a game of league of legends right now.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmppwehmnuw.wav


User: Quiz cactus that you have?
ChatBot: I have a few, but I'm not sure what they're worth.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmp27dihdqr.wav


User: Which characters do you know of
ChatBot: I'm sorry, I didn't catch that. Please say again.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmpq6114fak.wav


User: I said to which characters do you know op league of legends?
ChatBot: I'm not sure what you're trying to say.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmpc9f404sq.wav


User: Do even play league planes.
ChatBot: I do, but I'm not a big fan of the game.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmp89_tyk2m.wav


User: Did we one give you know more about?
ChatBot: I'm not sure, I just saw it on the front page.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmp8u5thqta.wav


User: From page of what?
ChatBot: The article.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmpksa14g9x.wav


User: What do you really articles from?
ChatBot: I'm not sure, but I think it's from the Onion.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmpockukqfz.wav


User: What Donnie?
ChatBot: I think it's a reference to the movie Donnie Darko.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmpxn7tkgi4.wav
Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmpzmns5z4k.wav


User: What onion?
ChatBot: The one that's in the picture.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmptla_qjk3.wav


User: Which picture?
ChatBot: The one with the guy in the middle.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmpybuc2km7.wav


User: Describe me to picture.
ChatBot: I'm a guy.


Recording audio...
Audio saved at C:\Users\NABANI~1\AppData\Local\Temp\tmpee4y5xhy.wav


User: What kind of guy?
ChatBot: A guy who likes to party.


Recording audio...
