In [19]:
import os
from langchain_openai import AzureOpenAIEmbeddings
from langchain_openai import AzureChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA


import azure.cognitiveservices.speech as speechsdk
import base64
import tempfile
import uuid
import os

OPENAI_API_BASE = ""
OPENAI_API_KEY = ""
OPENAI_API_TYPE = "azure"

OPENAI_EMBS_DEPLOYMENT_NAME = ""
OPENAI_EMBS_MODEL_NAME="text-embedding-ada-002"
OPENAI_EMB_DEPLOYMENT_VERSION = "2"

AZURE_SPEECH_KEY = ''
AZURE_SPEECH_REGION = ''
VOICE_NAME = ''

os.environ["AZURE_OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["AZURE_OPENAI_ENDPOINT"] = OPENAI_API_BASE

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=OPENAI_EMBS_DEPLOYMENT_NAME,
    openai_api_version="2023-05-15",
)

llm = AzureChatOpenAI(
    # deployment_name="gpt-35-turbo-16k",
    deployment_name="gpt-4",
    temperature=0.7,
    openai_api_version="2023-05-15"
)

saved_db = Chroma(persist_directory="knowledge/", embedding_function=embeddings)
retrieval_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=saved_db.as_retriever())

def get_response(query):
    result = retrieval_chain.invoke(query, return_only_outputs=True)['result']
    return result

In [24]:
def speech_to_text(audio_file):
    speech_config = speechsdk.SpeechConfig(
        subscription=AZURE_SPEECH_KEY, 
        region=AZURE_SPEECH_REGION
    )
    audio_config = speechsdk.audio.AudioConfig(filename=audio_file)
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
    result = speech_recognizer.recognize_once()
    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        return result.text
    
    return None

def text_to_speech(audio_file_path):
    query = speech_to_text(audio_file_path)
    text = get_response(query)
    speech_config = speechsdk.SpeechConfig(
        subscription=AZURE_SPEECH_KEY, 
        region=AZURE_SPEECH_REGION
    )
    speech_config.speech_synthesis_voice_name = VOICE_NAME
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
    result = synthesizer.speak_text_async(text).get()
    
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        audio_data_base64 = base64.b64encode(result.audio_data).decode('utf-8')
        data_uri = f"data:audio/wav;base64,{audio_data_base64}"
        file_id = uuid.uuid1()
        file_path = os.path.join(
            tempfile.gettempdir(),
            f'{file_id}.wav'
        )
        audio_bytes = base64.b64decode(audio_data_base64)
        with open(file_path, "wb") as audio_file:
            audio_file.write(audio_bytes)
        # return data_uri, audio_data_base64
        return [text, file_path]
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        return cancellation_details.reason
    return None

In [None]:
import gradio as gr

demo = gr.Interface(
    fn=text_to_speech,
    # inputs=[gr.components.Textbox(label='Input text')],
    inputs=[gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")],
    outputs=[gr.components.Textbox(label='Generated text'), gr.components.Audio(label='Generated audio')],
    allow_flagging='never'
)

demo.launch(debug=True, share=True)

In [None]:
import gradio as gr
import base64
import uuid
import os

def add_text(history, text):
    history = history + [(text, None)]
    return history, ""

def add_file(history, file):
    history = history + [((file.name,), None)]
    return history

def bot(history):
    def get_response(query):
        result = retrieval_chain.invoke(query, return_only_outputs=True)['result']
        return result

    response = get_response(history[-1][0])
    history[-1][1] = response
    return history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot([], elem_id="chatbot")

    txt = gr.Textbox(
                show_label=False,
                placeholder="Enter text and press enter",
            )

    txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then(
        bot, chatbot, chatbot
    )

demo.launch(debug=True, share=True)
