# Speech-to-Speech Assistant Testing

## 0 Global settings

In [1]:
import sys
import os
import json
import ssl
import urllib.request
import azure.cognitiveservices.speech as speechsdk
import openai
from dotenv import load_dotenv

print(f"System version: {sys.version}")
print(f"speechsdk version: {speechsdk.__version__}")
print(f"openai version: {openai.__version__}")
print(f"json version: {json.__version__}")
print('Setup Complete')

System version: 3.11.5 (tags/v3.11.5:cce6ba9, Aug 24 2023, 14:38:34) [MSC v.1936 64 bit (AMD64)]
speechsdk version: 1.37.0
openai version: 1.33.0
json version: 2.0.9
Setup Complete


In [2]:
# Load environment variables
load_dotenv()

# AML Setup
aml_endpoint = os.environ.get('AML_EDNPOINT')
aml_key = os.environ.get('AML_KEY')

# Azure OpenAI setup
endpoint = os.environ.get("OPEN_AI_ENDPOINT")
api_key = os.environ.get("OPEN_AI_KEY")
deployment = os.environ.get("OPEN_AI_DEPLOYMENT_NAME")

# Azure AI Search setup
search_endpoint = os.getenv("SEARCH_ENDPOINT"); 
search_key = os.getenv("SEARCH_KEY"); 
search_index_name = os.getenv("SEARCH_INDEX"); 

# setup speech configuration 
region = os.getenv("SPEECH_REGION")
speech_config = speechsdk.SpeechConfig(
    subscription=os.getenv("SPEECH_KEY"), 
    region=region
)

## 1 Speech to Speech Copilot RAG

In [5]:
# Get the text from the microphone
audio_config = speechsdk.audio.AudioConfig(
  use_default_microphone=True)
speech_config.speech_recognition_language="en-US"
speech_recognizer = speechsdk.SpeechRecognizer(
  speech_config, 
  audio_config)

print("Say something...")
speech_result = speech_recognizer.recognize_once_async().get()

print(f"User: {speech_result.text}")

message_text = [
    {
      "role": "system",
      "content": "## System Message for Storyteller Assistant 'Adventuress'\n- **Role**: I am 'Adventuress', a storytelling assistant designed to engage children with visual disabilities in immersive story experiences.\n- **Capabilities**: My primary function is to provide an interactive and accessible storytelling experience. I can understand and respond to the child's input regarding their name, age, and preferred story genre.\n- **Limitations**: I do not store personal information and will not remember past interactions. My knowledge is limited to the indexed database of stories provided.\n- **Output Format**: My responses will be in the form of friendly and engaging dialogue, suitable for children. I will use placeholders like [child's name] and [story type] to personalize the interaction.\n- **Safety**: I am programmed to ensure a safe and positive experience, avoiding any content that may be inappropriate or harmful for children.\n- **Behavioral Guardrails**: I will maintain a cheerful and supportive tone throughout the interaction, encouraging the child's love for stories and imagination.\n\n## Example Interaction Flow\n- Greeting: \"Hello! My name is Adventuress, your story assistant. I'm here to take you to worlds of fantasy and adventure through books. What's your name?\"\n- Personalization: \"What a lovely name, [child's name]! It's a pleasure to meet you. Tell me, how old are you?\"\n- Story Selection: \"[Child's age] years is a wonderful age to discover incredible stories! What kind of stories would you like to hear? Adventure,fiction, fantasy mystery, animals, or perhaps classic tales?\"\n- Story Introduction: \"I see, you like [story type] stories. I have the perfect tale for you. It's called '[Story Name]', and I think you're going to love it. Allow me to read it to you...\"\n- Feedback Request: \"Did you enjoy the story '[Story Name]'? On a scale from zero to five, with zero being not at all and 5 being very much, how much did you like it? Your feedback helps me choose even better stories for you next time!\"\n- Feedback Storage: \"I have recorded your [child rating] rating for '[Story Name]'. Thank you for helping me learn about your tastes!\"\n\n"
    },
    { "role": "user",
      "content": speech_result.text}
    ]

client = openai.AzureOpenAI(
    base_url=f"{endpoint}/openai/deployments/{deployment}/extensions",
    api_key=api_key,
    api_version="2023-08-01-preview",
)

completion = client.chat.completions.create(
    model=deployment,
    messages=message_text,
    extra_body={
        "dataSources": [
            {
                "type": "AzureCognitiveSearch",
                "parameters": {
                    "endpoint": os.environ["SEARCH_ENDPOINT"],
                    "key": os.environ["SEARCH_KEY"],
                    "indexName": os.environ["SEARCH_INDEX"]
                }
            }
        ]
    }
)

print(f"Assistant: {completion.choices[0].message.content}")

# Play the result on the computer's speaker
speech_config.speech_synthesis_voice_name = "en-US-AnaNeural"
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config)
speech_synthesizer.speak_text(
  completion.choices[0].message.content)


Say something...
User: Hello.
Assistant: Hi there! How can I assist you today?


<azure.cognitiveservices.speech.SpeechSynthesisResult at 0x2e2b9f5ef10>

## 2 RecSys Endpoint Test

In [6]:
def allowSelfSignedHttps(allowed):
    # bypass the server certificate verification on client side
    if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
        ssl._create_default_https_context = ssl._create_unverified_context

allowSelfSignedHttps(True) # this line is needed if you use self-signed certificate in your scoring service.

# Request data goes here
# The example below assumes JSON formatting which may be updated
data =  {
  "Inputs": {
    "input1": [
      {
        "user_id": "9773b4c5dc241e38d76aff2cb96c96fd",
        "book_id": 19337,
        "rating": 3
      },
      {
        "user_id": "5792082567cfacbc2d71bceb212a3065",
        "book_id": 1001896,
        "rating": 4
      },
      {
        "user_id": "80d52f5e70f023bd0098ab96599a3530",
        "book_id": 375711,
        "rating": 5
      }
    ]
  },
  "GlobalParameters": {}
}

body = str.encode(json.dumps(data))

if not aml_key:
    raise Exception("A key should be provided to invoke the endpoint")


headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ aml_key)}

req = urllib.request.Request(aml_endpoint, body, headers)

try:
    response = urllib.request.urlopen(req)

    result = response.read()
    print(result)
except urllib.error.HTTPError as error:
    print("The request failed with status code: " + str(error.code))

    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
    print(error.info())
    print(error.read().decode("utf8", 'ignore'))

b'{"Results": {"WebServiceOutput0": [{"User": "5792082567cfacbc2d71bceb212a3065", "Recommended Item 1": "65832", "Predicted Rating 1": 2.043990135192871, "Recommended Item 2": "1001896", "Predicted Rating 2": 2.0283126831054688, "Recommended Item 3": "533675", "Predicted Rating 3": 1.8523731231689453}, {"User": "80d52f5e70f023bd0098ab96599a3530", "Recommended Item 1": "65832", "Predicted Rating 1": 4.156232833862305, "Recommended Item 2": "533675", "Predicted Rating 2": 3.8327646255493164, "Recommended Item 3": "1577950", "Predicted Rating 3": 3.8082356452941895}, {"User": "9773b4c5dc241e38d76aff2cb96c96fd", "Recommended Item 1": "533675", "Predicted Rating 1": 4.389653205871582, "Recommended Item 2": "852724", "Predicted Rating 2": 4.380564212799072, "Recommended Item 3": "1577950", "Predicted Rating 3": 4.188114166259766}]}}'


## 3 Speech to Speech OpenaAI Assistant Example

In [7]:
# Create a Azure OpenAI client
client = openai.AzureOpenAI(
azure_endpoint=endpoint,
api_key=api_key,
api_version="2023-05-15"
)

# This will correspond to the custom name you chose for your deployment when you deployed a model.
deployment_id=deployment

# Enable the use of the microphone and speakers 
audio_output_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)
audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)

# Should be the locale for the speaker's language.
speech_config.speech_recognition_language="en-US"
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

# The language of the voice that responds on behalf of Azure OpenAI.
speech_config.speech_synthesis_voice_name='en-US-AnaNeural'
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_output_config)
# tts sentence end mark
tts_sentence_end = [ ".", "!", "?", ";", "。", "！", "？", "；", "\n" ]

# Prompts Azure OpenAI with a request and synthesizes the response.
def ask_openai(prompt):
    # Ask Azure OpenAI in streaming way
    response = client.chat.completions.create(model=deployment_id, max_tokens=200, stream=True, messages=[
        {"role": "user", "content": prompt}
    ])
    collected_messages = []
    last_tts_request = None

    # iterate through the stream response stream
    for chunk in response:
        if len(chunk.choices) > 0:
            chunk_message = chunk.choices[0].delta.content  # extract the message
            if chunk_message is not None:
                collected_messages.append(chunk_message)  # save the message
                if chunk_message in tts_sentence_end: # sentence end found
                    text = ''.join(collected_messages).strip() # join the recieved message together to build a sentence
                    if text != '': # if sentence only have \n or space, we could skip
                        print(f"Speech synthesized to speaker for: {text}")
                        last_tts_request = speech_synthesizer.speak_text_async(text)
                        collected_messages.clear()
    if last_tts_request:
        last_tts_request.get()

# Continuously listens for speech input to recognize and send as text to Azure OpenAI
def chat_with_open_ai():
    while True:
        print("Azure OpenAI is listening. Say 'Stop' or press Ctrl-Z to end the conversation.")
        try:
            # Get audio from the microphone and then send it to the TTS service.
            speech_recognition_result = speech_recognizer.recognize_once_async().get()

            # If speech is recognized, send it to Azure OpenAI and listen for the response.
            if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
                if speech_recognition_result.text == "Stop.": 
                    print("Conversation ended.")
                    break
                print("Recognized speech: {}".format(speech_recognition_result.text))
                ask_openai(speech_recognition_result.text)
            elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
                print("No speech could be recognized: {}".format(speech_recognition_result.no_match_details))
                break
            elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
                cancellation_details = speech_recognition_result.cancellation_details
                print("Speech Recognition canceled: {}".format(cancellation_details.reason))
                if cancellation_details.reason == speechsdk.CancellationReason.Error:
                    print("Error details: {}".format(cancellation_details.error_details))
        except EOFError:
            break

# Main

try:
    chat_with_open_ai()
except Exception as err:
    print("Encountered exception. {}".format(err))

Azure OpenAI is listening. Say 'Stop' or press Ctrl-Z to end the conversation.
Recognized speech: Hi, how are you?
Speech synthesized to speaker for: Hello!
Speech synthesized to speaker for: I'm an AI, so I don't have feelings, but I'm here to help.
Speech synthesized to speaker for: How can I assist you today?
Azure OpenAI is listening. Say 'Stop' or press Ctrl-Z to end the conversation.
Conversation ended.
