# Prerequisites

- Access granted to Azure OpenAI in the desired Azure subscription. Currently, access to this service is granted only by application. You can apply for access to Azure OpenAI by completing the form at https://aka.ms/oai/access. Open an issue on this repo to contact us if you have an issue.
- Python 3.8 or later version.
- An Azure OpenAI Service resource with a GPT-4 Turbo with Vision model deployed. See [GPT-4 and GPT-4 Turbo Preview model availability](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-4-and-gpt-4-turbo-model-availability) for available regions. For more information about resource creation, see the [resource deployment guide](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/create-resource).

- [Create a Speech resource](https://portal.azure.com/#create/Microsoft.CognitiveServicesSpeechServices) in the Azure portal.
- Your Speech resource key and region. After your Speech resource is deployed, select Go to resource to view and manage keys. For more information about Azure AI services resources, see [Get the keys for your resource](https://learn.microsoft.com/en-us/azure/ai-services/multi-service-resource?pivots=azportal#get-the-keys-for-your-resource).

# Dependancies

In [1]:
! pip install openai
! pip install tiktoken
! pip install azure-cognitiveservices-speech

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-win_amd64.whl.metadata (6.8 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.5.15-cp310-cp310-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ---------------------------------------  41.0/42.0 kB 2.0 MB/s eta 0:00:01
     -------------------------------------- 42.0/42.0 kB 675.2 kB/s eta 0:00:00
Downloading tiktoken-0.7.0-cp310-cp310-win_amd64.whl (798 kB)
   ---------------------------------------- 0.0/798.9 kB ? eta -:--:--
   --- ------------------------------------ 61.4/798.9 kB ? eta -:--:--
   ----- ---------------------------------- 102.4/798.9 kB 1.5 MB/s eta 0:00:01
   ------- -------------------------------- 153.6/798.9 kB 1.1 MB/s eta 0:00:01
   ------------- -------------------------- 266.2/798.9 kB 1.3 MB/s eta 0:00:01
   ------------- -------------------------- 266.2/798.9 kB 1.3 MB/s eta 0:00:01
   ----------------- ------------

# Setup

In [2]:
import base64
from mimetypes import guess_type
import tiktoken
from openai import AzureOpenAI
import azure.cognitiveservices.speech as speechsdk

In [3]:
# Function to encode a local image into data URL 
def local_image_to_data_url(image_path):
    if image_path=="":
        return ""
    # Guess the MIME type of the image based on the file extension
    mime_type, _ = guess_type(image_path)
    if mime_type is None:
        mime_type = 'application/octet-stream'  # Default MIME type if none is found

    # Read and encode the image file
    with open(image_path, "rb") as image_file:
        base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8')

    # Construct the data URL
    return f"data:{mime_type};base64,{base64_encoded_data}"

In [7]:
IN_SPEECH=True #set this to true if you want to talk to the bot via speech
OUT_SPEECH=True #set this to true if you want the bot to speak
MAX_RESPONSE_TOKENS = 500
TOKEN_LIMIT = 10000 # this is based on the model you use
SPEECH_SERVICE_KEY="4b8cf2765b3c4401b3ceacef9bf0c9e0"
SPEECH_SERVICE_REGION="eastus"
AZURE_OPENAI_KEY="c19b9fc79d1647bb880e3422e560a649"
AZURE_OPENAI_ENDPOINT="https://oai23.openai.azure.com/"

speech_config = speechsdk.SpeechConfig(subscription=SPEECH_SERVICE_KEY, region=SPEECH_SERVICE_REGION)
speech_config.speech_recognition_language="en-US"
speech_config.speech_synthesis_voice_name='en-US-AvaMultilingualNeural'
audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

client = AzureOpenAI(
  api_key = AZURE_OPENAI_KEY,  
  api_version = "2024-02-01",
  azure_endpoint = AZURE_OPENAI_ENDPOINT
)

# Chatbot

In [8]:
system_message = {"role": "system", "content": "You are a helpful assistant."}
conversation = []
conversation.append(system_message)

def num_tokens_from_messages(messages, model="gpt-4-0125-Preview-0314"): # this is based on the model you use
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-0125-Preview-0314",
        "gpt-4-0613",
        "gpt-4-0125-Preview-0613",
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            if type(value)==list:
                continue
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens


print("AI: " + "Hello how can I help you today?\n")
conversation.append({"role": "assistant", "content": "Hello how can I help you today?"})
while True:
    data_url=local_image_to_data_url(input("Q: Please enter the image URL otherwise press enter."))
    if IN_SPEECH:
        print("Speak into your microphone.")
        user_input = speech_recognizer.recognize_once_async().get().text
    else:
        user_input = input("Q:")
        
    print("User: " + user_input + "\n")
    
    if data_url=="":
        conversation.append({"role": "user", "content": user_input})
    else:
        conversation.append({"role": "user", "content": [{"type": "text","text": user_input},{"type": "image_url","image_url": {"url": data_url}}]})

    conv_history_tokens = num_tokens_from_messages(conversation)

    while conv_history_tokens + MAX_RESPONSE_TOKENS >= TOKEN_LIMIT:
        del conversation[1] 
        conv_history_tokens = num_tokens_from_messages(conversation)
    
    if OUT_SPEECH:
        
        response = client.chat.completions.create(
        model="gpt40", #depending on your deployment name
        messages=conversation,
        max_tokens=MAX_RESPONSE_TOKENS
        )
        
        conversation.append({"role": "assistant", "content": response.choices[0].message.content})
        speech_synthesizer.speak_text_async(response.choices[0].message.content).get()
        print("AI: " + response.choices[0].message.content + "\n")
        
    else:

        response = client.chat.completions.create(
            model="gpt40", #depending on your deployment name
            messages=conversation,
            stream=True,
            max_tokens=MAX_RESPONSE_TOKENS
        )
        
        full_message=""
        print("AI: ",end="")
        for chunk in response:
            if hasattr(chunk, 'choices') and len(chunk.choices) > 0:
                if hasattr(chunk.choices[0].delta, 'content'):
                    content_chunk = chunk.choices[0].delta.content
                    if content_chunk:  # Check if content_chunk is not None
                        print(content_chunk, end="")  # Print partial response
                        full_message += content_chunk
        print("\n")
        response=full_message
        conversation.append({"role": "assistant", "content": full_message})

AI: Hello how can I help you today?



User: what is the color of the car

AI: The car in the image is primarily white.



KeyboardInterrupt: Interrupted by user