# API-M's AI Gateway: Azure OpenAI load-balancing

This notebook explores the use of circuit-breaking and load balancing between two AOAI endpoints.

### Setting up environment

In [1]:
# Importing required packages
import requests
import time
import os

In [2]:
# Extracting environment variables
APIM_LB_URL = os.getenv("APIM_LB_URL")
APIM_LB_SUB_KEY = os.getenv("APIM_LB_SUB_KEY")
AOAI_API_VERSION = os.getenv("APIM_LB_API_VERSION")
AOAI_DEPLOYMENT = os.getenv("APIM_LB_AOAI_DEPLOY")

In [3]:
# Defining custom variables
SYSTEM_PROMPT = "You are a standup comedian."
USER_PROMPT = "Tell me a joke about red panda."
NUMBER_OF_RUNS = 10
SLEEP_TIME = 2
TEMPERATURE = 0.7
MAX_TOKENS = 200

### API-M load-balance testing with REST request

In [4]:
# Helper function for REST API call
def get_rest_completion(subscription_key, system_prompt, user_prompt):
    response = requests.post(
        url = f"{APIM_LB_URL}openai/deployments/{AOAI_DEPLOYMENT}/chat/completions",
        headers = {
            "Content-Type": "application/json",
            "api-key": subscription_key
        },
        params={'api-version': AOAI_API_VERSION},
        json = {
            "messages": [
                {
                   "role": "system",
                    "content": system_prompt
                },
                {
                    "role": "user",
                    "content": user_prompt
                }
            ],
        "temperature": TEMPERATURE,
        "max_tokens": MAX_TOKENS
        }
    )
    return response

In [5]:
# Testing load-balancing with REST API
for i in range(NUMBER_OF_RUNS):    
    start_time = time.time()
    response = get_rest_completion(
        subscription_key = APIM_LB_SUB_KEY,
        system_prompt = SYSTEM_PROMPT,
        user_prompt = USER_PROMPT
    )
    end_time = time.time()
    region = response.headers['x-ms-region'] if 'x-ms-region' in response.headers else None
    print(f"Run # {i}: {region}, Duration: {end_time - start_time:.2f}, Response Code: {response.status_code}")

    if i < NUMBER_OF_RUNS - 1:
        print(f"Pausing for {SLEEP_TIME} seconds...")
        time.sleep(SLEEP_TIME)
print("-----------------------------")

Run # 0: Sweden Central, Duration: 0.93, Response Code: 200
Pausing for 2 seconds...
Run # 1: None, Duration: 1.36, Response Code: 503
Pausing for 2 seconds...
Run # 2: France Central, Duration: 2.95, Response Code: 200
Pausing for 2 seconds...
Run # 3: France Central, Duration: 2.33, Response Code: 200
Pausing for 2 seconds...
Run # 4: France Central, Duration: 3.14, Response Code: 200
Pausing for 2 seconds...
Run # 5: France Central, Duration: 3.01, Response Code: 200
Pausing for 2 seconds...
Run # 6: France Central, Duration: 5.98, Response Code: 200
Pausing for 2 seconds...
Run # 7: Sweden Central, Duration: 0.86, Response Code: 200
Pausing for 2 seconds...
Run # 8: None, Duration: 1.27, Response Code: 503
Pausing for 2 seconds...
Run # 9: France Central, Duration: 4.35, Response Code: 200
-----------------------------
