# API-M's AI Gateway: Azure OpenAI token limit

This notebook explores the enforcemnt of a custom token-per-minute (TPM) quota per app or user.

### Setting up environment

In [1]:
# Importing required packages
from openai import AzureOpenAI
import requests
import time
import os

In [2]:
# Extracting environment variables
APIM_TPM_URL = os.getenv("APIM_TPM_URL")
APIM_TPM_SUB_KEY = os.getenv("APIM_TPM_SUB_KEY")
AOAI_API_VERSION = os.getenv("APIM_TPM_API_VERSION")
AOAI_DEPLOYMENT = os.getenv("APIM_TPM_AOAI_DEPLOY")

In [3]:
# Defining custom variables
SYSTEM_PROMPT = "You are a standup comedian."
USER_PROMPT = "Tell me a joke about red panda."
NUMBER_OF_RUNS = 5
SLEEP_TIME = 15
TEMPERATURE = 0.7

### Testing API-M endpoint with REST request

In [4]:
# Helper function for REST API call
def get_rest_completion(system_prompt, user_prompt):
    response = requests.post(
        url = f"{APIM_TPM_URL}openai/deployments/{AOAI_DEPLOYMENT}/chat/completions",
        headers = {
            "Content-Type": "application/json",
            "api-key": APIM_TPM_SUB_KEY
        },
        params={'api-version': AOAI_API_VERSION},
        json = {
            "messages": [
                {
                   "role": "system",
                    "content": system_prompt
                },
                {
                    "role": "user",
                    "content": user_prompt
                }
            ],
        "temperature": TEMPERATURE
        }
    )
    return response

In [5]:
# Testing REST API
for i in range(NUMBER_OF_RUNS):
    start_time = time.time()
    response = get_rest_completion(SYSTEM_PROMPT, USER_PROMPT)
    end_time = time.time()
    print(f"Run # {i} completed in {end_time - start_time:.2f} seconds")

    if response.status_code == 200:        
        print(f"Consumed tokens: {response.headers['consumed-tokens']}")
        print(f"Remaining tokens: {response.headers['remaining-tokens']}")    
    else:
        print(f"Response code: {response.status_code}")
        print(f"Response message: {response.json().get('message')}")        
    
    if i < NUMBER_OF_RUNS - 1:
        print(f"Pausing for {SLEEP_TIME} seconds...")
        time.sleep(SLEEP_TIME)
    print("-----------------------------")

Run # 0 completed in 0.96 seconds
Consumed tokens: 54
Remaining tokens: 46
Pausing for 15 seconds...
-----------------------------
Run # 1 completed in 0.74 seconds
Consumed tokens: 59
Remaining tokens: 0
Pausing for 15 seconds...
-----------------------------
Run # 2 completed in 0.24 seconds
Response code: 429
Response message: Token limit is exceeded. Try again in 29 seconds.
Pausing for 15 seconds...
-----------------------------
Run # 3 completed in 0.23 seconds
Response code: 429
Response message: Token limit is exceeded. Try again in 14 seconds.
Pausing for 15 seconds...
-----------------------------
Run # 4 completed in 0.73 seconds
Consumed tokens: 54
Remaining tokens: 0
-----------------------------


### Testing API-M endpoint with OpenAI SDK v1

In [6]:
# Initiating Azure OpenAI client
client = AzureOpenAI(
    azure_endpoint = APIM_TPM_URL,
    api_key = APIM_TPM_SUB_KEY,
    api_version = AOAI_API_VERSION
)

In [7]:
# Helper function for SDK call
def get_sdk_completion(system_prompt, prompt):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]

    response = client.chat.completions.with_raw_response.create(
        model = AOAI_DEPLOYMENT,
        messages = messages,
        temperature = TEMPERATURE
    )
    return response

In [8]:
response = get_sdk_completion(SYSTEM_PROMPT, USER_PROMPT)
response

<APIResponse [200 OK] type=<class 'openai.types.chat.chat_completion.ChatCompletion'>>

In [9]:
# Testing SDK
for i in range(NUMBER_OF_RUNS):
    start_time = time.time()
    response = get_sdk_completion(SYSTEM_PROMPT, USER_PROMPT)
    end_time = time.time()
    print(f"Run # {i} completed in {end_time - start_time:.2f} seconds")

    if response.http_response.status_code == 200:        
        print(f"Consumed tokens: {response.headers['consumed-tokens']}")
        print(f"Remaining tokens: {response.headers['remaining-tokens']}")    
    else:
        print(f"Response code: {response.http_response.status_code}")
        print(f"Response message: {response.http_response.reason_phrase}")  

    if i < NUMBER_OF_RUNS - 1:
        print(f"Pausing for {SLEEP_TIME} seconds...")
        time.sleep(SLEEP_TIME)
    print("-----------------------------")

Run # 0 completed in 45.82 seconds
Consumed tokens: 55
Remaining tokens: 0
Pausing for 15 seconds...
-----------------------------
Run # 1 completed in 0.65 seconds
Consumed tokens: 55
Remaining tokens: 0
Pausing for 15 seconds...
-----------------------------
Run # 2 completed in 30.87 seconds
Consumed tokens: 55
Remaining tokens: 0
Pausing for 15 seconds...
-----------------------------
Run # 3 completed in 1.20 seconds
Consumed tokens: 63
Remaining tokens: 0
Pausing for 15 seconds...
-----------------------------
Run # 4 completed in 29.88 seconds
Consumed tokens: 56
Remaining tokens: 0
-----------------------------
