## Configuration
Populate the environment variables or edit the variables below before running:
- `MIDDLEWARE_URL` (e.g., http://localhost:8000)
- `MIDDLEWARE_API_KEY` (local.api_key from config.yaml)
- `MIDDLEWARE_API_VERSION` (defaults to 2024-02-01)
- `CHAT_MODEL` (e.g., gpt-4.1-nano)
- `THINKING_MODEL` (e.g., gpt-5-nano)
- `EMBEDDING_MODEL` (e.g., text-embedding-3-small)

In [1]:
import os
import json
import base64
import requests
from typing import Any

BASE_URL = "http://localhost:8000"
API_KEY = "sk-12345678"
API_VERSION = "2024-02-01"
DEPLOYMENT_CHAT = "gpt-4.1-nano"
DEPLOYMENT_THINKING = "gpt-5-nano"
DEPLOYMENT_EMBEDDING = "text-embedding-3-small"

HEADERS = {"api-key": API_KEY, "Content-Type": "application/json"}

def build_url(path: str) -> str:
    return f"{BASE_URL.rstrip('/')}{path}"

def post_json(path: str, payload: dict[str, Any], *, stream: bool = False) -> requests.Response:
    response = requests.post(
        build_url(path),
        params={"api-version": API_VERSION},
        headers=HEADERS,
        json=payload,
        stream=stream,
        timeout=120,
    )
    response.raise_for_status()
    return response

def print_json(data: Any) -> None:
    print(json.dumps(data, indent=2))

def iter_sse(resp: requests.Response):
    for line in resp.iter_lines():
        if not line:
            continue
        if line.startswith(b"data: "):
            yield line.replace(b"data: ", b"", 1).decode("utf-8")

In [2]:
# Health and metrics
print('Health:', requests.get(build_url('/health'), timeout=10).json())
print('Metrics:', requests.get(build_url('/metrics'), timeout=10).json())

Health: {'status': 'healthy', 'timestamp': '2025-12-14T14:55:25.634504+00:00'}
Metrics: {'daily_cost_eur': 0.2469, 'daily_cap_eur': 5.0, 'date': '2025-12-14', 'percentage_used': 4.94}


## Chat completions: basic request
Minimal call using gpt-4.1-nano.

In [None]:
chat_payload = {
    "messages": [
        {"role": "system", "content": "You are a concise assistant."},
        {"role": "user", "content": "List three planets."}
    ],
    "max_completion_tokens": 100,
}
resp = post_json(f"/openai/deployments/{DEPLOYMENT_CHAT}/chat/completions", chat_payload)
print_json(resp.json())

{
  "choices": [
    {
      "content_filter_results": {
        "hate": {
          "filtered": false,
          "severity": "safe"
        },
        "protected_material_code": {
          "filtered": false,
          "detected": false
        },
        "protected_material_text": {
          "filtered": false,
          "detected": false
        },
        "self_harm": {
          "filtered": false,
          "severity": "safe"
        },
        "sexual": {
          "filtered": false,
          "severity": "safe"
        },
        "violence": {
          "filtered": false,
          "severity": "safe"
        }
      },
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "annotations": [],
        "content": "Mars, Jupiter, Saturn",
        "refusal": null,
        "role": "assistant"
      }
    }
  ],
  "created": 1765724137,
  "id": "chatcmpl-CmhnVzXVVTlqGiMOqjDgJUw6rQynz",
  "model": "gpt-4.1-nano-2025-04-14",
  "object": "chat.

: 

## Chat completions: parameter coverage
Showcase of the supported knobs (temperature, top_p, stop, penalties, n, seed, user).

In [19]:
param_payload = {
    "messages": [
        {"role": "system", "content": "Answer in two short bullets."},
        {"role": "user", "content": "Give quick travel tips for Tokyo."}
    ],
    "max_completion_tokens": 120,
    "temperature": 0.6,
    "top_p": 0.9,
    "stop": ["Stop"],
    "presence_penalty": 0.2,
    "frequency_penalty": 0.1,
    "n": 1,
    "seed": 1234,
    "user": "sample-user-123",
}
resp = post_json(f"/openai/deployments/{DEPLOYMENT_CHAT}/chat/completions", param_payload)
print_json(resp.json())

{
  "choices": [
    {
      "content_filter_results": {
        "hate": {
          "filtered": false,
          "severity": "safe"
        },
        "protected_material_code": {
          "filtered": false,
          "detected": false
        },
        "protected_material_text": {
          "filtered": false,
          "detected": false
        },
        "self_harm": {
          "filtered": false,
          "severity": "safe"
        },
        "sexual": {
          "filtered": false,
          "severity": "safe"
        },
        "violence": {
          "filtered": false,
          "severity": "safe"
        }
      },
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "annotations": [],
        "content": "- Use the Tokyo Metro and JR Pass for cost-effective and convenient transportation around the city.  \n- Explore popular neighborhoods like Shibuya, Shinjuku, and Asakusa, and don't miss local attractions such as Meiji Shrine a

## Streaming chat
Uses `stream=True` and reads SSE lines.

In [20]:
stream_payload = {
    "messages": [
        {"role": "user", "content": "Count from 1 to 50, one number per token."}
    ],
    "max_completion_tokens": 50,
    "stream": True,
}
resp = post_json(f"/openai/deployments/{DEPLOYMENT_CHAT}/chat/completions", stream_payload, stream=True)
for raw in iter_sse(resp):
    if raw == '[DONE]':
        break
    chunk = json.loads(raw)
    # Some chunks have empty choices (e.g., initial metadata, final usage stats)
    choices = chunk.get('choices', [])
    if choices:
        delta = choices[0].get('delta', {})
        content = delta.get('content')
        if content:
            print(content, end='', flush=True)
print()

1  
2  
3  
4  
5  
6  
7  
8  
9  
10  
11  
12  
13  
14  
15  
16  
17  
18  
19  
20  
21  
22  
23  
24  
25  



## Thinking model example (gpt-5-nano)
Let the model use reasoning tokens; output may be empty if all tokens are used for reasoning.

In [21]:
# Note: Thinking models do NOT support temperature, top_p, or other sampling parameters
thinking_payload = {
    "messages": [
        {"role": "user", "content": "What is 15 + 27? Explain briefly."}
    ],
    # No temperature! Reasoning models have fixed sampling behavior
}
resp = post_json(f"/openai/deployments/{DEPLOYMENT_THINKING}/chat/completions", thinking_payload)
data = resp.json()
content = data['choices'][0]['message'].get('content')
print('Answer:', content or '(reasoning-only tokens)')
usage = data.get('usage', {})
details = usage.get('completion_tokens_details')
if details:
    print('Reasoning tokens:', details.get('reasoning_tokens'))

Answer: 42. 
Reason: 15 = 10 + 5 and 27 = 20 + 7, so (10+20) + (5+7) = 30 + 12 = 42.
Reasoning tokens: 320


## Structured output (JSON schema)
Requests a JSON object shaped by a schema. Note: if your middleware version strips unknown fields, add them to `ChatCompletionRequest` or allow extras.

In [22]:
import os
import json
from openai import AzureOpenAI
from datetime import datetime
from zoneinfo import ZoneInfo

# Azure OpenAI config
endpoint = "http://localhost:8000"  # Your Azure endpoint
api_key = "sk-12345678"
deployment_name = "gpt-4.1-nano"  # Azure deployment

# Initialize client
client = AzureOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint=endpoint,
    api_key=api_key
)

response = client.chat.completions.create(
  model=deployment_name, # Model = should match the deployment name you chose for your model deployment
  response_format={ "type": "json_object" },
  messages=[
    {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
    {"role": "user", "content": "Who won the world series in 2020?"}
  ]
)
print(response.choices[0].message.content)

{
  "winner": "Los Angeles Dodgers",
  "series": "2020 World Series"
}


## Tool calling
Send tool definitions; inspect tool calls in the response. Note: ensure your middleware forwards `tools` and `tool_choice` without stripping them.

In [23]:
import os
import json
from openai import AzureOpenAI
from datetime import datetime
from zoneinfo import ZoneInfo

# Azure OpenAI config
endpoint = "http://localhost:8000"  # Your Azure endpoint
api_key = "sk-12345678"
deployment_name = "gpt-4.1-nano"  # Azure deployment

# Initialize client
client = AzureOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint=endpoint,
    api_key=api_key
)

# Simplified timezone data
TIMEZONE_DATA = {
    "tokyo": "Asia/Tokyo",
    "san francisco": "America/Los_Angeles",
    "paris": "Europe/Paris"
}

def get_current_time(location):
    """Get the current time for a given location"""
    print(f"get_current_time called with location: {location}")  
    location_lower = location.lower()
    
    for key, timezone in TIMEZONE_DATA.items():
        if key in location_lower:
            print(f"Timezone found for {key}")  
            current_time = datetime.now(ZoneInfo(timezone)).strftime("%I:%M %p")
            return json.dumps({
                "location": location,
                "current_time": current_time
            })
    
    print(f"No timezone data found for {location_lower}")  
    return json.dumps({"location": location, "current_time": "unknown"})

def run_conversation():
    # Initial user message
    messages = [{"role": "user", "content": "What's the current time in San Francisco"}] # Single function call
    #messages = [{"role": "user", "content": "What's the current time in San Francisco, Tokyo, and Paris?"}] # Parallel function call with a single tool/function defined

    # Define the function for the model
    tools = [
        {
            "type": "function",
            "function": {
                "name": "get_current_time",
                "description": "Get the current time in a given location",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "location": {
                            "type": "string",
                            "description": "The city name, e.g. San Francisco",
                        },
                    },
                    "required": ["location"],
                },
            }
        }
    ]

    # First API call: Ask the model to use the function
    response = client.chat.completions.create(
        model=deployment_name,
        messages=messages,
        tools=tools,
        tool_choice={
        "type": "function",
        "function": {"name": "get_current_time"} # Forces this specific function
        }
    )

    # Process the model's response
    response_message = response.choices[0].message
    messages.append(response_message)

    print("Model's response:")  
    print(response_message)  

    # Handle function calls
    if response_message.tool_calls:
        for tool_call in response_message.tool_calls:
            if tool_call.function.name == "get_current_time":
                function_args = json.loads(tool_call.function.arguments)
                print(f"Function arguments: {function_args}")  
                time_response = get_current_time(
                    location=function_args.get("location")
                )
                messages.append({
                    "tool_call_id": tool_call.id,
                    "role": "tool",
                    "name": "get_current_time",
                    "content": time_response,
                })
    else:
        print("No tool calls were made by the model.")  

    # Second API call: Get the final response from the model
    final_response = client.chat.completions.create(
        model=deployment_name,
        messages=messages,
    )

    return final_response.choices[0].message.content

# Run the conversation and print the result
print(run_conversation())

Model's response:
ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=[ChatCompletionMessageFunctionToolCall(id='call_OapbiMJQkHd4d0m2wuTKliSv', function=Function(arguments='{"location":"San Francisco"}', name='get_current_time'), type='function')])
Function arguments: {'location': 'San Francisco'}
get_current_time called with location: San Francisco
Timezone found for san francisco
The current time in San Francisco is 2:16 AM.


## Vision: sending a photo (image_url)
Uses a tiny inline PNG. Replace with your own base64 or hosted URL. Note: make sure your deployment supports vision.

## Vision: base64 from repo image
Encode `examples/element/example.png` to base64 and send as `image_url`. This avoids embedding large binaries directly in the notebook.

In [24]:
import os
from openai import AzureOpenAI
import base64
from mimetypes import guess_type

# Azure OpenAI config
endpoint = "http://localhost:8000"  # Your Azure endpoint
api_key = "sk-12345678"
deployment_name = "gpt-4.1-nano"  # Azure deployment

# Initialize client
client = AzureOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint=endpoint,
    api_key=api_key
)

# Read image and encode to base64
def local_image_to_data_url(image_path):
    mime_type, _ = guess_type(image_path)
    if mime_type is None:
        mime_type = "application/octet-stream"
    with open(image_path, "rb") as f:
        encoded = base64.b64encode(f.read()).decode("utf-8")
    return f"data:{mime_type};base64,{encoded}"
# ---------------------------
# Image input
# ---------------------------
image_path = r"P:\Alan\Github\AzureMiddleware\examples\element\example.png"  # Replace with your local image path
data_url = local_image_to_data_url(image_path)

# ---------------------------
# Make the chat completion call
# ---------------------------
response = client.chat.completions.create(
    model=deployment_name,
    messages=[
        { "role": "system", "content": "You are a helpful assistant." },
        { "role": "user", "content": [
            {"type": "text", "text": "Describe this picture:"},
            {"type": "image_url", "image_url": {"url": data_url}}
        ]}
    ]
)

# ---------------------------
# Print the assistant's response
# ---------------------------
print(response.choices[0].message.content)


The picture shows a dog standing on four soda cans, with a hat placed on its head. The cans are labeled with humorous text: 
- The front can is labeled "Indian guys on YouTube."
- The middle can is labeled "Stack Overflow."
- The back can is labeled "Luck."
- The dog itself has the caption "My code," suggesting the dog's balancing act represents the challenges or chaos of coding. 
The setting appears to be against a tiled wall on a red cloth or surface.


## Embeddings
Create embeddings and inspect dimensions.

In [14]:
emb_payload = {
    "input": ["First text", "Second text"],
    "dimensions": 256,
    "encoding_format": "float",
}
resp = post_json(f"/openai/deployments/{DEPLOYMENT_EMBEDDING}/embeddings", emb_payload)
data = resp.json()
print('Vectors:', len(data['data']), 'Dimension of first vector:', len(data['data'][0]['embedding']))

Vectors: 2 Dimension of first vector: 256


In [15]:
from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key="sk-12345678",
    azure_endpoint="http://localhost:8000",
    openai_api_version="2024-12-01-preview",
)
text = "LangChain is the framework for building context-aware reasoning applications"

text2 = (
    "LangGraph is a library for building stateful, multi-actor applications with LLMs"
)
two_vectors = embeddings.embed_documents([text, text2])
for vector in two_vectors:
    print(str(vector)[:100])  # Show the first 100 characters of the vector

[-0.019244952127337456, 0.0037762185093015432, -0.03293963521718979, 0.0037592509761452675, 0.008121
[-0.01016364898532629, 0.02342759631574154, -0.04225384443998337, -0.0015080638695508242, -0.0235117


## Responses API example
Lightweight example with instructions.

In [3]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_openai import AzureChatOpenAI
from langchain.messages import HumanMessage
import time

# -----------------------------
# LangChain model initialization
# -----------------------------
llm = AzureChatOpenAI(
    model="gpt-4.1-nano",
    api_key="sk-12345678",
    azure_endpoint="http://localhost:8000",
    openai_api_version="2024-02-01",
    temperature=0.2,
    max_tokens=200,
)

# -----------------------------
# Worker function (thread-safe)
# -----------------------------
def invoke_llm(prompt: str) -> dict:
    """
    Each thread runs this function.
    """
    start_time = time.time()
    
    response = llm.invoke(
        [HumanMessage(content=prompt)]
    )
    
    return {
        "prompt": prompt,
        "response": response.content,
        "latency_sec": round(time.time() - start_time, 2),
    }

# -----------------------------
# Main multithread runner
# -----------------------------
def run_multithreaded(prompts, max_workers=5):
    results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_prompt = {
            executor.submit(invoke_llm, prompt): prompt
            for prompt in prompts
        }

        for future in as_completed(future_to_prompt):
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                results.append({
                    "prompt": future_to_prompt[future],
                    "error": str(e),
                })

    return results

# -----------------------------
# Example usage
# -----------------------------
if __name__ == "__main__":
    prompts = [
        "Explain Value at Risk in simple terms.",
        "Summarize Basel III market risk framework.",
        "What is expected shortfall?",
        "Explain stress testing in banks.",
        "Difference between VaR and ES?",
        "Explain Value at Risk in simple terms.",
        "Summarize Basel III market risk framework.",
        "What is expected shortfall?",
        "Explain stress testing in banks.",
        "Difference between VaR and ES?"
    ]

    outputs = run_multithreaded(prompts, max_workers=4)

    for o in outputs:
        print("=" * 50)
        print("PROMPT:", o.get("prompt"))
        print("RESPONSE:", o.get("response"))
        print("LATENCY:", o.get("latency_sec"))


PROMPT: Summarize Basel III market risk framework.
RESPONSE: The Basel III market risk framework is a set of international banking regulations developed by the Basel Committee on Banking Supervision to strengthen the resilience of banks against market risks. It builds upon the previous Basel II standards and introduces more rigorous standards for measuring, managing, and capitalizing against market risk exposures.

**Key Components of the Basel III Market Risk Framework:**

1. **Scope and Objectives:**
   - Enhances the measurement and management of market risks, including interest rate risk, equity risk, foreign exchange risk, and commodity risk.
   - Applies to all banking organizations with significant trading activities.

2. **Standardized and Internal Models Approaches:**
   - **Standardized Approach:** Provides a consistent, regulator-approved method for calculating market risk capital requirements based on predefined risk weights and sensitivities.
   - **Internal Models Approac