# LLM Backend Onboarding Tests

This notebook provides comprehensive testing for LLM backend onboarding deployments. It validates:
- Backend configuration and connectivity
- Load balancing across backend pools
- Circuit breaker functionality
- Response latency and performance

## Prerequisites
- Azure subscription with deployed APIM instance
- LLM backends onboarded via the `llm-backend-onboarding` Bicep deployment
- Python 3.11+ with required packages

In [None]:
# Install required packages
%pip install azure-identity azure-mgmt-apimanagement openai requests pandas matplotlib httpx tenacity --quiet

In [None]:
import os
import json
import time
import asyncio
from datetime import datetime
from collections import defaultdict
from typing import Optional

import httpx
import requests
import pandas as pd
import matplotlib.pyplot as plt
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from azure.mgmt.apimanagement import ApiManagementClient
from openai import AzureOpenAI
from tenacity import retry, stop_after_attempt, wait_exponential

# Configuration - Update these values
SUBSCRIPTION_ID = os.environ.get("AZURE_SUBSCRIPTION_ID", "<your-subscription-id>")
RESOURCE_GROUP = os.environ.get("APIM_RESOURCE_GROUP", "<your-rg>")
APIM_NAME = os.environ.get("APIM_NAME", "<your-apim-name>")
APIM_GATEWAY_URL = os.environ.get("APIM_GATEWAY_URL", f"https://{APIM_NAME}.azure-api.net")
APIM_SUBSCRIPTION_KEY = os.environ.get("APIM_SUBSCRIPTION_KEY", "<your-subscription-key>")

# Initialize Azure credentials
credential = DefaultAzureCredential()
print(f"Configuration loaded for APIM: {APIM_NAME}")

## 1. Verify LLM Backend Configuration

Retrieve and validate the current LLM backend configuration from APIM.

In [None]:
# Initialize APIM Management Client
apim_client = ApiManagementClient(credential, SUBSCRIPTION_ID)

def get_llm_backends():
    """Retrieve all LLM backends from APIM."""
    backends = list(apim_client.backend.list_by_service(RESOURCE_GROUP, APIM_NAME))
    llm_backends = [b for b in backends if b.name and b.name.startswith("llm-")]
    return llm_backends

def get_backend_pools():
    """Retrieve all backend pools from APIM."""
    pools = list(apim_client.backend.list_by_service(RESOURCE_GROUP, APIM_NAME))
    backend_pools = [p for p in pools if p.pool and p.pool.services]
    return backend_pools

def get_policy_fragments():
    """Retrieve LLM-related policy fragments."""
    fragments = list(apim_client.policy_fragment.list_by_service(RESOURCE_GROUP, APIM_NAME))
    llm_fragments = [f for f in fragments if f.name and "llm" in f.name.lower()]
    return llm_fragments

# Display backend configuration
print("=" * 60)
print("LLM BACKEND CONFIGURATION")
print("=" * 60)

backends = get_llm_backends()
print(f"\nüì¶ Found {len(backends)} LLM backends:")
for b in backends:
    print(f"  - {b.name}: {b.url}")
    if hasattr(b, 'circuit_breaker') and b.circuit_breaker:
        print(f"    Circuit Breaker: enabled")

pools = get_backend_pools()
print(f"\nüîÑ Found {len(pools)} backend pools:")
for p in pools:
    if p.pool and p.pool.services:
        print(f"  - {p.name}:")
        for svc in p.pool.services:
            print(f"    ‚Ä¢ {svc.id} (priority: {getattr(svc, 'priority', 'N/A')}, weight: {getattr(svc, 'weight', 'N/A')})")

fragments = get_policy_fragments()
print(f"\nüìã Found {len(fragments)} LLM policy fragments:")
for f in fragments:
    print(f"  - {f.name}")

## 2. Test LLM API Connectivity

Test basic connectivity to the Universal LLM API through APIM.

In [None]:
# Test API connectivity
INFERENCE_API_PATH = "/inference"  # Update based on your deployment

def test_chat_completion(model: str, prompt: str = "Hello, how are you?") -> dict:
    """Send a chat completion request through APIM."""
    url = f"{APIM_GATEWAY_URL}{INFERENCE_API_PATH}/chat/completions"
    
    headers = {
        "Content-Type": "application/json",
        "Ocp-Apim-Subscription-Key": APIM_SUBSCRIPTION_KEY,
    }
    
    payload = {
        "model": model,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 100
    }
    
    start_time = time.time()
    response = requests.post(url, headers=headers, json=payload, timeout=30)
    elapsed = time.time() - start_time
    
    return {
        "status_code": response.status_code,
        "elapsed_ms": round(elapsed * 1000, 2),
        "response": response.json() if response.status_code == 200 else response.text,
        "headers": dict(response.headers)
    }

# Test with a sample model
TEST_MODEL = "gpt-4o"  # Update based on your deployed models

print(f"Testing chat completion with model: {TEST_MODEL}")
result = test_chat_completion(TEST_MODEL)

print(f"\n‚úÖ Status: {result['status_code']}")
print(f"‚è±Ô∏è  Latency: {result['elapsed_ms']} ms")

if result['status_code'] == 200:
    response_content = result['response'].get('choices', [{}])[0].get('message', {}).get('content', '')
    print(f"üí¨ Response: {response_content[:200]}...")
    
    # Check for backend info in headers
    if 'x-backend-id' in result['headers']:
        print(f"üéØ Backend: {result['headers']['x-backend-id']}")
else:
    print(f"‚ùå Error: {result['response']}")

## 3. Test Load Balancing Across Backend Pools

Send multiple requests to verify load distribution across backends in a pool.

In [None]:
async def send_concurrent_requests(model: str, num_requests: int = 20) -> list:
    """Send concurrent requests to test load balancing."""
    url = f"{APIM_GATEWAY_URL}{INFERENCE_API_PATH}/chat/completions"
    
    headers = {
        "Content-Type": "application/json",
        "Ocp-Apim-Subscription-Key": APIM_SUBSCRIPTION_KEY,
    }
    
    payload = {
        "model": model,
        "messages": [{"role": "user", "content": "Say 'test' and nothing else."}],
        "max_tokens": 10
    }
    
    results = []
    
    async with httpx.AsyncClient(timeout=30.0) as client:
        async def send_request(i: int):
            start = time.time()
            try:
                response = await client.post(url, headers=headers, json=payload)
                elapsed = time.time() - start
                return {
                    "request_id": i,
                    "status": response.status_code,
                    "latency_ms": round(elapsed * 1000, 2),
                    "backend": response.headers.get("x-backend-id", "unknown"),
                    "success": response.status_code == 200
                }
            except Exception as e:
                return {
                    "request_id": i,
                    "status": 0,
                    "latency_ms": round((time.time() - start) * 1000, 2),
                    "backend": "error",
                    "success": False,
                    "error": str(e)
                }
        
        tasks = [send_request(i) for i in range(num_requests)]
        results = await asyncio.gather(*tasks)
    
    return results

# Run load balancing test
print(f"Sending 20 concurrent requests to model: {TEST_MODEL}")
print("=" * 60)

results = await send_concurrent_requests(TEST_MODEL, 20)

# Analyze results
df = pd.DataFrame(results)
success_count = df['success'].sum()
print(f"\n‚úÖ Successful requests: {success_count}/{len(results)}")
print(f"‚è±Ô∏è  Average latency: {df['latency_ms'].mean():.2f} ms")
print(f"‚è±Ô∏è  Min latency: {df['latency_ms'].min():.2f} ms")
print(f"‚è±Ô∏è  Max latency: {df['latency_ms'].max():.2f} ms")

# Backend distribution
print("\nüìä Backend Distribution:")
backend_counts = df['backend'].value_counts()
for backend, count in backend_counts.items():
    pct = (count / len(results)) * 100
    print(f"  {backend}: {count} requests ({pct:.1f}%)")

## 4. Visualize Backend Distribution

Create charts showing request distribution and latency across backends.

In [None]:
# Visualize distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart - Request distribution
ax1 = axes[0]
colors = plt.cm.Pastel1.colors
ax1.pie(backend_counts.values, labels=backend_counts.index, autopct='%1.1f%%', colors=colors)
ax1.set_title('Request Distribution Across Backends')

# Box plot - Latency by backend
ax2 = axes[1]
df_success = df[df['success']]
if len(df_success) > 0:
    backends = df_success['backend'].unique()
    latency_data = [df_success[df_success['backend'] == b]['latency_ms'].values for b in backends]
    bp = ax2.boxplot(latency_data, labels=backends, patch_artist=True)
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
    ax2.set_ylabel('Latency (ms)')
    ax2.set_title('Response Latency by Backend')
    ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("\nüìà Latency Statistics by Backend:")
for backend in backends:
    backend_df = df_success[df_success['backend'] == backend]
    print(f"\n  {backend}:")
    print(f"    Mean: {backend_df['latency_ms'].mean():.2f} ms")
    print(f"    Std:  {backend_df['latency_ms'].std():.2f} ms")

## 5. Test Circuit Breaker Failover

Simulate backend failures to verify circuit breaker behavior.

In [None]:
def test_failover_behavior(model: str, num_requests: int = 10) -> dict:
    """
    Test circuit breaker failover by monitoring backend switching.
    
    Note: To truly test failover, you would need to:
    1. Disable a backend in APIM
    2. Observe requests redirecting to healthy backends
    3. Re-enable the backend
    """
    url = f"{APIM_GATEWAY_URL}{INFERENCE_API_PATH}/chat/completions"
    
    headers = {
        "Content-Type": "application/json",
        "Ocp-Apim-Subscription-Key": APIM_SUBSCRIPTION_KEY,
    }
    
    payload = {
        "model": model,
        "messages": [{"role": "user", "content": "Test"}],
        "max_tokens": 5
    }
    
    timeline = []
    
    for i in range(num_requests):
        start = time.time()
        try:
            response = requests.post(url, headers=headers, json=payload, timeout=30)
            elapsed = time.time() - start
            
            timeline.append({
                "request_num": i + 1,
                "timestamp": datetime.now().isoformat(),
                "status": response.status_code,
                "backend": response.headers.get("x-backend-id", "unknown"),
                "latency_ms": round(elapsed * 1000, 2),
                "success": response.status_code == 200
            })
        except Exception as e:
            timeline.append({
                "request_num": i + 1,
                "timestamp": datetime.now().isoformat(),
                "status": 0,
                "backend": "error",
                "latency_ms": 0,
                "success": False,
                "error": str(e)
            })
        
        # Small delay between requests
        time.sleep(0.5)
    
    return timeline

print("Testing failover behavior (sequential requests with delays)...")
print("=" * 60)

timeline = test_failover_behavior(TEST_MODEL, 10)

# Display timeline
print("\nüìä Request Timeline:")
print("-" * 70)
print(f"{'#':<4} {'Backend':<30} {'Status':<8} {'Latency':<12}")
print("-" * 70)

for entry in timeline:
    status_icon = "‚úÖ" if entry['success'] else "‚ùå"
    print(f"{entry['request_num']:<4} {entry['backend']:<30} {status_icon} {entry['status']:<5} {entry['latency_ms']:<10} ms")

# Check for backend switches
backends_used = [e['backend'] for e in timeline if e['success']]
unique_backends = set(backends_used)
print(f"\nüîÑ Backends used: {', '.join(unique_backends)}")
print(f"üìà Backend switches detected: {sum(1 for i in range(1, len(backends_used)) if backends_used[i] != backends_used[i-1])}")

## 6. Test Multiple Models

Verify that different models are routed to appropriate backends.

In [None]:
# Define models to test - Update based on your deployment
MODELS_TO_TEST = [
    "gpt-4o",
    "gpt-4o-mini", 
    "text-embedding-ada-002"
]

def test_model_routing(models: list) -> dict:
    """Test that each model routes to appropriate backends."""
    results = {}
    
    for model in models:
        print(f"\nüîç Testing model: {model}")
        
        url = f"{APIM_GATEWAY_URL}{INFERENCE_API_PATH}/chat/completions"
        
        # Use embeddings endpoint for embedding models
        if "embedding" in model.lower():
            url = f"{APIM_GATEWAY_URL}{INFERENCE_API_PATH}/embeddings"
            payload = {
                "model": model,
                "input": "Test embedding"
            }
        else:
            payload = {
                "model": model,
                "messages": [{"role": "user", "content": "Hello"}],
                "max_tokens": 10
            }
        
        headers = {
            "Content-Type": "application/json",
            "Ocp-Apim-Subscription-Key": APIM_SUBSCRIPTION_KEY,
        }
        
        try:
            start = time.time()
            response = requests.post(url, headers=headers, json=payload, timeout=30)
            elapsed = time.time() - start
            
            results[model] = {
                "status": response.status_code,
                "backend": response.headers.get("x-backend-id", "unknown"),
                "latency_ms": round(elapsed * 1000, 2),
                "success": response.status_code == 200
            }
            
            status_icon = "‚úÖ" if response.status_code == 200 else "‚ùå"
            print(f"  {status_icon} Status: {response.status_code}")
            print(f"  üéØ Backend: {results[model]['backend']}")
            print(f"  ‚è±Ô∏è  Latency: {results[model]['latency_ms']} ms")
            
        except Exception as e:
            results[model] = {
                "status": 0,
                "backend": "error",
                "latency_ms": 0,
                "success": False,
                "error": str(e)
            }
            print(f"  ‚ùå Error: {str(e)}")
    
    return results

print("Testing model routing...")
print("=" * 60)

model_results = test_model_routing(MODELS_TO_TEST)

# Summary
print("\n" + "=" * 60)
print("MODEL ROUTING SUMMARY")
print("=" * 60)

summary_df = pd.DataFrame.from_dict(model_results, orient='index')
summary_df.index.name = 'Model'
print(summary_df[['status', 'backend', 'latency_ms', 'success']].to_string())

## 7. Test Using OpenAI SDK

Verify compatibility with the official OpenAI Python SDK.

In [None]:
# Test with OpenAI SDK
def test_with_openai_sdk():
    """Test LLM API using the official OpenAI Python SDK."""
    
    # Configure client for APIM endpoint
    client = AzureOpenAI(
        azure_endpoint=APIM_GATEWAY_URL,
        api_key=APIM_SUBSCRIPTION_KEY,
        api_version="2024-10-21"
    )
    
    print("Testing with OpenAI SDK...")
    print("=" * 60)
    
    try:
        start = time.time()
        response = client.chat.completions.create(
            model=TEST_MODEL,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "What is 2 + 2? Answer with just the number."}
            ],
            max_tokens=10
        )
        elapsed = time.time() - start
        
        print(f"‚úÖ Success!")
        print(f"üìù Model: {response.model}")
        print(f"üí¨ Response: {response.choices[0].message.content}")
        print(f"üî¢ Tokens: {response.usage.total_tokens}")
        print(f"‚è±Ô∏è  Latency: {round(elapsed * 1000, 2)} ms")
        
        return True
        
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
        return False

sdk_success = test_with_openai_sdk()

## 8. Latency Comparison Test

Run extended tests to measure latency distribution across all backends.

In [None]:
async def extended_latency_test(model: str, num_requests: int = 50) -> pd.DataFrame:
    """Run extended latency tests."""
    print(f"Running extended latency test ({num_requests} requests)...")
    
    results = await send_concurrent_requests(model, num_requests)
    df = pd.DataFrame(results)
    
    return df

# Run extended test
latency_df = await extended_latency_test(TEST_MODEL, 50)

# Visualize results
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Latency histogram
ax1 = axes[0, 0]
latency_df[latency_df['success']]['latency_ms'].hist(bins=20, ax=ax1, color='steelblue', edgecolor='white')
ax1.set_xlabel('Latency (ms)')
ax1.set_ylabel('Frequency')
ax1.set_title('Latency Distribution')
ax1.axvline(latency_df['latency_ms'].mean(), color='red', linestyle='--', label=f"Mean: {latency_df['latency_ms'].mean():.0f}ms")
ax1.legend()

# 2. Latency over time
ax2 = axes[0, 1]
ax2.plot(latency_df['request_id'], latency_df['latency_ms'], marker='o', markersize=4, linestyle='-', alpha=0.7)
ax2.set_xlabel('Request #')
ax2.set_ylabel('Latency (ms)')
ax2.set_title('Latency Over Time')

# 3. Success rate by backend
ax3 = axes[1, 0]
success_by_backend = latency_df.groupby('backend')['success'].mean() * 100
success_by_backend.plot(kind='bar', ax=ax3, color='green', edgecolor='white')
ax3.set_ylabel('Success Rate (%)')
ax3.set_title('Success Rate by Backend')
ax3.set_ylim(0, 105)
ax3.tick_params(axis='x', rotation=45)

# 4. Request count by backend
ax4 = axes[1, 1]
backend_counts = latency_df['backend'].value_counts()
backend_counts.plot(kind='bar', ax=ax4, color='orange', edgecolor='white')
ax4.set_ylabel('Request Count')
ax4.set_title('Request Distribution by Backend')
ax4.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Print summary statistics
print("\n" + "=" * 60)
print("EXTENDED LATENCY TEST SUMMARY")
print("=" * 60)
print(f"\nTotal Requests: {len(latency_df)}")
print(f"Successful: {latency_df['success'].sum()} ({latency_df['success'].mean()*100:.1f}%)")
print(f"\nLatency Percentiles:")
print(f"  P50: {latency_df['latency_ms'].quantile(0.50):.0f} ms")
print(f"  P90: {latency_df['latency_ms'].quantile(0.90):.0f} ms")
print(f"  P95: {latency_df['latency_ms'].quantile(0.95):.0f} ms")
print(f"  P99: {latency_df['latency_ms'].quantile(0.99):.0f} ms")

## 9. Cleanup Test Resources

Optional cleanup for test resources created during validation.

In [None]:
# Cleanup function for LLM backend onboarding resources
def cleanup_llm_backends(prefix: str = "llm-", dry_run: bool = True):
    """
    Remove LLM backend resources from APIM.
    
    Args:
        prefix: Prefix to identify LLM backend resources
        dry_run: If True, only list resources without deleting
    """
    print(f"{'DRY RUN - ' if dry_run else ''}Cleaning up LLM backend resources...")
    print("=" * 60)
    
    # List backends to delete
    backends = get_llm_backends()
    print(f"\nüì¶ Found {len(backends)} LLM backends to remove:")
    for b in backends:
        print(f"  - {b.name}")
        if not dry_run:
            try:
                apim_client.backend.delete(RESOURCE_GROUP, APIM_NAME, b.name, if_match="*")
                print(f"    ‚úÖ Deleted")
            except Exception as e:
                print(f"    ‚ùå Error: {e}")
    
    # List policy fragments to delete
    fragments = get_policy_fragments()
    print(f"\nüìã Found {len(fragments)} LLM policy fragments to remove:")
    for f in fragments:
        print(f"  - {f.name}")
        if not dry_run:
            try:
                apim_client.policy_fragment.delete(RESOURCE_GROUP, APIM_NAME, f.name, if_match="*")
                print(f"    ‚úÖ Deleted")
            except Exception as e:
                print(f"    ‚ùå Error: {e}")
    
    if dry_run:
        print("\n‚ö†Ô∏è  DRY RUN - No resources were deleted.")
        print("   Set dry_run=False to actually delete resources.")

# Run cleanup in dry run mode first
cleanup_llm_backends(dry_run=True)

# Uncomment the following line to actually delete resources:
# cleanup_llm_backends(dry_run=False)

---

## Summary

This notebook validated:
- ‚úÖ LLM backend configuration retrieval
- ‚úÖ API connectivity through APIM
- ‚úÖ Load balancing across backend pools
- ‚úÖ Backend distribution visualization
- ‚úÖ Circuit breaker failover behavior
- ‚úÖ Multi-model routing
- ‚úÖ OpenAI SDK compatibility
- ‚úÖ Latency performance metrics

For more information, see the [LLM Backend Onboarding Guide](../guides/LLM-Backend-Onboarding-Guide.md).