## Deploying NIMs on AzureML using AzureML CLI

**Prerequisites:**
- AzureML account with minimum 24 vCPUs of NC Series A100 GPU provisioned.
- Host machine (CPU only) to download NIM container and model. Needs to have atleast 50 GB + **your LLM model size** GB amount of disk space to store the container and model.
- [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli)
- [Azure ML extension](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-configure-cli?view=azureml-api-2&tabs=public)
- [NGC API Key](https://catalog.ngc.nvidia.com/)

In [None]:
import subprocess
import requests
import os
import json

### Fill the configuration variables

Example:
subscription_id = "xxxxxxxxxxxxxxxxxxxxxx"  
resource_group = "nim-demo"  
workspace = "nim-test"  
location = "westeurope"  # e.g., "southcentralus", "westeurope"  
ngc_container = "nvcr.io/nim/microsoft/phi-3-mini-4k-instruct:1.2.3"  
ngc_api_key = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"  
email_address = "vikalluru@nvidia.com"  
endpoint_name = "phi3-nim-endpoint-aml-1"  
deployment_name = "phi3-nim-deployment-aml-1"  
instance_type = "Standard_NC24ads_A100_v4"  
acr_registry_name = "nimdemocr"  

In [None]:
# AzureML Workspace and corresponding container registry related information
subscription_id = "<your-azure-subscription-id>"
resource_group = "<your-resource-group>"
workspace = "<your-azureml-workspace-name>"
location = "<your-azureml-region>"  # e.g., "southcentralus", "westeurope"
ngc_api_key = "<your-ngc-api-key>"
email_address = "<your-email-address>"
ngc_container = "nvcr.io/nim/microsoft/phi-3-mini-4k-instruct:1.2.3"
endpoint_name = "phi3-nim-endpoint-aml-1"
deployment_name = "phi3-nim-deployment-aml-1"
instance_type = "Standard_NC24ads_A100_v4"
acr_registry_name = "<your-azureml-registry-name>"

In [None]:
def run_command(command, description="", return_output=False):
    try:
        if return_output:
            return subprocess.run(
                command, 
                check=True, 
                capture_output=True,  # capture_output should be set to True directly
                text=True
            ).stdout.strip()            
        else:
            subprocess.run(command, check=True)
            print(f"\033[92mSuccess: {description}\033[0m \n Command: {' '.join(command)}")
    except subprocess.CalledProcessError as e:
        print(f"\033[91mError: {description}\033[0m \n Command: {' '.join(command)}", e)

### Create new workspace

In [None]:
run_command(["az", "login", "--use-device-code"], "Azure login")

In [None]:
run_command([
        "az", "ml", "workspace", "create",
        "--name", workspace,
        "--resource-group", resource_group,
        "--location", location
], "AzureML workspace creation")

In [None]:
run_command([
    "az", "ml", "workspace", "update",
    "--name", workspace,
    "--resource-group", resource_group,
    "--container-registry", f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.ContainerRegistry/registries/{acr_registry_name}"
    ,"-u",
], "Link Azure Container Registry to Azure ML Workspace")

### Store NGC API KEY in Azure

#### (Option 1) Add NGC API Key to workspace key vault

In [None]:
workspace_keyvault_uri = run_command(
    ["az", "ml", "workspace", "show", "--name", workspace, "--resource-group", resource_group, "--query", "key_vault", "-o", "tsv"], 
    "Fetching workspace keyvault URI", 
    True
)

keyvault_name = workspace_keyvault_uri.split("/")[-1]

print("Workspace keyvault URI: ", workspace_keyvault_uri)
print("Keyvault name: ", keyvault_name)

# Assign role to allow access to the Key Vault
run_command([
    "az", "role", "assignment", "create",
    "--role", "Key Vault Secrets Officer",
    "--assignee", email_address,
    "--scope", workspace_keyvault_uri
], "Role assignment to access key vault")

# Set a secret in the Key Vault
run_command([
    "az", "keyvault", "secret", "set",
    "--vault-name", keyvault_name,
    "--name", "NGC-KEY",
    "--value", ngc_api_key
], "Add NGC secret to key vault")

# Show the secret in the Key Vault (for verification, if needed)
run_command([
    "az", "keyvault", "secret", "show",
    "--vault-name", keyvault_name,
    "--name", "NGC-KEY"
], "Verify NGC secret in key vault")

#### (Option 2) Add NGC API KEY as an AzureML workspace connection

In [None]:
# Assign role permission to read secrets from workspace connections
run_command([
    "az", "role", "assignment", "create",
    "--assignee", email_address,
    "--role", "Azure Machine Learning Workspace Connection Secrets Reader",
    "--scope", f"/subscriptions/{subscription_id}/resourcegroups/{resource_group}/providers/Microsoft.MachineLearningServices/workspaces/{workspace}"
], "AzureML secrets reader role assignment")

# Get a personal access token for the workspace
token = run_command(["az", "account", "get-access-token", "--query", "accessToken", "-o", "tsv"], "Getting access token for workspace")

# Define URLs
url = f"https://management.azure.com/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.MachineLearningServices/workspaces/{workspace}/connections/ngc?api-version=2023-08-01-preview"
verify_url = f"https://management.azure.com/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.MachineLearningServices/workspaces/{workspace}/connections/ngc/listsecrets?api-version=2023-08-01-preview"

# Add a workspace connection to store NGC API key
print("Adding NGC API key to workspace")
headers = {
    "Authorization": f"Bearer {token}",
    "Content-Type": "application/json"
}
data = {
    "properties": {
        "authType": "CustomKeys",
        "category": "CustomKeys",
        "credentials": {
            "keys": {
                "NGC_API_KEY": ngc_api_key
            }
        },
        "expiryTime": None,
        "target": "_",
        "isSharedToAll": False,
        "sharedUserList": []
    }
}

response = requests.put(url, headers=headers, json=data)
if response.status_code == 200:
    print("NGC API key added to workspace successfully.")
else:
    print(f"Failed to add NGC API key to workspace: {response.text}")

# Verify if the key got added
print("Verifying if the NGC API key was added")
verify_response = requests.post(verify_url, headers=headers, json={})
if verify_response.status_code == 200:
    verify_result = verify_response.json()
    ngc_api_key_value = verify_result.get("properties", {}).get("credentials", {}).get("keys", {}).get("NGC_API_KEY")
    
    if ngc_api_key_value == ngc_api_key:
        print("The NGC_API_KEY value matches the provided key.")
    else:
        print("The NGC_API_KEY value does not match the provided key.")
else:
    print(f"Failed to verify NGC API key: {verify_response.text}")

### Pull NIM container

In [None]:
run_command(["docker", "login", "nvcr.io", "-u", "$oauthtoken", "-p", ngc_api_key], "Docker login")
run_command(["docker", "pull", ngc_container], "Pulling NIM container")

### (Optional) Cache NIM model for airgapped deployment

#### List all compatible models

In [None]:
# model_cache_path = "<path-to-cache-model-repo>"
model_cache_path = "/mnt/models"

# Run the Docker container and list model profiles inside it
docker_run_command = [
    "docker", "run", "--rm", "--name=nim_list_profiles",
    "-e", "LOG_LEVEL=info",
    "-e", f"NGC_API_KEY={ngc_api_key}",
    "--gpus", "all",
    "-v", f"{model_cache_path}:/model-repo",
    "-u", "root",
    f"{ngc_container}",
    "bash", "-i", "-c", "list-model-profiles"
]

# Execute the command to start Docker and list model profiles
run_command(docker_run_command, "Run Docker container to list model profiles")

# Parse compatible model profile IDs from output (assuming the format provided)

#### Select model profile and download the required NIM model

Select a profile from the compatible profiles from the previous output.

Example:

```
===========================================
== NVIDIA Inference Microservice LLM NIM ==
===========================================
...
...
...
SYSTEM INFO
- Free GPUs:
  -  [20b5:10de] (0) NVIDIA A100 80GB PCIe [current utilization: 0%]
MODEL PROFILES
- Compatible with system and runnable:
  - cc2e0f9cb33ad6f9d31f64c0c1188342b00f427569a62a46397dfa33a2db7695 (vllm-bf16-tp1)
  - With LoRA support:
- Incompatible with system:
  - 94efad505d9248f0453e13c8a24420d5da4a909dd060c22904bc9fad923823b9 (tensorrt_llm-h100-fp8-tp1-throughput)
...
```
selected_profile = "cc2e0f9cb33ad6f9d31f64c0c1188342b00f427569a62a46397dfa33a2db7695"

In [None]:
# Select a compatible profile ID for the model store creation
selected_profile = "cc2e0f9cb33ad6f9d31f64c0c1188342b00f427569a62a46397dfa33a2db7695"
print(f"Selected compatible profile ID: {selected_profile}")

In [None]:
model_cache_path="/home/azureuser/phi3-mini-4k-nim"
# Run the command to create model store for the chosen profile
docker_create_model_command = [
    "docker", "run", "--rm", "--name=nim_model_store",
    "-e", "LOG_LEVEL=info",
    "-e", f"NGC_API_KEY={ngc_api_key}",
    "--gpus", "all",
    "-v", f"{model_cache_path}:/model-repo",
    "-u", "root",
    f"{ngc_container}",
    "bash", "-c", f"create-model-store --profile {selected_profile} --model-store /model-repo"
]

print("Running model store creation command in Docker...")
run_command(docker_create_model_command, "Create model store in Docker")

In [None]:
# Verify if the model was stored successfully
run_command(["ls",model_cache_path], "Model cache verification")

#### Push downloaded model to your AzureML workspace

In [None]:
model_cache_path = "/home/azureuser/phi3-mini-4k-nim"
cached_model_name = "phi3-mini-4k-nim"
model_version = "2"

In [None]:
run_command([
    "az", "ml", "model", "create",
    "--name", cached_model_name,
    "--version", model_version,
    "--path", model_cache_path,
    "--resource-group", resource_group,
    "--workspace-name", workspace
])

### Push container to Azure Container Registry

In [None]:
nim_acr_name = ngc_container.replace("nvcr.io", f"{acr_registry_name}.azurecr.io")
print("NIM image name as saved in ACR: ", nim_acr_name)

In [None]:
# Create Dockerfile content for NIM container
dockerfile_content = f"""FROM {ngc_container}
EXPOSE 8000
USER root
CMD bash -c "echo 'Displaying the NGC API Key:' && echo $NGC_API_KEY && \
             echo 'Displaying the NIM Model Name:' && echo $NIM_MODEL_NAME && \
             echo 'Listing the contents of /model-repo:' && ls /model-repo && \
             /opt/nim/start-server.sh"
"""

# Write Dockerfile to disk
with open("Dockerfile", "w") as dockerfile:
    dockerfile.write(dockerfile_content)
print("NIM Dockerfile has been created.")

# Login to Azure Container Registry
print("Logging into Azure Container Registry")
run_command(["az", "acr", "login", "-n", acr_registry_name])

# Build and tag the Docker image
print("Building the new Docker image and tagging it")
run_command(["docker", "build", "-t", nim_acr_name, "-f", "Dockerfile", "."], "Building azure NIM image")

# Clean up Dockerfile after build
os.remove("Dockerfile")

# Push the image to ACR
print("Pushing the image to ACR")
run_command(["docker", "push", nim_acr_name], "Pushing NIM image to Azure container registry")

### Create AzureML managed endpoint

In [None]:
# Create the endpoint YAML configuration
endpoint_yaml_content = f"""$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
name: {endpoint_name}
auth_mode: key
properties:
    enforce_access_to_default_secret_stores: enabled  # default: disabled
"""

# Write endpoint YAML configuration to file
with open("actual_endpoint_aml.yml", "w") as endpoint_yaml:
    endpoint_yaml.write(endpoint_yaml_content)
print("Endpoint YAML configuration created.")

# Deploy the endpoint using the Azure CLI
print(f"Creating Online Endpoint {endpoint_name}")
output = run_command([
    "az", "ml", "online-endpoint", "create", "-f", "actual_endpoint_aml.yml",
    "--resource-group", resource_group, "--workspace-name", workspace
], return_output=True)

endpointidentityid = ""
if output:
    endpoint_data = json.loads(output)
    endpointidentityid = endpoint_data.get("identity", {}).get("principal_id")
    if endpointidentityid:
        print(f"Principal ID: {endpointidentityid}")
    else:
        print("Principal ID not found.")
else:
    print("No output received.")

# Clean up the generated YAML file
os.remove("actual_endpoint_aml.yml")
print("Cleaned up temporary endpoint YAML file.")

In [None]:
# Provide access to endpoint to read secrets from key vault
endpointidentityid = "<provide-your-endpoint-id>"
run_command([
    "az", "role", "assignment", "create",
    "--assignee", endpointidentityid,
    "--role", "Key Vault Secrets User",
    "--scope", f"/subscriptions/{subscription_id}/resourcegroups/{resource_group}/providers/Microsoft.KeyVault/vaults/{keyvault_name}"
], "Providing permissions for endpoint to access NGC key from key vault")

### Create NIM deployment

#### Method 1: Normal deployment yaml file

Run this cell to generate a deployment YAML file for deploying NIMs in a non-airgapped environment, utilizing the internet to fetch the NIM profile and models. The deployment instance requires the NGC API KEY to retrieve these models, which is provided by injecting it from our workspace’s Azure Key Vault.

In [None]:
keyvault_uri = "${{keyvault:<provide-your-NGC-key-URI>}}"
# Ex: keyvault_uri = "${{keyvault:https://keyvaultname.vault.azure.net/secrets/NGC-KEY/acbfcec9c19e4a2ab6b64197382a5ecb}}"

In [None]:
deployment_yaml_content = f"""$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
name: {deployment_name}
endpoint_name: {endpoint_name}
environment: 
  name: nim-env
  image: {nim_acr_name}
  inference_config:
      liveness_route:
          path: /v1/health/ready
          port: 8000
      readiness_route:
          path: /v1/health/ready
          port: 8000
      scoring_route:
          path: /
          port: 8000
instance_type: {instance_type}
instance_count: 1
environment_variables:
    NGC_API_KEY: {keyvault_uri}
"""

#### Method 2: Airgapped deployment yaml file

Run this cell to generate a deployment YAML file for deploying NIMs in an airgapped environment. This deployment requires mounting a cached NIM model and setting the NIM container’s NIM_MODEL_NAME environment variable, which instructs NIM to avoid downloading from the internet and instead utilize the model hosted on the instance. The NIM model is sourced by mounting the previously pushed model from our AzureML workspace’s model registry.

In [None]:
deployment_yaml_content = f"""$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
name: {deployment_name}
endpoint_name: {endpoint_name}
model: azureml:{cached_model_name}:{model_version}
model_mount_path: /model-repo
environment: 
  name: nim-env
  image: {nim_acr_name}
  inference_config:
      liveness_route:
          path: /v1/health/ready
          port: 8000
      readiness_route:
          path: /v1/health/ready
          port: 8000
      scoring_route:
          path: /
          port: 8000
instance_type: {instance_type}
instance_count: 1
environment_variables:
    NIM_MODEL_NAME: /model-repo/phi3-mini-4k-nim/
    NGC_API_KEY: {keyvault_uri}
"""

In [None]:
# display content of deployment YAML file
import yaml
yaml_content = yaml.safe_load(deployment_yaml_content)
print(yaml.dump(yaml_content, sort_keys=False, default_flow_style=False, indent=2))

In [None]:
# Write deployment YAML configuration to file
with open("actual_deployment_aml.yml", "w") as deployment_yaml:
    deployment_yaml.write(deployment_yaml_content)
print("Deployment YAML configuration created.")

# Display the modified YAML file for confirmation
with open("actual_deployment_aml.yml", "r") as file:
    print(file.read())

# Create the online deployment using the Azure CLI
print(f"Creating Online Deployment {deployment_name}")
run_command([
    "az", "ml", "online-deployment", "create", "-f", "actual_deployment_aml.yml",
    "--resource-group", resource_group, "--workspace-name", workspace
])

# Clean up the generated YAML file
os.remove("actual_deployment_aml.yml")
print("Cleaned up temporary deployment YAML file.")

### Test your connection

Verify your deployment using the code below. Modify the code to add your endpoint URL, Endpoint authorization token and AzureML deployment name obtained from "Consume" tab under your AzureML endpoint page as shown below.

![Endpoint details](./endpoint_details.png)

Look at your deployment logs for available serving endpoints and example CURL request

![Serving endpoints](./serving_endpoints.png)

![Example request](./example_request.png)

In [None]:
base_url = 'https://phi3-nim-endpoint-aml-1.westeurope.inference.ml.azure.com' # modify this URL
token = '4lMW5uj9wQbbhxhAFBxFvY9Jhgn1UIoi'
model_name = "microsoft/phi-3-mini-4k-instruct"

In [None]:
import requests
import json
from urllib.parse import urljoin

url = urljoin(base_url, "v1/chat/completions")
headers = {
    'accept': 'application/json',
    'Authorization': f'Bearer {token}', # modify this token
    'Content-Type': 'application/json',
    'azureml-model-deployment': deployment_name
}
data = {
    "messages": [
        {
            "content": "You are a polite and respectful chatbot helping people plan a vacation.",
            "role": "system"
        },
        {
            "content": "What should I do for a 4 day vacation in Spain?",
            "role": "user"
        }
    ],
    "model": model_name,
    "max_tokens": 16,
    "top_p": 1,
    "n": 1,
    "stream": False,
    "stop": "\n",
    "frequency_penalty": 0.0
}

response = requests.post(url, headers=headers, json=data)
# Pretty print the JSON response
print(json.dumps(response.json(), indent=4))

#### (Optional) Launch a Gradio interface

In [None]:
# ! pip install gradio

In [None]:
import gradio as gr
import requests
import json

# Function to handle the chat with LLM
def chat_with_llm(user_message, history):
    # Format messages for the LLM with initial system prompt
    formatted_messages = [
        {"content": "You are a polite and respectful AI assistant.", "role": "system"}
    ]
    
    # Add previous messages from the history
    for user_msg, assistant_msg in history:
        formatted_messages.append({"content": user_msg, "role": "user"})
        formatted_messages.append({"content": assistant_msg, "role": "assistant"})

    # Add the user's latest message
    formatted_messages.append({"content": user_message, "role": "user"})

    data = {
        "messages": formatted_messages,
        "model": model_name,
        "max_tokens": 2000,
        "top_p": 1,
        "n": 1,
        "stream": False,  # Set to True if streaming responses are supported by your API
        "stop": "\n",
        "frequency_penalty": 0.1
    }

    response = requests.post(url, headers=headers, json=data)

    if response.status_code == 200:
        response_data = response.json()
        assistant_reply = response_data.get("choices", [{}])[0].get("message", {}).get("content", "No response")
    else:
        assistant_reply = f"Error: {response.status_code} - {response.text}"

    # Return the assistant's reply as a string
    return assistant_reply

# Create a ChatInterface using Gradio
chat_interface = gr.ChatInterface(
    fn=chat_with_llm,
    title="Multi-Turn LLM Chatbot with gr.ChatInterface",
    description="A chatbot interface that interacts with your LLM endpoint for multi-turn conversations.",
    examples=[["What should I do for a 4-day vacation in Spain?"]]
)

# Launch the interface
chat_interface.launch()