## This is a Demo Notebook of Deploying Air Gapped NIM LLama 3.1-70b on AzureML using AzureML CLI

**Prerequisites:**
- AzureML account with minimum 48 vCPUs of NC Series A100 GPU provisioned.
- Host machine (CPU only) to download NIM container and model. Needs to have atleast 140 GB  amount of disk space to store the container and model.
- [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli)
- [Azure ML extension](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-configure-cli?view=azureml-api-2&tabs=public)
- [NGC API Key](https://catalog.ngc.nvidia.com/)

In [8]:
import subprocess
import requests
import os
import json

In [None]:
### Fill the configuration variables

subscription_id="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
resource_group="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
workspace="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
location="eastus" # e.g., "southcentralus", "westeurope"
keyvault_name="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
email_address = "youremail@nvidia.com"  
ngc_api_key="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
instance_type="Standard_NC48ads_A100_v4"
acr_registry_name="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
ngc_container = "nvcr.io/nim/meta/llama-3.1-70b-instruct:1.3.3"
endpoint_name = "llama31-70b-nim-trt"
deployment_name = "llama31-70b-nim-trt-tp1"

In [10]:
def run_command(command, description="", return_output=False):
    try:
        if return_output:
            return subprocess.run(
                command, 
                check=True, 
                capture_output=True,  # capture_output should be set to True directly
                text=True
            ).stdout.strip()            
        else:
            subprocess.run(command, check=True)
            print(f"\033[92mSuccess: {description}\033[0m \n Command: {' '.join(command)}")
    except subprocess.CalledProcessError as e:
        print(f"\033[91mError: {description}\033[0m \n Command: {' '.join(command)}", e)

### Create new workspace

In [None]:
run_command(["az", "login", "--use-device-code"], "Azure login")

In [None]:
run_command([
        "az", "ml", "workspace", "create",
        "--name", workspace,
        "--resource-group", resource_group,
        "--location", location
], "AzureML workspace creation")

In [None]:
run_command([
    "az", "ml", "workspace", "update",
    "--name", workspace,
    "--resource-group", resource_group,
    "--container-registry", f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.ContainerRegistry/registries/{acr_registry_name}"
    ,"-u",
], "Link Azure Container Registry to Azure ML Workspace")

### Store NGC API KEY in Azure

#### (Option 1) Add NGC API Key to workspace key vault

In [None]:
workspace_keyvault_uri = run_command(
    ["az", "ml", "workspace", "show", "--name", workspace, "--resource-group", resource_group, "--query", "key_vault", "-o", "tsv"], 
    "Fetching workspace keyvault URI", 
    True
)

keyvault_name = workspace_keyvault_uri.split("/")[-1]

print("Workspace keyvault URI: ", workspace_keyvault_uri)
print("Keyvault name: ", keyvault_name)

# Assign role to allow access to the Key Vault
run_command([
    "az", "role", "assignment", "create",
    "--role", "Key Vault Secrets Officer",
    "--assignee", email_address,
    "--scope", workspace_keyvault_uri
], "Role assignment to access key vault")

# Set a secret in the Key Vault
run_command([
    "az", "keyvault", "secret", "set",
    "--vault-name", keyvault_name,
    "--name", "NGC-KEY",
    "--value", ngc_api_key
], "Add NGC secret to key vault")

# Show the secret in the Key Vault (for verification, if needed)
run_command([
    "az", "keyvault", "secret", "show",
    "--vault-name", keyvault_name,
    "--name", "NGC-KEY"
], "Verify NGC secret in key vault")

#### (Option 2) Add NGC API KEY as an AzureML workspace connection

In [None]:
# Assign role permission to read secrets from workspace connections
run_command([
    "az", "role", "assignment", "create",
    "--assignee", email_address,
    "--role", "Azure Machine Learning Workspace Connection Secrets Reader",
    "--scope", f"/subscriptions/{subscription_id}/resourcegroups/{resource_group}/providers/Microsoft.MachineLearningServices/workspaces/{workspace}"
], "AzureML secrets reader role assignment")

# Get a personal access token for the workspace
token = run_command(["az", "account", "get-access-token", "--query", "accessToken", "-o", "tsv"], "Getting access token for workspace")

# Define URLs
url = f"https://management.azure.com/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.MachineLearningServices/workspaces/{workspace}/connections/ngc?api-version=2023-08-01-preview"
verify_url = f"https://management.azure.com/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.MachineLearningServices/workspaces/{workspace}/connections/ngc/listsecrets?api-version=2023-08-01-preview"

# Add a workspace connection to store NGC API key
print("Adding NGC API key to workspace")
headers = {
    "Authorization": f"Bearer {token}",
    "Content-Type": "application/json"
}
data = {
    "properties": {
        "authType": "CustomKeys",
        "category": "CustomKeys",
        "credentials": {
            "keys": {
                "NGC_API_KEY": ngc_api_key
            }
        },
        "expiryTime": None,
        "target": "_",
        "isSharedToAll": False,
        "sharedUserList": []
    }
}

response = requests.put(url, headers=headers, json=data)
if response.status_code == 200:
    print("NGC API key added to workspace successfully.")
else:
    print(f"Failed to add NGC API key to workspace: {response.text}")

# Verify if the key got added
print("Verifying if the NGC API key was added")
verify_response = requests.post(verify_url, headers=headers, json={})
if verify_response.status_code == 200:
    verify_result = verify_response.json()
    ngc_api_key_value = verify_result.get("properties", {}).get("credentials", {}).get("keys", {}).get("NGC_API_KEY")
    
    if ngc_api_key_value == ngc_api_key:
        print("The NGC_API_KEY value matches the provided key.")
    else:
        print("The NGC_API_KEY value does not match the provided key.")
else:
    print(f"Failed to verify NGC API key: {verify_response.text}")

### Pull NIM container

In [None]:
run_command(["docker", "login", "nvcr.io", "-u", "$oauthtoken", "-p", ngc_api_key], "Docker login")
run_command(["docker", "pull", ngc_container], "Pulling NIM container")

### (Cache NIM model for airgapped deployment

#### List all compatible models

In [None]:
# model_cache_path = "<path-to-cache-model-repo>"
model_cache_path="/home/azureuser/llama-3.1-70b-nim"

# Run the Docker container and list model profiles inside it
docker_run_command = [
    "docker", "run", "--rm", "--name=nim_list_profiles",
    "-e", "LOG_LEVEL=info",
    "-e", "NIM_MANIFEST_ALLOW_UNSAFE",
    "-e", f"NGC_API_KEY={ngc_api_key}",
    "--gpus", "all",
    "-v", f"{model_cache_path}:/model-repo",
    "-u", "root",
    f"{ngc_container}",
    "bash", "-i", "-c", "list-model-profiles"
]

# Execute the command to start Docker and list model profiles
run_command(docker_run_command, "Run Docker container to list model profiles")

# Parse compatible model profile IDs from output (assuming the format provided)

#### Select model profile and download the required NIM model

Select a profile from the compatible profiles from the previous output.

Example:

```
MODEL PROFILES
- Compatible with system and runnable:
  - 395082aa40085d35f004dd3056d7583aea330417ed509b4315099a66cfc72bdd (vllm-bf16-tp2)
  - With LoRA support:
- Compilable to TRT-LLM using just-in-time compilation of HF models to TRTLLM engines:
  - b7b6fa584441d9536091ce5cf80ccc31765780b8a46540da4e7bada5c5108ed9 (tensorrt_llm-trtllm_buildable-bf16-tp2)
  - With LoRA support:
- Incompatible with system:
  - a29dc20fff4ad67746205295ccb4af9e010f8f31207235c75e27786fb834e574 (tensorrt_llm-h100-fp8-tp8-pp1-latency)
  - 852c2c07610526d83d0fa80c656c5ee32c54f91df2626b9e7f7dfb575e25dabf (tensorrt_llm-h100_nvl-fp8-tp4-pp1-latency)
  
...
```
selected_profile = "b7b6fa584441d9536091ce5cf80ccc31765780b8a46540da4e7bada5c5108ed9"

In [6]:
# Select a compatible profile ID for the model store creation
# 8b09858c4fc22c360e5e6dda70d67751671c9f5a1182059ebaa91b4babce884c (tensorrt_llm-a100-fp16-tp1-throughput)
selected_profile = "b7b6fa584441d9536091ce5cf80ccc31765780b8a46540da4e7bada5c5108ed9"
print(f"Selected compatible profile ID: {selected_profile}")


Selected compatible profile ID: b7b6fa584441d9536091ce5cf80ccc31765780b8a46540da4e7bada5c5108ed9


```
	mkdir /home/azureuser/llama-3.1-70b-nim
	export LOCAL_NIM_CACHE=/home/azureuser/llama-3.1-70b-nim
	export NIM_MODEL_PROFILE=trt-engine
	
	docker run -it --rm --gpus all --shm-size=16GB -e NIM_CUSTOM_MODEL_NAME=70b-trt-engine -e NGC_API_KEY=xxxxxx  -e NIM_LOW_MEMORY_MODE=1 -v "$LOCAL_NIM_CACHE:/opt/nim/.cache" -u $(id -u) -p 8000:8000  nvcr.io/nim/meta/llama-3.1-70b-instruct:1.3.3

	docker run -it --rm  --gpus all     --shm-size=16GB  -e NIM_MODEL_PROFILE  -e NGC_API_KEY=xxxxxx -v "$LOCAL_NIM_CACHE:/opt/nim/.cache"     -u $(id -u)     -p 8000:8000 nvcr.io/nim/meta/llama-3.1-70b-instruct:1.3.3 bash -i -c list-model-profiles
```

In [None]:
model_cache_path="/home/azureuser/llama-3.1-70b-nim"
nim_custom_model_name="70b-trt-engine"
# Run the command to create model store for the chosen profile
docker_create_model_command = [
    "docker", "run", "--rm", "--name=nim_model_cache_custom_name",
    "-e", "LOG_LEVEL=info",
    "-e", "NIM_LOW_MEMORY_MODE=1",
    "-e", f"NIM_CUSTOM_MODEL_NAME={nim_custom_model_name}",    
    "-e", f"NGC_API_KEY={ngc_api_key}",
    "--gpus", "all",
    "-v", f"{model_cache_path}:/model-repo",
    "-u", "root",
    f"{ngc_container}"
]

print("Cache lama-3.1-70b-instruct:1.3.3 to model_cache_path command in Docker...")
run_command(docker_create_model_command, "Cache Model command in Docker")

In [15]:
model_cache_path="/home/azureuser/llama-3.1-70b-nim/local_cache/70b-trt-engine/"

In [16]:
# Verify if the model was stored successfully
run_command(["ls",model_cache_path], "Model cache verification")

config.json
generation_config.json
model.safetensors.index.json
special_tokens_map.json
tokenizer.json
tokenizer_config.json
tool_use_config.json
trtllm_engine
[92mSuccess: Model cache verification[0m 
 Command: ls /home/azureuser/llama-3.1-70b-nim/local_cache/70b-trt-engine/


In [None]:
# Run the Docker container and list model profiles inside it to make sure the new custom model profile is created
docker_run_command = [
    "docker", "run", "--rm", "--name=nim_list_profiles",
    "-e", "LOG_LEVEL=info",
    "-e", "NIM_MODEL_PROFILE=trt-engine",
    "-e", f"NGC_API_KEY={ngc_api_key}",
    "--gpus", "all",
    "-v", f"{model_cache_path}:/model-repo",
    "-u", "root",
    f"{ngc_container}",
    "bash", "-i", "-c", "list-model-profiles"
]

# Execute the command to start Docker and list model profiles
run_command(docker_run_command, "Run Docker container to list model profiles")

### When you run the command above, you should get the newly cached profile listed in list-model-profile command (70b-trt-engine): 
```
942435195ceb8d93a1035c94844200e79e21c1b3744c5b4324ca40b71b164e75 (70b-trt-engine)
```

```
===========================================
	== NVIDIA Inference Microservice LLM NIM ==
	===========================================

	NVIDIA Inference Microservice LLM NIM Version 1.3.3
	Model: meta/llama-3.1-70b-instruct

	......
	Profile 942435195ceb8d93a1035c94844200e79e21c1b3744c5b4324ca40b71b164e75 is not fully defined with checksums
	SYSTEM INFO
	- Free GPUs:
	  -  [20b5:10de] (0) NVIDIA A100 80GB PCIe [current utilization: 0%]
	  -  [20b5:10de] (1) NVIDIA A100 80GB PCIe [current utilization: 0%]
	Profile 0462612f0f2de63b2d423bc3863030835c0fbdbc13b531868670cc416e030029 is not fully defined with checksums
	Profile 09af04392c0375ae5493ca5e6ea0134890ac28f75efd244a57f414f86e97b133 is not fully defined with checksums
	Profile 0b0193d56ec0bba1840ea429993c776f9168a1ca4699e81f4db48319dd7e5c3a is not fully defined with checksums
	Profile 162948dba7374caeb8f7886f7c62a105fd198cfc2dd533aa1cdb34eaea872af0 is not fully defined with checksums
	.......
	MODEL PROFILES
	- Compatible with system and runnable:
	  - 942435195ceb8d93a1035c94844200e79e21c1b3744c5b4324ca40b71b164e75 (70b-trt-engine)
	  - 395082aa40085d35f004dd3056d7583aea330417ed509b4315099a66cfc72bdd (vllm-bf16-tp2)
	  - With LoRA support:
	- Compilable to TRT-LLM using just-in-time compilation of HF models to TRTLLM engines:
	  - b7b6fa584441d9536091ce5cf80ccc31765780b8a46540da4e7bada5c5108ed9 (tensorrt_llm-trtllm_buildable-bf16-tp2)
	  - With LoRA support:
```

#### Push downloaded model to your AzureML workspace. Make sure to point to the local_cache/70b-trt-engine/  path this is where .engine files will be stored
```
	 az ml model create --name llama-3-1-70b-nim-tensorrt-bf16-tp2 --version 1 --path /home/azureuser/llama-3.1-70b-nim/local_cache/70b-trt-engine/ --resource-group rg-azeltov-aml --workspace-name aml_east
```

```
azureuser@nc48a100:~/dev$ ll /home/azureuser/llama-3.1-70b-nim/local_cache/70b-trt-engine/trtllm_engine/
total 140145228
drwxr-xr-x 2 azureuser ubuntu        4096 Jan 15 00:55 ./
drwxr-xr-x 3 azureuser ubuntu        4096 Jan 15 01:03 ../
-rw-r--r-- 1 azureuser ubuntu        3640 Jan 15 00:59 config.json
-rw-r--r-- 1 azureuser ubuntu 71754332004 Jan 15 01:02 rank0.engine
-rw-r--r-- 1 azureuser ubuntu 71754331260 Jan 15 00:59 rank1.engine
```

In [18]:

cached_model_name = "llama-3-1-70b-nim-tensorrt-bf16-tp2"
model_version = "1"

In [19]:
run_command([
    "az", "ml", "model", "create",
    "--name", cached_model_name,
    "--version", model_version,
    "--path", model_cache_path,
    "--resource-group", resource_group,
    "--workspace-name", workspace
])

Your file exceeds 100 MB. If you experience low speeds, latency, or broken connections, we recommend using the AzCopyv10 tool for this file transfer.

Example: azcopy copy '/home/azureuser/llama-3.1-70b-nim/local_cache/70b-trt-engine' 'https://amleast2001436580.blob.core.windows.net/azureml-blobstore-9a114f90-b35e-41f7-9dfa-61f6b44a10e2/LocalUpload/8a09a9c976478ca28bc8e53cf1918364/70b-trt-engine' 

See https://docs.microsoft.com/azure/storage/common/storage-use-azcopy-v10 for more information.
Uploading 70b-trt-engine (143517.87 MBs): 100%|██████████| 143517866268/143517866268 [04:25<00:00, 541135290.51it/s]




{
  "creation_context": {
    "created_at": "2025-01-15T01:17:34.369716+00:00",
    "created_by": "Alexander Zeltov",
    "created_by_type": "User",
    "last_modified_at": "2025-01-15T01:17:34.369716+00:00",
    "last_modified_by": "Alexander Zeltov",
    "last_modified_by_type": "User"
  },
  "id": "azureml:/subscriptions/7d5234cf-1437-4626-94d3-47372cdfc8f5/resourceGroups/rg-azeltov-aml/providers/Microsoft.MachineLearningServices/workspaces/aml_east/models/llama-3-1-70b-nim-tensorrt-bf16-tp2/versions/1",
  "name": "llama-3-1-70b-nim-tensorrt-bf16-tp2",
  "path": "azureml://subscriptions/7d5234cf-1437-4626-94d3-47372cdfc8f5/resourceGroups/rg-azeltov-aml/workspaces/aml_east/datastores/workspaceblobstore/paths/LocalUpload/8a09a9c976478ca28bc8e53cf1918364/70b-trt-engine",
  "properties": {},
  "resourceGroup": "rg-azeltov-aml",
  "stage": "Development",
  "tags": {},
  "type": "custom_model",
  "version": "1"
}
[92mSuccess: [0m 
 Command: az ml model create --name llama-3-1-70b-nim-te

### Push container to Azure Container Registry

In [24]:
nim_acr_name = ngc_container.replace("nvcr.io", f"{acr_registry_name}.azurecr.io")
print("NIM image name as saved in ACR: ", nim_acr_name)

NIM image name as saved in ACR:  amlacrazeltov.azurecr.io/nim/meta/llama-3.1-70b-instruct:1.3.3


In [19]:
# Create Dockerfile content for NIM container
dockerfile_content = f"""FROM {ngc_container}
EXPOSE 8000
USER root
CMD bash -c "echo 'Displaying the NGC API Key:' && echo $NGC_API_KEY && \
             echo 'Displaying the NIM Model Name:' && echo $NIM_MODEL_NAME && \
             echo 'Listing the contents of /model-repo:' && ls -al /model-repo && \
             /opt/nim/start-server.sh"
"""

# Write Dockerfile to disk
with open("Dockerfile", "w") as dockerfile:
    dockerfile.write(dockerfile_content)
print("NIM Dockerfile has been created.")

# Login to Azure Container Registry
print("Logging into Azure Container Registry")
run_command(["az", "acr", "login", "-n", acr_registry_name])

# Build and tag the Docker image
print("Building the new Docker image and tagging it")
run_command(["docker", "build", "-t", nim_acr_name, "-f", "Dockerfile", "."], "Building azure NIM image")

# Clean up Dockerfile after build
#os.remove("Dockerfile")

# Push the image to ACR
print("Pushing the image to ACR")
run_command(["docker", "push", nim_acr_name], "Pushing NIM image to Azure container registry")

NIM Dockerfile has been created.
Logging into Azure Container Registry
Login Succeeded
[92mSuccess: [0m 
 Command: az acr login -n amlacrazeltov
Building the new Docker image and tagging it


#0 building with "default" instance using docker driver

#1 [internal] load build definition from Dockerfile
#1 transferring dockerfile: 383B done
#1 WARN: JSONArgsRecommended: JSON arguments recommended for CMD to prevent unintended behavior related to OS signals (line 4)
#1 DONE 0.0s

#2 [internal] load metadata for nvcr.io/nim/meta/llama-3.1-70b-instruct:1.3.3
#2 DONE 0.0s

#3 [internal] load .dockerignore
#3 transferring context: 2B done
#3 DONE 0.0s

#4 [1/1] FROM nvcr.io/nim/meta/llama-3.1-70b-instruct:1.3.3
#4 CACHED

#5 exporting to image
#5 exporting layers done
#5 writing image sha256:6223c028dcbd66e4a98a7b6bb6cc325fd45aaffa94d5227b937e150f6d6069e2 done
#5 naming to amlacrazeltov.azurecr.io/nim/meta/llama-3.1-70b-instruct:1.3.3
#5 naming to amlacrazeltov.azurecr.io/nim/meta/llama-3.1-70b-instruct:1.3.3 0.0s done
#5 DONE 0.1s

[0m - JSONArgsRecommended: JSON arguments recommended for CMD to prevent unintended behavior related to OS signals (line 4)


[92mSuccess: Building azure NIM image[0m 
 Command: docker build -t amlacrazeltov.azurecr.io/nim/meta/llama-3.1-70b-instruct:1.3.3 -f Dockerfile .
Pushing the image to ACR
The push refers to repository [amlacrazeltov.azurecr.io/nim/meta/llama-3.1-70b-instruct]
f687685aa27f: Preparing
1b0c89cec5aa: Preparing
62088d11c7da: Preparing
a208910323e1: Preparing
e8636a8b2156: Preparing
f049acf4da08: Preparing
11c46f95fe3a: Preparing
5f70bf18a086: Preparing
5f70bf18a086: Preparing
5f70bf18a086: Preparing
5f70bf18a086: Preparing
5f70bf18a086: Preparing
5f70bf18a086: Preparing
5f70bf18a086: Preparing
5f70bf18a086: Preparing
394dc07662b7: Preparing
9f4780cade95: Preparing
b5e5b52f77e3: Preparing
947d21d5d746: Preparing
20e30de06f95: Preparing
556972573b13: Preparing
4fe921117ad8: Preparing
970aef7eea04: Preparing
fa8af7c7f5aa: Preparing
4bc0813a6f2a: Preparing
0b846862bcf8: Preparing
800b524a69be: Preparing
f0d9a97f6f4d: Preparing
4d34e736e0f6: Preparing
c0cea0db07e0: Preparing
665b4101b8c9: Pre

### Create AzureML managed endpoint

In [None]:
# Create the endpoint YAML configuration
endpoint_yaml_content = f"""$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
name: {endpoint_name}
auth_mode: key
properties:
    enforce_access_to_default_secret_stores: enabled  # default: disabled
"""

# Write endpoint YAML configuration to file
with open("actual_endpoint_aml.yml", "w") as endpoint_yaml:
    endpoint_yaml.write(endpoint_yaml_content)
print("Endpoint YAML configuration created.")

# Deploy the endpoint using the Azure CLI
print(f"Creating Online Endpoint {endpoint_name}")
output = run_command([
    "az", "ml", "online-endpoint", "create", "-f", "actual_endpoint_aml.yml",
    "--resource-group", resource_group, "--workspace-name", workspace
], return_output=True)

endpointidentityid = ""
if output:
    endpoint_data = json.loads(output)
    endpointidentityid = endpoint_data.get("identity", {}).get("principal_id")
    if endpointidentityid:
        print(f"Principal ID: {endpointidentityid}")
    else:
        print("Principal ID not found.")
else:
    print("No output received.")

# Clean up the generated YAML file
#os.remove("actual_endpoint_aml.yml")
print("Cleaned up temporary endpoint YAML file.")

Endpoint YAML configuration created.
Creating Online Endpoint llama31-70b-nim-trt
Principal ID: 5ebd626e-907e-463c-ab5a-cdd89702dcfa
Cleaned up temporary endpoint YAML file.


In [22]:
# Provide access to endpoint to read secrets from key vault
endpointidentityid = "5ebd626e-907e-463c-ab5a-cdd89702dcfa"
run_command([
    "az", "role", "assignment", "create",
    "--assignee", endpointidentityid,
    "--role", "Key Vault Secrets User",
    "--scope", f"/subscriptions/{subscription_id}/resourcegroups/{resource_group}/providers/Microsoft.KeyVault/vaults/{keyvault_name}"
], "Providing permissions for endpoint to access NGC key from key vault")

{
  "condition": null,
  "conditionVersion": null,
  "createdBy": null,
  "createdOn": "2025-01-14T22:18:49.096848+00:00",
  "delegatedManagedIdentityResourceId": null,
  "description": null,
  "id": "/subscriptions/7d5234cf-1437-4626-94d3-47372cdfc8f5/resourcegroups/rg-azeltov-aml/providers/Microsoft.KeyVault/vaults/amleast5828881351/providers/Microsoft.Authorization/roleAssignments/ca97cb79-324a-43a9-8acb-3ff2179e9161",
  "name": "ca97cb79-324a-43a9-8acb-3ff2179e9161",
  "principalId": "5ebd626e-907e-463c-ab5a-cdd89702dcfa",
  "principalType": "ServicePrincipal",
  "resourceGroup": "rg-azeltov-aml",
  "roleDefinitionId": "/subscriptions/7d5234cf-1437-4626-94d3-47372cdfc8f5/providers/Microsoft.Authorization/roleDefinitions/4633458b-17de-408a-b874-0445c86b69e6",
  "scope": "/subscriptions/7d5234cf-1437-4626-94d3-47372cdfc8f5/resourcegroups/rg-azeltov-aml/providers/Microsoft.KeyVault/vaults/amleast5828881351",
  "type": "Microsoft.Authorization/roleAssignments",
  "updatedBy": "e3ab9be0

### Create NIM deployment

#### Airgapped deployment yaml file

Run this cell to generate a deployment YAML file for deploying NIMs in an airgapped environment. This deployment requires mounting a cached NIM model and setting the NIM container’s NIM_MODEL_NAME environment variable, which instructs NIM to avoid downloading from the internet and instead utilize the model hosted on the instance. The NIM model is sourced by mounting the previously pushed model from our AzureML workspace’s model registry.

In [25]:
deployment_yaml_content = f"""$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
name: {deployment_name}
endpoint_name: {endpoint_name}
model: azureml:{cached_model_name}:{model_version}
model_mount_path: /model-repo
environment: 
  name: llama31-70b-nim-trt-tp2-env
  image: {nim_acr_name}
  inference_config:
      liveness_route:
          path: /v1/health/ready
          port: 8000
      readiness_route:
          path: /v1/health/ready
          port: 8000
      scoring_route:
          path: /
          port: 8000
instance_type: {instance_type}
instance_count: 1
request_settings:
    max_concurrent_requests_per_instance: 256
    request_timeout_ms: 180000
environment_variables:
    NIM_MODEL_NAME: /model-repo/70b-trt-engine/
    NGC_API_KEY: {ngc_api_key}
    shm-size: 16GB
    gpus: 0,1
    OMPI_ALLOW_RUN_AS_ROOT: 1
    OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1
    # NIM_LOW_MEMORY_MODE: 1
    NIM_MANIFEST_ALLOW_UNSAFE: 1
    NIM_MODEL_PROFILE: trt-engine    
liveness_probe:
    timeout: 300
    period: 300
    failure_threshold: 100
readiness_probe:
    timeout: 300
    period: 300
    failure_threshold: 100    
"""

In [None]:
# display content of deployment YAML file
import yaml
yaml_content = yaml.safe_load(deployment_yaml_content)
print(yaml.dump(yaml_content, sort_keys=False, default_flow_style=False, indent=2))

In [None]:
# Write deployment YAML configuration to file
with open("actual_deployment_aml.yml", "w") as deployment_yaml:
    deployment_yaml.write(deployment_yaml_content)
print("Deployment YAML configuration created.")

# Display the modified YAML file for confirmation
with open("actual_deployment_aml.yml", "r") as file:
    print(file.read())

# Create the online deployment using the Azure CLI
print(f"Creating Online Deployment {deployment_name}")
run_command([
    "az", "ml", "online-deployment", "create", "-f", "actual_deployment_aml.yml",
    "--resource-group", resource_group, "--workspace-name", workspace
])

# Clean up the generated YAML file
#os.remove("actual_deployment_aml.yml")
print("Cleaned up temporary deployment YAML file.")

### Test your connection

Verify your deployment using the code below. Modify the code to add your endpoint URL, Endpoint authorization token and AzureML deployment name obtained from "Consume" tab under your AzureML endpoint page as shown below.

![Endpoint details](./endpoint_details.png)

Look at your deployment logs for available serving endpoints and example CURL request

![Serving endpoints](./serving_endpoints.png)

![Example request](./example_request.png)

In [None]:
base_url = 'https://llama31-70b-nim-trt.eastus.inference.ml.azure.com/' # modify this URL
token = 'xxxxxxxxxxxxxxxx'
model_name = "meta/llama-3.1-70b-instruct"

In [None]:
import requests
import json
from urllib.parse import urljoin

url = urljoin(base_url, "v1/chat/completions")
headers = {
    'accept': 'application/json',
    'Authorization': f'Bearer {token}', # modify this token
    'Content-Type': 'application/json',
    'azureml-model-deployment': deployment_name
}
data = {
    "messages": [
        {
            "content": "You are a polite and respectful chatbot helping people plan a vacation.",
            "role": "system"
        },
        {
            "content": "What should I do for a 4 day vacation in Spain?",
            "role": "user"
        }
    ],
    "model": model_name,
    "max_tokens": 500,
    "top_p": 1,
    "n": 1,
    "stream": False,
    #"stop": "\n",
    "frequency_penalty": 0.0
}

response = requests.post(url, headers=headers, json=data)
# Pretty print the JSON response
print(json.dumps(response.json(), indent=4))

#### (Optional) Launch a Gradio interface

In [None]:
# ! pip install gradio

In [None]:
import gradio as gr
import requests
import json

# Function to handle the chat with LLM
def chat_with_llm(user_message, history):
    # Format messages for the LLM with initial system prompt
    formatted_messages = [
        {"content": "You are a polite and respectful AI assistant.", "role": "system"}
    ]
    
    # Add previous messages from the history
    for user_msg, assistant_msg in history:
        formatted_messages.append({"content": user_msg, "role": "user"})
        formatted_messages.append({"content": assistant_msg, "role": "assistant"})

    # Add the user's latest message
    formatted_messages.append({"content": user_message, "role": "user"})

    data = {
        "messages": formatted_messages,
        "model": model_name,
        "max_tokens": 2000,
        "top_p": 1,
        "n": 1,
        "stream": False,  # Set to True if streaming responses are supported by your API
        "stop": "\n",
        "frequency_penalty": 0.1
    }

    response = requests.post(url, headers=headers, json=data)

    if response.status_code == 200:
        response_data = response.json()
        assistant_reply = response_data.get("choices", [{}])[0].get("message", {}).get("content", "No response")
    else:
        assistant_reply = f"Error: {response.status_code} - {response.text}"

    # Return the assistant's reply as a string
    return assistant_reply

# Create a ChatInterface using Gradio
chat_interface = gr.ChatInterface(
    fn=chat_with_llm,
    title="Multi-Turn LLM Chatbot with gr.ChatInterface",
    description="A chatbot interface that interacts with your LLM endpoint for multi-turn conversations.",
    examples=[["What should I do for a 4-day vacation in Spain?"]]
)

# Launch the interface
chat_interface.launch()