In [None]:
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Deploy NVIDIA Nemo Retriever NIM to GCP Vertex AI

### Objective

NVIDIA NeMo Text Retriever NIM APIs provide easy access to state-of-the-art models that are foundational building blocks for enterprise semantic search applications, delivering accurate answers quickly at scale. Developers can use these APIs to create robust copilots, chatbots, and AI assistants from start to finish. Text Retriever NIM models are built on the NVIDIA software platform, incorporating CUDA, TensorRT, and Triton to offer out-of-the-box GPU acceleration.

- NeMo Retriever Text Embedding NIM - Boosts text question-answering retrieval performance, providing high quality embeddings for many downstream NLP tasks.
- NeMo Retriever Text Reranking NIM - Enhances the retrieval performance further with a fine-tuned reranker, finding the most relevant passages to provide as context when querying an LLM.

In this notebook, you learn to how to run NVIDIA NeMo Retriever Text Embedding NIM (NREM NIM) container on Google Cloud Vertex AI, make inference to get customized responses, and deploy model to Vertex AI endpoint.

This tutorial uses the following NVIDIA NREM NIM and Vertex AI services:

- NVIDIA NREM NIM Container
- Vertex AI Model Resource
- Vertex AI Model Registry
- Vertex AI Endpoint Resource
- Vertex AI Prediction
- Vertex AI Artifact Registry
- Vertex AI Cloud Storage

The steps performed include:

- Pull NVIDIA NREM NIM container from NGC.
- Push NVIDIA NREM NIM container to Artifact Registry.
- Run NREM NIM container to make inference within interface.
- Upload NREM NIM container as a Vertex AI Model Resource.
- Create a Vertex AI Endpoint Resource.
- Deploy the Model Resource to the Endpoint Resource.
- Generate prediction responses from Endpoint Resource.


### Install and Import packages

In [None]:
! pip3 install -r requirements.txt

Restart kernel after installs so that the environment can access the new packages

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

In [None]:
import google.cloud.aiplatform_v1beta1 as aip_beta
from google.cloud.aiplatform import Endpoint, Model
from google.api_core.exceptions import InvalidArgument
import requests

### Authenticate to Google Cloud
Please run the following commands in a separate **Terminal** window.

In [None]:
! gcloud auth login
! gcloud auth application-default login

### Set Up

The example provided is NVIDIA Retrieval QA E5 Embedding v5 NIM (`nv-embedqa-e5-v5` container from NGC), on Vertex AI Workbench Notebook `g2-standard-12` instance with NVIDIA L4 GPU.
The solution is also applicable to other NeMo Retriever models, including `llama-3.2-nv-embedqa-1b-v2`, `nv-yolox-page-elements-v1`, `llama-3.2-nv-rerankqa-1b-v2`, `llama-3.2-nv-rerankqa-1b-v2`.


IAM role requirements:
* Vertex AI Users `(roles/aiplatform.user)` 
* Artifact Registry Repository Administrator `(roles/artifactregistry.repoAdmin)` 
* Storage Admin `(roles/storage.admin)`

Get account name

In [None]:
import requests
gcloud_token = !gcloud auth print-access-token
gcloud_tokeninfo = requests.get('https://www.googleapis.com/oauth2/v3/tokeninfo?access_token=' + gcloud_token[0]).json()
account_email = gcloud_tokeninfo['email']
account_name = gcloud_tokeninfo['email'].split('@')[0]
print(account_email)
print(account_name)

Please set the value of the following variables

In [None]:
region = None # please set here, e.g. us-central1
project_id = None # please set here
public_repository = None # please set here any value to name the public Artifact Registry

In [None]:
private_repository = account_name
bucket_url = f"gs://{account_name}"

nim_model = "nrem:embedqa-e5-v5-1.1.1"
# NIM in NGC
ngc_nim_image = "nvcr.io/nim/nvidia/nv-embedqa-e5-v5:1.1.1"
container_name = "nv-embedqa-e5-v5"
# NIM in Artifact Registry (AR)
public_nim_image = f"{region}-docker.pkg.dev/{project_id}/{public_repository}/{nim_model}"
private_nim_image = f"{region}-docker.pkg.dev/{project_id}/{private_repository}/{nim_model}"

va_model_name = "nrem-embedqa-e5-v5"

machine_type = "g2-standard-12"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1

endpoint_name = va_model_name+"_endpoint"
payload_model = "nvidia/nv-embedqa-e5-v5"

Grant required IAM roles to the service account

*Note: If "Use default Compute Engine service account" is selected when creating the workbench instance, Vertex AI service account is the same as Compute Engine, as example below.*

In [None]:
project_number = !gcloud projects describe {project_id} --format="value(projectNumber)"
service_account = "serviceAccount:" + project_number[0] + "-compute@developer.gserviceaccount.com"
role1 = "roles/aiplatform.user"
role2 = "roles/artifactregistry.repoAdmin"
role3 = "roles/storage.admin"

! gcloud projects add-iam-policy-binding {project_id} --member={service_account} --role={role1}
! gcloud projects add-iam-policy-binding {project_id} --member={service_account} --role={role2}
! gcloud projects add-iam-policy-binding {project_id} --member={service_account} --role={role3}

If Cloud Storage Bucket or Artifact Registry repository doesn't already exist: Run the following cell to create your bucket or repository.

- Private Artifact Registry is to securely store NIM containers with minimum user access, for testing, validation, and maintaining a version-controlled, auditable copy.

- Public Artifact Registry is optional,  enabling more selected users to access the NIM containers, while adhering to the Principle of Least Privilege.

In [None]:
! gsutil mb -l {region} -p  {project_id} {bucket_url}
! gcloud artifacts repositories create {public_repository} --repository-format=docker --location={region}
! gcloud artifacts repositories create {private_repository} --repository-format=docker --location={region}

In [None]:
# (Optional) Create public AR if needed
user = 'serviceAccount:test123@example.iam.gserviceaccount.com' # Please set member to grant AR read access to, e.g. user:test-user@gmail.com, group:admins@example.com, 
                                                                # serviceAccount:test123@example.domain.com, or domain:example.domain.com
! gcloud artifacts repositories add-iam-policy-binding {public_repository} --location={region} --member={user} --role=roles/artifactregistry.reader

Initialize Vertex AI SDK for Python

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=project_id, location=region, staging_bucket=bucket_url)

GCP Configuration

In [None]:
def run_bash_cmd(cmd):
    import subprocess

    if isinstance(cmd, str):
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, text=True)
    elif isinstance(cmd, list):
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False, text=True)
        
    output, error = process.communicate()
    if error:
        raise Exception(error)
    else:
        print(output)

In [None]:
bash_cmd = f"""
    export region={region}
    gcloud config set ai_platform/region {region}
    gcloud config set project {project_id}
    gcloud auth configure-docker {region}-docker.pkg.dev
    """
run_bash_cmd(bash_cmd)

### NIM Container

* **NGC_API_KEY**

To access NIM container from NGC catalog, `NGC_API_KEY` is required.

The credentail will be used in Vertex AI as an environment variable during model uploading, and will show on Model Registry Version Details UI. **Attention: the credential will be visible for all Vertex AI users in the same project.**

Please upload a json file to Cloud Storage Bucket to use `read_key()` function below, format  `"{NGC_API_KEY": Your Key}"`.

Reference: [NGC User Guide](https://docs.nvidia.com/ngc/gpu-cloud/ngc-user-guide/index.html)

* **Artifact Registry**

We will pull NIM container from NGC, then push to a private AR repository. 

(Optional) Then we pull NIM container from the private AR and push to a public AR repository, which allows more users in the project able to access NIM. 


#### Set NGC API KEY

In [None]:
NGC_API_KEY = None # please set here

#### Pull NIM from NGC and Push to GCP AR

In [None]:
# Login to NGC
from pathlib import Path
local_nim_cache=str(Path(".cache/nim").absolute())

bash_cmd = f"""
    sudo apt-get install -y nvidia-docker2
    export NGC_API_KEY={NGC_API_KEY}
    echo "export NGC_API_KEY={NGC_API_KEY}" >> ~/.bashrc
    echo "$NGC_API_KEY" | docker login nvcr.io --username '$oauthtoken' --password-stdin

    export LOCAL_NIM_CACHE={local_nim_cache}
    mkdir -p "$LOCAL_NIM_CACHE"
    echo "Local NIM cache created"
    """

run_bash_cmd(bash_cmd)

# Pull NIM container from NGC and run container
docker_cmd = [
    "docker", "run", "-d", "--rm",
    f"--name={container_name}",
    "--gpus", "all",
    "-e", f"{NGC_API_KEY}",
    "-v", f"{local_nim_cache}:/opt/nim/.cache",
    "-p", "8000:8000",
    ngc_nim_image
]

print(f"NIM container {ngc_nim_image} pulled from NGC successfully, running container is")
run_bash_cmd(docker_cmd)

# Push NIM container to private AR repository
bash_cmd = f"""
    docker tag {ngc_nim_image} {private_nim_image}

    docker push {private_nim_image}
    """

run_bash_cmd(bash_cmd)
print(f"NIM container {ngc_nim_image} pushed to Artifact Registry {private_nim_image} successfully")

In [None]:
# Optional
# Push NIM container to public AR repository
bash_cmd = f"""
    docker tag {private_nim_image} {public_nim_image}

    docker push {public_nim_image}
    """

run_bash_cmd(bash_cmd)
print(f"NIM container {private_nim_image} pushed to Artifact Registry {public_nim_image} successfully")

### Run NIM Container Within Interface

Run NREM NIM container locally in **Terminal** or **Another notebook**, keep the container active, then inference with Python OpenAI API or CLI command to get model responses in the Notebook interface.

Terminal

In [None]:
# Run this command here, used for the following variable definition
print(private_nim_image)

In [None]:
# If Terminal, use $variable_name, add export commands
export container_name=nv-embedqa-e5-v5
export NGC_API_KEY=None                     # please set here
export local_nim_cache=~/.cache/nim         
export private_nim_image=None               # please set here

docker run -it --rm --name=$container_name \
  --runtime=nvidia \
  --gpus all \
  --shm-size=16GB \
  -e NGC_API_KEY=$NGC_API_KEY \
  -v $local_nim_cache":/opt/nim/.cache" \
  -u $(id -u) \
  -p 8000:8000 \
  $private_nim_image

Notebook

In [None]:
# If Notebook, use {variable_name}, add variables definition
container_name = "nv-embedqa-e5-v5"
NGC_API_KEY = None                      # please set here
local_nim_cache = "~/.cache/nim"        
private_nim_image = None                # please set here

! docker run -it --rm --name={container_name} \
  --runtime=nvidia \
  --gpus all \
  --shm-size=16GB \
  -e NGC_API_KEY={NGC_API_KEY} \
  -v {local_nim_cache}":/opt/nim/.cache" \
  -u $(id -u) \
  -p 8000:8000 \
  {private_nim_image}

Run below commands in the current notebook interface.

In [None]:
! docker images | grep nrem

In [None]:
! docker ps 
! echo ""
CONTAINER_ID = !docker ps | awk 'NR>1 {print $1}'
CONTAINER_ID = CONTAINER_ID[0]
! echo 'Running Container is' $CONTAINER_ID
! echo 'IP Address'
# ! docker inspect $CONTAINER_ID
IPAddress= !docker exec $CONTAINER_ID sh -c "hostname --ip-address" 
IPAddress=IPAddress[0]
! echo $IPAddress
! echo ""
! echo "NIM Model and Profile"
! docker inspect $CONTAINER_ID |grep -i model

#### Make Inference within Interface
After running NREM NIM container and keeping it active, we could make inference to model and get response. NREM NIM on Vertex AI Workbench supports both OpenAI Python API and CLI.

With the `embeddings` endpoint, `input` could be set as input text to be transformed into vectors by the model. `input_type` could be adjusted for Embedding models such as NV-Embed-QA, E5, as they operate in `passage` or `query` mode, where `passage` is used when generating embeddings during indexing, `query` is used when generating embeddings during querying.

Since the OpenAI API does not accept `input_type` as a parameter, it is possible to add the `-query` or `-passage` suffix to the model parameter like `nv-embedqa-e5-v5-query` and not use the `input_type` field at all for OpenAI API compliance.

*Note: May need to change IP address of URL when make request (e.g. http://172.18.0.2:8000/v1/embeddings)*

Reference: [NVIDIA Embedding API](https://docs.api.nvidia.com/nim/reference/nvidia-nv-embedqa-e5-v5), [Text Embedding NIM](https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/reference.html) 

CLI

In [None]:
# Confirm the service is ready to handle inference requests
! curl -X 'GET' 'http://localhost:8000/v1/health/ready'

In [None]:
# Generate Embeddings
! curl -X "POST" \
  "http://localhost:8000/v1/embeddings" \ 
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{ "input": ["Hello world"],
        "model": "nvidia/nv-embedqa-e5-v5",
        "input_type": "query"
        }'

Python

In [None]:
# Generate Embeddings
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-used")
inputs = ["Hello World", "Once upon a time"]
input_type="-query" #-passage

response = client.embeddings.create(
    model=payload_model+input_type,
    input=inputs,
)
embeddings = response.data[0].embedding
print(len(embeddings))

Stop NIM container

In [None]:
! docker stop $CONTAINER_ID

### Endpoint Deployment

Then we could proceed to endpoint deloyment, this will allow the model endpoint available on Vertex AI Online Prediction.

Steps are as follows:

* Upload NIM container as a Vertex AI Model resource.
* Create a Vertex AI Endpoint resource.
* Deploy the Model resource to the Endpoint resource.
* Generate raw prediction requests and get responses.

#### Upload NIM as a Vertex AI Model Resource

First, we upload the NREM NIM image as a Vertex AI model resource using the `upload()` method, with the following parameters:

*  `display_name`: The human readable name for the Model resource.
*  `artifact_uri`: The Cloud Storage location of the model artifacts. If the container image includes the model artifacts that you need to serve predictions, there is no need to load files from Cloud Storage.
*  `parent_model`: The parent resource name of an existing model.
*  `model_version_aliases`: The aliases of the model version to create.
*  `model_version_description`: The description of the model version.
*  `is_default_version`: Whether the model version is the default version.

*  `serving_container_image`: The serving container image to use when the model is deployed to a Vertex AI

*  `serving_container_command`: The serving binary (HTTP Server) to start up.

*  `serving_container_shared_memory_size_mb`: The shared memory is an Inter-process communication (IPC) mechanism that allows multiple processes to access and manipulate a common block of memory. The default shared memory size is 64MB. Model servers such as vLLM or NVIDIA Triton, use shared memory to cache internal data during model inferences. Also, because shared memory can be used for cross GPU communication, using more shared memory can improve performance for accelerators without NVLink capabilities (for example, L4), if the model container requires communication across GPUs. NIM generally requires a larger shared memory size than default. 

*  `serving_container_environment_variables`: The environment variables specify container required settings such as authentication key. 

*  `serving_container_args`: The arguments to pass to the serving binary. For example:

      -- `model_name`: The human readable name to assign to the model.

      -- `model_base_name`: Where to store the model artifacts in the container. The Vertex service sets the variable `AIP_STORAGE_URI` to where the service installed the model artifacts in the container.

      -- `rest_api_port`: The port to which to send REST based prediction requests. NREM NIM uses `8000`.

      -- `port`: The port to which to send gRPC based prediction requests. NREM NIM uses `8000`.

*  `serving_container_health_route`: The URL for the service to periodically ping for a response to verify that the serving binary is running. For NREM NIM, this will be `/v1/health/ready`.

*  `serving_container_predict_route`: The URL for the service to route REST-based prediction requests to. For NREM NIM, this will be `/v1/embeddings`.

*  `serving_container_ports`: A list of ports for the HTTP server to listen for requests. 

*  `sync`: Whether to wait for the process to complete, or return immediately (async).

Uploading a model into a Vertex Model resource may take a few moments. After completion, model will show up in Vertex AI Model Registry.

In [None]:
from google.api_core.future.polling import DEFAULT_POLLING
from google.cloud.aiplatform import Endpoint, Model
DEFAULT_POLLING._timeout = 360000

print("NREM NIM Container:",private_nim_image)

models = Model.list(filter=f'displayName="{va_model_name}"')

if models:
    model = models[0]
else:
    model = aiplatform.Model.upload(
                    display_name=va_model_name,
                    # parent_model="3585596478619385856",
                    is_default_version=True,
                    # version_aliases=["v2"], 
                    # version_description="This is the second version of the model",
                    serving_container_image_uri=private_nim_image,
                    serving_container_predict_route="/v1/embeddings",
                    serving_container_health_route="/v1/health/ready",
                    serving_container_environment_variables={"NGC_API_KEY": NGC_API_KEY, "PORT": "8000", "shm-size":"16GB"},
                    serving_container_shared_memory_size_mb=16000,
                    serving_container_ports=[8000],
                    sync=True,
                )
model.wait()

print("Model:")
print(f"\tDisplay name: {model.display_name}")
print(f"\tResource name: {model.resource_name}")

In [None]:
! gcloud ai models list --region=$region --filter="DISPLAY_NAME ~ .*nrem.*"

In [None]:
MODEL_ID = !gcloud ai models list --region=$region --filter="DISPLAY_NAME ~ .*nrem.*" | awk 'NR>1 {print $1}'
MODEL_ID = MODEL_ID[1]
MODEL_ID

In [None]:
def list_model_version(model_id: str, project: str, location: str):
    """
    List all model versions of a model.
    Args:
        model_id: The ID of the model to list. Parent resource name of the model is also accepted.
        project: The project ID.
        location: The region name.
    Returns:
        versions: List of model versions.
    """
    # Initialize the client.
    aiplatform.init(project=project, location=location)

    # Initialize the Model Registry resource with the ID 'model_id'.The parent_name of Model resource can be also
    # 'projects/<your-project-id>/locations/<your-region>/models/<your-model-id>'
    model_registry = aiplatform.models.ModelRegistry(model=model_id)

    # List all model versions of the model.
    versions = model_registry.list_versions()

    return versions

list_model_version(MODEL_ID, project_id, region)

#### Create a Vertex AI Endpoint Resource

In [None]:
endpoints = Endpoint.list(filter=f'displayName="{endpoint_name}"')
if endpoints:
    endpoint = endpoints[0]
else:
    print(f"Endpoint {endpoint_name} doesn't exist, creating...")
    endpoint = aiplatform.Endpoint.create(display_name=endpoint_name)
print("Endpoint:")
print(f"\tDisplay name: {endpoint.display_name}")
print(f"\tResource name: {endpoint.resource_name}")

In [None]:
! gcloud ai endpoints list --region=$region --filter="DISPLAY_NAME ~ .*nrem.*"

In [None]:
ENDPOINT_ID = !gcloud ai endpoints list --region=$region --filter="DISPLAY_NAME ~ .*nrem.*" | awk 'NR>1 {print $1}'
ENDPOINT_ID = ENDPOINT_ID[1]
ENDPOINT_ID

#### Deploy Model Resource to Endpoint Resource

Next, deploy the Vertex AI model resource to the endpoint resource with the following parameters:

* `deploy_model_display`: The human reable name for the deployed model.

* `traffic_split`: Percent of traffic at the endpoint that goes to this model, which is specified as a dictionary of one or more key/value pairs.
    * If only one model, then specify `{ "0": 100 }`, where "0" refers to this model being uploaded and 100 means 100% of the traffic.
    * If there are existing models on the endpoint, for which the traffic is split, then use model_id to specify `{ "0": percent, model_id: percent, ... }`, where model_id is the ID of an existing deployed model on the endpoint. The percentages must add up to 100.

* `machine_type`: The machine type for each VM node instance.

* `min_replica_count`: The minimum number of nodes to provision for auto-scaling.

* `max_replica_count`: The maximum number of nodes to provision for auto-scaling.

* `accelerator_type`: The type, if any, of GPU accelators per provisioned node.

* `accelrator_count`: The number, if any, of GPU accelators per provisioned node.

After successful deployment, the endpoint and associated deloyed model will be available on Vertex AI Online Prediction.

In [None]:
model.deploy(
    endpoint=endpoint,
    deployed_model_display_name=va_model_name,
    traffic_percentage=100,
    machine_type=machine_type,
    min_replica_count=1,
    max_replica_count=1,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    enable_access_logging=True,
    sync=True,
)

print(f"Model {model.display_name} deployed at endpoint {endpoint.display_name}.")

In [None]:
print(endpoint.gca_resource)
endpoint_name = endpoint.resource_name
print(endpoint_name)
print(endpoint.list_models())

#### Endpoint Inference

Use the Endpoint object's `rawPredict` function to get responses from the deployed model, which accepts request that matches directly the input format of the model.

If use the alternative `Predict` function, it will take the following parameters:

* `instances`: A list of messages or prompts instances. Each instance should be an array of strings. 
* `parameters`: A list of LLM model parameteres, e.g. temperature, max_tokens, top_p, stream.

NREM NIM on Vertex AI Workbench supports both OpenAI Python API and CLI. 

In [None]:
# Create payload request
inputs = ["Hello world"]

payload = {
  "model": payload_model,
  "input": inputs,
  "input_type": "query"
}

with open("request_nrem.json", "w") as outfile: 
    json.dump(payload, outfile)

Python SDK

In [None]:
import json
from pprint import pprint
from google.api import httpbody_pb2
from google.cloud import aiplatform_v1

http_body = httpbody_pb2.HttpBody(
    data=json.dumps(payload).encode("utf-8"),
    content_type="application/json",
)

req = aiplatform_v1.RawPredictRequest(
    http_body=http_body, endpoint=endpoint.resource_name
)

print("Request")
print(req)
pprint(json.loads(req.http_body.data))
print()

API_ENDPOINT = "{}-aiplatform.googleapis.com".format(region)
client_options = {"api_endpoint": API_ENDPOINT}

pred_client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)

response = pred_client.raw_predict(req)
print("--------------------------------------------------------------------------------------")
print("Response")
print("Length of Embeddings:", len(json.loads(response.data)['data'][0]['embedding']))
pprint(json.loads(response.data))

CLI

In [None]:
! curl \
    --request POST \
    --header "Authorization: Bearer $(gcloud auth print-access-token)" \
    --header "Content-Type: application/json" \
    https://$region-prediction-aiplatform.googleapis.com/v1/projects/$project_id/locations/$region/endpoints/$ENDPOINT_ID:rawPredict \
    --data "@request_nrem.json"

### Clean Up

In [None]:
delete_endpoint = True
delete_model = True
delete_image = True
delete_art_repo = False
delete_bucket = False

# Undeploy model and delete endpoint
try:
    if delete_endpoint:
        endpoint.undeploy_all(sync=True)
        endpoint.delete()
        print(f"Deleted endpoint {endpoint.display_name}")
except Exception as e:
    print(e)

# Delete the model resource
try:
    if delete_model:
        model.delete()
        print(f"Deleted model {model.display_name}")
except Exception as e:
    print(e)

# Delete the container image from Artifact Registry
if delete_image:
    !gcloud artifacts docker images delete --quiet --delete-tags {private_nim_image}

# Delete the Artifact Repository
if delete_art_repo:
    ! gcloud artifacts repositories delete {private_repository} --location={region} -q

# Delete the Cloud Storage bucket
if delete_bucket:
    ! gsutil rm -rf {bucket_url}