## Deploy multi-LLM NIM (Hugging Face) to GCP Vertex AI


This notebook demonstrates deploying the multi-LLM compatible NVIDIA NIM on Vertex AI using a Hugging Face model.

We will:
- Pull multi-LLM NIM from NGC and push to Artifact Registry
- Optionally run locally for validation
- Upload the container as a Vertex AI Model
- Create a Vertex AI Endpoint and deploy the Model
- Send inference requests to the Endpoint

Reference: [Get Started with NVIDIA NIM for LLMs](https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html)


### Install and import packages


In [None]:
! pip3 install -r requirements.txt

Restart kernel after installs so new packages are available.


In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)


In [1]:
import google.cloud.aiplatform_v1beta1 as aip_beta
from google.cloud.aiplatform import Endpoint, Model
from google.api_core.exceptions import InvalidArgument
import requests


### Authenticate to Google Cloud


Please run the following commands **in a separate Terminal window.**


In [None]:
gcloud auth login
gcloud auth application-default login


### Set up variables


In [None]:
region = "" # e.g. "us-central1"
project_id = "" # your GCP project id
public_repository = ""  # optional: name for Artifact Registry that will get created below
NGC_API_KEY = "" # Your NGC API Key
HF_TOKEN = "" # add token

In [None]:
from subprocess import getoutput
account_email = getoutput('gcloud config get-value account')
account_name = account_email.split('@')[0] if '@' in account_email else "user"
private_repository = account_name
bucket_url = f"gs://{account_name}"

# NIM (multi-LLM) + Hugging Face model
nim_image_ngc = "nvcr.io/nim/nvidia/llm-nim:1.13.0"
image_name = "llm-nim"
image_tag = "1.13.0"

NIM_MODEL_NAME = "hf://meta-llama/Meta-Llama-3-8B"
NIM_SERVED_MODEL_NAME = "meta/llama3-8b-instruct"

# Artifact Registry targets (repo/image:tag)
private_nim_image = f"{region}-docker.pkg.dev/{project_id}/{private_repository}/{image_name}:{image_tag}"
public_nim_image = (
    f"{region}-docker.pkg.dev/{project_id}/{public_repository}/{image_name}:{image_tag}"
    if public_repository else None
)

# Vertex AI names
va_model_name = "nim-multi-llm"
endpoint_name = va_model_name + "-endpoint"

# Local cache for optional local run
local_nim_cache = "~/.cache/nim"

### GCP Configuration


In [3]:
def run_bash_cmd(cmd):
    import subprocess

    if isinstance(cmd, str):
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, text=True)
    elif isinstance(cmd, list):
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False, text=True)
        
    output, error = process.communicate()
    if error:
        raise Exception(error)
    else:
        print(output)


In [None]:
bash_cmd = f"""
    export region={region}
    gcloud config set ai_platform/region {region}
    gcloud config set project {project_id}
    gcloud auth configure-docker {region}-docker.pkg.dev
    """
run_bash_cmd(bash_cmd)


Grant required IAM roles to the service account

- Vertex AI Users `(roles/aiplatform.user)`
- Artifact Registry Repository Administrator `(roles/artifactregistry.repoAdmin)`
- Storage Admin `(roles/storage.admin)`


In [None]:
from subprocess import getoutput

project_number = getoutput(f"gcloud projects describe {project_id} --format='value(projectNumber)'").strip()
service_account = f"serviceAccount:{project_number}-compute@developer.gserviceaccount.com"

run_bash_cmd(f"gcloud projects add-iam-policy-binding {project_id} --member={service_account} --role=roles/aiplatform.user")
run_bash_cmd(f"gcloud projects add-iam-policy-binding {project_id} --member={service_account} --role=roles/artifactregistry.repoAdmin")
run_bash_cmd(f"gcloud projects add-iam-policy-binding {project_id} --member={service_account} --role=roles/storage.admin")


If Cloud Storage bucket or Artifact Registry repositories don't exist, create them.


In [None]:
# Create bucket and AR repos if needed
run_bash_cmd(f"gsutil mb -l {region} -p {project_id} {bucket_url}")
run_bash_cmd(f"gcloud artifacts repositories create {private_repository} --repository-format=docker --location={region}")
if public_repository:
    run_bash_cmd(f"gcloud artifacts repositories create {public_repository} --repository-format=docker --location={region}")


(Optional) Grant Artifact Registry read access to a user/group/service account for public repo.


In [None]:
# Example: set member to grant AR read access to
member = None  # e.g. 'user:test-user@gmail.com' or 'serviceAccount:xyz@project.iam.gserviceaccount.com'
if public_repository and member:
    run_bash_cmd(f"gcloud artifacts repositories add-iam-policy-binding {public_repository} --location={region} --member={member} --role=roles/artifactregistry.reader")


### Pull multi-LLM NIM from NGC and push to Artifact Registry


In [None]:
from pathlib import Path
container_name = "LLM-NIM"
local_cache_dir = str(Path(local_nim_cache).expanduser())

bash_cmd = f"""
    export NGC_API_KEY={NGC_API_KEY}
    echo "export NGC_API_KEY={NGC_API_KEY}" >> ~/.bashrc
    echo "$NGC_API_KEY" | docker login nvcr.io --username '$oauthtoken' --password-stdin
    mkdir -p "{local_cache_dir}"
    chmod -R a+w "{local_cache_dir}"
    echo "Local NIM cache created at {local_cache_dir}"
    docker pull {nim_image_ngc}
    """
run_bash_cmd(bash_cmd)

# Tag and push to private Artifact Registry
run_bash_cmd(f"docker tag {nim_image_ngc} {private_nim_image}")
run_bash_cmd(f"docker push {private_nim_image}")

# Optional: also push to public AR
if public_repository:
    run_bash_cmd(f"docker tag {private_nim_image} {public_nim_image}")
    run_bash_cmd(f"docker push {public_nim_image}")


### Optional: Run multi-LLM NIM locally (validate model startup) **in terminal**


In [None]:
# Run in terminal
export HF_TOKEN=""
export CONTAINER_NAME="LLM-NIM"
export IMG_NAME="nvcr.io/nim/nvidia/llm-nim:1.13.0"
export NIM_MODEL_NAME = "hf://meta-llama/Meta-Llama-3-8B"
export NIM_SERVED_MODEL_NAME = "meta/llama3-8b-instruct"
export LOCAL_NIM_CACHE="$HOME/.cache/nim"

# Create cache directory
mkdir -p "$LOCAL_NIM_CACHE"
chmod -R a+w "$LOCAL_NIM_CACHE"

# Run Docker container
docker run -it --rm --name="$CONTAINER_NAME" \
  --gpus all \
  --shm-size="16GB" \
  -e HF_TOKEN="$HF_TOKEN" \
  -e NIM_MODEL_NAME="$NIM_MODEL_NAME" \
  -e NIM_SERVED_MODEL_NAME="$NIM_SERVED_MODEL_NAME" \
  -v "$LOCAL_NIM_CACHE:/opt/nim/.cache" \
  -u "$(id -u)" \
  -p 8000:8000 \
  "$IMG_NAME"

### Upload multi-LLM NIM as a Vertex AI Model resource


In [6]:
from google.api_core.future.polling import DEFAULT_POLLING
from google.cloud import aiplatform

DEFAULT_POLLING._timeout = 360000

# Init SDK
aiplatform.init(project=project_id, location=region, staging_bucket=bucket_url)

In [None]:
# multi-LLM NIM needs the Hugging Face token and model selection via env vars
serving_env = {
    "PORT": "8000",
    "shm-size": "16GB",
    "HF_TOKEN": HF_TOKEN or "",
    "NIM_MODEL_NAME": NIM_MODEL_NAME,
    "NIM_SERVED_MODEL_NAME": NIM_SERVED_MODEL_NAME,
}

models = aiplatform.Model.list(filter=f'displayName="{va_model_name}"')

if models:
    model = models[0]
else:
    model = aiplatform.Model.upload(
        display_name=va_model_name,
        serving_container_image_uri=private_nim_image,
        serving_container_predict_route="/v1/chat/completions",
        serving_container_health_route="/v1/health/ready",
        serving_container_environment_variables=serving_env,
        serving_container_shared_memory_size_mb=16000,
        serving_container_ports=[8000],
        sync=True,
    )
model.wait()

print("Model:")
print(f"\tDisplay name: {model.display_name}")
print(f"\tResource name: {model.resource_name}")


### Create Vertex AI Endpoint and deploy the model


In [None]:
from google.cloud.aiplatform import Endpoint

endpoints = Endpoint.list(filter=f'displayName="{endpoint_name}"')
if endpoints:
    endpoint = endpoints[0]
else:
    print(f"Endpoint {endpoint_name} doesn't exist, creating...")
    endpoint = aiplatform.Endpoint.create(display_name=endpoint_name)
print("Endpoint:")
print(f"\tDisplay name: {endpoint.display_name}")
print(f"\tResource name: {endpoint.resource_name}")



In [None]:
model.deploy(
    endpoint=endpoint,
    deployed_model_display_name=va_model_name,
    traffic_percentage=100,
    machine_type="g2-standard-24",
    min_replica_count=1,
    max_replica_count=1,
    accelerator_type="NVIDIA_L4",
    accelerator_count=2,
    enable_access_logging=True,
    sync=True,
)
print(f"Model {model.display_name} deployed at endpoint {endpoint.display_name}.")


### Endpoint inference


In [12]:
# Prepare payloads for chat completions
import json
messages = [
    {"role": "user", "content": "Hello! How are you?"},
    {"role": "assistant", "content": "Hi! I am quite well, how can I help you today?"},
    {"role": "user", "content": "Write a short limerick about the wonders of GPU computing."}
]

llama3_chat_template = (
    "{% for message in messages %}"
    "{% if message['role'] == 'system' %}"
    "{{'<|begin_of_text|>' + message['content']}}"
    "{% elif message['role'] == 'user' %}"
    "{{'<|start_header_id|>user<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>'}}"
    "{% elif message['role'] == 'assistant' %}"
    "{{'<|start_header_id|>assistant<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>'}}"
    "{% endif %}"
    "{% endfor %}"
    "{% if add_generation_prompt %}"
    "{{'<|start_header_id|>assistant<|end_header_id|>\n\n'}}"
    "{% endif %}"
)

payload = {
  "model": NIM_SERVED_MODEL_NAME,
  "messages": messages,
  "temperature": 0.2,
  "max_tokens": 512,
  "top_p": 0.8,
  "chat_template": llama3_chat_template
}

with open("request.json", "w") as outfile:
    json.dump(payload, outfile)

payload_s = {
  "model": NIM_SERVED_MODEL_NAME,
  "messages": messages,
  "max_tokens": 512,
  "stream": True
}

with open("request_stream.json", "w") as outfile:
    json.dump(payload_s, outfile)


In [None]:
print(payload)

In [None]:
# Python SDK rawPredict
from pprint import pprint
from google.api import httpbody_pb2
from google.cloud import aiplatform_v1

http_body = httpbody_pb2.HttpBody(
    data=json.dumps(payload).encode("utf-8"),
    content_type="application/json",
)

req = aiplatform_v1.RawPredictRequest(
    http_body=http_body, endpoint=endpoint.resource_name
)

API_ENDPOINT = f"{region}-aiplatform.googleapis.com"
client_options = {"api_endpoint": API_ENDPOINT}

pred_client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)
response = pred_client.raw_predict(req)
print("Response:")
try:
    print(json.loads(response.data))
except Exception:
    print(response.data.decode("utf-8"))


### Clean up


In [None]:
delete_endpoint = True
delete_model = True
delete_image = True
delete_art_repo = False
delete_bucket = False

# Undeploy model and delete endpoint
try:
    if delete_endpoint:
        endpoint.undeploy_all(sync=True)
        endpoint.delete()
        print(f"Deleted endpoint {endpoint.display_name}")
except Exception as e:
    print(e)

# Delete the model resource
try:
    if delete_model:
        model.delete()
        print(f"Deleted model {model.display_name}")
except Exception as e:
    print(e)

# Delete the container image from Artifact Registry
if delete_image:
    run_bash_cmd(f"gcloud artifacts docker images delete --quiet --delete-tags {private_nim_image}")

# Optionally delete repositories and bucket
if delete_art_repo:
    run_bash_cmd(f"gcloud artifacts repositories delete {private_repository} --location={region} -q")

if delete_bucket:
    run_bash_cmd(f"gsutil rm -rf {bucket_url}")


In [None]:
delete_endpoint = True
delete_model = True
delete_image = True
delete_art_repo = False
delete_bucket = False

# Undeploy model and delete endpoint
try:
    if delete_endpoint:
        endpoint.undeploy_all(sync=True)
        endpoint.delete()
        print(f"Deleted endpoint {endpoint.display_name}")
except Exception as e:
    print(e)

# Delete the model resource
try:
    if delete_model:
        model.delete()
        print(f"Deleted model {model.display_name}")
except Exception as e:
    print(e)

# Delete the container image from Artifact Registry
if delete_image:
    run_bash_cmd(f"gcloud artifacts docker images delete --quiet --delete-tags {private_nim_image}")

# Optionally delete repositories and bucket
if delete_art_repo:
    run_bash_cmd(f"gcloud artifacts repositories delete {private_repository} --location={region} -q")

if delete_bucket:
    run_bash_cmd(f"gsutil rm -rf {bucket_url}")
