In [None]:
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Getting Started with `Llama Nemotron` Models


<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/generative_ai/llama_nemotron_intro.ipynb">
      <img src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fgenerative_ai%2Fllama_nemotron_intro.ipynb\">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/generative_ai/llama_nemotron_intro.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/generative_ai/llama_nemotron_intro.ipynb">
      <img width="32px" src="https://raw.githubusercontent.com/primer/octicons/refs/heads/main/icons/mark-github-24.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

## Overview

This notebook demonstrates how to deploy `Llama Nemotron` NVIDIA Inference Microservices (NIM) to Google Cloud Platform (GCP) Vertex AI.

You’ll learn how to run an NVIDIA NIM container on a Vertex AI compute instance, register it as a model in the Vertex AI Model Registry, deploy it to an Endpoint, and perform real-time predictions — all within a managed and scalable GCP environment.

The steps performed include:

- Set up Vertex AI environment and authentication.
- Upload the NVIDIA NIM model to the Vertex AI Model Registry.
- Create a Vertex AI Endpoint.
- Deploy the Model to the Endpoint.
- Send a prediction request via API and Python SDK.

### `Llama Nemotron` on Vertex AI

You can deploy the `Llama Nemotron` models in your own endpoint.

### Available `Llama Nemotron` models

#### `Llama-3.3-Nemotron-Super-49B-v1.5`
**Llama-3.3-Nemotron-Super-49B-v1.5** is a reasoning model that is post trained for reasoning, human chat preferences, and agentic tasks, such as Retrieval-Augmented Generation (RAG) and tool calling. The model supports a context length of 128K tokens. Llama-3.3-Nemotron-Super-49B-v1.5 is a model which offers a great tradeoff between model accuracy and efficiency. Efficiency (throughput) directly translates to savings. Using a novel Neural Architecture Search (NAS) approach, we greatly reduce the model’s memory footprint, enabling larger workloads, as well as fitting the model on a single GPU at high workloads (H200). This NAS approach enables the selection of a desired point in the accuracy-efficiency tradeoff. For more information on the NAS approach, please refer to this [paper](https://arxiv.org/abs/2411.19146).

## Objective

This notebook shows how to use **Vertex AI API** to deploy the `Llama Nemotron` models.

For more information, see the [NIM documentation](https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1_5/modelcard).


## Get Started

### Install Vertex AI SDK for Python or other required packages


In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform

In [None]:
! pip3 install -U -q httpx

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Select one of `publisher name` models

In [2]:
PUBLISHER_NAME = "nvidia"  # @param {type:"string"}
PUBLISHER_MODEL_NAME = (
    "llama-3.3-nemotron-super-49b-v1.5"  # @param ["llama-3.3-nemotron-super-49b-v1.5"]
)

if PUBLISHER_MODEL_NAME == "llama-3.3-nemotron-super-49b-v1.5":
    available_regions = [
        "asia-northeast1",
        "asia-northeast3",
        "asia-south1",
        "asia-south2",
        "asia-southeast1",
        "australia-southeast1",
        "europe-north1",
        "europe-west1",
        "europe-west2",
        "europe-west3",
        "europe-west4",
        "me-west1",
        "northamerica-northeast2",
        "us-central1",
        "us-east1",
        "us-east4",
        "us-east5",
        "us-south1",
        "us-west1",
        "us-west2",
        "us-west3",
        "us-west4",
    ]

### Select a location and a version from the dropdown

In [None]:
import ipywidgets as widgets
from IPython.display import display

dropdown_loc = widgets.Dropdown(
    options=available_regions,
    description="Select a location:",
    font_weight="bold",
    style={"description_width": "initial"},
)


def dropdown_loc_eventhandler(change):
    global LOCATION
    if change["type"] == "change" and change["name"] == "value":
        LOCATION = change.new
        print("Selected:", change.new)


LOCATION = dropdown_loc.value
dropdown_loc.observe(dropdown_loc_eventhandler, names="value")
display(dropdown_loc)

### Set Google Cloud project information and initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
ENDPOINT = f"https://{LOCATION}-aiplatform.googleapis.com"

if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    raise ValueError("Please set your PROJECT_ID")

### Import required libraries

In [None]:
import json
import time

## Using Vertex AI API

### Upload Model

In [None]:
UPLOAD_MODEL_PAYLOAD = {
    "model": {
        "displayName": "ModelGarden_LaunchPad_Model_" + time.strftime("%Y%m%d-%H%M%S"),
        "baseModelSource": {
            "modelGardenSource": {
                "publicModelName": f"publishers/{PUBLISHER_NAME}/models/{PUBLISHER_MODEL_NAME}",
            }
        },
    }
}

request = json.dumps(UPLOAD_MODEL_PAYLOAD)

! curl -X POST -H "Authorization: Bearer $(gcloud auth print-access-token)" -H "Content-Type: application/json" {ENDPOINT}/v1beta1/projects/{PROJECT_ID}/locations/{LOCATION}/models:upload -d '{request}'

### Get Model

After uploading the model to the Vertex AI Model Registry, extract its Model ID by specifying the target region and matching the display name pattern.

In [None]:
# Adjust filter to include exact timestamp if needed
MODEL_ID = !gcloud ai models list --region=$LOCATION --filter="display_name~'.*ModelGarden_LaunchPad_Model.*'" --format="value(name)" | head -n 1
MODEL_ID = MODEL_ID[1]
print(MODEL_ID)

In [None]:
MODEL_ID = "YOUR_MODEL_ID"  # @param {type: "string"}

! curl -X GET -H "Authorization: Bearer $(gcloud auth print-access-token)" -H "Content-Type: application/json" {ENDPOINT}/v1/projects/{PROJECT_ID}/locations/{LOCATION}/models/{MODEL_ID}

### Create Endpoint

In [None]:
CREATE_ENDPOINT_PAYLOAD = {
    "displayName": "ModelGarden_LaunchPad_Endpoint_" + time.strftime("%Y%m%d-%H%M%S"),
}

request = json.dumps(CREATE_ENDPOINT_PAYLOAD)

! curl -X POST -H "Authorization: Bearer $(gcloud auth print-access-token)" -H "Content-Type: application/json" {ENDPOINT}/v1/projects/{PROJECT_ID}/locations/{LOCATION}/endpoints -d '{request}'

### Get Endpoint

After creating the Endpoint, extract its Endpoint ID by specifying the target region and matching the display name pattern.

In [None]:
# Adjust filter to include exact timestamp if needed
ENDPOINT_ID = !gcloud ai endpoints list --region=$LOCATION --filter="DISPLAY_NAME ~ .*ModelGarden_LaunchPad_Endpoint.*" --format="value(name)" | head -n 1
ENDPOINT_ID = ENDPOINT_ID[1]
print(ENDPOINT_ID)

In [None]:
ENDPOINT_ID = "YOUR_ENDPOINT_ID"  # @param {type: "string"}

! curl -X GET -H "Authorization: Bearer $(gcloud auth print-access-token)" -H "Content-Type: application/json" {ENDPOINT}/v1/projects/{PROJECT_ID}/locations/{LOCATION}/endpoints/{ENDPOINT_ID}

### Deploy Model

Deploy the model to the endpoint. The deployment process will take a few minutes.

In [None]:
import ipywidgets as widgets
from IPython.display import display

# Configure machine types (please change if needed)

MACHINE_CONFIG = {
    "a2-highgpu-8g": {"type": "NVIDIA_TESLA_A100", "count": 8},
    "a2-ultragpu-8g": {"type": "NVIDIA_A100_80GB", "count": 8},
    "a3-highgpu-4g": {"type": "NVIDIA_H100_80GB", "count": 4},
    "g4-standard-384": {"type": "NVIDIA_RTX_PRO_6000", "count": 8},
}

COMPATIBLE_MACHINES = list(MACHINE_CONFIG.keys())

# Setup Widgets

dropdown_machine = widgets.Dropdown(
    options=COMPATIBLE_MACHINES,
    description="Machine type:",
    style={"description_width": "initial"},
)

label_accelerator_type = widgets.HTML()
label_accelerator_count = widgets.HTML()


def update_labels(machine_type: str):
    """Update accelerator labels based on the selected machine."""
    info = MACHINE_CONFIG.get(machine_type, {"type": "Unknown", "count": 0})
    label_accelerator_type.value = f"<b>Accelerator type:</b> {info['type']}"
    label_accelerator_count.value = f"<b>Accelerator count:</b> {info['count']}"
    return info


# Event Handling


def on_machine_change(change):
    """Handle machine selection change."""
    if change["name"] == "value" and change["type"] == "change":
        global MACHINE_TYPE, ACCELERATOR_TYPE, ACCELERATOR_COUNT
        MACHINE_TYPE = change.new
        info = update_labels(MACHINE_TYPE)
        ACCELERATOR_TYPE = info["type"]
        ACCELERATOR_COUNT = info["count"]


# Initiate Global Variables

MACHINE_TYPE = dropdown_machine.value
initial_info = update_labels(MACHINE_TYPE)
ACCELERATOR_TYPE = initial_info["type"]
ACCELERATOR_COUNT = initial_info["count"]

dropdown_machine.observe(on_machine_change, names="value")

display(
    widgets.VBox(
        [
            dropdown_machine,
            label_accelerator_type,
            label_accelerator_count,
        ]
    )
)

In [None]:
DEPLOY_PAYLOAD = {
    "deployedModel": {
        "model": f"projects/{PROJECT_ID}/locations/{LOCATION}/models/{MODEL_ID}",
        "displayName": "ModelGarden_LaunchPad_DeployedModel_"
        + time.strftime("%Y%m%d-%H%M%S"),
        "dedicatedResources": {
            "machineSpec": {
                "machineType": MACHINE_TYPE,
                "acceleratorType": ACCELERATOR_TYPE,
                "acceleratorCount": ACCELERATOR_COUNT,
            },
            "minReplicaCount": 1,
            "maxReplicaCount": 1,
        },
    },
    "trafficSplit": {"0": 100},
}

request = json.dumps(DEPLOY_PAYLOAD)
print("Request payload to Deploy Model:")
print(json.dumps(DEPLOY_PAYLOAD, indent=2))
print("\nResult:")
! curl -X POST -H "Authorization: Bearer $(gcloud auth print-access-token)" -H "Content-Type: application/json" {ENDPOINT}/v1/projects/{PROJECT_ID}/locations/{LOCATION}/endpoints/{ENDPOINT_ID}:deployModel -d '{request}'

### Prediction

#### Unary call

Sends a POST request to the specified API endpoint to get a response from the model for a limerick writing using the provided payload, with reasoning on / off.

In [None]:
# Reasoning off 

PAYLOAD = {
    "model": PUBLISHER_NAME + "/" + PUBLISHER_MODEL_NAME,
    "messages": [
       {"role": "system", "content": "/no_think"}, 
       {"role":"user", "content":"How many 'r's are in 'strawberry'?"}
    ],
    "temperature": 0.6,
    "top_p": 0.95,
    "max_tokens": 1024,
    "stream": False
}

request = json.dumps(PAYLOAD)

!curl -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json" {ENDPOINT}/v1/projects/{PROJECT_ID}/locations/{LOCATION}/endpoints/{ENDPOINT_ID}:rawPredict \
  -d '{request}'

In [None]:
# Reasoning on

PAYLOAD = {
    "model": PUBLISHER_NAME + "/" + PUBLISHER_MODEL_NAME,
    "messages": [
       {"role": "system", "content": "/think"}, 
       {"role":"user", "content":"How many 'r's are in 'strawberry'?"}
    ],
    "temperature": 0.6,
    "top_p": 0.95,
    "max_tokens": 1024,
    "stream": False
}

request = json.dumps(PAYLOAD)

!curl -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json" {ENDPOINT}/v1/projects/{PROJECT_ID}/locations/{LOCATION}/endpoints/{ENDPOINT_ID}:rawPredict \
  -d '{request}'

#### Streaming call

Sends a POST request to the specified API endpoint to stream a response from the model for a limerick writing using provided payload, with reasoning on / off.

In [None]:
# Reasoning off 

PAYLOAD = {
    "model": PUBLISHER_NAME + "/" + PUBLISHER_MODEL_NAME,
    "messages": [
       {"role": "system", "content": "/no_think"}, 
       {"role":"user", "content":"How many 'r's are in 'strawberry'?"}
    ],
    "temperature": 0.6,
    "top_p": 0.95,
    "max_tokens": 1024,
    "stream": True
}

request = json.dumps(PAYLOAD)
!curl -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json" {ENDPOINT}/v1/projects/{PROJECT_ID}/locations/{LOCATION}/endpoints/{ENDPOINT_ID}:streamRawPredict \
  -d '{request}'

In [None]:
# Reasoning on

PAYLOAD = {
    "model": PUBLISHER_NAME + "/" + PUBLISHER_MODEL_NAME,
    "messages": [
       {"role": "system", "content": "/think"}, 
       {"role":"user", "content":"How many 'r's are in 'strawberry'?"}
    ],
    "temperature": 0.6,
    "top_p": 0.95,
    "max_tokens": 1024,
    "stream": True
}

request = json.dumps(PAYLOAD)
!curl -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json" {ENDPOINT}/v1/projects/{PROJECT_ID}/locations/{LOCATION}/endpoints/{ENDPOINT_ID}:streamRawPredict \
  -d '{request}'

## Using Vertex AI SDK for *Python*

In [None]:
from google.cloud import aiplatform

In [None]:
aiplatform.init(project=PROJECT_ID, location=LOCATION)

### Upload Model

In [None]:
model = aiplatform.Model.upload(
    display_name="ModelGarden_LaunchPad_Endpoint_" + time.strftime("%Y%m%d-%H%M%S"),
    model_garden_source_model_name=f"publishers/{PUBLISHER_NAME}/models/{PUBLISHER_MODEL_NAME}",
)

### Create Endpoint

In [None]:
my_endpoint = aiplatform.Endpoint.create(
    display_name="ModelGarden_LaunchPad_Endpoint_" + time.strftime("%Y%m%d-%H%M%S")
)

### Deploy Model

Deploy the model to the endpoint. The deployment process will take a few minutes.

In [None]:
import ipywidgets as widgets
from IPython.display import display

# Configure machine types (please change if needed)

MACHINE_CONFIG = {
    "a2-highgpu-8g": {"type": "NVIDIA_TESLA_A100", "count": 8},
    "a2-ultragpu-8g": {"type": "NVIDIA_A100_80GB", "count": 8},
    "a3-highgpu-4g": {"type": "NVIDIA_H100_80GB", "count": 4},
    "g4-standard-384": {"type": "NVIDIA_RTX_PRO_6000", "count": 8},
}

COMPATIBLE_MACHINES = list(MACHINE_CONFIG.keys())

# Setup Widgets

dropdown_machine = widgets.Dropdown(
    options=COMPATIBLE_MACHINES,
    description="Machine type:",
    style={"description_width": "initial"},
)

label_accelerator_type = widgets.HTML()
label_accelerator_count = widgets.HTML()


def update_labels(machine_type: str):
    """Update accelerator labels based on the selected machine."""
    info = MACHINE_CONFIG.get(machine_type, {"type": "Unknown", "count": 0})
    label_accelerator_type.value = f"<b>Accelerator type:</b> {info['type']}"
    label_accelerator_count.value = f"<b>Accelerator count:</b> {info['count']}"
    return info


# Event Handling


def on_machine_change(change):
    """Handle machine selection change."""
    if change["name"] == "value" and change["type"] == "change":
        global MACHINE_TYPE, ACCELERATOR_TYPE, ACCELERATOR_COUNT
        MACHINE_TYPE = change.new
        info = update_labels(MACHINE_TYPE)
        ACCELERATOR_TYPE = info["type"]
        ACCELERATOR_COUNT = info["count"]


# Initiate Global Variables

MACHINE_TYPE = dropdown_machine.value
initial_info = update_labels(MACHINE_TYPE)
ACCELERATOR_TYPE = initial_info["type"]
ACCELERATOR_COUNT = initial_info["count"]

dropdown_machine.observe(on_machine_change, names="value")

display(
    widgets.VBox(
        [
            dropdown_machine,
            label_accelerator_type,
            label_accelerator_count,
        ]
    )
)

In [None]:
model.deploy(
    endpoint=my_endpoint,
    deployed_model_display_name="ModelGarden_LaunchPad_DeployedModel_"
    + time.strftime("%Y%m%d-%H%M%S"),
    traffic_split={"0": 100},
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    min_replica_count=1,
    max_replica_count=1,
)

### Prediction

#### Unary call

Sends a POST request to the specified API endpoint to get a response from the model for a limerick writing using the provided payload, with reasoning on / off.

In [None]:
# Reasoning off

PAYLOAD = {
    "model": PUBLISHER_NAME + "/" + PUBLISHER_MODEL_NAME,
    "messages": [
        {"role": "system", "content": "/no_think"},
        {"role": "user", "content": "How many 'r's are in 'strawberry'?"},
    ],
    "temperature": 0.6,
    "top_p": 0.95,
    "max_tokens": 1024,
    "stream": False,
}

request = json.dumps(PAYLOAD)

response = my_endpoint.raw_predict(
    body=request, headers={"Content-Type": "application/json"}
)

result = json.loads(response.text)
print(json.dumps(result["choices"][0]["message"]["content"]))

In [None]:
# Reasoning on

PAYLOAD = {
    "model": PUBLISHER_NAME + "/" + PUBLISHER_MODEL_NAME,
    "messages": [
        {"role": "system", "content": "/think"},
        {"role": "user", "content": "How many 'r's are in 'strawberry'?"},
    ],
    "temperature": 0.6,
    "top_p": 0.95,
    "max_tokens": 1024,
    "stream": False,
}

request = json.dumps(PAYLOAD)

response = my_endpoint.raw_predict(
    body=request, headers={"Content-Type": "application/json"}
)

result = json.loads(response.text)
print(json.dumps(result["choices"][0]["message"]["content"]))

#### Streaming call

Sends a POST request to the specified API endpoint to stream a response from the model for a limerick writing using provided payload, with reasoning on / off.

In [None]:
# Reasoning off

PAYLOAD = {
    "model": PUBLISHER_NAME + "/" + PUBLISHER_MODEL_NAME,
    "messages": [
        {"role": "system", "content": "/no_think"},
        {"role": "user", "content": "How many 'r's are in 'strawberry'?"},
    ],
    "temperature": 0.6,
    "top_p": 0.95,
    "max_tokens": 1024,
    "stream": True,
}

request = json.dumps(PAYLOAD)

for stream_response in my_endpoint.stream_raw_predict(
    body=request, headers={"Content-Type": "application/json"}
):
    print(stream_response)

In [None]:
# Reasoning on

PAYLOAD = {
    "model": PUBLISHER_NAME + "/" + PUBLISHER_MODEL_NAME,
    "messages": [
        {"role": "system", "content": "/think"},
        {"role": "user", "content": "How many 'r's are in 'strawberry'?"},
    ],
    "temperature": 0.6,
    "top_p": 0.95,
    "max_tokens": 1024,
    "stream": True,
}

request = json.dumps(PAYLOAD)

for stream_response in my_endpoint.stream_raw_predict(
    body=request, headers={"Content-Type": "application/json"}
):
    print(stream_response)