In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Llama 3.3 (Deployment)

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_pytorch_llama3_3_deployment.ipynb">
      <img alt="Workbench logo" src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" width="32px"><br> Run in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_pytorch_llama3_3_deployment.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama3_3_deployment.ipynb">
      <img alt="GitHub logo" src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates how to deploy a **llama3.3** open model on Google Cloud Vertex AI.

### Objectives

- Deploy llama3.3 using containerized backends like [vLLM](https://github.com/vllm-project/vllm) on GPU.
- Use the deployed model to serve chat completion requests for both text and multimodal inputs.

### File a Bug

If you encounter issues with this notebook, report them on [GitHub](https://github.com/GoogleCloudPlatform/vertex-ai-samples/issues/new).

### Costs

This tutorial uses billable components of Google Cloud:

- Vertex AI
- Cloud Storage

Refer to the [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage pricing](https://cloud.google.com/storage/pricing) pages for more information. Use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to estimate your projected costs.

## Get Started

### Install Vertex AI SDK and other required packages

In [None]:
%pip install --upgrade --force-reinstall --quiet 'google-cloud-aiplatform>=1.106.0' 'openai' 'google-auth==2.27.0' 'requests==2.32.3'

### Authenticate the Notebook Environment (Colab only)

If you're running this notebook in Google Colab, run the following cell to authenticate.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud Project Information

To get started with Vertex AI, ensure you have an existing Google Cloud project and that the [Vertex AI API is enabled](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

See the guide on [setting up your project and development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment). Also confirm that [billing is enabled](https://cloud.google.com/billing/docs/how-to/modify-project).


In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

import vertexai

PROJECT_ID = ""  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}

if not PROJECT_ID:
    PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")

REGION = ""  # @param {type: "string", placeholder: "[your-region]", isTemplate: true}

if not REGION:
    REGION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

vertexai.init(project=PROJECT_ID, location=REGION)

### Import libraries

In [None]:
from vertexai import model_garden

## Deploy model

### Choose model variant

You can proceed with the default model variant or select a different one.

In [None]:
model_version = "llama-3.3-70b-instruct"  # @param ["llama-3.3-70b-instruct", "llama-3.3-70b-instruct-fp8"] {isTemplate:true}
MODEL_NAME = f"meta/llama3-3@{model_version}"

To see all deployable model variants available in Model Garden, use:

In [None]:
all_model_versions = model_garden.list_deployable_models(
    model_filter="llama3-3", list_hf_models=False
)

Once you've selected a model variant, initialize it:

In [None]:
model = model_garden.OpenModel(MODEL_NAME)

### Check the Deployment Configuration

Use the `list_deploy_options()` method to view the verified deployment configurations for your selected model. This helps ensure you have sufficient resources (e.g., GPU quota) available to deploy it.

In [None]:
deploy_options = model.list_deploy_options(concise=True)
print(deploy_options)

### Deploy the Model

Now that you’ve reviewed the deployment options, use the `deploy()` method to serve the selected open model to a Vertex AI endpoint. Deployment time may vary depending on the model size and infrastructure requirements.

> **Note**: If the model requires accepting a license agreement (EULA), set the `accept_eula=True` flag in the deploy call. Set `use_dedicated_endpoint` to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint).

In [None]:
use_dedicated_endpoint = True

In [None]:
endpoint = model.deploy(
    accept_eula=True,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

Alternatively, you can select one of the verified deployment configurations listed above.

In [None]:
endpoint = model.deploy(
    accept_eula=True,
    use_dedicated_endpoint=use_dedicated_endpoint,
    serving_container_image_uri="us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/sglang-serve.cu124.0-4.ubuntu2204.py310:model-garden.sglang-0-4-release_20250831.00_p0",
    machine_type="a3-ultragpu-8g",
    accelerator_type="NVIDIA_H200_141GB",
    accelerator_count=8,
)

To further customize your deployment, you can configure:

- **Compute Resources**: Machine type, replica count (min/max), accelerator type and quantity.
- **Infrastructure**: Use Spot VMs, reservation affinity, or dedicated endpoints.
- **Serving Container**: Customize container image, ports, health checks, and environment variables.

See the [Model Garden SDK README](https://github.com/googleapis/python-aiplatform/blob/main/vertexai/model_garden/README.md) for advanced configuration options.

In [None]:
# @title Raw predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/dev/sampling_params.html).

# @markdown Example:

# @markdown ```
# @markdown Human: What is a car?
# @markdown Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
# @markdown ```
# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

prompt = "What is a car?"  # @param {type: "string"}
# @markdown If you encounter an issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, by lowering `max_tokens`.
max_tokens = 50  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}
top_p = 1.0  # @param {type:"number"}
top_k = 1  # @param {type:"integer"}
# @markdown Set `raw_response` to `True` to obtain the raw model output. Set `raw_response` to `False` to apply additional formatting in the structure of `"Prompt:\n{prompt.strip()}\nOutput:\n{output}"`.
raw_response = False  # @param {type:"boolean"}

# Overrides parameters for inferences.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "raw_response": raw_response,
    },
]
response = endpoint.predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    print(prediction)

# @markdown Click "Show Code" to see more details.

In [None]:
# @title Chat completion

if use_dedicated_endpoint:
    DEDICATED_ENDPOINT_DNS = endpoint.gca_resource.dedicated_endpoint_dns
ENDPOINT_RESOURCE_NAME = endpoint.resource_name

# @title Chat Completions Inference

# @markdown Once deployment succeeds, you can send requests to the endpoint using the OpenAI SDK.

# @markdown First you will need to install the SDK and some auth-related dependencies.

! pip install -qU openai google-auth requests

# @markdown Next fill out some request parameters:

user_message = "How is your day going?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 50  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}
stream = False  # @param {type: "boolean"}

# @markdown Now we can send a request.

import google.auth
import openai

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

BASE_URL = (
    f"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}"
)
try:
    if use_dedicated_endpoint:
        BASE_URL = f"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}"
except NameError:
    pass

client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)

model_response = client.chat.completions.create(
    model="",
    messages=[{"role": "user", "content": user_message}],
    temperature=temperature,
    max_tokens=max_tokens,
    stream=stream,
)

if stream:
    usage = None
    contents = []
    for chunk in model_response:
        if chunk.usage is not None:
            usage = chunk.usage
            continue
        print(chunk.choices[0].delta.content, end="")
        contents.append(chunk.choices[0].delta.content)
    print(f"\n\n{usage}")
else:
    print(model_response)

# @markdown Click "Show Code" to see more details.

## Deploy Llama 3.3 70B Instruct FP8 with TensorRT-LLM

In [None]:
# @title Deploy

# @markdown This section uploads Llama 3.3 70B Instruct FP8 model to Model Registry and deploys it to a Vertex Prediction Endpoint. It takes ~1 hour to finish.

# @markdown The [Llama 3.3 70B Instruct FP8 model](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP8) is provided by Nvidia.

# @markdown It's recommended to use the region selected by the deployment button on the model card. If the deployment button is not available, it's recommended to stay with the default region of the notebook.

# @markdown Set the model to deploy.

base_model_name = "Llama-3.3-70B-Instruct-FP8"  # @param ["Llama-3.3-70B-Instruct-FP8"] {isTemplate:true}
model_id = os.path.join(VERTEX_AI_MODEL_GARDEN_LLAMA_3_3, base_model_name)

# The pre-built serving docker images.
TRTLLM_DOCKER_URI = "us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/tensorrt-llm.cu128.0-18.ubuntu2404.py312:20250514-1800-rc0"

# @markdown Choose whether to use a [Spot VM](https://cloud.google.com/compute/docs/instances/spot) for the deployment.
is_spot = False  # @param {type:"boolean"}

# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint). Note that [dedicated endpoint does not support VPC Service Controls](https://cloud.google.com/vertex-ai/docs/predictions/choose-endpoint-type), uncheck the box if you are using VPC-SC.
use_dedicated_endpoint = True  # @param {type:"boolean"}

accelerator_type = "NVIDIA_H100_80GB"
accelerator_count = 8
machine_type = "a3-highgpu-8g"
multihost_gpu_node_count = 1
trtllm_region = REGION

common_util.check_quota(
    project_id=PROJECT_ID,
    region=trtllm_region,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)

# 128K context length.
MAX_MODEL_LEN = 131072


def poll_operation(op_name: str, trtllm_region: str) -> bool:  # noqa: F811
    creds, _ = auth.default()
    auth_req = auth.transport.requests.Request()
    creds.refresh(auth_req)
    headers = {
        "Authorization": f"Bearer {creds.token}",
    }
    get_resp = requests.get(
        f"https://{trtllm_region}-aiplatform.googleapis.com/ui/{op_name}",
        headers=headers,
    )
    opjs = get_resp.json()
    if "error" in opjs:
        raise ValueError(f"Operation failed: {opjs['error']}")
    return opjs.get("done", False)


def poll_and_wait_trtllm(
    op_name: str, total_wait: int, trtllm_region: str, interval: int = 60
):  # noqa: F811
    waited = 0
    while not poll_operation(op_name, trtllm_region):
        if waited > total_wait:
            raise TimeoutError("Operation timed out")
        print(
            f"\rStill waiting for operation... Waited time in second: {waited:<6}",
            end="",
            flush=True,
        )
        waited += interval
        time.sleep(interval)


def deploy_model_tensorrt_llm_multihost(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    service_account: str = "",
    base_model_id: str = "",
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    multihost_gpu_node_count: int = 1,
    gpu_memory_utilization: float | None = None,
    max_input_len: int | None = None,
    max_model_len: int | None = None,
    max_num_seqs: int | None = None,
    enable_trust_remote_code: bool = False,
    enable_chunked_prefill: bool = False,
    use_dedicated_endpoint: bool = False,
    is_spot: bool = True,
    trtllm_region: str = REGION,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with TensorRT-LLM on Vertex AI."""
    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        location=trtllm_region,
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )

    if not base_model_id:
        base_model_id = model_id

    container_args = [
        "python",
        "api_server.py",
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count * multihost_gpu_node_count}",
    ]

    if gpu_memory_utilization:
        container_args.append(f"--gpu-memory-utilization={gpu_memory_utilization}")

    if max_input_len:
        container_args.append(f"--max-input-len={max_input_len}")

    if max_model_len:
        container_args.append(f"--max-model-len={max_model_len}")

    if max_num_seqs:
        container_args.append(f"--max-num-seqs={max_num_seqs}")

    if enable_trust_remote_code:
        container_args.append("--trust-remote-code=True")

    if enable_chunked_prefill:
        container_args.append("--enable-chunked-prefill=True")

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        location=trtllm_region,
        serving_container_image_uri=TRTLLM_DOCKER_URI,
        serving_container_args=container_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/v1/chat/completions",
        serving_container_health_route="/health",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )
    print(
        f"Deploying {model_name} on {machine_type} with {int(accelerator_count * multihost_gpu_node_count)} {accelerator_type} GPU(s)."
    )

    creds, _ = auth.default()
    auth_req = auth.transport.requests.Request()
    creds.refresh(auth_req)

    url = f"https://{trtllm_region}-aiplatform.googleapis.com/ui/projects/{PROJECT_ID}/locations/{trtllm_region}/endpoints/{endpoint.name}:deployModel"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {creds.token}",
    }
    data = {
        "deployedModel": {
            "model": model.resource_name,
            "displayName": model_name,
            "dedicatedResources": {
                "machineSpec": {
                    "machineType": machine_type,
                    "multihostGpuNodeCount": multihost_gpu_node_count,
                    "acceleratorType": accelerator_type,
                    "acceleratorCount": accelerator_count,
                },
                "minReplicaCount": 1,
                "maxReplicaCount": 1,
            },
            "system_labels": {
                "NOTEBOOK_NAME": "model_garden_pytorch_llama3_3_deployment.ipynb",
                "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
            },
        },
    }
    if service_account:
        data["deployedModel"]["serviceAccount"] = service_account
    if is_spot:
        data["deployedModel"]["dedicatedResources"]["spot"] = True
    response = requests.post(url, headers=headers, json=data)
    print(f"Deploy Model response: {response.json()}")
    if response.status_code != 200 or "name" not in response.json():
        raise ValueError(f"Failed to deploy model: {response.text}")
    poll_and_wait_trtllm(response.json()["name"], 7200, trtllm_region)
    print("endpoint_name:", endpoint.name)

    return model, endpoint


models["trtllm_gpu"], endpoints["trtllm_gpu"] = deploy_model_tensorrt_llm_multihost(
    model_name=common_util.get_job_name_with_datetime(prefix="llama3-3-serve"),
    model_id=model_id,
    publisher="meta-llama",
    publisher_model_id="llama-3.3-70b-instruct",
    base_model_id="meta-llama/Llama-3.3-70B-Instruct",
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    multihost_gpu_node_count=multihost_gpu_node_count,
    max_model_len=MAX_MODEL_LEN,
    enable_chunked_prefill=True,
    use_dedicated_endpoint=use_dedicated_endpoint,
    is_spot=is_spot,
    trtllm_region=trtllm_region,
)
# @markdown Click "Show Code" to see more details.

In [None]:
# @title Raw predict
# @markdown Once deployment succeeds, you can send requests to the endpoint using `:rawPredict`. The OpenAI Client chat completions support is coming soon.
endpoints["trtllm_gpu"] = aiplatform.Endpoint(endpoints["trtllm_gpu"].resource_name)

# @markdown Fill out some request parameters:
user_message = "How is your day going?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 50  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}

# @markdown Now we can send a request.

response = endpoints["trtllm_gpu"].raw_predict(
    body=json.dumps(
        {
            "model": "",
            "messages": [
                {
                    "role": "user",
                    "content": user_message,
                }
            ],
            "max_tokens": max_tokens,
            "temperature": temperature,
        }
    ),
    headers={"Content-Type": "application/json"},
    use_dedicated_endpoint=use_dedicated_endpoint,
)
print(response.json()["choices"][0]["message"]["content"])

# @markdown Click "Show Code" to see more details.

## Clean up resources

In [None]:
# @title Delete the endpoints

# @markdown Delete the endpoint.

if endpoint:
    endpoint.delete(force=True)