In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Llama 3.2 (Deployment)

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_pytorch_llama3_2_deployment.ipynb">
      <img alt="Workbench logo" src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" width="32px"><br> Run in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_pytorch_llama3_2_deployment.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama3_2_deployment.ipynb">
      <img alt="GitHub logo" src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates downloading, deploying, and serving prebuilt Llama 3.2 models on GPU and TPU.

### Objective

- Deploy Llama 3.2 1B and 3B with [Hex-LLM](https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-hex-llm) on TPU.
- Deploy Llama 3.2 1B and 3B Instruct with the Fast Deployment feature.
- Deploy Llama 3.2 11B-vision and 90B-vision with [vLLM](https://github.com/vllm-project/vllm) on GPU.

### File a bug

File a bug on [GitHub](https://github.com/GoogleCloudPlatform/vertex-ai-samples/issues/new) if you encounter any issue with the notebook.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Request for quota

# @markdown By default, the quota for TPU deployment `Custom model serving TPU v5e cores per region` is 4, which is sufficient for serving the Llama 3.2 1B and 3B model. You can request for additional TPU quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

# @markdown By default, the quota for A100_80GB and H100 deployment `Custom model serving per region` is 0. You need to request quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

# @markdown For better chance to get resources, we recommend to request A100_80GB quota in the regions `us-central1, us-east1`, and request H100 quota in the regions `us-central1, us-west1`.

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = ""  # @param {type:"string"}

# @markdown 3. If you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus). You can request for quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

# @markdown > | Machine Type | Accelerator Type | Recommended Regions |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |
# @markdown | a3-highgpu-2g | 2 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-4g | 4 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, europe-west4, us-west1, asia-southeast1 |

# Import the necessary packages

# Upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet 'google-cloud-aiplatform==1.97.0'
! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

import importlib
import os
import re
from typing import Tuple

import requests
from google import auth
from google.cloud import aiplatform

if os.environ.get("VERTEX_PRODUCT") != "COLAB_ENTERPRISE":
    ! pip install --upgrade tensorflow
! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

models, endpoints = {}, {}


# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION)

! gcloud config set project $PROJECT_ID
import vertexai

vertexai.init(
    project=PROJECT_ID,
    location=REGION,
)

# @markdown # Access Llama 3.2 models on Vertex AI for serving
# @markdown The original models from Meta are converted into the Hugging Face format for serving in Vertex AI.
# @markdown Accept the model agreement to access the models:
# @markdown 1. Open the [Llama 3.2 model card](https://console.cloud.google.com/vertex-ai/publishers/meta/model-garden/llama3-2) from [Vertex AI Model Garden](https://cloud.google.com/model-garden).
# @markdown 2. Review and accept the agreement in the pop-up window on the model card page. If you have previously accepted the model agreement, there will not be a pop-up window on the model card page and this step is not needed.
# @markdown 3. After accepting the agreement of Llama 3.2, a `gs://` URI containing Llama 3.2 pretrained and finetuned models will be shared.
# @markdown 4. Paste the URI in the `VERTEX_AI_MODEL_GARDEN_LLAMA_3_2` field below.


VERTEX_AI_MODEL_GARDEN_LLAMA_3_2 = ""  # @param {type:"string", isTemplate:true}
assert (
    VERTEX_AI_MODEL_GARDEN_LLAMA_3_2
), "Click the agreement of Llama 3.2 in Vertex AI Model Garden, and get the GCS path of Llama 3.2 model artifacts."
parsed_gcs_url = re.search("gs://.*?(?=[ ]|$)", VERTEX_AI_MODEL_GARDEN_LLAMA_3_2)
if parsed_gcs_url:
    VERTEX_AI_MODEL_GARDEN_LLAMA_3_2 = parsed_gcs_url.group()
assert VERTEX_AI_MODEL_GARDEN_LLAMA_3_2.startswith(
    "gs://"
), "VERTEX_AI_MODEL_GARDEN_LLAMA_3_2 is expected to be a GCS URI and must start with `gs://`."

## Deploy prebuilt Llama 3.2 1B and 3B with Hex-LLM

**Hex-LLM** is a **H**igh-**E**fficiency **L**arge **L**anguage **M**odel (LLM) TPU serving solution built with **XLA**, which is being developed by Google Cloud.

Refer to the "Request for TPU quota" section for TPU quota.

In [None]:
# @title Select the model variants

# @markdown Select one of the four model variations. More model variants will be supported by Hex-LLM in the future.

MODEL_ID = "Llama-3.2-1B-Instruct"  # @param ["Llama-3.2-1B", "Llama-3.2-1B-Instruct", "Llama-3.2-3B", "Llama-3.2-3B-Instruct"] {allow-input: true, isTemplate: true}
TPU_DEPLOYMENT_REGION = "us-west1"  # @param ["us-west1"] {isTemplate:true}
hf_model_id = "meta-llama/" + MODEL_ID
version_id = MODEL_ID.lower()
model_id = os.path.join(VERTEX_AI_MODEL_GARDEN_LLAMA_3_2, MODEL_ID)
PUBLISHER_MODEL_NAME = f"publishers/meta/models/llama3-2@{version_id}"

# The pre-built serving docker images.
HEXLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/hex-llm-serve:20241210_2323_RC00"

# @markdown Find Vertex AI prediction TPUv5e machine types in
# @markdown https://cloud.google.com/vertex-ai/docs/predictions/use-tpu#deploy_a_model.

# Sets ct5lp-hightpu-1t (1 TPU chip) to deploy models.
machine_type = "ct5lp-hightpu-1t"
# Note: 1 TPU V5 chip has only one core.
tpu_type = "TPU_V5e"
tpu_count = 1
tpu_topo = "1x1"

common_util.check_quota(
    project_id=PROJECT_ID,
    region=TPU_DEPLOYMENT_REGION,
    accelerator_type=tpu_type,
    accelerator_count=tpu_count,
    is_for_training=False,
)

In [None]:
# @title Deploy

# @markdown This section uploads prebuilt Llama 3.2 models to Model Registry and deploys it to a Vertex AI Endpoint. It takes 15 minutes to 1 hour to finish depending on the size of the model.

# @markdown Set enable_prefix_cache_hbm to False if you don't want to use [prefix caching](https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-hex-llm#prefix-caching).
enable_prefix_cache_hbm = True  # @param {type:"boolean"}

# Server parameters.
tensor_parallel_size = tpu_count

# Fraction of HBM memory allocated for KV cache after model loading. A larger value improves throughput but gives higher risk of TPU out-of-memory errors with long prompts.
hbm_utilization_factor = 0.8  # @param
# Maximum number of running sequences in a continuous batch.
max_running_seqs = 256  # @param
# Maximum context length for a request.
max_model_len = 4096  # @param

# Endpoint configurations.
min_replica_count = 1
max_replica_count = 1

# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint). Note that [dedicated endpoint does not support VPC Service Controls](https://cloud.google.com/vertex-ai/docs/predictions/choose-endpoint-type), uncheck the box if you are using VPC-SC.
use_dedicated_endpoint = True  # @param {type:"boolean"}


def deploy_model_hexllm(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    base_model_id: str = None,
    data_parallel_size: int = 1,
    tensor_parallel_size: int = 1,
    machine_type: str = "ct5lp-hightpu-1t",
    tpu_topology: str = "1x1",
    disagg_topology: str = None,
    hbm_utilization_factor: float = 0.6,
    max_running_seqs: int = 256,
    max_model_len: int = 4096,
    enable_prefix_cache_hbm: bool = False,
    endpoint_id: str = "",
    min_replica_count: int = 1,
    max_replica_count: int = 1,
    use_dedicated_endpoint: bool = False,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys models with Hex-LLM on TPU in Vertex AI."""
    if endpoint_id:
        aip_endpoint_name = (
            f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_id}"
        )
        endpoint = aiplatform.Endpoint(aip_endpoint_name)
    else:
        endpoint = aiplatform.Endpoint.create(
            display_name=f"{model_name}-endpoint",
            location=TPU_DEPLOYMENT_REGION,
            dedicated_endpoint_enabled=use_dedicated_endpoint,
        )

    if not base_model_id:
        base_model_id = model_id

    if not tensor_parallel_size:
        tensor_parallel_size = int(machine_type[-2])

    num_hosts = int(tpu_topology.split("x")[0])

    # Learn more about the supported arguments and environment variables at https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-hex-llm#config-server.
    hexllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--data_parallel_size={data_parallel_size}",
        f"--tensor_parallel_size={tensor_parallel_size}",
        f"--num_hosts={num_hosts}",
        f"--hbm_utilization_factor={hbm_utilization_factor}",
        f"--max_running_seqs={max_running_seqs}",
        f"--max_model_len={max_model_len}",
    ]
    if disagg_topology:
        hexllm_args.append(f"--disagg_topo={disagg_topology}")
    if enable_prefix_cache_hbm and not disagg_topology:
        hexllm_args.append("--enable_prefix_cache_hbm")

    env_vars = {
        "MODEL_ID": base_model_id,
        "HEX_LLM_LOG_LEVEL": "info",
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars.update({"HF_TOKEN": HF_TOKEN})
    except:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=HEXLLM_DOCKER_URI,
        serving_container_command=["python", "-m", "hex_llm.server.api_server"],
        serving_container_args=hexllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        location=TPU_DEPLOYMENT_REGION,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        tpu_topology=tpu_topology if num_hosts > 1 else None,
        deploy_request_timeout=1800,
        min_replica_count=min_replica_count,
        max_replica_count=max_replica_count,
        system_labels={
            "NOTEBOOK_NAME": "model_garden_pytorch_llama3_2_deployment.ipynb",
            "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
        },
    )
    return model, endpoint


models["hexllm_tpu"], endpoints["hexllm_tpu"] = deploy_model_hexllm(
    model_name=common_util.get_job_name_with_datetime(prefix="llama3_2-serve-hexllm"),
    model_id=model_id,
    publisher="meta",
    publisher_model_id="llama3-2",
    base_model_id=hf_model_id,
    tensor_parallel_size=tensor_parallel_size,
    machine_type=machine_type,
    tpu_topology=tpu_topo,
    hbm_utilization_factor=hbm_utilization_factor,
    max_running_seqs=max_running_seqs,
    max_model_len=max_model_len,
    enable_prefix_cache_hbm=enable_prefix_cache_hbm,
    min_replica_count=min_replica_count,
    max_replica_count=max_replica_count,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

In [None]:
# @title Raw predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts based on your `template`. Note that the first few prompts will take longer to execute.

# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# @markdown Example:

# @markdown ```
# @markdown > What is a car?
# @markdown > A car is a four-wheeled vehicle designed for the transportation of passengers and their belongings.
# @markdown ```

# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the endpoint
#   name of the endpoint `endpoint` created in the cell above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint:
# endpoint_name = endpoint_without_peft.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

prompt = "What is a car?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 50  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}
top_p = 1.0  # @param {type: "number"}
top_k = 1  # @param {type: "integer"}

# Overrides parameters for inferences.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
    },
]
response = endpoints["hexllm_tpu"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    print(prediction)

In [None]:
# @title Chat completion

temp_region = REGION
REGION = TPU_DEPLOYMENT_REGION

if use_dedicated_endpoint:
    DEDICATED_ENDPOINT_DNS = endpoints["hexllm_tpu"].gca_resource.dedicated_endpoint_dns
ENDPOINT_RESOURCE_NAME = "projects/{}/locations/{}/endpoints/{}".format(
    PROJECT_ID, REGION, endpoints["hexllm_tpu"].name
)

# @title Chat Completions Inference

# @markdown Once deployment succeeds, you can send requests to the endpoint using the OpenAI SDK.

# @markdown First you will need to install the SDK and some auth-related dependencies.

! pip install -qU openai google-auth requests

# @markdown Next fill out some request parameters:

user_message = "How is your day going?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 50  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}
stream = False  # @param {type: "boolean"}

# @markdown Now we can send a request.

import google.auth
import openai

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

BASE_URL = (
    f"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}"
)
try:
    if use_dedicated_endpoint:
        BASE_URL = f"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}"
except NameError:
    pass

client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)

model_response = client.chat.completions.create(
    model="",
    messages=[{"role": "user", "content": user_message}],
    temperature=temperature,
    max_tokens=max_tokens,
    stream=stream,
)

if stream:
    usage = None
    contents = []
    for chunk in model_response:
        if chunk.usage is not None:
            usage = chunk.usage
            continue
        print(chunk.choices[0].delta.content, end="")
        contents.append(chunk.choices[0].delta.content)
    print(f"\n\n{usage}")
else:
    print(model_response)

REGION = temp_region

# @markdown Click "Show Code" to see more details.

## Deploy prebuilt Llama 3.2 1B / 3B Instruct with standard vLLM and Fast Deployment

In [None]:
# @title Fast Deployment

# @markdown The Llama 3.2 1B/3B Instruct model will be deployed to a dedicated endpoint on an `a2-ultragpu-1g` machine with Fast Deployment.
# @markdown **Currently, the Fast Deployment is only supported in the `us-central1` region.**

# @markdown This section demonstrates how to use the Fast Deployment feature.

# @markdown The Fast Deployment feature prioritizes speed for model exploration, making it ideal for initial testing and experimentation. For sensitive data or production workloads, use the Standard environment for enhanced security and stability.

# @markdown Note that only a subset of the models support the Fast Deployment feature.

FAST_DEPLOYMENT_REGION = "us-central1"  # @param ["us-central1"] {isTemplate:true}

API_ENDPOINT = f"{FAST_DEPLOYMENT_REGION}-aiplatform.googleapis.com"


def fast_deploy(
    publisher: str, publisher_model_id: str, version_id: str
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    url = f"https://{API_ENDPOINT}/v1/publishers/{publisher}/models/{publisher_model_id}@{version_id}"
    access_token = ! gcloud auth print-access-token
    access_token = access_token[0]

    response = requests.get(
        url,
        headers={
            "Authorization": f"Bearer {access_token}",
            "Content-Type": "application/json",
        },
    )
    response.raise_for_status()
    response = response.json()
    if (
        len(
            response.get("supportedActions", {})
            .get("multiDeployVertex", {})
            .get("multiDeployVertex", {})
        )
        == 0
    ):
        raise ValueError(
            f"No supportedActions.multiDeployVertex found in {FAST_DEPLOYMENT_REGION}. You can skip"
            " this section or try a different region."
        )
    deploy_configs = response["supportedActions"]["multiDeployVertex"][
        "multiDeployVertex"
    ]
    fast_deploy_config = [
        config
        for config in deploy_configs
        if config["deployMetadata"]
        .get("labels", {})
        .get("show-faster-deployment-option")
        == "true"
    ]
    if fast_deploy_config:
        fast_deploy_config = fast_deploy_config[0]
    else:
        raise ValueError(
            f"No Fast Deployment config found in {FAST_DEPLOYMENT_REGION}. You can skip this"
            " section or try a different region."
        )

    container_spec = fast_deploy_config["containerSpec"]
    machine_spec = fast_deploy_config["dedicatedResources"]["machineSpec"]
    machine_type = machine_spec["machineType"]
    accelerator_type = machine_spec["acceleratorType"]
    accelerator_count = machine_spec["acceleratorCount"]
    env = {item["name"]: item["value"] for item in container_spec.get("env", [])}
    if "DEPLOY_SOURCE" in env:
        del env["DEPLOY_SOURCE"]
    port = container_spec.get("ports", [{}])[0].get("containerPort")

    model = aiplatform.Model.upload(
        display_name=fast_deploy_config.get("modelDisplayName"),
        serving_container_image_uri=container_spec.get("imageUri"),
        serving_container_args=container_spec.get("args"),
        serving_container_environment_variables=env,
        serving_container_ports=[port],
        serving_container_predict_route=container_spec.get("predictRoute"),
        serving_container_health_route=container_spec.get("healthRoute"),
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        location=FAST_DEPLOYMENT_REGION,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )
    endpoint = aiplatform.Endpoint.create(
        display_name=model.name + "-endpoint",
        location=FAST_DEPLOYMENT_REGION,
        dedicated_endpoint_enabled=True,
    )
    print(
        f"Deploying {model.name} on {machine_type} with"
        f" {accelerator_count} {accelerator_type} GPU(s)."
    )
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        disable_container_logging=True,
        fast_tryout_enabled=True,
        system_labels={
            "DEPLOY_SOURCE": "notebook",
            "NOTEBOOK_NAME": "model_garden_pytorch_llama3_2_deployment.ipynb",
        },
    )
    print("endpoint_name:", endpoint.name)
    return model, endpoint


# Fast Deployment only supports dedicated endpoints.
use_dedicated_endpoint = True

base_model_name = "Llama-3.2-1B-Instruct"  # @param ["Llama-3.2-1B-Instruct", "Llama-3.2-3B-Instruct"] {isTemplate:true}
models["vllm_fast"], endpoints["vllm_fast"] = fast_deploy(
    "meta", "llama3-2", base_model_name.lower()
)

In [None]:
# @title Raw predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/dev/sampling_params.html).

# @markdown Example:

# @markdown ```
# @markdown Human: What is a car?
# @markdown Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
# @markdown ```
# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

prompt = "What is a car?"  # @param {type: "string"}
# @markdown If you encounter an issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, by lowering `max_tokens`.
max_tokens = 50  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}
top_p = 1.0  # @param {type:"number"}
top_k = 1  # @param {type:"integer"}
# @markdown Set `raw_response` to `True` to obtain the raw model output. Set `raw_response` to `False` to apply additional formatting in the structure of `"Prompt:\n{prompt.strip()}\nOutput:\n{output}"`.
raw_response = False  # @param {type:"boolean"}

# Overrides parameters for inferences.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "raw_response": raw_response,
    },
]
response = endpoints["vllm_fast"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    print(prediction)

# @markdown Click "Show Code" to see more details.

In [None]:
# @title Chat completion

temp_region = REGION
REGION = FAST_DEPLOYMENT_REGION

if use_dedicated_endpoint:
    DEDICATED_ENDPOINT_DNS = endpoints["vllm_fast"].gca_resource.dedicated_endpoint_dns
ENDPOINT_RESOURCE_NAME = endpoints["vllm_fast"].resource_name

# @title Chat Completions Inference

# @markdown Once deployment succeeds, you can send requests to the endpoint using the OpenAI SDK.

# @markdown First you will need to install the SDK and some auth-related dependencies.

! pip install -qU openai google-auth requests

# @markdown Next fill out some request parameters:

user_message = "How is your day going?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 50  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}
stream = False  # @param {type: "boolean"}

# @markdown Now we can send a request.

import google.auth
import openai

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

BASE_URL = (
    f"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}"
)
try:
    if use_dedicated_endpoint:
        BASE_URL = f"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}"
except NameError:
    pass

client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)

model_response = client.chat.completions.create(
    model="",
    messages=[{"role": "user", "content": user_message}],
    temperature=temperature,
    max_tokens=max_tokens,
    stream=stream,
)

if stream:
    usage = None
    contents = []
    for chunk in model_response:
        if chunk.usage is not None:
            usage = chunk.usage
            continue
        print(chunk.choices[0].delta.content, end="")
        contents.append(chunk.choices[0].delta.content)
    print(f"\n\n{usage}")
else:
    print(model_response)

REGION = temp_region

# @markdown Click "Show Code" to see more details.

## Deploy prebuilt Llama 3.2 11B-Vision and 90B-Vision with vLLM

In [None]:
# @title Select the model variants
LABEL = "vllm_gpu"
base_model_name = "Llama-3.2-11B-Vision-Instruct"  # @param ["Llama-3.2-11B-Vision", "Llama-3.2-11B-Vision-Instruct", "Llama-3.2-90B-Vision", "Llama-3.2-90B-Vision-Instruct"] {isTemplate:true}
hf_model_id = "meta-llama/" + base_model_name
model_id = os.path.join(VERTEX_AI_MODEL_GARDEN_LLAMA_3_2, base_model_name)
version_id = base_model_name.replace("3.2", "3-2").lower()
PUBLISHER_MODEL_NAME = f"publishers/meta/models/llama3-2@{version_id}"

# The pre-built serving docker images.
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20241202_0916_RC00_maas"

# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint). Note that [dedicated endpoint does not support VPC Service Controls](https://cloud.google.com/vertex-ai/docs/predictions/choose-endpoint-type), uncheck the box if you are using VPC-SC.
use_dedicated_endpoint = True  # @param {type:"boolean"}

# @markdown Find Vertex AI prediction supported accelerators and regions at https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.
if "3.2-1B" in base_model_name or "3.2-3B" in base_model_name:
    accelerator_type = "NVIDIA_L4"
    machine_type = "g2-standard-8"
    accelerator_count = 1
elif "3.2-11B" in base_model_name:
    accelerator_type = "NVIDIA_TESLA_A100"
    machine_type = "a2-highgpu-1g"
    accelerator_count = 1
elif "3.2-90B" in base_model_name:
    accelerator_type = "NVIDIA_H100_80GB"
    machine_type = "a3-highgpu-8g"
    accelerator_count = 8
else:
    raise ValueError(f"Recommended GPU setting not found for: {base_model_name}.")

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)

# @markdown Click "Show Code" to see more details.

In [None]:
# @title [Option 1] Deploy with Model Garden SDK

# @markdown Deploy with Gen AI model-centric SDK. This section uploads the prebuilt model to Model Registry and deploys it to a Vertex AI Endpoint. It takes 15 minutes to 1 hour to finish depending on the size of the model. See [use open models with Vertex AI](https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-open-models) for documentation on other use cases.
from vertexai import model_garden

if REGION == "us-central1":
    fast_tryout_enabled = True
else:
    fast_tryout_enabled = False

model = model_garden.OpenModel(PUBLISHER_MODEL_NAME)
endpoints[LABEL] = model.deploy(
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    fast_tryout_enabled=fast_tryout_enabled,
    use_dedicated_endpoint=use_dedicated_endpoint,
    accept_eula=True,  # Accept the End User License Agreement (EULA) on the model card before deploy. Otherwise, the deployment will be forbidden.
)

endpoint = endpoints[LABEL]

In [None]:
# @title [Option 2] Deploy with customized configs
# @markdown This section uploads prebuilt Llama 3.2 models to Model Registry and deploys it to a Vertex AI Endpoint. It takes 15 minutes to 1 hour to finish depending on the size of the model.

# @markdown Currently vLLM can be used with limited inputs for multi-modality models, which are "text only" format and "single leading image + text" format. More input formats will be supported later.

gpu_memory_utilization = 0.9
max_model_len = 4096
max_num_seqs = 12

# @markdown Choose whether to use a [Spot VM](https://cloud.google.com/compute/docs/instances/spot) for the deployment.
is_spot = False  # @param {type:"boolean"}

# @markdown To enable the auto-scaling in deployment, you can set the following options:

min_replica_count = 1  # @param {type:"integer"}
max_replica_count = 1  # @param {type:"integer"}
required_replica_count = 1  # @param {type:"integer"}

# @markdown Set the target of GPU duty cycle or CPU usage between 1 and 100 for auto-scaling.
autoscale_by_gpu_duty_cycle_target = 0  # @param {type:"integer"}
autoscale_by_cpu_usage_target = 0  # @param {type:"integer"}

# @markdown Note: GPU duty cycle is not the most accurate metric for scaling workloads. More advanced auto-scaling metrics are coming soon. See [the public doc](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute#autoscaling) for more details.


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    base_model_id: str = None,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    gpu_memory_utilization: float = 0.9,
    max_model_len: int = 4096,
    dtype: str = "auto",
    enable_trust_remote_code: bool = False,
    enforce_eager: bool = False,
    enable_lora: bool = False,
    enable_chunked_prefill: bool = False,
    enable_prefix_cache: bool = False,
    host_prefix_kv_cache_utilization_target: float = 0.0,
    max_loras: int = 1,
    max_cpu_loras: int = 8,
    use_dedicated_endpoint: bool = False,
    max_num_seqs: int = 256,
    model_type: str = None,
    enable_llama_tool_parser: bool = False,
    min_replica_count: int = 1,
    max_replica_count: int = 1,
    required_replica_count: int = 1,
    autoscale_by_gpu_duty_cycle_target: int = 0,
    autoscale_by_cpu_usage_target: int = 0,
    is_spot: bool = False,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )

    if not base_model_id:
        base_model_id = model_id

    # See https://docs.vllm.ai/en/latest/models/engine_args.html for a list of possible arguments with descriptions.
    vllm_args = [
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=8080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        f"--gpu-memory-utilization={gpu_memory_utilization}",
        f"--max-model-len={max_model_len}",
        f"--dtype={dtype}",
        f"--max-loras={max_loras}",
        f"--max-cpu-loras={max_cpu_loras}",
        f"--max-num-seqs={max_num_seqs}",
        "--disable-log-stats",
    ]

    if enable_trust_remote_code:
        vllm_args.append("--trust-remote-code")

    if enforce_eager:
        vllm_args.append("--enforce-eager")

    if enable_lora:
        vllm_args.append("--enable-lora")

    if enable_chunked_prefill:
        vllm_args.append("--enable-chunked-prefill")

    if enable_prefix_cache:
        vllm_args.append("--enable-prefix-caching")

    if 0 < host_prefix_kv_cache_utilization_target < 1:
        vllm_args.append(
            f"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}"
        )

    if model_type:
        vllm_args.append(f"--model-type={model_type}")

    if enable_llama_tool_parser:
        if "Llama-4" not in model_id:
            vllm_args.append("--enable-auto-tool-choice")
            vllm_args.append("--tool-call-parser=vertex-llama-3")
        else:
            vllm_args.append("--enable-auto-tool-choice")
            vllm_args.append("--tool-call-parser=llama3_json")

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_args=vllm_args,
        serving_container_ports=[8080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )
    print(
        f"Deploying {model_name} on {machine_type} with {accelerator_count} {accelerator_type} GPU(s)."
    )

    creds, _ = auth.default()
    auth_req = auth.transport.requests.Request()
    creds.refresh(auth_req)

    url = f"https://{REGION}-aiplatform.googleapis.com/ui/projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint.name}:deployModel"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {creds.token}",
    }
    data = {
        "deployedModel": {
            "model": model.resource_name,
            "displayName": model_name,
            "dedicatedResources": {
                "machineSpec": {
                    "machineType": machine_type,
                    "acceleratorType": accelerator_type,
                    "acceleratorCount": accelerator_count,
                },
                "minReplicaCount": min_replica_count,
                "requiredReplicaCount": required_replica_count,
                "maxReplicaCount": max_replica_count,
            },
            "system_labels": {
                "NOTEBOOK_NAME": "model_garden_pytorch_llama3_2_deployment.ipynb",
                "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
            },
        },
    }
    if is_spot:
        data["deployedModel"]["dedicatedResources"]["spot"] = True
    if autoscale_by_gpu_duty_cycle_target > 0 or autoscale_by_cpu_usage_target > 0:
        data["deployedModel"]["dedicatedResources"]["autoscalingMetricSpecs"] = []
        if autoscale_by_gpu_duty_cycle_target > 0:
            data["deployedModel"]["dedicatedResources"][
                "autoscalingMetricSpecs"
            ].append(
                {
                    "metricName": "aiplatform.googleapis.com/prediction/online/accelerator/duty_cycle",
                    "target": autoscale_by_gpu_duty_cycle_target,
                }
            )
        if autoscale_by_cpu_usage_target > 0:
            data["deployedModel"]["dedicatedResources"][
                "autoscalingMetricSpecs"
            ].append(
                {
                    "metricName": "aiplatform.googleapis.com/prediction/online/cpu/utilization",
                    "target": autoscale_by_cpu_usage_target,
                }
            )
    response = requests.post(url, headers=headers, json=data)
    print(f"Deploy Model response: {response.json()}")
    if response.status_code != 200 or "name" not in response.json():
        raise ValueError(f"Failed to deploy model: {response.text}")
    common_util.poll_and_wait(response.json()["name"], REGION, 7200)
    print("endpoint_name:", endpoint.name)

    return model, endpoint


models[LABEL], endpoints[LABEL] = deploy_model_vllm(
    model_name=common_util.get_job_name_with_datetime(prefix="llama3_2-serve-vllm"),
    model_id=model_id,
    publisher="meta",
    publisher_model_id="llama3-2",
    base_model_id=hf_model_id,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    gpu_memory_utilization=gpu_memory_utilization,
    max_model_len=max_model_len,
    enforce_eager=True,
    use_dedicated_endpoint=use_dedicated_endpoint,
    max_num_seqs=max_num_seqs,
    enable_llama_tool_parser=True,
    min_replica_count=min_replica_count,
    max_replica_count=max_replica_count,
    required_replica_count=required_replica_count,
    autoscale_by_gpu_duty_cycle_target=autoscale_by_gpu_duty_cycle_target,
    autoscale_by_cpu_usage_target=autoscale_by_cpu_usage_target,
    is_spot=is_spot,
)
# @markdown Click "Show Code" to see more details.

In [None]:
# @title Chat completion with text-only requests

if use_dedicated_endpoint:
    DEDICATED_ENDPOINT_DNS = endpoints[LABEL].gca_resource.dedicated_endpoint_dns
ENDPOINT_RESOURCE_NAME = endpoints[LABEL].resource_name

# @title Chat Completions Inference

# @markdown Once deployment succeeds, you can send requests to the endpoint using the OpenAI SDK.

# @markdown First you will need to install the SDK and some auth-related dependencies.

! pip install -qU openai google-auth requests

# @markdown Next fill out some request parameters:

user_message = "How is your day going?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 50  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}
stream = False  # @param {type: "boolean"}

# @markdown Now we can send a request.

import google.auth
import openai

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

BASE_URL = (
    f"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}"
)
try:
    if use_dedicated_endpoint:
        BASE_URL = f"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}"
except NameError:
    pass

client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)

model_response = client.chat.completions.create(
    model="",
    messages=[{"role": "user", "content": user_message}],
    temperature=temperature,
    max_tokens=max_tokens,
    stream=stream,
)

if stream:
    usage = None
    contents = []
    for chunk in model_response:
        if chunk.usage is not None:
            usage = chunk.usage
            continue
        print(chunk.choices[0].delta.content, end="")
        contents.append(chunk.choices[0].delta.content)
    print(f"\n\n{usage}")
else:
    print(model_response)

# @markdown Click "Show Code" to see more details.

In [None]:
# @title Chat completion for vision models

if use_dedicated_endpoint:
    DEDICATED_ENDPOINT_DNS = endpoints[LABEL].gca_resource.dedicated_endpoint_dns
ENDPOINT_RESOURCE_NAME = endpoints[LABEL].resource_name

# @title Chat Completions Inference

# @markdown Once deployment succeeds, you can send requests to the endpoint using the OpenAI SDK.

# @markdown First you will need to install the SDK and some auth-related dependencies.

! pip install -qU openai google-auth requests

# @markdown Next fill out some request parameters:

user_image = "https://upload.wikimedia.org/wikipedia/commons/thumb/c/cb/The_Blue_Marble_%28remastered%29.jpg/580px-The_Blue_Marble_%28remastered%29.jpg"  # @param {type: "string"}
user_message = "What is in the image?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 50  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}

# @markdown Now we can send a request.

import google.auth
import openai

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

BASE_URL = (
    f"https://{REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}"
)
try:
    if use_dedicated_endpoint:
        BASE_URL = f"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}"
except NameError:
    pass

client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)

model_response = client.chat.completions.create(
    model="",
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": user_image}},
                {"type": "text", "text": user_message},
            ],
        }
    ],
    temperature=temperature,
    max_tokens=max_tokens,
)
print(model_response)

# @markdown Click "Show Code" to see more details.

## Use Llama guard model


You can use the Llama Guard model together with the Llama 3.2 Instruct API. See the [Llama Guard model](https://console.cloud.google.com/vertex-ai/publishers/meta/model-garden/llama-guard) for details.

## Clean up resources

In [None]:
# @title Delete the models and endpoints

# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()