In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Llama 3.1 and Qwen2.5 Models Deployment

## Overview

This notebook demonstrates deploying and serving prebuilt [Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f) and
[Qwen2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e) models with [vLLM](https://github.com/vllm-project/vllm) on TPU v5e.


### Objective

- Deploy Llama 3.1 8B with vLLM on 4 TPU v5es.

- Deploy Qwen2.5 1.5B with vLLM on 1 TPU v5e.

vLLM has been used as the serving faremwork for both models to improve serving throughput. Note [vLLM on TPU](https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html) is in experimental status.


### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Request for quota

# @markdown By default, the quota for TPU deployment `Custom model serving TPU v5e cores per region` is 4, which is sufficient for serving Llama 3.1 8B and Qwen2.5 1.5B models. We need 1 TPU v5e to deploy [Qwen2.5 1.5B](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model and 4 TPU v5es to deploy [Llama 3.1 8B](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).

# @markdown > | Model | Accelerator Type |
# @markdown | ----------- | ----------- |
# @markdown | Llama 3.1 8B |4 TPU v5e (ct5lp-hightpu-4t)|
# @markdown | Qwen2.5 1.5B|1 TPU v5e (ct5lp-hightpu-1t)|

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1"). If not set, a unique GCS bucket will be created instead.

BUCKET_URI = "gs://"  # @param {type:"string"}

# @markdown 3. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = ""  # @param {type:"string"}

# Upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet 'google-cloud-aiplatform>=1.64.0'

# Import the necessary packages
import datetime
import importlib
import os
import uuid
from typing import Tuple

from google.cloud import aiplatform

! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

models, endpoints = {}, {}

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    if not os.environ.get("GOOGLE_CLOUD_REGION"):
        raise ValueError(
            "REGION must be set. See"
            " https://cloud.google.com/vertex-ai/docs/general/locations for"
            " available cloud locations."
        )
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, change the value yourself below.
now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])

if BUCKET_URI is None or BUCKET_URI.strip() == "" or BUCKET_URI == "gs://":
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}-{str(uuid.uuid4())[:4]}"
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
    ! gsutil mb -l {REGION} {BUCKET_URI}
else:
    assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."
    shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep "Location constraint:" | sed "s/Location constraint://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            "Bucket region %s is different from notebook region %s"
            % (bucket_region, REGION)
        )
print(f"Using this GCS Bucket: {BUCKET_URI}")

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
MODEL_BUCKET = os.path.join(BUCKET_URI, "vllm_tpu")


# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)


# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

! gcloud config set project $PROJECT_ID
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/storage.admin"
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/aiplatform.user"

In [None]:
# @title Access the models
# @markdown ### Access Llama 3.1 and Qwen2.5 models on Vertex AI for serving
# @markdown The models from the Hugging Face can be used for serving in Vertex AI.
# @markdown 1. Open the [Llama 3.1 model](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) and [Qwen2.5](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) from [Hugging Face](https://huggingface.co/).
# @markdown 2. Review and accept the agreement.
# @markdown 3. After accepting the agreement, Llama 3.1 and Qwen2.5 models will be available for serving.
# @markdown 4. You must provide a Hugging Face User Access Token (read) to access the models. You can follow the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens) to create a **read** access token and put it in the `HF_TOKEN` field below.

HF_TOKEN = ""  # @param {type:"string", isTemplate:true}
assert HF_TOKEN, "Provide a read HF_TOKEN to load models from Hugging Face"

In [None]:
# @title Prepare

# @markdown In this section you can choose a desired model and the region for TPU deployment.
# @markdown Learn about [TPU v5e machine types](https://cloud.google.com/vertex-ai/docs/predictions/use-tpu#deploy_a_model) for Vertex AI prediction.

# @markdown Here are 2 example models you can run:

MODEL_ID = "Llama-3.1-8B-Instruct"  # @param ["Llama-3.1-8B","Llama-3.1-8B-Instruct", "Qwen2.5-1.5B", "Qwen2.5-1.5B-Instruct"] {isTemplate: true}

TPU_DEPLOYMENT_REGION = ""  # @param {type:"string"}

tpu_type = "TPU_V5e"

if "Llama" in MODEL_ID:
    model_path_prefix = "meta-llama/"
    model_id = os.path.join(model_path_prefix, MODEL_ID)
    machine_type = "ct5lp-hightpu-4t"
    tpu_count = 4
    tpu_topo = "2x2"
    print(MODEL_ID, "will run on", tpu_count, "tpus")
elif "Qwen2.5" in MODEL_ID:
    model_path_prefix = "Qwen/"
    model_id = os.path.join(model_path_prefix, MODEL_ID)
    machine_type = "ct5lp-hightpu-1t"
    tpu_count = 1
    tpu_topo = "1x1"
    print(MODEL_ID, "will run on", tpu_count, "tpu")
else:
    raise ValueError(f"Unsupported MODEL_ID: {MODEL_ID}")

# The pre-built serving docker image for vLLM on TPU
vLLM_TPU_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20241107_0917_tpu_experimental_RC01"

# @markdown Set `use_dedicated_endpoint` to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint).
use_dedicated_endpoint = True  # @param {type:"boolean"}

common_util.check_quota(
    project_id=PROJECT_ID,
    region=TPU_DEPLOYMENT_REGION,
    accelerator_type=tpu_type,
    accelerator_count=tpu_count,
    is_for_training=False,
)

# Server parameters.
tensor_parallel_size = tpu_count

# Fraction of HBM memory allocated for KV cache after model loading. A larger value improves throughput but gives higher risk of TPU out-of-memory errors with long prompts.

# Maximum number of running sequences in a continuous batch.
max_running_seqs = 256  # @param
# Maximum context length for a request.
max_model_len = 4096  # @param

# Endpoint configurations.
min_replica_count = 1
max_replica_count = 1

run_name = ""  # @param {type:"string"}

# @markdown Note: The vLLM-TPU container used in this notebook is in experimental status.

## Deploy prebuilt Llama 3.1 8B or Qwen2.5 1.5B models with vLLM on TPUs
This section will download the prebuilt model chosen in the previous section and deploys it to a Vertex AI Endpoint. It takes 15 minutes to 1 hour to finish depending on the size of the model.

In [None]:
# @title Deploy
def deploy_model_vllm_tpu(
    model_name: str,
    model_id: str,
    service_account: str,
    base_model_id: str = None,
    tensor_parallel_size: int = 1,
    machine_type: str = "ct5lp-hightpu-1t",
    tpu_topology: str = "1x1",
    max_model_len: int = 4096,
    endpoint_id: str = "",
    min_replica_count: int = 1,
    max_replica_count: int = 1,
    use_dedicated_endpoint: bool = False,
    model_type: str = None,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys models with vLLM on TPU in Vertex AI."""
    if endpoint_id:
        aip_endpoint_name = (
            f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_id}"
        )
        endpoint = aiplatform.Endpoint(aip_endpoint_name)
    else:
        endpoint = aiplatform.Endpoint.create(
            display_name=f"{model_name}-endpoint",
            location=TPU_DEPLOYMENT_REGION,
            dedicated_endpoint_enabled=use_dedicated_endpoint,
        )

    if not base_model_id:
        base_model_id = model_id

    if not tensor_parallel_size:
        tensor_parallel_size = int(machine_type[-2])

    num_hosts = int(tpu_topology.split("x")[0])

    vllmtpu_args = [
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=8080",
        f"--model={model_id}",
        f"--tensor_parallel_size={tensor_parallel_size}",
        f"--max_model_len={max_model_len}",
    ]

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=vLLM_TPU_DOCKER_URI,
        serving_container_args=vllmtpu_args,
        serving_container_ports=[8080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        location=TPU_DEPLOYMENT_REGION,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        tpu_topology=tpu_topology if num_hosts > 1 else None,
        deploy_request_timeout=1800,
        service_account=service_account,
        min_replica_count=min_replica_count,
        max_replica_count=max_replica_count,
        system_labels={
            "NOTEBOOK_NAME": "model_garden_pytorch_llama3_1_qwen2_5_deployment_tpu.ipynb"
        },
    )
    return model, endpoint


models["vllmtpu"], endpoints["vllmtpu"] = deploy_model_vllm_tpu(
    model_name=common_util.get_job_name_with_datetime(prefix=run_name),
    model_id=model_id,
    service_account=SERVICE_ACCOUNT,
    tensor_parallel_size=tensor_parallel_size,
    machine_type=machine_type,
    tpu_topology=tpu_topo,
    max_model_len=max_model_len,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

In [None]:
# @title Predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/dev/sampling_params.html).
# @markdown Note Top-k sampling is not currently enabled for vLLM on TPU.

# @markdown Example:

# @markdown ```
# @markdown Human: What is a car?
# @markdown Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
# @markdown ```
# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)


prompt = "what is a car?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, by lowering `max_tokens`.
max_tokens = 50  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}

# Overrides parameters for inferences.
instance = {
    "prompt": prompt,
    "max_tokens": max_tokens,
    "temperature": temperature,
}

instances = [instance]

response = endpoints["vllmtpu"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    print(prediction)

## Clean up resources


In [None]:
# @title Delete the models and endpoints
# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()

delete_bucket = False  # @param {type:"boolean"}
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_NAME