In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Gemma (Deployment)

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_gemma_deployment_on_vertex.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_gemma_deployment_on_vertex.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates deploying Gemma models
 * on TPU using **Hex-LLM**, a **H**igh-**E**fficiency **L**arge **L**anguage **M**odel serving solution built with **XLA** that is being developed by Google Cloud, and
 * on GPU using [vLLM](https://github.com/vllm-project/vllm), the state-of-the-art open source LLM serving solution on GPU.


### Objective

- Deploy Gemma with Hex-LLM on TPU
- Deploy Gemma with [vLLM](https://github.com/vllm-project/vllm) on GPU

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. [Optional] [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1"). If not set, a unique GCS bucket will be created instead.

# @markdown 3. By default, the quota for TPU deployment `Custom model serving TPU v5e cores per region` is 4. TPU quota is only available in `us-west1`. You can request for higher TPU quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

# Import the necessary packages

! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

import importlib
import os
import uuid
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

models, endpoints = {}, {}

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, change the value yourself below.
now = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_URI = "gs://"  # @param {type:"string"}
BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])

if BUCKET_URI is None or BUCKET_URI.strip() == "" or BUCKET_URI == "gs://":
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}-{str(uuid.uuid4())[:4]}"
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
    ! gsutil mb -l {REGION} {BUCKET_URI}
else:
    assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."
    shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep "Location constraint:" | sed "s/Location constraint://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            "Bucket region %s is different from notebook region %s"
            % (bucket_region, REGION)
        )
print(f"Using this GCS Bucket: {BUCKET_URI}")

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
MODEL_BUCKET = os.path.join(BUCKET_URI, "gemma")


# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)


# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

! gcloud config set project $PROJECT_ID

# @markdown ## Access Gemma Models
# @markdown Choose between accessing Gemma models on [Hugging Face](https://huggingface.co/)
# @markdown or Vertex AI as described below.

# @markdown If you already obtained access to Gemma models on [Hugging Face](https://huggingface.co/), you can load models from there.
# @markdown Alternatively, you can also load the original Gemma models for serving from Vertex AI after accepting the agreement.

# @markdown **Select and fill one of the two following sections.**
LOAD_MODEL_FROM = "Hugging Face"  # @param ["Hugging Face", "Google Cloud"] {isTemplate:true}

# @markdown ---

# @markdown ### Access Gemma models on Hugging Face
# @markdown You must provide a Hugging Face User Access Token (read) to access the Gemma models. You can follow the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens) to create a **read** access token and put it in the `HF_TOKEN` field below.

HF_TOKEN = ""  # @param {type:"string", isTemplate:true}
if LOAD_MODEL_FROM == "Hugging Face":
    assert (
        HF_TOKEN
    ), "Provide a read HF_TOKEN to load models from Hugging Face, or select a different model source."

# @markdown *--- Or ---*
# @markdown ### Access Gemma models on Vertex AI
# @markdown Accept the model agreement to access the models:
# @markdown 1. Open the [Gemma model card](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/335) from [Vertex AI Model Garden](https://cloud.google.com/model-garden).
# @markdown 1. Review and accept the agreement in the pop-up window on the model card page. If you have previously accepted the model agreement, there will not be a pop-up window on the model card page and this step is not needed.
# @markdown 1. After accepting the agreement of Gemma, a `https://` link containing Gemma pretrained and finetuned models will be shared.
# @markdown 1. Paste the link in the `VERTEX_AI_MODEL_GARDEN_GEMMA` field below.
# @markdown **Note:** This will unzip and copy the Gemma model artifacts to your Cloud Storage bucket, which will take around 1 hour.

VERTEX_AI_MODEL_GARDEN_GEMMA = ""  # @param {type:"string", isTemplate:true}


if LOAD_MODEL_FROM == "Google Cloud":
    assert (
        VERTEX_AI_MODEL_GARDEN_GEMMA
    ), "Accept the agreement of Gemma in Vertex AI Model Garden and get the URL to Gemma model artifacts, or select a different model source."

    # Only use the last part in case a full command is pasted.
    signed_url = VERTEX_AI_MODEL_GARDEN_GEMMA.split(" ")[-1].strip('"')

    ! mkdir -p ./gemma
    ! curl -X GET "{signed_url}" | tar -xzvf - -C ./gemma/
    ! gsutil -m cp -R ./gemma/* {MODEL_BUCKET}

    model_path_prefix = MODEL_BUCKET
    HF_TOKEN = ""
else:
    model_path_prefix = "google/"

# @markdown ---

# The pre-built serving docker images.
HEXLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/hex-llm-serve:deploy"
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240721_0916_RC00"


def deploy_model_hexllm(
    model_name: str,
    model_id: str,
    service_account: str,
    base_model_id: str = None,
    tensor_parallel_size: int = 1,
    machine_type: str = "ct5lp-hightpu-1t",
    hbm_utilization_factor: float = 0.6,
    max_running_seqs: int = 256,
    endpoint_id: str = "",
    min_replica_count: int = 1,
    max_replica_count: int = 1,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys models with Hex-LLM on TPU in Vertex AI."""
    if endpoint_id:
        aip_endpoint_name = (
            f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_id}"
        )
        endpoint = aiplatform.Endpoint(aip_endpoint_name)
    else:
        endpoint = aiplatform.Endpoint.create(
            display_name=f"{model_name}-endpoint",
            location=TPU_DEPLOYMENT_REGION,
        )

    if not base_model_id:
        base_model_id = model_id

    if not tensor_parallel_size:
        tensor_parallel_size = int(machine_type[-2])

    hexllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        "--log_level=INFO",
        f"--model={model_id}",
        f"--tensor_parallel_size={tensor_parallel_size}",
        "--enable_jit",
        "--load_format=auto",
        f"--hbm_utilization_factor={hbm_utilization_factor}",
        f"--max_running_seqs={max_running_seqs}",
    ]

    env_vars = {
        "MODEL_ID": base_model_id,
        "PJRT_DEVICE": "TPU",
        "RAY_DEDUP_LOGS": "0",
        "RAY_USAGE_STATS_ENABLED": "0",
        "DEPLOY_SOURCE": "notebook",
    }

    try:
        if HF_TOKEN:
            env_vars.update({"HF_TOKEN": HF_TOKEN})
    except:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=HEXLLM_DOCKER_URI,
        serving_container_command=["python", "-m", "hex_llm.server.api_server"],
        serving_container_args=hexllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        location=TPU_DEPLOYMENT_REGION,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        deploy_request_timeout=1800,
        service_account=service_account,
        min_replica_count=min_replica_count,
        max_replica_count=max_replica_count,
    )
    return model, endpoint


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    service_account: str,
    base_model_id: str = None,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    gpu_memory_utilization: float = 0.9,
    max_model_len: int = 4096,
    dtype: str = "auto",
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    if not base_model_id:
        base_model_id = model_id

    vllm_args = [
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        f"--gpu-memory-utilization={gpu_memory_utilization}",
        f"--max-model-len={max_model_len}",
        f"--dtype={dtype}",
        "--disable-log-stats",
    ]

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
    )
    print(
        f"Deploying {model_name} on {machine_type} with {accelerator_count} {accelerator_type} GPU(s)."
    )
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    print("endpoint_name:", endpoint.name)

    return model, endpoint

## Deploy Gemma models with Hex-LLM on TPU

**Hex-LLM** is a **H**igh-**E**fficiency **L**arge **L**anguage **M**odel (LLM) TPU serving solution built with **XLA**, which is being developed by Google Cloud.

Refer to the "Request for TPU quota" section for TPU quota.

In [None]:
# @title Deploy
# @markdown Set the model ID. Model weights can be loaded from HuggingFace or from a GCS bucket.

# @markdown Select one of the six model variations.
MODEL_ID = "gemma-1.1-2b-it"  # @param ["gemma-2b", "gemma-2b-it", "gemma-7b", "gemma-7b-it", "gemma-1.1-2b-it", "gemma-1.1-7b-it"] {allow-input: true, isTemplate: true}
TPU_DEPLOYMENT_REGION = "us-west1"  # @param ["us-west1"] {isTemplate:true}
model_id = os.path.join(model_path_prefix, MODEL_ID)

# @markdown Find Vertex AI prediction TPUv5e machine types in
# @markdown https://cloud.google.com/vertex-ai/docs/predictions/use-tpu#deploy_a_model.
if "2b" in model_id:
    # Sets ct5lp-hightpu-1t (1 TPU chip) to deploy Gemma 2B models.
    machine_type = "ct5lp-hightpu-1t"
    accelerator_type = "TPU_V5e"
    # Note: 1 TPU V5 chip has only one core.
    accelerator_count = 1
else:
    # Sets ct5lp-hightpu-4t (4 TPU chips) to deploy Gemma 7B models.
    machine_type = "ct5lp-hightpu-4t"
    accelerator_type = "TPU_V5e"
    # Note: 1 TPU V5 chip has only one core.
    accelerator_count = 4

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)

# Server parameters.
hbm_utilization_factor = 0.6  # A larger value improves throughput but gives higher risk of TPU out-of-memory errors with long prompts.
max_running_seqs = 256

# Endpoint configurations.
min_replica_count = 1
max_replica_count = 1

models["hexllm_tpu"], endpoints["hexllm_tpu"] = deploy_model_hexllm(
    model_name=common_util.get_job_name_with_datetime(prefix=MODEL_ID),
    model_id=model_id,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    hbm_utilization_factor=hbm_utilization_factor,
    max_running_seqs=max_running_seqs,
    min_replica_count=min_replica_count,
    max_replica_count=max_replica_count,
)

In [None]:
# @title Predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts based on your `template`. Note that the first few prompts will take longer to execute.

# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# @markdown Example:

# @markdown ```
# @markdown > What is a car?
# @markdown > A car is a four-wheeled vehicle designed for the transportation of passengers and their belongings.
# @markdown ```

# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the endpoint
#   name of the endpoint `endpoint` created in the cell above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint:
# endpoint_name = endpoint_without_peft.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

prompt = "What is a car?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 50  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}
top_p = 1.0  # @param {type: "number"}
top_k = 1  # @param {type: "integer"}

# Overrides parameters for inferences.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
    },
]
response = endpoints["hexllm_tpu"].predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

### Build chat applications with Gemma

You can build chat applications with the instruction finetuned Gemma models.

The instruction tuned Gemma models were trained with a specific formatter that annotates instruction tuning examples with extra information, both during training and inference. The annotations (1) indicate roles in a conversation, and (2) delineate tunes in a conversation. Below we show a sample code snippet for formatting the model prompt using the user and model chat templates for a multi-turn conversation. The relevant tokens are:
- `user`: user turn
- `model`: model turn
- `<start_of_turn>`: beginning of dialogue turn
- `<end_of_turn>`: end of dialogue turn

An example set of dialogues is:
```
<start_of_turn>user
knock knock<end_of_turn>
<start_of_turn>model
who is there<end_of_turn>
<start_of_turn>user
LaMDA<end_of_turn>
<start_of_turn>model
LaMDA who?<end_of_turn>
```
where `<end_of_turn>\n` is the turn separator and `<start_of_turn>model\n` is the prompt prefix. This means if we would like to prompt the model with a question like, `What is Cramer's Rule?`, we should use:
```
<start_of_turn>user
What is Cramer's Rule?<end_of_turn>
<start_of_turn>model
```

In [None]:
# Chat templates.
USER_CHAT_TEMPLATE = "<start_of_turn>user\n{prompt}<end_of_turn>\n"
MODEL_CHAT_TEMPLATE = "<start_of_turn>model\n{prompt}<end_of_turn>\n"

# Sample formatted prompt.
prompt = (
    USER_CHAT_TEMPLATE.format(prompt="What is a good place for travel in the US?")
    + MODEL_CHAT_TEMPLATE.format(prompt="California.")
    + USER_CHAT_TEMPLATE.format(prompt="What can I do in California?")
    + "<start_of_turn>model\n"
)
print("Chat prompt:\n", prompt)

instances = [
    {
        "prompt": prompt,
        "max_tokens": 50,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 1,
    },
]
response = endpoints["hexllm_tpu"].predict(instances=instances)

prediction = response.predictions[0]
print(prediction)

## Deploy Gemma models with vLLM on GPU

[vLLM](https://github.com/vllm-project/vllm) is a high-throughput GPU Large Language Model (LLM) serving library which implements a number of optimizations including paged attention and continuous batching.

Note that V100 GPUs generally offer better throughput and latency performance than L4 GPUs, while L4 GPUs are generally more cost efficient than V100 GPUs. The serving efficiency of L4, V100 and T4 GPUs is inferior to that of A100 GPUs, but L4, V100 and T4 GPUs are nevertheless good serving solutions if you do not have A100 quota.

Gemma model weights are stored in bfloat16 precision. L4 and A100 GPUs are needed for vLLM serving at bfloat16 precision. V100 and T4 GPUs can support vLLM serving at float32 and float16 precision, and they are also meaningful deployment configurations.

In [None]:
# @title Deploy
MODEL_ID = "gemma-1.1-2b-it"  # @param ["gemma-2b", "gemma-2b-it", "gemma-7b", "gemma-7b-it", "gemma-1.1-2b-it", "gemma-1.1-7b-it"] {isTemplate: true}
model_id = os.path.join(model_path_prefix, MODEL_ID)

# @markdown Finds Vertex AI prediction supported accelerators and regions in
# @markdown https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

accelerator_type = "NVIDIA_L4"  # @param ["NVIDIA_L4", "NVIDIA_TESLA_V100", "NVIDIA_TESLA_T4", "NVIDIA_TESLA_A100"] {isTemplate: true}

if "2b" in MODEL_ID:
    if accelerator_type == "NVIDIA_L4":
        # Sets 1 L4 (24G) to deploy Gemma 2B models.
        machine_type = "g2-standard-8"
        accelerator_count = 1
        vllm_dtype = "bfloat16"
    elif accelerator_type == "NVIDIA_TESLA_V100":
        # Sets 1 V100 (16G) to deploy Gemma 2B models.
        machine_type = "n1-standard-8"
        accelerator_count = 1
        vllm_dtype = "float32"
    elif accelerator_type == "NVIDIA_TESLA_T4":
        # Sets 1 T4 (16G) to deploy Gemma 2B models.
        machine_type = "n1-standard-8"
        accelerator_count = 1
        vllm_dtype = "float32"
    elif accelerator_type == "NVIDIA_TESLA_A100":
        # Sets 1 A100 (40G) to deploy Gemma 2B models.
        machine_type = "a2-highgpu-1g"
        accelerator_count = 1
        vllm_dtype = "bfloat16"
    else:
        raise ValueError(
            "Recommended machine settings not found for accelerator type: %s"
            % accelerator_type
        )
elif "7b" in MODEL_ID:
    if accelerator_type == "NVIDIA_L4":
        # Sets 1 L4 (24G) to deploy Gemma 7B models.
        machine_type = "g2-standard-12"
        accelerator_count = 1
        vllm_dtype = "bfloat16"
    elif accelerator_type == "NVIDIA_TESLA_A100":
        # Sets 1 A100 (40G) to deploy Gemma 7B models.
        machine_type = "a2-highgpu-1g"
        accelerator_count = 1
        vllm_dtype = "bfloat16"
    else:
        raise ValueError(
            "Recommended machine settings not found for accelerator type: %s"
            % accelerator_type
        )
else:
    raise ValueError(
        "Recommended machine settings not found for accelerator type: %s"
        % accelerator_type
    )

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)

# Note that a larger max_model_len will require more GPU memory.
max_model_len = 2048

models["vllm_gpu"], endpoints["vllm_gpu"] = deploy_model_vllm(
    model_name=common_util.get_job_name_with_datetime(prefix="gemma-serve-vllm"),
    model_id=model_id,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    max_model_len=max_model_len,
    dtype=vllm_dtype,
)

Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://github.com/vllm-project/vllm/blob/2e8e49fce3775e7704d413b2f02da6d7c99525c9/vllm/sampling_params.py#L23-L64). Setting `raw_response` to `True` allows you to obtain raw outputs.

In [None]:
# @title Predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/dev/sampling_params.html).

# @markdown Example:

# @markdown ```
# @markdown Human: What is a car?
# @markdown Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
# @markdown ```
# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

prompt = "What is a car?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 50  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}
top_p = 1.0  # @param {type:"number"}
top_k = 1  # @param {type:"integer"}
raw_response = False  # @param {type:"boolean"}

# Overrides parameters for inferences.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "raw_response": raw_response,
    },
]
response = endpoints["vllm_gpu"].predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

### Apply chat templates

Chat templates can be applied to model predictions generated by the vLLM endpoint as well. You may use the same code snippets as for the Hex-LLM endpoint. They are not repeated here for brevity.

## Clean up resources

In [None]:
# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()

delete_bucket = False  # @param {type:"boolean"}
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_NAME