In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Gemma 2 (Deployment)

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_gemma2_deployment_on_vertex.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_gemma2_deployment_on_vertex.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates deploying Gemma 2 models
 * on TPU using **Hex-LLM**, a **H**igh-**E**fficiency **L**arge **L**anguage **M**odel serving solution built with **XLA** that is being developed by Google Cloud, and
 * on GPU using **TGI** ([text-generation-inference](https://github.com/huggingface/text-generation-inference)), the state-of-the-art open source LLM serving solution on GPU.


### Objective

- Deploy Gemma 2 with Hex-LLM on TPU
- Deploy Gemma with [TGI](https://github.com/huggingface/text-generation-inference) on GPU

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Request for TPU quota

# @markdown By default, the quota for TPU deployment `Custom model serving TPU v5e cores per region` is 4. TPU quota is only available in `us-west1`. You can request for higher TPU quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown **[Optional]** Set the GCS BUCKET_URI to store the experiment artifacts, if you want to use your own bucket. **If not set, a unique GCS bucket will be created automatically on your behalf**.

import json
import os
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, please change the value yourself below.
now = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_URI = "gs://"  # @param {type:"string"}
if BUCKET_URI is None or BUCKET_URI.strip() == "" or BUCKET_URI == "gs://":
    # Create a unique GCS bucket for this notebook if not specified
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}"
    ! gsutil mb -l {REGION} {BUCKET_URI}
else:
    assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
    shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep "Location constraint:" | sed "s/Location constraint://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            "Bucket region %s is different from notebook region %s"
            % (bucket_region, REGION)
        )
print(f"Using this GCS Bucket: {BUCKET_URI}")

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
MODEL_BUCKET = os.path.join(BUCKET_URI, "gemma2")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)

# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

# Enable Vertex AI and Cloud Compute APIs.
! gcloud config set project $PROJECT_ID
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# @markdown ## Access Gemma 2 Models

# @markdown You must provide a Hugging Face User Access Token (read) to access the Gemma 2 models. You can follow the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens) to create a **read** access token and put it in the `HF_TOKEN` field below.

HF_TOKEN = ""  # @param {type:"string", isTemplate:true}
assert (
    HF_TOKEN
), "Please provide a read HF_TOKEN to load models from Hugging Face, or select a different model source."

model_path_prefix = "google/"

# The pre-built serving docker images.
HEXLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/hex-llm-serve:gemma2"
TGI_DOCKER_URI = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-1.ubuntu2204.py310"

SERVICE_ENDPOINT = "aiplatform.googleapis.com"


def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering deployment jobs."""
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def deploy_model_hexllm(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "ct5lp-hightpu-1t",
    tensor_parallel_size: int = 1,
    hbm_utilization_factor: float = 0.6,
    max_running_seqs: int = 256,
    endpoint_id: str = "",
    min_replica_count: int = 1,
    max_replica_count: int = 1,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys models with Hex-LLM on TPU in Vertex AI."""
    if endpoint_id:
        aip_endpoint_name = (
            f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_id}"
        )
        endpoint = aiplatform.Endpoint(aip_endpoint_name)
    else:
        endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    hexllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        "--log_level=INFO",
        "--enable_jit",
        f"--model={model_id}",
        "--load_format=auto",
        f"--tensor_parallel_size={tensor_parallel_size}",
        f"--hbm_utilization_factor={hbm_utilization_factor}",
        f"--max_running_seqs={max_running_seqs}",
    ]
    hexllm_envs = {
        "PJRT_DEVICE": "TPU",
        "RAY_DEDUP_LOGS": "0",
        "RAY_USAGE_STATS_ENABLED": "0",
        "MODEL_ID": model_id,
        "DEPLOY_SOURCE": "notebook",
    }
    if HF_TOKEN:
        hexllm_envs.update({"HF_TOKEN": HF_TOKEN})

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=HEXLLM_DOCKER_URI,
        serving_container_command=["python", "-m", "hex_llm.server.api_server"],
        serving_container_args=hexllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=hexllm_envs,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        deploy_request_timeout=1800,
        service_account=service_account,
        min_replica_count=min_replica_count,
        max_replica_count=max_replica_count,
    )
    return model, endpoint


def deploy_model_tgi(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "g2-standard-24",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 2,
    max_input_length: int = 1562,
    max_total_tokens: int = 2048,
    max_batch_prefill_tokens: int = 2048,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys models with TGI on GPU in Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    env_vars = {
        "AIP_HTTP_PORT": 7080,
        "MODEL_ID": model_id,
        "NUM_SHARD": f"{accelerator_count}",
        "MAX_INPUT_LENGTH": f"{max_input_length}",
        "MAX_TOTAL_TOKENS": f"{max_total_tokens}",
        "MAX_BATCH_PREFILL_TOKENS": f"{max_batch_prefill_tokens}",
        "CUDA_MEMORY_FRACTION": 0.93,
        "DEPLOY_SOURCE": "notebook",
    }

    if HF_TOKEN:
        env_vars["HF_TOKEN"] = HF_TOKEN

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=TGI_DOCKER_URI,
        serving_container_ports=[7080],
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint


def get_quota(project_id: str, region: str, resource_id: str) -> int:
    """Returns the quota for a resource in a region. Returns -1 if can not figure out the quota."""
    quota_list_output = !gcloud alpha services quota list --service=$SERVICE_ENDPOINT  --consumer=projects/$project_id --filter="$SERVICE_ENDPOINT/$resource_id" --format=json
    # Use '.s' on the command output because it is an SList type.
    quota_data = json.loads(quota_list_output.s)
    if len(quota_data) == 0 or "consumerQuotaLimits" not in quota_data[0]:
        return -1
    if (
        len(quota_data[0]["consumerQuotaLimits"]) == 0
        or "quotaBuckets" not in quota_data[0]["consumerQuotaLimits"][0]
    ):
        return -1
    all_regions_data = quota_data[0]["consumerQuotaLimits"][0]["quotaBuckets"]
    for region_data in all_regions_data:
        if (
            region_data.get("dimensions")
            and region_data["dimensions"]["region"] == region
        ):
            if "effectiveLimit" in region_data:
                return int(region_data["effectiveLimit"])
            else:
                return 0
    return -1


def get_resource_id(accelerator_type: str, is_for_training: bool) -> str:
    """Returns the resource id for a given accelerator type and the use case.
    Args:
      accelerator_type: The accelerator type.
      is_for_training: Whether the resource is used for training. Set false
      for serving use case.
    Returns:
      The resource id.
    """
    training_accelerator_map = {
        "NVIDIA_TESLA_V100": "custom_model_training_nvidia_v100_gpus",
        "NVIDIA_L4": "custom_model_training_nvidia_l4_gpus",
        "NVIDIA_TESLA_A100": "custom_model_training_nvidia_a100_gpus",
        "NVIDIA_TESLA_T4": "custom_model_training_nvidia_t4_gpus",
        "TPU_V5e": "custom_model_training_tpu_v5e",
        "TPU_V3": "custom_model_training_tpu_v3",
    }
    serving_accelerator_map = {
        "NVIDIA_TESLA_V100": "custom_model_serving_nvidia_v100_gpus",
        "NVIDIA_L4": "custom_model_serving_nvidia_l4_gpus",
        "NVIDIA_TESLA_A100": "custom_model_serving_nvidia_a100_gpus",
        "NVIDIA_TESLA_T4": "custom_model_serving_nvidia_t4_gpus",
        "TPU_V5e": "custom_model_serving_tpu_v5e",
    }
    if is_for_training:
        if accelerator_type in training_accelerator_map:
            return training_accelerator_map[accelerator_type]
        else:
            raise ValueError(
                f"Could not find accelerator type: {accelerator_type} for training."
            )
    else:
        if accelerator_type in serving_accelerator_map:
            return serving_accelerator_map[accelerator_type]
        else:
            raise ValueError(
                f"Could not find accelerator type: {accelerator_type} for serving."
            )


def check_quota(
    project_id: str,
    region: str,
    accelerator_type: str,
    accelerator_count: int,
    is_for_training: bool,
):
    """Checks if the project and the region has the required quota."""
    resource_id = get_resource_id(accelerator_type, is_for_training)
    quota = get_quota(project_id, region, resource_id)
    quota_request_instruction = (
        "Either use "
        "a different region or request additional quota. Follow "
        "instructions here "
        "https://cloud.google.com/docs/quotas/view-manage#requesting_higher_quota"
        " to check quota in a region or request additional quota for "
        "your project."
    )
    if quota == -1:
        raise ValueError(
            f"""Quota not found for: {resource_id} in {region}.
            {quota_request_instruction}"""
        )
    if quota < accelerator_count:
        raise ValueError(
            f"""Quota not enough for {resource_id} in {region}:
            {quota} < {accelerator_count}.
            {quota_request_instruction}"""
        )

## Deploy Gemma 2 models with Hex-LLM on TPU

**Hex-LLM** is a **H**igh-**E**fficiency **L**arge **L**anguage **M**odel (LLM) TPU serving solution built with **XLA**, which is being developed by Google Cloud.

Refer to the "Request for TPU quota" section for TPU quota.

In [None]:
# @title Deploy
# @markdown Set the model ID. Model weights can be loaded from HuggingFace or from a GCS bucket.

# @markdown Select one of the four model variations.
MODEL_ID = "gemma-2-9b"  # @param ["gemma-2-9b", "gemma-2-9b-it", "gemma-2-27b", "gemma-2-27b-it"] {allow-input: true, isTemplate: true}
model_id = os.path.join(model_path_prefix, MODEL_ID)

# @markdown Find Vertex AI prediction TPUv5e machine types in
# @markdown https://cloud.google.com/vertex-ai/docs/predictions/use-tpu#deploy_a_model.
if "9b" in model_id:
    # Sets ct5lp-hightpu-4t (4 TPU chips) to deploy Gemma 2 9B models.
    machine_type = "ct5lp-hightpu-4t"
    accelerator_type = "TPU_V5e"
    # Note: 1 TPU V5 chip has only one core.
    accelerator_count = 4
else:
    # Sets ct5lp-hightpu-8t (8 TPU chips) to deploy Gemma 2 27B models.
    machine_type = "ct5lp-hightpu-8t"
    accelerator_type = "TPU_V5e"
    # Note: 1 TPU V5 chip has only one core.
    accelerator_count = 8

check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)

# Server parameters.
tensor_parallel_size = accelerator_count
hbm_utilization_factor = 0.6  # Fraction of HBM memory allocated for KV cache after model loading. A larger value improves throughput but gives higher risk of TPU out-of-memory errors with long prompts.
max_running_seqs = 256  # Maximum number of running sequences in a continuous batch.

# Endpoint configurations.
min_replica_count = 1
max_replica_count = 1

model_hexllm, endpoint_hexllm = deploy_model_hexllm(
    model_name=get_job_name_with_datetime(prefix=MODEL_ID),
    model_id=model_id,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    tensor_parallel_size=tensor_parallel_size,
    hbm_utilization_factor=hbm_utilization_factor,
    max_running_seqs=max_running_seqs,
    min_replica_count=min_replica_count,
    max_replica_count=max_replica_count,
)

In [None]:
# @title Predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. The first few requests may have high latency. This is because the server needs to warm up with the initial requests. The following requests should not have the same delay.

# @markdown Example:

# @markdown ```
# @markdown > What is a car?
# @markdown > A car is a four-wheeled vehicle designed for the transportation of passengers and their belongings.
# @markdown ```

# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint_hexllm.name` allows us to get the endpoint
#   name of the endpoint `endpoint_hexllm` created in the cell above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint:
# endpoint_name = endpoint_without_peft.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint_hexllm = aiplatform.Endpoint(aip_endpoint_name)

prompt = "What is a car?"  # @param {type: "string"}
max_tokens = 50  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}
top_p = 1.0  # @param {type: "number"}
top_k = 1  # @param {type: "integer"}
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
    },
]
response = endpoint_hexllm.predict(instances=instances)

prediction = response.predictions[0]
print(prediction)

## Deploy Gemma models with TGI on GPU

[TGI](https://github.com/huggingface/text-generation-inference) stands for Text Generation Inference. It's a powerful library designed specifically for running large language models on GPUs efficiently. TGI utilizes techniques like "paged attention" and "continuous batching" to improve the speed and throughput of LLMs.

Currently, only L4 GPUs are demonstrated in this notebook. Functionality on other GPU types will be confirmed and added in the future.

Gemma2 9B models require at least 2 L4 GPUs for deployment. Gemma2 27B models require at least 4 L4 GPUs for deployment.

In [None]:
# @title Deploy
MODEL_ID = "gemma-2-9b"  # @param ["gemma-2-9b", "gemma-2-9b-it", "gemma-2-27b", "gemma-2-27b-it"] {allow-input: true, isTemplate: true}
model_id = os.path.join(model_path_prefix, MODEL_ID)

# @markdown Finds Vertex AI prediction supported accelerators and regions in
# @markdown https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

accelerator_type = "NVIDIA_L4"  # @param ["NVIDIA_L4"] {isTemplate: true}

if "9b" in MODEL_ID:
    if accelerator_type == "NVIDIA_L4":
        # Sets 2 L4 (24G) to deploy Gemma 9B models.
        machine_type = "g2-standard-24"
        accelerator_count = 2
    else:
        raise ValueError(
            "Recommended machine settings not found for accelerator type: %s"
            % accelerator_type
        )
elif "27b" in MODEL_ID:
    if accelerator_type == "NVIDIA_L4":
        # Sets 4 L4 (24G) to deploy Gemma 27B models.
        machine_type = "g2-standard-48"
        accelerator_count = 4
    else:
        raise ValueError(
            "Recommended machine settings not found for accelerator type: %s"
            % accelerator_type
        )
else:
    raise ValueError("Recommended machine settings not found for model: %s" % MODEL_ID)

check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)

# Note that larger token counts will require more GPU memory. For example, if you'd
# like to increase the `max_total_tokens` and `max_batch_prefill_tokens` to 8192,
# you may need 4 L4s for the 9b model, and 8 L4s for the 27b model.
max_input_length = 1562
max_total_tokens = 2048
max_batch_prefill_tokens = 2048

model_tgi, endpoint_tgi = deploy_model_tgi(
    model_name=get_job_name_with_datetime(prefix=MODEL_ID),
    model_id=model_id,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    max_input_length=max_input_length,
    max_total_tokens=max_total_tokens,
    max_batch_prefill_tokens=max_batch_prefill_tokens,
)

In [None]:
# @title Predict
# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts.

# @markdown Example:

# @markdown ```
# @markdown > What is a car?
# @markdown > A car is a four-wheeled vehicle designed for the transportation of passengers and their belongings.
# @markdown ```

# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.


# @markdown Please click "Show Code" to see more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint_tgi.name` allows us to get the
#   endpoint name of the endpoint `endpoint_tgi` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint_tgi = aiplatform.Endpoint(aip_endpoint_name)

prompt = "What is a car?"  # @param {type: "string"}
max_new_tokens = 128  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}
top_p = 0.9  # @param {type:"number"}
top_k = 1  # @param {type:"integer"}

# Overides max_new_tokens and top_k parameters during inferences.
# If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`,
# you can reduce the max length, such as set max_new_tokens as 20.
instances = [
    {
        "inputs": f"### Human: {prompt}### Assistant: ",
        "parameters": {
            "max_new_tokens": max_new_tokens,
            "temperature": temperature,
            "top_p": top_p,
            "top_k": top_k,
        },
    },
]

response = endpoint_tgi.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

## Clean up resources

In [None]:
# @title Delete the models and endpoints

# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continouous charges that may incur.
# Undeploy models and delete endpoints.
endpoint_hexllm.delete(force=True)
endpoint_tgi.delete(force=True)

# Delete models.
model_hexllm.delete()
model_tgi.delete()

# Delete Cloud Storage objects.
delete_bucket = False  # @param {type:"boolean", isTemplate: true}
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI