In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Gemma 2 (Deployment)

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_gemma2_deployment_on_vertex.ipynb">
      <img alt="Workbench logo" src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" width="32px"><br> Run in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_gemma2_deployment_on_vertex.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_gemma2_deployment_on_vertex.ipynb">
      <img alt="GitHub logo" src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates deploying Gemma 2 models
 * on TPU using **Hex-LLM**, a **H**igh-**E**fficiency **L**arge **L**anguage **M**odel serving solution built with **XLA** that is being developed by Google Cloud, and
 * on GPU using **TGI** ([text-generation-inference](https://github.com/huggingface/text-generation-inference)), the state-of-the-art open source LLM serving solution on GPU.


### Objective

- Deploy Gemma 2 with Hex-LLM on TPU
- Deploy Gemma with [TGI](https://github.com/huggingface/text-generation-inference) on GPU

### File a bug

File a bug on [GitHub](https://github.com/GoogleCloudPlatform/vertex-ai-samples/issues/new) if you encounter any issue with the notebook.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Request for TPU quota

# @markdown By default, the quota for TPU deployment `Custom model serving TPU v5e cores per region` is 4. TPU quota is only available in `us-west1`. You can request for higher TPU quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = ""  # @param {type:"string"}

# @markdown 3. If you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus). You can request for quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

# @markdown > | Machine Type | Accelerator Type | Recommended Regions |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |
# @markdown | a3-highgpu-2g | 2 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-4g | 4 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, europe-west4, us-west1, asia-southeast1 |

# Upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet 'google-cloud-aiplatform==1.97.0'

import importlib
import os
from typing import Tuple

from google.cloud import aiplatform

# Upgrade Vertex AI SDK.
if os.environ.get("VERTEX_PRODUCT") != "COLAB_ENTERPRISE":
    ! pip install --upgrade tensorflow
! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

LABEL = "tgi"
models, endpoints = {}, {}

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION)

! gcloud config set project $PROJECT_ID

import vertexai

vertexai.init(
    project=PROJECT_ID,
    location=REGION,
)

# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint). Note that [dedicated endpoint does not support VPC Service Controls](https://cloud.google.com/vertex-ai/docs/predictions/choose-endpoint-type), uncheck the box if you are using VPC-SC.
use_dedicated_endpoint = True  # @param {type:"boolean"}
# @markdown ## Access Gemma 2 Models

# @markdown You must provide a Hugging Face User Access Token (with read access) to access the Gemma 2 models. You can follow the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens) to create a **read** access token and put it in the `HF_TOKEN` field below.

HF_TOKEN = ""  # @param {type:"string", isTemplate:true}
assert (
    HF_TOKEN
), "Provide a read HF_TOKEN to load models from Hugging Face, or select a different model source."

model_path_prefix = "google/"

## Deploy Gemma 2 models with Hex-LLM on TPU

**Hex-LLM** is a **H**igh-**E**fficiency **L**arge **L**anguage **M**odel (LLM) TPU serving solution built with **XLA**, which is being developed by Google Cloud. Learn more about Hex-LLM [here](https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-hex-llm).

Refer to the "Request for TPU quota" section for TPU quota.

In [None]:
# @title Select the model variants

# @markdown Select one of the four model variations.
MODEL_ID = "gemma-2-2b-it"  # @param ["gemma-2-2b", "gemma-2-2b-it", "gemma-2-9b", "gemma-2-9b-it", "gemma-2-27b", "gemma-2-27b-it"] {allow-input: true, isTemplate: true}
version_id = f"publishers/google/models/gemma2/@{MODEL_ID}"

TPU_DEPLOYMENT_REGION = "us-west1"  # @param ["us-west1"] {isTemplate:true}
model_id = os.path.join(model_path_prefix, MODEL_ID)

# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint). Note that [dedicated endpoint does not support VPC Service Controls](https://cloud.google.com/vertex-ai/docs/predictions/choose-endpoint-type), uncheck the box if you are using VPC-SC.
use_dedicated_endpoint = True  # @param {type:"boolean"}

# @markdown Find Vertex AI prediction TPUv5e machine types in
# @markdown https://cloud.google.com/vertex-ai/docs/predictions/use-tpu#deploy_a_model.
if "2b" in model_id:
    # Sets ct5lp-hightpu-1t (1 TPU chip) to deploy Gemma 2 2B models.
    machine_type = "ct5lp-hightpu-1t"
    accelerator_type = "TPU_V5e"
    # Note: 1 TPU V5 chip has only one core.
    accelerator_count = 1
elif "9b" in model_id:
    # Sets ct5lp-hightpu-4t (4 TPU chips) to deploy Gemma 2 9B models.
    machine_type = "ct5lp-hightpu-4t"
    accelerator_type = "TPU_V5e"
    # Note: 1 TPU V5 chip has only one core.
    accelerator_count = 4
else:
    # Sets ct5lp-hightpu-8t (8 TPU chips) to deploy Gemma 2 27B models.
    machine_type = "ct5lp-hightpu-8t"
    accelerator_type = "TPU_V5e"
    # Note: 1 TPU V5 chip has only one core.
    accelerator_count = 8

common_util.check_quota(
    project_id=PROJECT_ID,
    region=TPU_DEPLOYMENT_REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)

In [None]:
# @title Deploy Gemma2 models with Hex-LLM on TPU
# @markdown Set the model ID. Model weights can be loaded from HuggingFace or from a GCS bucket.

# The pre-built serving docker images.
HEXLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/hex-llm-serve:20241210_2323_RC00"

# Server parameters.
tensor_parallel_size = accelerator_count
hbm_utilization_factor = 0.6  # Fraction of HBM memory allocated for KV cache after model loading. A larger value improves throughput but gives higher risk of TPU out-of-memory errors with long prompts.
max_running_seqs = 256  # Maximum number of running sequences in a continuous batch.

# Endpoint configurations.
min_replica_count = 1
max_replica_count = 1


def deploy_model_hexllm(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    base_model_id: str = None,
    data_parallel_size: int = 1,
    tensor_parallel_size: int = 1,
    machine_type: str = "ct5lp-hightpu-1t",
    tpu_topology: str = "1x1",
    disagg_topology: str = None,
    hbm_utilization_factor: float = 0.6,
    max_running_seqs: int = 256,
    max_model_len: int = 4096,
    enable_prefix_cache_hbm: bool = False,
    endpoint_id: str = "",
    min_replica_count: int = 1,
    max_replica_count: int = 1,
    use_dedicated_endpoint: bool = False,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys models with Hex-LLM on TPU in Vertex AI."""
    if endpoint_id:
        aip_endpoint_name = (
            f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_id}"
        )
        endpoint = aiplatform.Endpoint(aip_endpoint_name)
    else:
        endpoint = aiplatform.Endpoint.create(
            display_name=f"{model_name}-endpoint",
            location=TPU_DEPLOYMENT_REGION,
            dedicated_endpoint_enabled=use_dedicated_endpoint,
        )

    if not base_model_id:
        base_model_id = model_id

    if not tensor_parallel_size:
        tensor_parallel_size = int(machine_type[-2])

    num_hosts = int(tpu_topology.split("x")[0])

    # Learn more about the supported arguments and environment variables at https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-hex-llm#config-server.
    hexllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--data_parallel_size={data_parallel_size}",
        f"--tensor_parallel_size={tensor_parallel_size}",
        f"--num_hosts={num_hosts}",
        f"--hbm_utilization_factor={hbm_utilization_factor}",
        f"--max_running_seqs={max_running_seqs}",
        f"--max_model_len={max_model_len}",
    ]
    if disagg_topology:
        hexllm_args.append(f"--disagg_topo={disagg_topology}")
    if enable_prefix_cache_hbm and not disagg_topology:
        hexllm_args.append("--enable_prefix_cache_hbm")

    env_vars = {
        "MODEL_ID": base_model_id,
        "HEX_LLM_LOG_LEVEL": "info",
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars.update({"HF_TOKEN": HF_TOKEN})
    except:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=HEXLLM_DOCKER_URI,
        serving_container_command=["python", "-m", "hex_llm.server.api_server"],
        serving_container_args=hexllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        location=TPU_DEPLOYMENT_REGION,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        tpu_topology=tpu_topology if num_hosts > 1 else None,
        deploy_request_timeout=1800,
        min_replica_count=min_replica_count,
        max_replica_count=max_replica_count,
        system_labels={
            "NOTEBOOK_NAME": "model_garden_gemma2_deployment_on_vertex.ipynb",
            "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
        },
    )
    return model, endpoint


models["hexllm_tpu"], endpoints["hexllm_tpu"] = deploy_model_hexllm(
    model_name=common_util.get_job_name_with_datetime(prefix=MODEL_ID),
    model_id=model_id,
    publisher="google",
    publisher_model_id="gemma2",
    machine_type=machine_type,
    tensor_parallel_size=tensor_parallel_size,
    hbm_utilization_factor=hbm_utilization_factor,
    max_running_seqs=max_running_seqs,
    min_replica_count=min_replica_count,
    max_replica_count=max_replica_count,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

In [None]:
# @title Predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts based on your `template`. Note that the first few prompts will take longer to execute.

# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# @markdown Example:

# @markdown ```
# @markdown > What is a car?
# @markdown > A car is a four-wheeled vehicle designed for the transportation of passengers and their belongings.
# @markdown ```

# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the endpoint
#   name of the endpoint `endpoint` created in the cell above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint:
# endpoint_name = endpoint_without_peft.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

prompt = "What is a car?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 50  # @param {type: "integer"}
temperature = 1.0  # @param {type: "number"}
top_p = 1.0  # @param {type: "number"}
top_k = 1  # @param {type: "integer"}

# Overrides parameters for inferences.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
    },
]
response = endpoints["hexllm_tpu"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    print(prediction)

## Deploy Gemma models with TGI on GPU

[TGI](https://github.com/huggingface/text-generation-inference) stands for Text Generation Inference. It's a powerful library designed specifically for running large language models on GPUs efficiently. TGI utilizes techniques like "paged attention" and "continuous batching" to improve the speed and throughput of LLMs.

Currently, only L4 GPUs are demonstrated in this notebook. Functionality on other GPU types will be confirmed and added in the future.

Gemma2 2B, 9B and 27B models require at least 1, 2, and 4 L4 GPUs respectively for deployment.

In [None]:
# @title Select the model variants

# The pre-built serving docker image.
TGI_DOCKER_URI = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-1.ubuntu2204.py310"

MODEL_ID = "gemma-2-2b"  # @param ["gemma-2-2b", "gemma-2-2b-it", "gemma-2-9b", "gemma-2-9b-it", "gemma-2-27b", "gemma-2-27b-it"] {allow-input: true, isTemplate: true}
model_id = os.path.join(model_path_prefix, MODEL_ID)
PUBLISHER_MODEL_NAME = f"publishers/google/models/gemma2@{MODEL_ID}"

# @markdown Finds Vertex AI prediction supported accelerators and regions in
# @markdown https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

accelerator_type = "NVIDIA_L4"  # @param ["NVIDIA_L4"] {isTemplate: true}

# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint). Note that [dedicated endpoint does not support VPC Service Controls](https://cloud.google.com/vertex-ai/docs/predictions/choose-endpoint-type), uncheck the box if you are using VPC-SC.
use_dedicated_endpoint = True  # @param {type:"boolean"}


if "2b" in MODEL_ID:
    if accelerator_type == "NVIDIA_L4":
        # Sets 1 L4 (24G) to deploy Gemma 2 2B models.
        machine_type = "g2-standard-12"
        accelerator_count = 1
    else:
        raise ValueError(
            "Recommended machine settings not found for accelerator type: %s"
            % accelerator_type
        )
elif "9b" in MODEL_ID:
    if accelerator_type == "NVIDIA_L4":
        # Sets 2 L4 (24G) to deploy Gemma 2 9B models.
        machine_type = "g2-standard-24"
        accelerator_count = 2
    else:
        raise ValueError(
            "Recommended machine settings not found for accelerator type: %s"
            % accelerator_type
        )
elif "27b" in MODEL_ID:
    if accelerator_type == "NVIDIA_L4":
        # Sets 4 L4 (24G) to deploy Gemma 2 27B models.
        machine_type = "g2-standard-48"
        accelerator_count = 4
    else:
        raise ValueError(
            "Recommended machine settings not found for accelerator type: %s"
            % accelerator_type
        )
else:
    raise ValueError("Recommended machine settings not found for model: %s" % MODEL_ID)

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)

# @markdown Click "Show Code" to see more details.

In [None]:
# @title [Option 1] Deploy with Model Garden SDK

# @markdown Deploy with Gen AI model-centric SDK. This section uploads the prebuilt model to Model Registry and deploys it to a Vertex AI Endpoint. It takes 15 minutes to 1 hour to finish depending on the size of the model. See [use open models with Vertex AI](https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-open-models) for documentation on other use cases.
from vertexai import model_garden

model = model_garden.OpenModel(PUBLISHER_MODEL_NAME)
endpoints[LABEL] = model.deploy(
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    use_dedicated_endpoint=use_dedicated_endpoint,
    accept_eula=True,  # Accept the End User License Agreement (EULA) on the model card before deploy. Otherwise, the deployment will be forbidden.
)

endpoint = endpoints[LABEL]

In [None]:
# @title [Option 2] Deploy Gemma models with TGI on GPU

# Note that larger token counts will require more GPU memory. For example, if you'd
# like to increase the `max_total_tokens` and `max_batch_prefill_tokens` to 8192,
# you may need 1 L4 for 2b model, 4 L4s for the 9b model, and 8 L4s for the 27b model.
max_input_length = 1562
max_total_tokens = 2048
max_batch_prefill_tokens = 2048


def deploy_model_tgi(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    service_account: str = None,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    max_input_length: int = 2047,
    max_total_tokens: int = 2048,
    max_batch_prefill_tokens: int = 2048,
    use_dedicated_endpoint: bool = False,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys models with TGI on GPU in Vertex AI."""
    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )

    env_vars = {
        "MODEL_ID": model_id,
        "NUM_SHARD": f"{accelerator_count}",
        "MAX_INPUT_LENGTH": f"{max_input_length}",
        "MAX_TOTAL_TOKENS": f"{max_total_tokens}",
        "MAX_BATCH_PREFILL_TOKENS": f"{max_batch_prefill_tokens}",
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    if service_account:
        env_vars["SERVICE_ACCOUNT"] = service_account

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=TGI_DOCKER_URI,
        serving_container_ports=[8080],
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
        system_labels={
            "NOTEBOOK_NAME": "model_garden_gemma2_deployment_on_vertex.ipynb",
            "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
        },
    )
    return model, endpoint


models[LABEL], endpoints[LABEL] = deploy_model_tgi(
    model_name=common_util.get_job_name_with_datetime(prefix=MODEL_ID),
    model_id=model_id,
    publisher="google",
    publisher_model_id="gemma2",
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    max_input_length=max_input_length,
    max_total_tokens=max_total_tokens,
    max_batch_prefill_tokens=max_batch_prefill_tokens,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

# @markdown Click "Show Code" to see more details.

In [None]:
# @title Predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts.

# @markdown Here we use an example from the [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) to show the finetuning outcome:

# @markdown ```
# @markdown ### Human: How would the Future of AI in 10 Years look?### Assistant: Predicting the future is always a challenging task, but here are some possible ways that AI could evolve over the next 10 years: Continued advancements in deep learning: Deep learning has been one of the main drivers of recent AI breakthroughs, and we can expect continued advancements in this area. This may include improvements to existing algorithms, as well as the development of new architectures that are better suited to specific types of data and tasks. Increased use of AI in healthcare: AI has the potential to revolutionize healthcare, by improving the accuracy of diagnoses, developing new treatments, and personalizing patient care. We can expect to see continued investment in this area, with more healthcare providers and researchers using AI to improve patient outcomes. Greater automation in the workplace: Automation is already transforming many industries, and AI is likely to play an increasingly important role in this process. We can expect to see more jobs being automated, as well as the development of new types of jobs that require a combination of human and machine skills. More natural and intuitive interactions with technology: As AI becomes more advanced, we can expect to see more natural and intuitive ways of interacting with technology. This may include voice and gesture recognition, as well as more sophisticated chatbots and virtual assistants. Increased focus on ethical considerations: As AI becomes more powerful, there will be a growing need to consider its ethical implications. This may include issues such as bias in AI algorithms, the impact of automation on employment, and the use of AI in surveillance and policing. Overall, the future of AI in 10 years is likely to be shaped by a combination of technological advancements, societal changes, and ethical considerations. While there are many exciting possibilities for AI in the future, it will be important to carefully consider its potential impact on society and to work towards ensuring that its benefits are shared fairly and equitably.
# @markdown ```

# @markdown Click "Show Code" to see more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

prompt = "How would the Future of AI in 10 Years look?"  # @param {type: "string"}
# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 128  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}
top_p = 0.9  # @param {type:"number"}
top_k = 1  # @param {type:"integer"}

# Overrides max_tokens and top_k parameters during inferences.
instances = [
    {
        "inputs": f"### Human: {prompt}### Assistant: ",
        "parameters": {
            "max_new_tokens": max_tokens,
            "temperature": temperature,
            "top_p": top_p,
            "top_k": top_k,
        },
    },
]
response = endpoints["tgi"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    print(prediction)

## Clean up resources

In [None]:
# @title Delete the models and endpoints

# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()