In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Gemma Finetuning

<table align="left">
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_gemma_finetuning_on_vertex.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab Enterprise
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_gemma_finetuning_on_vertex.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>

## Overview

This notebook demonstrates finetuning and deploying Gemma models with [Vertex AI Custom Training Job](https://cloud.google.com/vertex-ai/docs/training/create-custom-job). All of the examples in this notebook use parameter efficient finetuning methods [PEFT (LoRA)](https://github.com/huggingface/peft) to reduce training and storage costs. LoRA (Low-Rank Adaptation) is one approach of Parameter Efficient FineTuning (PEFT), where pretrained model weights are frozen and rank decomposition matrices representing the change in model weights are trained during finetuning. Read more about LoRA in the following publication: [Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L. and Chen, W., 2021. Lora: Low-rank adaptation of large language models. *arXiv preprint arXiv:2106.09685*](https://arxiv.org/abs/2106.09685).


After tuning, we can deploy models on Vertex with GPU or TPU.


### Objective

- Finetune and deploy Gemma models with Vertex AI Custom Training Jobs.
- Send prediction requests to your finetuned Gemma model.


### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

# @markdown 2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 3. [Enable the Vertex AI API and Compute Engine API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute.googleapis.com).

# @markdown 4. **If using KerasNLP PEFT and HexLLM deployment (only 2b version supported right now)** which uses TPUs, request TPU quota for serving and optionally training:
# @markdown     - For training,
# @markdown     you already have TPU V3 8 cores available in
# @markdown     three regions: `us-central1`, `europe-west4`, and `asia-east1` which is sufficient. Load the notebook in one of these three regions.
# @markdown     For other regions, check your [accelerator quota](https://console.cloud.google.com/iam-admin/quotas) and use  **Filter** search box to enter `Custom model training TPU V3 cores per region` to display current quota for the `Vertex AI API service` in different regions
# @markdown     and then follow the instructions at [Request a higher quota](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota) to request 8 cores for your region.
# @markdown     - For serving, check your [accelerator quota](https://console.cloud.google.com/iam-admin/quotas) and use  **Filter** search box to enter `Custom model serving TPU V5e cores per region` to display current quota for the `Vertex AI API service` in different regions
# @markdown     and then follow the instructions at [Request a higher quota](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota) to request 8 cores for your region.


# @markdown 5. [Optional] [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs.

# @markdown **[Optional]** Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the specified region (`REGION`). Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1"). If not set, a unique GCS bucket will be created instead.

import os
import sys
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, please change the value yourself below.
now = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}"  # @param

# Gets the default BUCKET_URI and SERVICE_ACCOUNT if they were not specified by the user.

SERVICE_ACCOUNT = None
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

print("Using this default Service Account:", SERVICE_ACCOUNT)

# Create a unique GCS bucket for this notebook, if not specified by the user
if BUCKET_URI.strip().startswith(f"gs://{PROJECT_ID}-tmp-"):
    ! gsutil mb -l {REGION} {BUCKET_URI}

print(f"Using this GCS Bucket: {BUCKET_URI}")

# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_URI

# @markdown ## Access Gemma Models
# @markdown For GPU based finetuning and serving, choose between accessing Gemma models on [Hugging Face](https://huggingface.co/)
# @markdown or Vertex AI as described below.

# @markdown If you already obtained access to Gemma models on [Hugging Face](https://huggingface.co/), you can load models from there.
# @markdown Alternatively, you can also load the original Gemma models for finetuning and serving from Vertex AI after accepting the agreement.

# @markdown For TPU based finetuning and serving with KerasNLP, choose the Kaggle option.

# @markdown **Please only select and fill one of the three following sections.**
LOAD_MODEL_FROM = "Hugging Face"  # @param ["Hugging Face", "Google Cloud", "Kaggle"]

# @markdown ---

# @markdown ### Access Gemma models on Hugging Face for GPU based finetuning and serving
# @markdown You must provide a Hugging Face User Access Token (read) to access the Gemma models. You can follow the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens) to create a **read** access token and put it in the `HF_TOKEN` field below.

HF_TOKEN = ""  # @param {type:"string"}

# @markdown *--- Or ---*
# @markdown ### Access Gemma models on Vertex AI for GPU based finetuning and serving
# @markdown Accept the model agreement to access the models:
# @markdown 1. Open the [Gemma model card](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/335) from [Vertex AI Model Garden](https://cloud.google.com/model-garden).
# @markdown 2. Review the agreement on the model card page.
# @markdown 3. After accepting the agreement of Gemma, a `https://` link containing Gemma pretrained and finetuned models will be shared.
# @markdown 4. Paste the link in the `VERTEX_MODEL_GARDEN_GEMMA` field below.
# @markdown **Note:** This will unzip and copy the Gemma model artifacts to your Cloud Storage bucket, which will take around 1 hour.

VERTEX_MODEL_GARDEN_GEMMA = ""  # @param {type:"string"}

! gcloud config set project $PROJECT_ID

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
MODEL_BUCKET = os.path.join(BUCKET_URI, "gemma")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

if LOAD_MODEL_FROM == "Google Cloud":
    assert (
        VERTEX_MODEL_GARDEN_GEMMA
    ), "Please click the agreement of Gemma in Vertex AI Model Garden, and get the URL to Gemma model artifacts."

    # Only use the last part in case a full command is pasted.
    signed_url = VERTEX_MODEL_GARDEN_GEMMA.split(" ")[-1].strip('"')

    ! mkdir -p ./gemma
    ! curl -X GET "{signed_url}" | tar -xzvf - -C ./gemma/
    ! gsutil -m cp -R ./gemma/* {MODEL_BUCKET}

    base_model_path_prefix = MODEL_BUCKET
    HF_TOKEN = ""
else:
    base_model_path_prefix = "google/"


# The pre-built training and serving docker images.
TRAIN_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20240220_0936_RC01"
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240220_0936_RC01"

# The pre-built training and serving docker images for KerasNLP training
# and Hex-LLM serving.
KERAS_TRAIN_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/jax-keras-train-tpu:20240220_0936_RC01"
KERAS_MODEL_CONVERSION_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/jax-keras-model-conversion:20240220_0936_RC01"
HEXLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/hex-llm-serve:20240220_0936_RC01"
conversion_job = None

# @markdown *--- Or ---*
# @markdown ### Access Gemma models from Kaggle for TPU based finetuning and serving
# @markdown Kaggle credentials are required for KerasNLP training and Hex-LLM deployment with TPUs.
# @markdown Generate the Kaggle username and key by following [these instructions](https://github.com/Kaggle/kaggle-api?tab=readme-ov-file#api-credentials).
# @markdown You will need to review and accept the model license.
KAGGLE_USERNAME = ""  # @param {type:"string"}
KAGGLE_KEY = ""  # @param {type:"string"}
# @markdown ---

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user(project_id=PROJECT_ID)


def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        "--gpu-memory-utilization=0.95",
        "--disable-log-stats",
    ]

    env_vars = {}
    if HF_TOKEN:
        env_vars["HF_TOKEN"] = HF_TOKEN

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_command=["python", "-m", "vllm.entrypoints.api_server"],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        artifact_uri=model_id,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint


def deploy_model_hexllm(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "ct5lp-hightpu-1t",
    max_num_batched_tokens: int = 11264,
    tokens_pad_multiple: int = 1024,
    seqs_pad_multiple: int = 32,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys models with Hex-LLM on TPU in Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    num_tpu_chips = int(machine_type[-2])
    hexllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        "--log_level=INFO",
        f"--tensor_parallel_size={num_tpu_chips}",
        "--num_nodes=1",
        "--use_ray",
        "--batch_mode=continuous",
        f"--max_num_batched_tokens={max_num_batched_tokens}",
        f"--tokens_pad_multiple={tokens_pad_multiple}",
        f"--seqs_pad_multiple={seqs_pad_multiple}",
    ]

    model = aiplatform.Model.upload(
        display_name=model_name,
        artifact_uri=model_id,
        serving_container_image_uri=HEXLLM_DOCKER_URI,
        serving_container_command=["python", "-m", "hex_llm.entrypoints.api_server"],
        serving_container_args=hexllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables={
            "PJRT_DEVICE": "TPU",
            "RAY_DEDUP_LOGS": "0",
            "RAY_USAGE_STATS_ENABLED": "0",
            "KAGGLE_USERNAME": KAGGLE_USERNAME,
            "KAGGLE_KEY": KAGGLE_KEY,
        },
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint

## Finetune with HuggingFace PEFT and Deploy with vLLM on GPUs

In [None]:
# @title Set Dataset

# @markdown Use the Vertex AI SDK to create and run the custom training jobs.

# @markdown This notebook uses [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) dataset as an example.
# @markdown You can set `dataset_name` to any existing [Hugging Face dataset](https://huggingface.co/datasets) name, and set `instruct_column_in_dataset` to the name of the dataset column containing training data. The [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) has only one column `text`, and therefore we set `instruct_column_in_dataset` to `text` in this notebook.

# @markdown ### (Optional) Prepare a custom JSONL dataset for finetuning

# @markdown You can prepare a JSONL file where each line is a valid JSON string as your custom training dataset. For example, here is one line from the [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) dataset:
# @markdown ```
# @markdown {"text": "### Human: Hola### Assistant: \u00a1Hola! \u00bfEn qu\u00e9 puedo ayudarte hoy?"}
# @markdown ```

# @markdown The JSON object has a key `text`, which should match `instruct_column_in_dataset`; The value should be one training data point, i.e. a string. After you prepared your JSONL file, you can either upload it to [Hugging Face datasets](https://huggingface.co/datasets) or [Google Cloud Storage](https://cloud.google.com/storage).

# @markdown - To upload a JSONL dataset to [Hugging Face datasets](https://huggingface.co/datasets), follow the instructions on [Uploading Datasets](https://huggingface.co/docs/hub/en/datasets-adding). Then, set `dataset_name` to the name of your newly created dataset on Hugging Face.

# @markdown - To upload a JSONL dataset to [Google Cloud Storage](https://cloud.google.com/storage), follow the instructions on [Upload objects from a filesystem](https://cloud.google.com/storage/docs/uploading-objects). Then, set `dataset_name` to the `gs://` URI to your JSONL file. For example: `gs://cloud-samples-data/vertex-ai/model-evaluation/peft_train_sample.jsonl`.

# @markdown Optionally update the `instruct_column_in_dataset` field below if your JSON objects use a key other than the default `text`.

# @markdown ### (Optional) Format your data with custom JSON template

# @markdown Sometimes, your dataset might have multiple text columns and you want to construct the training data with a template. You can prepare a JSON template in the following format:

# @markdown ```
# @markdown {
# @markdown   "description": "A short template for vertex sample dataset.",
# @markdown   "prompt_input": "{input_text}{output_text}",
# @markdown   "prompt_no_input": "{input_text}{output_text}"
# @markdown }
# @markdown ```

# @markdown As an example, the template above can be used to format the following training data (this line comes from `gs://cloud-samples-data/vertex-ai/model-evaluation/peft_train_sample.jsonl`):

# @markdown ```
# @markdown {"input_text":"TRANSCRIPT: \nREASON FOR EVALUATION:,\n\n LABEL:","output_text":"Chiropractic"}
# @markdown ```

# @markdown This example template simply concatenates `input_text` with `output_text`. You can set `template` to `vertex_sample` to try out this built-in template, or build more complicated JSON templates such as [the alpaca example](https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca.json). To use your own JSON template, please [upload it to Google Cloud Storage](https://cloud.google.com/storage/docs/uploading-objects) and put the `gs://` URI in the `template` field below. Leave `instruct_column_in_dataset` as `text`.

dataset_name = "timdettmers/openassistant-guanaco"  # @param {type:"string"}

# Name of the dataset column containing training text input.
instruct_column_in_dataset = "text"  # @param {type:"string"}

# Cloud Storage URI to the template JSON file.
template = ""  # @param {type:"string"}

In [None]:
# @title Finetune
# @markdown Use the Vertex AI SDK to create and run the custom training jobs.

# @markdown **Note**: To finetune the Gemma 7B models, we recommend setting `finetuning_precision_mode` to `4bit` and using NVIDIA_L4 instead of NVIDIA_TESLA_V100.
# @markdown Please click "Show Code" to see more details.

# The Gemma base model.
base_model = "gemma-2b"  # @param["gemma-2b", "gemma-2b-it", "gemma-7b", "gemma-7b-it"]
base_model_id = os.path.join(base_model_path_prefix, base_model)

# Batch size for finetuning.
per_device_train_batch_size = 1  # @param{type:"integer"}
# Runs 10 training steps as a minimal example.
max_steps = 10  # @param {type:"integer"}
# Precision mode for finetuning.
finetuning_precision_mode = "float16"  # @param["4bit", "8bit", "float16"]
# Learning rate.
learning_rate = 2e-4
# LoRA parameters.
lora_rank = 16  # @param{type:"integer"}
lora_alpha = 64  # @param{type:"integer"}
lora_dropout = 0.1  # @param{type:"number"}

# Worker pool spec.

# Finetunes Gemma with 1 L4 (24G).
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1
# Finetunes Gemma with 1 V100 (16G).
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1

replica_count = 1

# Setup training job.
job_name = get_job_name_with_datetime("gemma-lora-train")

# Pass training arguments and launch job.
train_job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name,
    container_uri=TRAIN_DOCKER_URI,
)

# Create a GCS folder to store the LORA adapter.
lora_adapter_dir = get_job_name_with_datetime("gemma-lora-adapter")
lora_output_dir = os.path.join(STAGING_BUCKET, lora_adapter_dir)

# Create a GCS folder to store the merged model with the base model and the
# finetuned LORA adapter.
merged_model_dir = get_job_name_with_datetime("gemma-merged-model")
merged_model_output_dir = os.path.join(STAGING_BUCKET, merged_model_dir)

train_job.run(
    args=[
        "--task=instruct-lora",
        f"--pretrained_model_id={base_model_id}",
        f"--dataset_name={dataset_name}",
        f"--instruct_column_in_dataset={instruct_column_in_dataset}",
        f"--output_dir={lora_output_dir}",
        f"--merge_base_and_lora_output_dir={merged_model_output_dir}",
        f"--per_device_train_batch_size={per_device_train_batch_size}",
        f"--lora_rank={lora_rank}",
        f"--lora_alpha={lora_alpha}",
        f"--lora_dropout={lora_dropout}",
        f"--max_steps={max_steps}",
        "--max_seq_length=512",
        f"--learning_rate={learning_rate}",
        f"--precision_mode={finetuning_precision_mode}",
        f"--template={template}",
        f"--huggingface_access_token={HF_TOKEN}",
    ],
    environment_variables={"WANDB_DISABLED": True},
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    boot_disk_size_gb=500,
    service_account=SERVICE_ACCOUNT,
)

print("LoRA adapter was saved in: ", lora_output_dir)
print("Trained and merged models were saved in: ", merged_model_output_dir)

In [None]:
# @title Deploy
# @markdown This section uploads the model to Model Registry and deploys it on the Endpoint. It takes 15 minutes to 1 hour to finish.
# @markdown Please click "Show Code" to see more details.

# After the deployment succeeds, the finetuned model will be downloaded from the GCS bucket used in training above. Thus, an additional ~10 minutes (depending on the model sizes) of waiting time is needed **after** the model deployment step above succeeds and before you run the next step below. Otherwise you might see a `ServiceUnavailable: 503 502:Bad Gateway` error when you send requests to the endpoint.

print("Deploying models in: ", merged_model_output_dir)

# Please finds Vertex AI prediction supported accelerators and regions in [here](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute).
# Sets 1 L4 (24G) to deploy Gemma models.
machine_type = "g2-standard-8"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1

model, endpoint = deploy_model_vllm(
    model_name=get_job_name_with_datetime(prefix="gemma-vllm-serve"),
    model_id=merged_model_output_dir,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
)
print("endpoint_name:", endpoint.name)

In [None]:
# @title Predict
# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts.

# @markdown Example:

# @markdown ```
# @markdown Human: What is a car?
# @markdown Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
# @markdown ```

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)


prompt = "What is a car?"  # @param {type: "string"}
max_tokens = 50  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}
top_p = 1.0  # @param {type:"number"}
top_k = 1.0  # @param {type:"number"}


# Overides max_tokens and top_k parameters during inferences.
# If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`,
# you can reduce the max length, such as set max_tokens as 20.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
    },
]
response = endpoint.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

## Finetune with KerasNLP PEFT and Deploy with HexLLM on TPUs

In [None]:
# @title Set Dataset

# @markdown For the training dataset, you can
# @markdown  - either use a pre-built TensorFlow
# @markdown    dataset such as this [imdb_reviews](https://www.tensorflow.org/datasets/catalog/imdb_reviews) dataset
# @markdown  - or your own dataset in [JSONL format](https://jsonlines.org/examples/) such as this [databricks-dolly-15K](https://huggingface.co/datasets/databricks/databricks-dolly-15k/blob/main/databricks-dolly-15k.jsonl) JSONL file. [See license information here.](https://huggingface.co/datasets/databricks/databricks-dolly-15k#licenseattribution)

# @markdown Whether you use a TensorFlow dataset or a JSONL dataset, each data-item
# @markdown will be in the form of a dictionary containing multiple key-value pairs. For example,
# @markdown [the `imdb_reviews` data-item dictionary](https://www.tensorflow.org/datasets/catalog/imdb_reviews#imdb_reviewsplain_text_default_config)
# @markdown contains keys `text` and `label` and [the `databricks-dolly-15k` data-item dictionary](https://huggingface.co/datasets/databricks/databricks-dolly-15k) contains
# @markdown keys `instruction`, `context`, `response`, and  `category`.
# @markdown However, the Gemma model only takes a single string as a training example. To arbitrarily select and combine
# @markdown multiple key-values into a single training string, you can set a `template` variable in the next section.
# @markdown For example, for the [`databricks-dolly-15k`](https://huggingface.co/datasets/databricks/databricks-dolly-15k), you can set the
# @markdown `template` as `Instruction: {instruction} Response: {response}`
# @markdown which will then automatically fill each `instruction` and `response` key-values into this string template
# @markdown and generate a single training string which will look like:

# @markdown ```
# @markdown  Instruction: Why can camels survive for long without water? Response: Camels use the fat in their humps to keep them filled with energy and hydration for long periods of time.
# @markdown  ```

# @markdown And for the [imdb_reviews](https://www.tensorflow.org/datasets/catalog/imdb_reviews#imdb_reviewsplain_text_default_config) you can set the `template` as `{text}`
# @markdown which will then select each `text` key-value as a single training string which will look like:

# @markdown ```
# @markdown I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was constant. Constantly slow and boring. Things seemed to happen, but with no explanation of what was causing them or why. I admit, I may have missed part of the film, but i watched the majority of it and everything just seemed to happen of its own accord without any real concern for anything else. I cant recommend this film at all.
# @markdown ```

# @markdown Set a template suitable for the selected dataset whether TensorFlow Dataset or JSONL format. The following value is
# @markdown set for the `databricks-dolly-15k` dataset.
template = "Instruction: {instruction} Response: {response}"  # @param {type:"string"}

# @markdown ### Fill only one of the sections below:
# @markdown ---

# @markdown For example `imdb_reviews`.

# The TensorFlow dataset name.
tfds_dataset_name = ""  # @param {type:"string"}
# The dataset split to use.
tfds_dataset_split = "train"  # @param {type:"string"}
# @markdown ---

# @markdown or

# @markdown ---

# @markdown `jsonl_dataset_file` can be your Cloud Storage path
# @markdown such as `<BUCKET_URI>/<path-to-your-jsonl-file>` or a link to an online JSONL file
# @markdown in which case the code here will download and then copy the file to `BUCKET_URI`.
# @markdown If you want to upload a JSONL dataset to [Google Cloud Storage](https://cloud.google.com/storage) by yourself, then follow the instructions on [Upload objects from a filesystem](https://cloud.google.com/storage/docs/uploading-objects). Then, set `jsonl_dataset_file` to the `gs://` URI to your JSONL file such as `gs://cloud-samples-data/vertex-ai/model-evaluation/peft_train_sample.jsonl`.

jsonl_dataset_file = "https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl"  # @param {type:"string"}
# @markdown ---

# @markdown Click `Show code` to see more details.

assert (
    tfds_dataset_name or jsonl_dataset_file
), "Please fill in either `tfds_dataset_name` or `jsonl_dataset_file`."
assert not (
    tfds_dataset_name and jsonl_dataset_file
), "Please fill in only one of `tfds_dataset_name` or `jsonl_dataset_file`."

# Download the JSONL dataset.
jsonl_dataset_uri_gcsfuse = ""
if jsonl_dataset_file:
    if jsonl_dataset_file.startswith("gs://"):
        # Using cloud storage location.
        jsonl_dataset_uri = jsonl_dataset_file
    else:
        # Download the file and copy to cloud storage.
        !wget -O dataset.jsonl $jsonl_dataset_file
        jsonl_dataset_uri = f"{BUCKET_URI}/dataset.jsonl"
        print("Copying dataset.jsonl to ", jsonl_dataset_uri)
        !gsutil cp dataset.jsonl $jsonl_dataset_uri
        print("JSONL url copied to: ", jsonl_dataset_uri)
    jsonl_dataset_uri_gcsfuse = jsonl_dataset_uri.replace("gs://", "/gcs/")

In [None]:
# @title Finetune
# @markdown Use the Vertex AI SDK to create and run the custom training jobs.
# @markdown The training job uses TPU V3 8 cores and takes around 10 mins to
# @markdown finish once it starts running.
# @markdown Click `View backing custom job` link in the output of this cell to follow training job progress.
# @markdown **Note that to make the training run faster, only a subset of dataset (2000 examples) is used here during fine tuning and the fine tuning runs for just one epoch. To improve the performance of the model, use more training samples, fine tune for more epochs and experiment with increasing the LoRA rank.**
# @markdown Click `Show code` to see more details.

# The Gemma base model.
model_type = "gemma_2b_en"  # @param["gemma_2b_en"]
# @markdown Set `num_train_subset_samples` as `-1` to use all the training samples.
num_train_subset_samples = 2000  # @param {type:"integer"}
# Number of train epochs.
num_epochs = 1  # @param{type:"integer"}
# Learning rate.
learning_rate = 5e-5  # @param{type:"number"}
# Weight decay.
weight_decay = 0.01  # @param{type:"number"}
# Input sequence length. It determines the memory required by the model.
input_sequence_length = 512  # @param{type:"integer"}
# LoRA rank.
lora_rank = 4  # @param{type:"integer"}
# Batch size for training.
train_batch_size = 2  # @param{type:"integer"}
# The KerasNLP checkpoint filename.
# Note: Do not add folder name here.
checkpoint_filename = "fine_tuned.weights.h5"  # @param{type:"string"}

# Worker pool spec.
machine_type = "cloud-tpu"
# NOTE: The models have been test only with 8 cores.
accelerator_type = "TPU_V3"
# Number of TPU cores.
accelerator_count = 8
# Set model parallelism related parameters for 8 cores.
model_parallel_batch_dim = 1
model_parallel_model_dim = 8

replica_count = 1

# Setup training job.
job_name = get_job_name_with_datetime("gemma-keras-lora-train")

# Pass training arguments and launch job.
train_job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name,
    container_uri=KERAS_TRAIN_DOCKER_URI,
)

# Create a GCS folder to save the finetuned model.
output_folder = os.path.join(BUCKET_URI, job_name)
output_folder_gcsfuse = output_folder.replace("gs://", "/gcs/")

train_job.run(
    args=[
        f"--model_type={model_type}",
        f"--num_epochs={num_epochs}",
        f"--learning_rate={learning_rate}",
        f"--weight_decay={weight_decay}",
        f"--input_sequence_length={input_sequence_length}",
        f"--lora_rank={lora_rank}",
        f"--model_parallel_batch_dim={model_parallel_batch_dim}",
        f"--model_parallel_model_dim={model_parallel_model_dim}",
        f"--tfds_dataset_name={tfds_dataset_name}",
        f"--tfds_dataset_split={tfds_dataset_split}",
        f"--jsonl_dataset_file={jsonl_dataset_uri_gcsfuse}",
        f"--template={template}",
        f"--train_batch_size={train_batch_size}",
        f"--num_train_subset_samples={num_train_subset_samples}",
        f"--output_folder={output_folder_gcsfuse}",
        f"--checkpoint_filename={checkpoint_filename}",
    ],
    environment_variables={
        "KAGGLE_USERNAME": KAGGLE_USERNAME,
        "KAGGLE_KEY": KAGGLE_KEY,
    },
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    service_account=SERVICE_ACCOUNT,
)

print("Trained model is saved in: ", output_folder)

In [None]:
# @title Convert model
# @markdown Convert the KerasNLP model checkpoint to Hex-LLM format.
# @markdown  Use the Vertex AI SDK to create and run the custom job.
# @markdown Click `View backing custom job` link in the output of this cell to follow job progress.
# @markdown  The jobs takes around 6 mins to finish.
# @markdown  Click `Show code` to see more details.

model_type_to_size = {"gemma_2b_en": "2b", "gemma_7b_en": "7b"}
size = model_type_to_size[model_type]
# NOTE: The Hexllmm serving code looks for model type tag in
# the checkpoint filename.
model_type_to_file_suffix = {
    "gemma_2b_en": "_gemma-2b.ckpt",
    "gemma_7b_en": "_gemma-7b.ckpt",
}
hexllm_checkpoint_file = "finetuned_hexllm" + model_type_to_file_suffix[model_type]

# Worker pool spec.
machine_type = "n1-highmem-16"

replica_count = 1

# Setup training job.
job_name = get_job_name_with_datetime("gemma-keras-model-conversion")

# Pass training arguments and launch job.
conversion_job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name,
    container_uri=KERAS_MODEL_CONVERSION_DOCKER_URI,
)

conversion_job.run(
    args=[
        f"--weights_file={output_folder_gcsfuse}/{checkpoint_filename}",
        f"--size={size}",
        f"--output_file={output_folder_gcsfuse}/{hexllm_checkpoint_file}",
    ],
    environment_variables={
        "KAGGLE_USERNAME": KAGGLE_USERNAME,
        "KAGGLE_KEY": KAGGLE_KEY,
    },
    replica_count=replica_count,
    machine_type=machine_type,
    service_account=SERVICE_ACCOUNT,
)

print(
    "Converted Hexllm checkpoint is saved in: ",
    output_folder + "/" + hexllm_checkpoint_file,
)

In [None]:
# @title Deploy
# @markdown This section uploads the model to Model Registry and deploys it on the Endpoint. It takes 15 minutes to 1 hour to finish.
# @markdown   **Hex-LLM** is a **H**igh-**E**fficiency **L**arge **L**anguage **M**odel (LLM) TPU serving solution built with **XLA**. This notebook uses TPU v5e 8 cores. Click `Show code` to see more details.

if "2b" in model_type:
    # Sets ct5lp-hightpu-1t (1 TPU chip) to deploy Gemma 2B models.
    machine_type = "ct5lp-hightpu-1t"
else:
    # Sets ct5lp-hightpu-4t (4 TPU chips) to deploy Gemma 7B models.
    machine_type = "ct5lp-hightpu-4t"

# Note that a larger max_num_batched_tokens will require more TPU memory.
max_num_batched_tokens = 11264
# Multiple of tokens for padding alignment. A higher value can reduce
# re-compilation but can also increase the waste in computation.
tokens_pad_multiple = 1024
# Multiple of sequences for padding alignment. A higher value can reduce
# re-compilation but can also increase the waste in computation.
seqs_pad_multiple = 32

print("Using model from: ", output_folder)
model, endpoint = deploy_model_hexllm(
    model_name=get_job_name_with_datetime(prefix="gemma-serve-hexllm"),
    model_id=output_folder,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    max_num_batched_tokens=max_num_batched_tokens,
    tokens_pad_multiple=tokens_pad_multiple,
    seqs_pad_multiple=seqs_pad_multiple,
)
print("endpoint_name:", endpoint.name)

In [None]:
# @title Predict
# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts based on your `template`.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

# If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`,
# you can reduce the max length, such as set max_tokens as 20.

# @markdown **Note:** The following input corresponds to the default `template` set for the `databricks-dolly-15k` which uses `instruction` and `response` keys.
# @markdown   If you modify the `template` or use another dataset, then modify the `prompt` accordingly. For example for the `imdb_reviews` dataset  where `template = "{text}"`, set `prompt = "Inception is "`.
prompt = "Instruction: What should I do on a trip to Europe? Response: "  # @param {type: "string"}
max_tokens = 50  # @param {type:"integer"}

# @markdown **Note that the first few prompts will take longer to execute.**

instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
    },
]
prediction_response = endpoint.predict(instances=instances)

for prediction in prediction_response.predictions:
    print(prediction)

## Clean up resources

In [None]:
# Delete the train job.
train_job.delete()

# Delete the conversion job.
if conversion_job:
    conversion_job.delete()

# Undeploy model and delete endpoint.
endpoint.delete(force=True)

# Delete models.
model.delete()

# Delete Cloud Storage objects that were created
delete_bucket = False
if delete_bucket:
    ! gsutil -m rm -r $STAGING_BUCKET