In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - OpenLLaMA (PEFT)
<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_openllama_peft.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_openllama_peft.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_pytorch_openllama_peft.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
Open in Vertex AI Workbench
    </a> (A Python-3 GPU notebook is recommended)
  </td>
</table>

## Overview

This notebook demonstrates running local inference with prebuilt OpenLLaMA, deploying prebuilt OpenLLaMA, deploying prebuilt OpenLLaMA with [vLLM](https://github.com/vllm-project/vllm), finetuning and deploying OpenLLaMA with performance efficient finetuning libraries ([PEFT](https://github.com/huggingface/peft)), quantizing and deploying OpenLLaMA with AWQ or GPTQ, and evaluating PEFT-finetuned OpenLLaMA in Vertex AI.

### Objective

- Run local inference with prebuilt OpenLLaMA
- Deploy prebuilt OpenLLaMA
- Deploy prebuilt OpenLLaMA with [vLLM](https://github.com/vllm-project/vllm) to improve serving throughput
- Finetune and deploy OpenLLaMA with PEFT
- Quantize and deploy OpenLLaMA models with AWQ or GPTQ
- Evaluate finetuned OpenLLaMA with PEFT

| Models | LoRA |
| :- | :- |
| [openlm-research/open_llama_3b](https://huggingface.co/openlm-research/open_llama_3b) | Y |
| [openlm-research/open_llama_7b](https://huggingface.co/openlm-research/open_llama_7b) | Y |
| [openlm-research/open_llama_13b](https://huggingface.co/openlm-research/open_llama_13b) | Y |

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

**NOTE**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

Running local inference with OpenLLaMA requires a GPU.

### Colab only
Run the following commands for Colab and skip this section if you are using Workbench.

In [None]:
import sys

if "google.colab" in sys.modules:
    ! pip3 install --upgrade google-cloud-aiplatform
    from google.colab import auth as google_auth

    google_auth.authenticate_user()

    # Restart the notebook kernel after installs.
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)
! pip3 install transformers==4.31.0
! pip3 install sentencepiece==0.1.99
! pip3 install accelerate==0.21.0

### Setup Google Cloud project

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI API and Compute Engine API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component).

1. [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs.

1. [Create a service account](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console) with `Vertex AI User` and `Storage Object Admin` roles for deploying fine tuned model to Vertex AI endpoint.

Set the following variables for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the specified region (`REGION`). Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1").

In [None]:
# Cloud project id.
PROJECT_ID = ""  # @param {type:"string"}

# The region you want to launch jobs in.
REGION = ""  # @param {type:"string"}

# The Cloud Storage bucket for storing experiments output.
# Start with gs:// prefix, e.g. gs://foo_bucket.
BUCKET_URI = "gs://"  # @param {type:"string"}

! gcloud config set project $PROJECT_ID

import os

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
EXPERIMENT_BUCKET = os.path.join(BUCKET_URI, "peft")
DATA_BUCKET = os.path.join(EXPERIMENT_BUCKET, "data")
MODEL_BUCKET = os.path.join(EXPERIMENT_BUCKET, "model")

# The service account looks like:
# '@.iam.gserviceaccount.com'
# Please go to https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
# and create service account with `Vertex AI User` and `Storage Object Admin` roles.
# The service account for deploying fine tuned model.
SERVICE_ACCOUNT = ""  # @param {type:"string"}

### Initialize Vertex AI API

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

### Define constants

In [None]:
# The pre-built training and serving docker images.
TRAIN_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20231130_0936_RC00"
PREDICTION_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-serve:20231130_0948_RC00"
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20231127_0916_RC00"
VLLM_GPTQ_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:gptq"
EVAL_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-lm-evaluation-harness:20231011_0934_RC00"

### Define common functions

In [None]:
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform


def create_name_with_datetime(prefix: str) -> str:
    """Creates a name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def deploy_model(
    model_name: str,
    base_model_id: str,
    finetuned_lora_model_path: str,
    service_account: str,
    task: str,
    precision_loading_mode: str = "float16",
    machine_type: str = "n1-standard-8",
    accelerator_type: str = "NVIDIA_TESLA_V100",
    accelerator_count: int = 1,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")
    serving_env = {
        "BASE_MODEL_ID": base_model_id,
        "PRECISION_LOADING_MODE": precision_loading_mode,
        "TASK": task,
    }
    if finetuned_lora_model_path:
        serving_env["FINETUNED_LORA_MODEL_PATH"] = finetuned_lora_model_path
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=PREDICTION_DOCKER_URI,
        serving_container_ports=[7080],
        serving_container_predict_route="/predictions/peft_serving",
        serving_container_health_route="/ping",
        serving_container_environment_variables=serving_env,
    )
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "n1-standard-8",
    accelerator_type: str = "NVIDIA_TESLA_V100",
    accelerator_count: int = 1,
    quantization_method: str = "",
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        "--gpu-memory-utilization=0.9",
        "--max-num-batched-tokens=4096",
        "--disable-log-stats",
    ]
    if quantization_method:
        vllm_args.append(f"--quantization={quantization_method}")
    if quantization_method == "gptq":
        vllm_docker_uri = VLLM_GPTQ_DOCKER_URI
    else:
        vllm_docker_uri = VLLM_DOCKER_URI

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=vllm_docker_uri,
        serving_container_command=["python", "-m", "vllm.entrypoints.api_server"],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint

## Run inferences locally with prebuilt OpenLLaMA

In [None]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer

model_path = "openlm-research/open_llama_3b"

tokenizer = LlamaTokenizer.from_pretrained(model_path)

model = LlamaForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto",
)

prompt = "Q: What is the largest animal?\nA:"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to("cuda")
generation_output = model.generate(input_ids=input_ids, max_new_tokens=32)
print(tokenizer.decode(generation_output[0]))

## Deploy Prebuilt OpenLLaMA with vLLM

This section deploys prebuilt OpenLLaMA models with [vLLM](https://github.com/vllm-project/vllm) on the Endpoint. The model deployment step will take ~15 minutes to complete.

vLLM is a highly optimized LLM serving framework that can significantly increase serving throughput. The higher QPS you have, the more performance benefits you get from using vLLM.

Set the prebuilt model id.

In [None]:
prebuilt_model_id = "openlm-research/open_llama_7b"  # @param ["openlm-research/open_llama_3b", "openlm-research/open_llama_7b", "openlm-research/open_llama_13b"]

In [None]:
# Finds Vertex AI prediction supported accelerators and regions in
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

# Sets V100 to deploy open_llama_3b and open_llama_7b.
# V100 serving has better throughput and latency performance than L4 serving.
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1

# Sets L4 to deploy open_llama_3b and open_llama_7b.
# L4 serving is more cost efficient than V100 serving.
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# Sets 2 V100 to deploy open_llama_13b.
# V100 serving has better throughput and latency performance than L4 serving.
# machine_type = "n1-standard-16"
# accelerator_type = "NVIDIA_TESLA_V100"
# accelerator_count = 2

# Sets 2 L4 to deploy open_llama_13b.
# L4 serving is more cost efficient than V100 serving.
# machine_type = "g2-standard-24"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 2

if prebuilt_model_id == "openlm-research/open_llama_3b":
    # vLLM currently does not support OpenLLaMA 3B.
    precision_loading_mode = "float16"
    model_without_peft, endpoint_without_peft = deploy_model(
        model_name=get_job_name_with_datetime(prefix="openllama-serve"),
        base_model_id=base_model_id,
        finetuned_lora_model_path="",  # This will avoid override finetuning models.
        service_account=SERVICE_ACCOUNT,
        task="causal-language-modeling-lora",
        precision_loading_mode=precision_loading_mode,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
    )
else:
    model_without_peft, endpoint_without_peft = deploy_model_vllm(
        model_name=create_name_with_datetime(prefix="openllama-serve-vllm"),
        model_id=prebuilt_model_id,
        service_account=SERVICE_ACCOUNT,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
    )

NOTE: The prebuilt model weights will be downloaded on the fly from the original location after the deployment succeeds. Thus, an additional 5 minutes of waiting time is needed **after** the above model deployment step succeeds and before you can run the next step below. Otherwise you might see a `ServiceUnavailable: 503 502:Bad Gateway` error when you send requests to the endpoint.

Once deployment succeeds, you can send requests to the endpoint with text prompts. If you are interested in additional serving parameters, please refer to the vLLM GitHub [examples/api_client.py](https://github.com/vllm-project/vllm/blob/main/examples/api_client.py) for more details.

In [None]:
instance = {
    "prompt": "Hi, Google. How are you doing?",
    "n": 1,
    "max_tokens": 32,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 10,
}
response = endpoint_without_peft.predict(instances=[instance])
print(response.predictions[0])

## Finetune and deploy OpenLLaMA with PEFT

This section demonstrates how to finetune the OpenLLaMA-7b model, merge the finetuned LoRA adapter with the base model, and serve using vLLM.

Set the base model id.

In [None]:
base_model_id = "openlm-research/open_llama_7b"  # @param ["openlm-research/open_llama_3b", "openlm-research/open_llama_7b", "openlm-research/open_llama_13b"]

### Finetune

Use the Vertex AI SDK to create and run the custom training jobs with Vertex AI Model Garden training images.

This example uses the dataset [Abirate/english_quotes](https://huggingface.co/datasets/Abirate/english_quotes). You can either use a [dataset from huggingface](https://huggingface.co/datasets) or a custom JSONL dataset in [Vertex text model dataset format](https://cloud.google.com/vertex-ai/docs/generative-ai/models/tune-text-models-supervised#dataset-format) stored in Cloud Storage. The `template` parameter is optional.

In order to make the finetuning efficiently, we enabled quantization for loading pretrained models for finetuning LoRA models. Precision options include `"4bit"`, `"8bit"`, `"float16"` (default) and `"float32"`, and the precision can be set via `"--precision_mode"`. The peak GPU memory usages are ~7G, ~10G and ~16G for finetuning LoRA models for [openlm-research/open_llama_3b](https://huggingface.co/openlm-research/open_llama_3b), [openlm-research/open_llama_7b](https://huggingface.co/openlm-research/open_llama_7b), and [openlm-research/open_llama_13b](https://huggingface.co/openlm-research/open_llama_13b) separately with default training parameters and the example dataset. `open_llama_3b` and `open_llama_7b` can be finetuned on **1 V100 (16G)** and **1 L4 (24G)**, and `open_llama_13b` can be finetuned on **1 L4 (24G)**.

In this section, the finetuned LoRA adapter will be saved to a GCS bucket specified by the variable `lora_adapter_dir` below; and we merge the LoRa adapter with the base model, and save it to a separate GCS bucket specified by `merged_model_output_dir` below.


#### [Optional] Finetune with a custom dataset

To use a custom dataset, you should supply a `gs://` URI to a JSONL file in [Vertex text model dataset format](https://cloud.google.com/vertex-ai/docs/generative-ai/models/tune-text-models-supervised#dataset-format) in the `dataset_name` below.

For example, here is one data point from the sample dataset `gs://cloud-samples-data/vertex-ai/model-evaluation/peft_train_sample.jsonl`:

```json
{"input_text":"TRANSCRIPT: \nREASON FOR EVALUATION:,\n\n LABEL:","output_text":"Chiropractic"}
```

To use this sample dataset that contains `input_text` and `output_text` fields, set `dataset_name` to `gs://cloud-samples-data/vertex-ai/model-evaluation/peft_train_sample.jsonl` and `template` to `vertex_sample`. For advanced usage with custom datatset fields, see [the template example](https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca.json) and supply your own JSON template as `gs://` URIs.

In [None]:
# Huggingface dataset name or gs:// URI to a custom JSONL dataset.
dataset_name = "Abirate/english_quotes"  # @param {type:"string"}
# Optional. Template name or gs:// URI to a custom template.
template = ""  # @param {type:"string"}

# Worker pool spec.
# Finetunes open_llama_3b and open_llama_7b with 1 V100 (16G).
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1

# Finetunes open_llama_3b and open_llama_7b with 1 L4 (24G).
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# Finetunes open_llama_13b with 1 L4 (24G).
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# Finetunes open_llama_13b with 1 A100 (40G).
# machine_type = "a2-highgpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100"
# accelerator_count = 1

replica_count = 1


# Setup training job.
job_name = create_name_with_datetime("openllama-lora-train")
train_job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name,
    container_uri=TRAIN_DOCKER_URI,
)

# Create a GCS folder to store the LORA adapter.
lora_adapter_dir = create_name_with_datetime("openllama-lora-adapter")
lora_output_dir = os.path.join(MODEL_BUCKET, lora_adapter_dir)
lora_output_dir_gcsfuse = lora_output_dir.replace("gs://", "/gcs/")

# Create a GCS folder to store the merged model with the base model and the
# finetuned LORA adapter.
merged_model_dir = create_name_with_datetime("openllama-merged-model")
merged_model_output_dir = os.path.join(MODEL_BUCKET, merged_model_dir)
merged_model_output_dir_gcsfuse = merged_model_output_dir.replace("gs://", "/gcs/")

# Pass training arguments and launch job.
train_job.run(
    args=[
        "--task=causal-language-modeling-lora",
        f"--pretrained_model_id={base_model_id}",
        f"--dataset_name={dataset_name}",
        f"--output_dir={lora_output_dir_gcsfuse}",
        f"--merge_base_and_lora_output_dir={merged_model_output_dir_gcsfuse}",
        "--lora_rank=16",
        "--lora_alpha=32",
        "--lora_dropout=0.05",
        "--warmup_steps=10",
        "--max_steps=10",
        "--learning_rate=2e-4",
        f"--template={template}",
    ],
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    boot_disk_size_gb=500,
)

print("The finetuned Lora adapter can be found at: ", lora_output_dir)
print(
    "The finetuned Lora adapter merged with the base model can be found at: ",
    merged_model_output_dir,
)

### [Optional] Hyperparameter tuning

You can use the Vertex AI SDK to create and run the [hyperparameter tuning job](https://cloud.google.com/vertex-ai/docs/training/hyperparameter-tuning-overview) to obtain a better performance by experimenting with different hyperparameters such as learning rates.

Define the following specifications:

- `worker_pool_specs`: Dictionary specifying the machine type and Docker image.

- `parameter_spec`: Dictionary specifying the parameters to optimize. The dictionary key is the string assigned to the command line argument for each hyperparameter in your training application code, and the dictionary value is the parameter specification. The parameter specification includes the type, min/max values, and scale for the hyperparameter.

- `metric_spec`: Dictionary specifying the metric to optimize. The dictionary key is the hyperparameter_metric_tag that you set in your training application code, and the value is the optimization goal.

The following 4bit QLoRA experiment results show the effectiveness of hyperparameter tuning evaluated on the ARC Challenge dataset (for reference only):

| Model         | Training time | Trials | Parallel Trials | GPU  | ∆arc challenge | ∆hellaswag | ∆truthfulqa_mc | cost     |
|---------------|---------------|--------|-----------------|------|----------------|------------|----------------|----------|
| Openllama-3b  | 2d 10hrs      | 8      | 1               | L4x1 | +1.62          | +7.32      | +3.34          | \$29.0232 |
| Openllama-7b  | 1d 4hrs       | 8      | 2               | L4x1 | +2.82          | +3.55      | +6.68          | \$47.8016 |
| Openllama-13b | 6d 10hrs      | 8      | 2               | L4x1 | +1.01          | +3.67      | +6.19          | \$87.9208 |

The following example runs 8 trials on `timdettmers/openassistant-guanaco` with different learning rates, and evaluates the model on `arc_challenge` dataset. You can customize the search space by extending the range of learning rates, adding other parameters such as LoRA rank, etc. Please refer to the [hyperparameter tuning documentation](https://cloud.google.com/vertex-ai/docs/training/hyperparameter-tuning-overview) for more information.

In [None]:
# Huggingface dataset name or gs:// URI to a custom JSONL dataset.
dataset_name = "timdettmers/openassistant-guanaco"  # @param {type:"string"}
# Optional. Template name or gs:// URI to a custom template.
template = ""  # @param {type:"string"}

hpt_precision_mode = "4bit"

# Worker pool spec for 4bit finetuning.

# Finetunes Openllama 3B / 7B / 13B with 1 L4 (24G).
machine_type = "g2-standard-8"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1

### [Optional] Custom evaluation dataset

To obtain a model with better performance on some specific tasks, you might want to run hyperparameter tuning with a custom evaluation dataset. The hyperparameter tuning service will pick the model according to the evaluation dataset and the metrics you selected. You can use any of the following tasks as the `eval_task` in the code cell below:

1. The name of a [lm-evaluation-harness task](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor/lm_eval/tasks).

2. `custom_likelihood`. Then, add a flag `--eval_dataset_path=<Cloud Storage URI to your JSONL dataset>`. The JSONL file must be in the format in Vertex AI language model's [prepare evaluation dataset](https://cloud.google.com/vertex-ai/docs/generative-ai/models/evaluate-models#classification) page.

3. `builtin_eval`. The built-in evaluation loop of the trainer will be used to evaluate the model instead of the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) library. You can supply any eval dataset in the same format as the training dataset by specifying `--eval_dataset_path`, `--eval_split`, `--eval_template`, and `--eval_column`.

In [None]:
from google.cloud.aiplatform import hyperparameter_tuning as hpt

eval_task = "arc_challenge"  # @param {type:"string"}
eval_metric_name = "acc_norm"  # @param {type:"string"}

# Runs 10 training steps as a minimal example. Use 1000 to reproduce the experiment results.
max_steps = 10  # @param {type:"integer"}
# Evaluates the model on 10 examples. Use 10000 to reproduce the experiment results.
eval_limit = 10  # @param {type:"integer"}

flags = {
    "learning_rate": 1e-5,
    "precision_mode": hpt_precision_mode,
    "task": "instruct-lora",
    "pretrained_model_id": base_model_id,
    "output_dir": lora_output_dir_gcsfuse,
    "warmup_steps": 10,
    "max_steps": max_steps,
    "lora_rank": 32,
    "lora_alpha": 64,
    "lora_dropout": 0.05,
    "dataset_name": dataset_name,
    "eval_steps": max_steps + 1,  # Only evaluates in the end.
    "eval_tasks": eval_task,
    "eval_limit": eval_limit,
    "eval_metric_name": eval_metric_name,
    "merge_base_and_lora_output_dir": merged_model_output_dir_gcsfuse,
}

worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        },
        "replica_count": replica_count,
        "container_spec": {
            "image_uri": TRAIN_DOCKER_URI,
            "args": ["--{}={}".format(k, v) for k, v in flags.items()],
        },
    }
]
metric_spec = {"model_performance": "maximize"}
parameter_spec = {
    "learning_rate": hpt.DoubleParameterSpec(min=1e-5, max=1e-4, scale="linear"),
}
train_job = aiplatform.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=STAGING_BUCKET,
)

train_hpt_job = aiplatform.HyperparameterTuningJob(
    display_name=f"{job_name}_hpt",
    custom_job=train_job,
    metric_spec=metric_spec,
    parameter_spec=parameter_spec,
    max_trial_count=8,
    parallel_trial_count=2,
)

train_hpt_job.run()

print("Trained models were saved in: ", lora_output_dir)

Then, find the best trial from the hyperparameter tuning job.

In [None]:
best_trial_id = max(
    train_hpt_job.trials, key=lambda trial: trial.final_measurement.metrics[0].value
).id
lora_output_dir = os.path.join(lora_output_dir, f"trial_{best_trial_id}")
lora_output_dir_gcsfuse = lora_output_dir.replace("gs://", "/gcs/")
print(f"Best trial {best_trial_id} saved model in:", lora_output_dir)

### Deploy with vLLM
This section uploads the model to Model Registry and deploys it on the Endpoint. vLLM currently does not support serving finetuned [openlm-research/open_llama_3b](https://huggingface.co/openlm-research/open_llama_3b).

The model deployment step will take ~15 minutes to complete.

The peak GPU memory usages for [openlm-research/open_llama_3b](https://huggingface.co/openlm-research/open_llama_3b), [openlm-research/open_llama_7b](https://huggingface.co/openlm-research/open_llama_7b), and [openlm-research/open_llama_13b](https://huggingface.co/openlm-research/open_llama_13b) with LoRA weights are ~5.3G, ~8.7G and ~15.2G respectively with the default settings.

NOTE: vLLM requires a merged model with the base model and the finetuned LoRA adapter. Based on your business need, if you need the base model and the finetuned LoRA weight to be served separately, please consider using the regular Vertex serving instead.


In [None]:
# Finds Vertex AI prediction supported accelerators and regions in
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

# Sets V100 to deploy open_llama_3b and open_llama_7b.
# V100 serving has better throughput and latency performance than L4 serving.
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1

# Sets L4 to deploy open_llama_3b and open_llama_7b.
# L4 serving is more cost efficient than V100 serving.
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# Sets 2 V100 to deploy open_llama_13b.
# V100 serving has better throughput and latency performance than L4 serving.
# machine_type = "n1-standard-16"
# accelerator_type = "NVIDIA_TESLA_V100"
# accelerator_count = 2

# Sets 2 L4 to deploy open_llama_13b.
# L4 serving is more cost efficient than V100 serving.
# machine_type = "g2-standard-24"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 2

if prebuilt_model_id == "openlm-research/open_llama_3b":
    # vLLM currently does not support OpenLLaMA 3B.
    precision_loading_mode = "float16"
    model_with_peft, endpoint_with_peft = deploy_model(
        model_name=get_job_name_with_datetime(prefix="openllama-peft-serve"),
        base_model_id=base_model_id,
        finetuned_lora_model_path=lora_output_dir,  # This will avoid override finetuning models.
        service_account=SERVICE_ACCOUNT,
        task="causal-language-modeling-lora",
        precision_loading_mode=precision_loading_mode,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
    )
else:
    model_with_peft, endpoint_with_peft = deploy_model_vllm(
        model_name=create_name_with_datetime(prefix="openllama-peft-serve-vllm"),
        model_id=merged_model_output_dir,
        service_account=SERVICE_ACCOUNT,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
    )

print("endpoint_name:", endpoint_with_peft.name)

NOTE: After the deployment succeeds, the base model weights will be downloaded on the fly from the original location and LoRA model weights will be downloaded from the GCS bucket used in training above. Thus, an additional 5 minutes of waiting time is needed **after** the above model deployment step succeeds and before you can run the next step below. Otherwise you might see a `ServiceUnavailable: 503 502:Bad Gateway` error when you send requests to the endpoint.

Once deployment succeeds, you can send requests to the endpoint with text prompts. Parameters supported by vLLM can be found [here](https://github.com/vllm-project/vllm/blob/2e8e49fce3775e7704d413b2f02da6d7c99525c9/vllm/sampling_params.py#L23-L64).

In [None]:
instance = {
    "prompt": "Hi, Google. How are you doing?",
    "n": 1,
    "max_tokens": 32,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 10,
}
response = endpoint_with_peft.predict(instances=[instance])
print(response.predictions[0])

### [Optional] Merge a previously trained LoRA adapter with the base model

This section demonstrates how to merge a previously trained LoRA adapter with a base model, and save the merged model to a GCS bucket. Please be aware that the LoRA adapter should be trained on the same base model.

In [None]:
merge_job_name = create_name_with_datetime(prefix="openllama-peft-merge")

# The base model to be merged upon. It can be a huggingface model id, or a GCS
# path where the base model was stored.
base_model_dir = "gs://"  # @param {type:"string"}
# The previously trained LoRA adapter. It needs to be stored in a GCS path.
finetuned_lora_adapter_dir = ""  # @param {type:"string"}

# The GCS path to save the merged model
merged_model_output_dir = os.path.join(MODEL_BUCKET, merge_job_name)
merged_model_output_dir_gcsfuse = merged_model_output_dir.replace("gs://", "/gcs/")

# Worker pool spec.
# Merges open_llama_3b and open_llama_7b with 1 V100 (16G).
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"

# Merges open_llama_3b and open_llama_7b with 1 L4 (24G).
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"

# Merges open_llama_13b with 1 L4 (24G).
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"

# Merges open_llama_13b with 1 A100 (40G).
# machine_type = "a2-highgpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100"

worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": 1,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": TRAIN_DOCKER_URI,
            "command": [],
            "args": [
                "--task=merge-causal-language-model-lora",
                "--merge_model_precision_mode=float16",
                "--pretrained_model_id=%s" % base_model_dir,
                "--finetuned_lora_model_dir=%s" % finetuned_lora_adapter_dir,
                "--merge_base_and_lora_output_dir=%s" % merged_model_output_dir_gcsfuse,
            ],
        },
    }
]

merge_custom_job = aiplatform.CustomJob(
    display_name=merge_job_name,
    project=PROJECT_ID,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=STAGING_BUCKET,
)

merge_custom_job.run()

print("The merged model is stored at: ", merged_model_output_dir)

## Quantize and deploy OpenLLaMA 2 models

This section demonstrates post-training quantization of OpenLLaMA models with Vertex Custom Job. Quantization reduces the memory required by a model while attempting to retain the same performance. Two such algorithms to do so are AWQ and GPTQ. Read more about AWQ in the following publication: [AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration](https://arxiv.org/abs/2306.00978). Read more about GPTQ in the following publication: [GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers
](https://arxiv.org/abs/2210.17323).

### Quantize OpenLLaMA models

Quantization reduces the amount of GPU required to serve a model by reducing the bit precision of the weights while minimizing drop in performance. Serving quantized models on VLLM requires models to be quantized to 4 bits. It is recommended to first search if a model has already been quantized and made publicly available: [AWQ](https://huggingface.co/TheBloke?search_models=-awq) and [GPTQ](https://huggingface.co/TheBloke?search_models=-gptq).

Quantizing models with AWQ with 1 NVIDIA_L4 GPU will take around
20 minutes for OpenLLaMA 3B, 30 minutes for OpenLLaMA 7B, and 1 hour for OpenLLaMA 13B.

Quantizing models with GPTQ with 1 NVIDIA_L4 GPU will take around 30 minutes for OpenLLaMA 3B, 45 minutes for OpenLLaMA 7B, and 1.5 hour for OpenLLaMA 13B. Finetuned models can also be quantized, so long as the LoRA weights are merged with the base model.

In [None]:
# Setup quantization job.

# Set `finetuned_model_path` to `merged_model_output_dir` from the previous
# section above to quantize the finetuned model, if not set the base model will
# be quantized.
finetuned_model_path = ""  # @param {type:"string"}
if finetuned_model_path:
    prequantized_model_path = finetuned_model_path
else:
    prequantized_model_path = base_model_id

quantization_method = "awq"  # @param ["awq", "gptq"]
quantization_job_name = get_job_name_with_datetime(
    f"openllama-{quantization_method}-quantize"
)

quantization_output_dir = os.path.join(MODEL_BUCKET, quantization_job_name)
quantization_output_dir_gcsfuse = quantization_output_dir.replace("gs://", "/gcs/")

# Worker pool spec.

# Sets 1 L4 (24G) to quantize OpenLLaMA model.
machine_type = "g2-standard-16"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1


# Quantization parameters.
quantization_precision_mode = "4bit"
if quantization_method == "awq":
    awq_dataset_name = "pileval"
    group_size = 64
    quantization_args = [
        "--task=quantize-model",
        f"--quantization_method={quantization_method}",
        f"--pretrained_model_id={base_model_id}",
        f"--quantization_precision_mode={quantization_precision_mode}",
        f"--quantization_output_dir={quantization_output_dir_gcsfuse}",
        f"--quantization_dataset_name={awq_dataset_name}",
        f"--group_size={group_size}",
    ]
else:
    # The original datasets used in GPTQ paper ["wikitext2","c4","c4-new","ptb","ptb-new"].
    gptq_dataset_name = "c4"  # @param {type:"string"}
    gptq_precision_mode = "4bit"
    group_size = -1
    damp_percent = 0.1
    desc_act = True
    quantization_args = [
        "--task=quantize-model",
        f"--quantization_method={quantization_method}",
        f"--pretrained_model_id={base_model_id}",
        f"--quantization_precision_mode={quantization_precision_mode}",
        f"--quantization_output_dir={quantization_output_dir_gcsfuse}",
        f"--quantization_dataset_name={gptq_dataset_name}",
        f"--group_size={group_size}",
        f"--damp_percent={damp_percent}",
        f"--desc_act={desc_act}",
    ]

# Pass quantization arguments and launch job.
worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        },
        "replica_count": 1,
        "disk_spec": {
            "boot_disk_type": "pd-ssd",
            "boot_disk_size_gb": 500,
        },
        "container_spec": {
            "image_uri": TRAIN_DOCKER_URI,
            "env": [
                {
                    "name": "PYTORCH_CUDA_ALLOC_CONF",
                    "value": "max_split_size_mb:32",
                },
            ],
            "command": [],
            "args": quantization_args,
        },
    }
]

print(f"Quantizing {prequantized_model_path}.")
quantize_job = aiplatform.CustomJob(
    display_name=quantization_job_name,
    project=PROJECT_ID,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=STAGING_BUCKET,
)
quantize_job.run()

print("Quantized models were saved in: ", quantization_output_dir)

### Deploy quantized models with Google Cloud Text Moderation
This section uploads the model to Model Registry and deploys it on the Endpoint.

The model deployment step will take 15 minutes to 1 hour to complete, depending on the model sizes.

In [None]:
# Finds Vertex AI prediction supported accelerators and regions in
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

# Sets 1 L4 (24G) to deploy OpenLLaMA models.
machine_type = "g2-standard-8"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1


if prebuilt_model_id == "openlm-research/open_llama_3b":
    # vLLM currently does not support OpenLLaMA 3B.
    precision_loading_mode = "float16"
    model_quantized_vllm, endpoint_quantized_vllm = deploy_model(
        model_name=get_job_name_with_datetime(prefix="openllama-quantized-serve"),
        base_model_id=quantization_output_dir,
        finetuned_lora_model_path="",  # This will avoid override finetuning models.
        service_account=SERVICE_ACCOUNT,
        task="causal-language-modeling-lora",
        precision_loading_mode=precision_loading_mode,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
    )
else:
    model_quantized_vllm, endpoint_quantized_vllm = deploy_model_vllm(
        model_name=create_name_with_datetime(prefix="openllama-quantized-serve-vllm"),
        model_id=quantization_output_dir,
        service_account=SERVICE_ACCOUNT,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
    )

NOTE: After the deployment succeeds, the model weights will be downloaded on the fly. Thus additional 10 ~ 40 minutes (depending on the model sizes) of waiting time is needed **after** the above model deployment step succeeds and before you run the next step below. Otherwise you might see a `ServiceUnavailable: 503 502:Bad Gateway` error when you send requests to the endpoint.

Once deployment succeeds, you can send requests to the endpoint with text prompts.

Example:

```
Human: What is a car?
Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
```

In [None]:
# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint_quantized_vllm.name` allows us to get the
#   endpoint name of the endpoint `endpoint_quantized_vllm` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = endpoint_quantized_vllm.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint_quantized_vllm = aiplatform.Endpoint(aip_endpoint_name)


# Overides max_length and top_k parameters during inferences.
# If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`,
# you can reduce the max length, such as set max_length as 20.
instances = [
    {
        "prompt": "What is a car?",
        "max_tokens": 50,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 10,
    },
]
response = endpoint_quantized_vllm.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

## Evaluate PEFT-finetuned OpenLLaMA

This section demonstrates how to evaluate the OpenLLaMA model fintuned with PEFT LoRA using EleutherAI's [Language Model Evaluation Harness (lm-evaluation-harness)](https://github.com/EleutherAI/lm-evaluation-harness) with Vertex CustomJob.

This example uses the dataset [TruthfulQA](https://arxiv.org/abs/2109.07958). All supported tasks are listed in [this task table](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md).

In [None]:
eval_dataset = "truthfulqa_mc"  # @param {type:"string"}

# Worker pool spec.
# Sets L4 to evaluate open_llama_3b and open_llama_7b.
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1

# Sets L4 to evaluate open_llama_3b and open_llama_7b.
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# Sets 2 V100 to evaluate open_llama_13b.
# machine_type = "n1-standard-8"
# accelerator_type = "NVIDIA_TESLA_V100"
# accelerator_count = 2

# Sets 2 L4 to evaluate open_llama_13b.
# machine_type = "g2-standard-24"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 2

replica_count = 1

# Setup evaluation job.
job_name = create_name_with_datetime(prefix="openllama-peft-eval")
eval_output_dir = os.path.join(MODEL_BUCKET, job_name)
eval_output_dir_gcsfuse = eval_output_dir.replace("gs://", "/gcs/")

In [None]:
# Prepare evaluation script that runs the evaluation harness.
script_path = "./eval_script.py"  # @param {type:"string"}

eval_command = f"""import subprocess


subprocess.call([
    'python',
    'main.py',
    '--model',
    'hf-causal-experimental',
    '--model_args',
    'pretrained={merged_model_output_dir_gcsfuse},use_accelerate=True',
    '--tasks',
    '{eval_dataset}',
    '--output_path',
    '{eval_output_dir_gcsfuse}',
])
"""

with open(script_path, "w") as fp:
    fp.write(eval_command)

### Submit evaluation CustomJob

In [None]:
# Pass evaluation arguments and launch job.
eval_job = aiplatform.CustomJob.from_local_script(
    display_name=job_name,
    script_path=script_path,
    container_uri=EVAL_DOCKER_URI,
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    base_output_dir=eval_output_dir,
)

eval_job.run()

print("Evaluation results were saved in:", eval_output_dir)

### Fetch and print evaluation results

In [None]:
import json

from google.cloud import storage

# Fetch evaluation results.
storage_client = storage.Client()
BUCKET_NAME = BUCKET_URI.split("gs://")[1]
bucket = storage_client.get_bucket(BUCKET_NAME)
RESULT_FILE_PATH = eval_output_dir[len(BUCKET_URI) + 1 :]
blob = bucket.blob(RESULT_FILE_PATH)
raw_result = blob.download_as_string()

# Print evaluation results.
result = json.loads(raw_result)
result_formatted = json.dumps(result, indent=2)
print(f"Evaluation result:\n{result_formatted}")

## Clean up resources

In [None]:
# Delete custom train and evaluation jobs.
train_job.delete()
eval_job.delete()
quantize_job.delete()

# Undeploy models and delete endpoints.
endpoint_without_peft.delete(force=True)
endpoint_with_peft.delete(force=True)
endpoint_quantized_vllm.delete(force=True)

# Delete models.
model_without_peft.delete()
model_with_peft.delete()
model_quantized_vllm.delete()