In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - LLaMA2 (Evaluation)

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama2_evaluation.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama2_evaluation.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_pytorch_llama2_evaluation.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
Open in Vertex AI Workbench
    </a> (A Python-3 CPU notebook is recommended)
  </td>
</table>

## Overview

This notebook demonstrates downloading prebuilt [LLaMA2 models](https://huggingface.co/meta-llama) and evaluating LLaMA2 models with popular benchmark datasets through Vertex CustomJobs using [EleutherAI's evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).

### Objective

- Download prebuilt LLaMA2 models
- Evaluate the LLaMA2 models on any of the benchmark datasets

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

### Colab only
Run the following commands for Colab and skip this section if you are using Workbench.

In [None]:
import sys

if "google.colab" in sys.modules:
    ! pip3 install --upgrade google-cloud-aiplatform
    ! pip3 install ipython pandas[output_formatting]
    from google.colab import auth as google_auth

    google_auth.authenticate_user()

    # Restart the notebook kernel after installs.
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

### Workbench only
If you are using Workbench, you should find that the necessary dependencies are already pre-installed. If this is not the case or if you have previously modified the existing libraries, you may install the dependencies using the following commands:
```
! pip3 install --upgrade google-cloud-aiplatform
! pip3 install ipython pandas[output_formatting]
```

### Setup Google Cloud project

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI API and Compute Engine API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component).

1. [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs.

1. [Create a service account](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console) with `Vertex AI User` and `Storage Object Admin` roles for deploying fine tuned model to Vertex AI endpoint.

### Import the necessary packages

In [None]:
import json
import os
import sys
from datetime import datetime

from google.cloud import aiplatform, storage

Set the following variables for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the specified region (`REGION`). Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1").

In [None]:
# Cloud project id.
PROJECT_ID = ""  # @param {type:"string"}

# Region for launching jobs.
REGION = ""  # @param {type:"string"}

# Cloud Storage bucket for storing experiments output.
# Start with gs:// prefix, e.g. gs://foo_bucket.
BUCKET_URI = "gs://"  # @param {type:"string"}

! gcloud config set project $PROJECT_ID


STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
EXPERIMENT_BUCKET = os.path.join(BUCKET_URI, "peft")
BASE_MODEL_BUCKET = os.path.join(EXPERIMENT_BUCKET, "base_model")
MODEL_BUCKET = os.path.join(EXPERIMENT_BUCKET, "model")

# The service account looks like:
# '@.iam.gserviceaccount.com'
# Please go to https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
# and create service account with `Vertex AI User` and `Storage Object Admin` roles.
# The service account for deploying fine tuned model.
SERVICE_ACCOUNT = ""  # @param {type:"string"}

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user(project_id=PROJECT_ID)

### Initialize Vertex AI API

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

### Define constants

In [None]:
# The evaluation docker image.
EVAL_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-lm-evaluation-harness:20231011_0934_RC00"

### Define common functions

In [None]:
def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")

## Access LLaMA2 pretrained and finetuned models
The original models from Meta are converted into the Hugging Face format for finetuning and serving in Vertex AI.

Accept the model agreement to access the models:
1. Navigate to the Vertex AI > Model Garden page in the Google Cloud console
2. Find the LLaMA2 model card and click on "VIEW DETAILS"
3. Review the agreement on the model card page
4. After clicking the agreement of LLaMA2, a Cloud Storage bucket containing LLaMA2 pretrained and finetuned models will be shared
5. Paste the Cloud Storage bucket link below and assign it to `VERTEX_AI_MODEL_GARDEN_LLAMA2`

In [None]:
VERTEX_AI_MODEL_GARDEN_LLAMA2 = ""  # This will be shared once click the agreement of LLaMA2 in Vertex AI Model Garden.
assert (
    VERTEX_AI_MODEL_GARDEN_LLAMA2
), "Please click the agreement of LLaMA2 in Vertex AI Model Garden, and get the GCS path of LLaMA2 model artifacts."
print(
    "Copy LLaMA2 model artifacts from",
    VERTEX_AI_MODEL_GARDEN_LLAMA2,
    "to ",
    BASE_MODEL_BUCKET,
)
! gsutil -m cp -R $VERTEX_AI_MODEL_GARDEN_LLAMA2/* $BASE_MODEL_BUCKET

Set the base model id.

In [None]:
base_model_name = "llama2-7b-chat-hf"  # @param ["llama2-7b-hf", "llama2-7b-chat-hf", "llama2-13b-hf", "llama2-13b-chat-hf", "llama2-70b-hf", "llama2-70b-chat-hf"]
base_model_id = os.path.join(BASE_MODEL_BUCKET, base_model_name)

## Evaluate PEFT-finetuned LLaMA 2 models

This section demonstrates evaluation of LLaMA2 models using EleutherAI's [Language Model Evaluation Harness (lm-evaluation-harness)](https://github.com/EleutherAI/lm-evaluation-harness) with Vertex Custom Job. Please reference the peak GPU memory usgaes for serving and adjust the machine type, accelerator type and accelerator count accordingly.

This example uses the dataset [TruthfulQA](https://arxiv.org/abs/2109.07958).
All the supported tasks are listed in [this task table](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md).

In [None]:
# Set the machine_type, accelerator_type, accelerator_count and benchmark dataset.
eval_dataset = "truthfulqa_mc"  # @param ["truthfulqa_mc", "boolq", "gsm8k", "hellaswag", "natural_questions", "openai_humaneval", "openbookqa", "quac", "trivia_qa", "winograde"]

# Worker pool spec.
# Find Vertex AI supported accelerators and regions in:
#  https://cloud.google.com/vertex-ai/docs/training/configure-compute

# Sets V100 (16G) to evaluate LLaMA2 7B models.
machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1

# Sets 1 L4 (24G) to evaluate LLaMA2 7B models.
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# Sets 2 V100 (16G) to evaluate LLaMA2 13B models.
# machine_type = "n1-standard-8"
# accelerator_type = "NVIDIA_TESLA_V100"
# accelerator_count = 2

# Sets 2 L4 (24G) to evaluate LLaMA2 13B models.
# machine_type = "g2-standard-24"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 2

# Sets A100 (40G) to evaluate LLaMA2 13B models.
# machine_type = "a2-highgpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100"
# accelerator_count = 1

# Sets 8 L4 (24G) to evaluate LLaMA2 70B models.
# machine_type = "g2-standard-96"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 8

# Sets 4 A100 (40G) to evaluate LLaMA2 70B models.
# machine_type = "a2-highgpu-4g"
# accelerator_type = "NVIDIA_TESLA_A100"
# accelerator_count = 4

# Sets A100 (80G) to evaluate LLaMA2 70B models for faster inferences.
# machine_type = "a2-ultragpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100_80GB"
# accelerator_count = 2

replica_count = 1

In [None]:
# Setup evaluation job.
job_name = get_job_name_with_datetime(prefix="llama2-eval")
eval_output_dir = os.path.join(MODEL_BUCKET, job_name)
eval_output_dir_gcsfuse = eval_output_dir.replace("gs://", "/gcs/")
base_model_id_gcsfuse = base_model_id.replace("gs://", "/gcs/")

In [None]:
# Prepare evaluation script that runs the evaluation harness.
# Set `trust_remote_code = True` because evaluating the model requires
# executing code from the model repository.
# Set `use_accelerate = True` to enable evaluation across multiple GPUs.
script_path = "./eval_script.py"  # @param {type:"string"}


eval_command = f"""import subprocess


subprocess.call([
    'python',
    'main.py',
    '--model',
    'hf-causal-experimental',
    '--model_args',
    'pretrained={base_model_id_gcsfuse},peft={output_dir_gcsfuse},trust_remote_code=True,use_accelerate=True,device_map_option=auto',
    '--tasks',
    '{eval_dataset}',
    '--output_path',
    '{eval_output_dir_gcsfuse}',
])
"""

with open(script_path, "w") as fp:
    fp.write(eval_command)

In [None]:
# Run the evaluation job.

eval_job = aiplatform.CustomJob.from_local_script(
    display_name=job_name,
    script_path=script_path,
    container_uri=EVAL_DOCKER_URI,
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    base_output_dir=eval_output_dir,
    boot_disk_size_gb=500,
)

eval_job.run()

print("Evaluation results were saved in:", eval_output_dir)

In [None]:
# Fetch evaluation results.
storage_client = storage.Client()
BUCKET_NAME = BUCKET_URI.replace("gs://", "")
bucket = storage_client.get_bucket(BUCKET_NAME)
RESULT_FILE_PATH = eval_output_dir[len(BUCKET_URI) + 1 :]
blob = bucket.blob(RESULT_FILE_PATH)
raw_result = blob.download_as_string()

# Print evaluation results.
result = json.loads(raw_result)
result_formatted = json.dumps(result, indent=2)
print(f"Evaluation result:\n{result_formatted}")

## Clean up resources

In [None]:
# Delete the evaluation job.
eval_job.delete()

# Delete Cloud Storage objects that were created
delete_bucket = False
if delete_bucket:
    ! gsutil -m rm -r $EXPERIMENT_BUCKET