In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Dolly V2

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_dolly_v2.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_dolly_v2.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_pytorch_dolly_v2.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
Open in Vertex AI Workbench
    </a>
    (a Python-3 CPU notebook is recommended)
  </td>
</table>

## Overview

This notebook demonstrates downloading and deploying the pre-trained [Dolly-V2-7b](https://huggingface.co/databricks/dolly-v2-7b) model on Vertex AI for online prediction. It also demonstrates the evaluation of popular benchmark datasets through Vertex CustomJobs using [EleutherAI’s evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).


### Objective

- Upload the model to [Model Registry](https://cloud.google.com/vertex-ai/docs/model-registry/introduction).
- Deploy the model on [Endpoint](https://cloud.google.com/vertex-ai/docs/predictions/using-private-endpoints).
- Run predictions by giving instructions or by QnA.
- Run the evaluation on any of the benchmark datasets (hellaswag, openbookqa, etc...) as per your requirement/choice and get the evaluation results.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage pricing](https://cloud.google.com/storage/pricing).

Use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Setup environment

**NOTE**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

### Import the required packages

In [None]:
import json
import os
from datetime import datetime

from google.cloud import storage

### Colab only

In [None]:
!pip3 install --upgrade google-cloud-aiplatform

In [None]:
from google.colab import auth as google_auth

google_auth.authenticate_user()

### Setup Google Cloud project

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI API and Compute Engine API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component).

1. [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs.

Fill following variables for experiments environment:

In [None]:
# Cloud project id.
PROJECT_ID = ""  # @param {type:"string"}

# The region you want to launch jobs in.
REGION = ""  # @param {type:"string"}

# The Cloud Storage bucket for storing experiments output. Fill it without the 'gs://' prefix.
GCS_BUCKET = ""  # @param {type:"string"}

BUCKET_URI = "gs://" + GCS_BUCKET

Initialize Vertex AI API:

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=GCS_BUCKET)

### Define constants

In [None]:
# The pre-built serving and evaluation docker image.
# The model artifacts are embedded within the container, except for model weights which will be downloaded during deployment.
SERVE_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-dolly-v2-serve"
EVAL_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-lm-evaluation-harness"

### Define common functions

In [None]:
def deploy_model(model_id: str) -> tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Uploads and deploys the model to Vertex AI endpoint for prediction."""
    model_name = "dolly_v2"
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")
    serving_env = {
        "MODEL_ID": model_id,
    }
    # If the model_id is a GCS path, use artifact_uri to pass it to serving docker.
    artifact_uri = model_id if model_id.startswith("gs://") else None
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=SERVE_DOCKER_URI,
        serving_container_ports=[7080],
        serving_container_predict_route="/predictions/transformers_serving",
        serving_container_health_route="/ping",
        serving_container_environment_variables=serving_env,
        artifact_uri=artifact_uri,
    )
    model.deploy(
        endpoint=endpoint,
        machine_type="a2-highgpu-1g",
        accelerator_type="NVIDIA_TESLA_A100",
        accelerator_count=1,
        deploy_request_timeout=1800,
    )
    return model, endpoint

In [None]:
def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")

## Upload and deploy models

This section uploads the pre-trained model to Model Registry and deploys it on the Endpoint with 1 A100 GPU.

The model deployment step will take ~15 minutes to complete.

In [None]:
MODEL_ID = "databricks/dolly-v2-3b"  # @param ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]
model, endpoint = deploy_model(model_id=MODEL_ID)

NOTE: The model weights will be downloaded after the deployment succeeds. Thus additional 10 minutes of waiting time is needed **after** the above model deployment step succeeds and before you run the next step below. Otherwise you might see a `ServiceUnavailable: 503 502:Bad Gateway` error when you send requests to the endpoint.

## Send prediction request

In [None]:
input_text = "Explain to me the difference between nuclear fission and fusion."

instances = [
    {"text": input_text},
]
preds = endpoint.predict(instances=instances).predictions
print(preds)

## Evaluation

This section demonstrates how to evaluate the Dolly V2 models using EleutherAI's [Language Model Evaluation Harness (lm-evaluation-harness)](https://github.com/EleutherAI/lm-evaluation-harness) with Vertex Custom Job. Please reference the peak GPU memory usgaes for serving and adjust the machine type, accelerator type and accelerator count accordingly.

This example uses the dataset [HellaSwag](https://allenai.org/data/hellaswag). All supported tasks are listed in [this task table](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md).

### Define the variables

In [None]:
# Define the machine_type, accelerator_type, accelerator_count and benchmark dataset.

eval_dataset = "hellaswag"  # @param ["arc_challenge", "arc_easy", "boolq", "hellaswag", "openbookqa", "piqa", "winogrande"]

# Worker pool spec.
# Find Vertex AI supported accelerators and regions in:
# https://cloud.google.com/vertex-ai/docs/training/configure-compute

# Sets V100 to deploy Dolly V2 as an example.

machine_type = "n1-standard-8"
accelerator_type = "NVIDIA_TESLA_V100"
accelerator_count = 1  # for dolly-v2-3b & dolly-v2-7b
# accelerator_count = 2  # for dolly-v2-12b

replica_count = 1

In [None]:
# Setup evaluation job.
job_name = get_job_name_with_datetime(prefix="dolly-v2")
eval_output_dir = os.path.join(BUCKET_URI, job_name)
eval_output_dir_gcsfuse = eval_output_dir.replace("gs://", "/gcs/")

### Prepare the evaluation script

In [None]:
# Prepare evaluation script that runs the evaluation harness.
# We set `trust_remote_code = True` because evaluating the model requires
# executing code from the model repository.
# We set `use_accelerate = True` to enable evaluation across multiple GPUs.
script_path = "./eval_script.py"  # @param {type:"string"}


eval_command = f"""import subprocess


subprocess.call([
    'python',
    'main.py',
    '--model',
    'hf-causal-experimental',
    '--model_args',
    'pretrained={MODEL_ID},trust_remote_code=True,use_accelerate=True,device_map_option=auto',
    '--tasks',
    '{eval_dataset}',
    '--output_path',
    '{eval_output_dir_gcsfuse}',
])
"""

with open(script_path, "w") as fp:
    fp.write(eval_command)

### Run the evaluation

In [None]:
eval_job = aiplatform.CustomJob.from_local_script(
    display_name=job_name,
    script_path=script_path,
    container_uri=EVAL_DOCKER_URI,
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    base_output_dir=eval_output_dir,
    boot_disk_size_gb=500,
)

eval_job.run()

print("Evaluation results were saved in:", eval_output_dir)

In [None]:
# Fetch evaluation results.
storage_client = storage.Client()
BUCKET_NAME = GCS_BUCKET
bucket = storage_client.get_bucket(BUCKET_NAME)
RESULT_FILE_PATH = eval_output_dir[len(BUCKET_URI) + 1 :]
blob = bucket.blob(RESULT_FILE_PATH)
raw_result = blob.download_as_string()

# Print evaluation results.
result = json.loads(raw_result)
result_formatted = json.dumps(result, indent=2)
print(f"Evaluation result:\n{result_formatted}")

## Clean up

In [None]:
# Undeploy model and delete endpoint.
endpoint.delete(force=True)

# Delete models.
model.delete()

# Delete evaluation jobs.
eval_job.delete()