In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Get started with Gemma on Ray on Vertex AI

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_gemma_fine_tuning_batch_deployment_on_rov.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_gemma_fine_tuning_batch_deployment_on_rov.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_gemma_fine_tuning_batch_deployment_on_rov.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_gemma_fine_tuning_batch_deployment_on_rov.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

## Overview

This tutorial demonstrates how to use Ray on Vertex AI for fine-tuning and serving Gemma on Vertex AI.

Learn more about [Ray on Vertex AI](https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/overview).

### Objective

In this tutorial, you'll learn how to distribute Gemma Supervised tuning on Ray on Vertex AI. Furthermore, you'll learn how to deploy the trained model seamlessly for offline predictions using Ray Data on Ray on Vertex AI.

This tutorial uses the following Google Cloud ML services and resources:

- Ray on Vertex AI

The steps performed include:

- Create a Ray cluster on Vertex AI
- Tune Gemma with Ray Train on Ray on Vertex AI
- Serving Gemma with Ray Data for offline predictions.

### Dataset

The [Extreme Summarization (XSum) dataset](https://huggingface.co/datasets/EdinburghNLP/xsum) is a dataset about abstractive single-document summarization systems.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),
and [Cloud Storage pricing](https://cloud.google.com/storage/pricing),
and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

<b>Note</b>: This tutorial uses the Ray Jobs API via public Ray Dashboard. The Ray dashboard address is accessible from outside the VPC, including the public internet. To learn more about  private versus public connectivity, see the [Private and public connectivity](https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/create-cluster#private_and_public_connectivity) section in the [Create a Ray cluster on Vertex AI](https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/create-cluster) documentation.

## Before you begin

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager).

2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

3. [Enable APIs](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,artifactregistry.googleapis.com,cloudbuild.googleapis.com).

4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk).

### Installation

Install the following packages required to execute this notebook.

In [None]:
# Install the packages
import os

if not os.getenv("IS_TESTING"):
    USER = "--user"
else:
    USER = ""

! pip3 install {USER} google-cloud-aiplatform[ray]==1.48.0 -q --no-warn-conflicts
! pip3 install {USER} google-cloud-aiplatform[tensorboard]==1.48.0 -q --no-warn-conflicts
! pip3 install {USER} torch==2.2.1 datasets==2.17.0 transformers==4.38.1 evaluate==0.4.1 rouge-score==0.1.2 nltk==3.8.1 bitsandbytes==0.42.0 peft==0.8.2 accelerate==0.27.1 -q --no-warn-conflicts
! pip3 install {USER} tensorflow==2.15.0 -q --no-warn-conflicts
! pip3 install {USER} etils==1.5.0 fsspec==2023.10.0 gcsfs==2023.10.0 -q --no-warn-conflicts

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it is finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information


#### Project ID


In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

#### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "us-central1"  # @param {type: "string"}

#### Timestamp

You create a timestamp to make resources you create unique in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

#### Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [None]:
BUCKET_NAME = f"your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

BUCKET_URI = f"gs://{BUCKET_NAME}"

In [None]:
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

#### Service Account

Set service account and grant the service account access to Vertex AI TensorBoard.

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

In [None]:
! gcloud projects add-iam-policy-binding {PROJECT_ID} \
   --member=serviceAccount:{SERVICE_ACCOUNT} \
   --role="roles/storage.admin"

! gcloud projects add-iam-policy-binding {PROJECT_ID} \
   --member=serviceAccount:{SERVICE_ACCOUNT} \
   --role="roles/aiplatform.user"

### Set tutorial folder

Set up the folder to use in this tutorial.

In [None]:
from pathlib import Path as path

root_path = path.cwd()
tutorial_path = root_path / "tutorial"
data_path = tutorial_path / "data"
src_path = tutorial_path / "src"
experiments_path = tutorial_path / "experiments"
models_path = tutorial_path / "models"
build_path = tutorial_path / "build"
tests_path = tutorial_path / "tests"

data_path.mkdir(parents=True, exist_ok=True)
src_path.mkdir(parents=True, exist_ok=True)
experiments_path.mkdir(parents=True, exist_ok=True)
models_path.mkdir(parents=True, exist_ok=True)
build_path.mkdir(parents=True, exist_ok=True)
tests_path.mkdir(parents=True, exist_ok=True)

### Set a Ray cluster on Vertex AI

Before running the code below, make sure to [set up](https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/set-up) Ray on Vertex AI.

In [None]:
import vertex_ray
from google.cloud import aiplatform as vertex_ai
from vertex_ray import NodeImages, Resources

#### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project.

In [None]:
vertex_ai.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

#### Build the custom cluster image

 It's necessary to utilize Ray Custom cluster image support since certain dependencies are required.

 To use a custom cluster image, the first step is to build the image. Below there are the steps to cover:

*  Prepare the requirements file
*  Create the Dockerfile for the custom image
*  Create the Docker image repository
*  Build the Ray cluster custom image


##### Prepare the requirements file

Prepare the `requirements` file that includes the dependencies your Ray application needs to run.

In [None]:
requirements = """
ipython==8.22.2
torch==2.2.1
ray==2.10.0
ray[data]==2.10.0
ray[train]==2.10.0
ray[tune]==2.10.0
datasets==2.17.0
transformers==4.38.1
evaluate==0.4.1
rouge-score==0.1.2
nltk==3.8.1
accelerate==0.27.1
bitsandbytes==0.42.0
peft==0.8.2
trl==0.7.10
# flash-attn==2.5.5
pyarrow==15.0.2
fsspec==2023.10.0
gcsfs==2023.10.0
etils==1.7.0
importlib-resources==6.1.2
"""

with open(build_path / "requirements.txt", "w") as rfile:
    rfile.write(requirements)
rfile.close()

##### Create the Dockerfile

Create the Dockerfile for the custom image by leveraging one of the prebuilt Ray on Vertex AI base images.


In [None]:
CUSTOM_BASE_IMAGE = "us-docker.pkg.dev/vertex-ai/training/ray-gpu.2-9.py310:latest"  # @param ["us-docker.pkg.dev/vertex-ai/training/ray-cpu.2-4.py310:latest", "us-docker.pkg.dev/vertex-ai/training/ray-cpu.2-9.py310:latest", "us-docker.pkg.dev/vertex-ai/training/ray-gpu.2-4.py310:latest", "us-docker.pkg.dev/vertex-ai/training/ray-gpu.2-9.py310:latest", "europe-docker.pkg.dev/vertex-ai/training/ray-cpu.2-4.py310:latest", "europe-docker.pkg.dev/vertex-ai/training/ray-cpu.2-9.py310:latest", "europe-docker.pkg.dev/vertex-ai/training/ray-gpu.2-4.py310:latest", "europe-docker.pkg.dev/vertex-ai/training/ray-gpu.2-9.py310:latest", "asia-docker.pkg.dev/vertex-ai/training/ray-cpu.2-4.py310:latest", "asia-docker.pkg.dev/vertex-ai/training/ray-cpu.2-9.py310:latest", "asia-docker.pkg.dev/vertex-ai/training/ray-gpu.2-4.py310:latest", "asia-docker.pkg.dev/vertex-ai/training/ray-gpu.2-9.py310:latest"] {allow-input: true}

In [None]:
dockerfile = f"""
FROM {CUSTOM_BASE_IMAGE}

# Install training libraries.
ENV PIP_ROOT_USER_ACTION=ignore
COPY requirements.txt .
RUN pip install -r requirements.txt
"""

with open(build_path / "Dockerfile", "w") as image_file:
    image_file.write(dockerfile)
image_file.close()

##### Create the Docker image repository

To store the custom cluster image, create a Docker repository in the Artifact Registry.

In [None]:
REPO_NAME = f"your-repo-name-{PROJECT_ID}-unique"  # @param {type:"string"}

In [None]:
! gcloud artifacts repositories create {REPO_NAME} --repository-format=docker \
    --location={REGION} --description="Tutorial repository"

##### Build the Ray cluster custom image

Finally, build the Ray cluster custom image using Cloud Build.

In [None]:
NODE_TRAIN_IMAGE = f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPO_NAME}/train"
BUILD_MACHINE_TYPE = "E2_HIGHCPU_32"

In [None]:
! gcloud builds submit --region={REGION} --tag={NODE_TRAIN_IMAGE} \
    --machine-type={BUILD_MACHINE_TYPE} --timeout=3600 {build_path}

#### Create the Ray cluster

With the custom image, create the Ray cluster using the custom image via Ray on Vertex AI SDK for Python.

In [None]:
CLUSTER_NAME = f"your-cluster-name-{PROJECT_ID}-unique"  # @param {type:"string"}

###### Set the Ray cluster configuration

Use the Vertex AI Python SDK for Ray on Vertex AI to set the cluster configuration.

To know more about the cluster configuration, see the [documentation](https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/create-cluster#ray-on-vertex-ai-sdk).

In [None]:
HEAD_NODE_MACHINE_TYPE = "n1-standard-16"  # @param {type:"string"}
HEAD_NODE_COUNT = 1  # @param {type:"integer"}

WORKER_NODE_MACHINE_TYPE = "a2-highgpu-1g"  # @param {type:"string"}
WORKER_NODE_COUNT = 1  # @param {type:"integer"}
WORKER_ACCELERATION_TYPE = "NVIDIA_TESLA_A100"  # @param {type:"string"}
WORKER_ACCELERATION_COUNT = 1  # @param {type:"integer"}

In [None]:
HEAD_NODE_TYPE = Resources(
    machine_type=HEAD_NODE_MACHINE_TYPE,
    node_count=HEAD_NODE_COUNT,
)

WORKER_NODE_TYPES = [
    Resources(
        machine_type=WORKER_NODE_MACHINE_TYPE,
        node_count=WORKER_NODE_COUNT,
        accelerator_type=WORKER_ACCELERATION_TYPE,
        accelerator_count=WORKER_ACCELERATION_COUNT,
    )
]

CUSTOM_IMAGES = NodeImages(
    head=NODE_TRAIN_IMAGE,
    worker=NODE_TRAIN_IMAGE,
)

##### Create the Ray cluster

Create the Ray cluster with the predefined custom configuration. Creating a cluster can take several minutes, depending on its configuration.

In [None]:
ray_cluster_name = vertex_ray.create_ray_cluster(
    head_node_type=HEAD_NODE_TYPE,
    worker_node_types=WORKER_NODE_TYPES,
    custom_images=CUSTOM_IMAGES,
    cluster_name=CLUSTER_NAME,
)

##### Get the Ray cluster

Use the Ray on Vertex AI SDK for Python to get the Ray cluster.

In [None]:
ray_clusters = vertex_ray.list_ray_clusters()
ray_cluster_resource_name = ray_clusters[-1].cluster_resource_name
ray_cluster = vertex_ray.get_ray_cluster(ray_cluster_resource_name)

In [None]:
print("Ray cluster on Vertex AI:", ray_cluster_resource_name)

### Import libraries

Import required libraries.

In [None]:
# General
import io
import logging
import os
import random
import shutil
import string
import time

import datasets
import evaluate
import pandas as pd
# Ray - Training
import ray
import torch
import transformers
from etils import epath
from google.cloud import storage
from huggingface_hub import login
from peft import PeftModel
from ray.job_submission import JobStatus, JobSubmissionClient
# Ray - Batch Serving
from ray.tune import ExperimentAnalysis
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [None]:
print("Ray version: ", ray.__version__)

### Set variables

Initiate some tutorial variables.

In [None]:
# Training
HF_TOKEN = "[your-hugging-face-token]"  # @param {type:"string"}
EXPERIMENTS_FOLDER_URI = epath.Path(BUCKET_URI) / "experiments"
TENSORBOARD_NAME = f"rov-xsum-gemma-tb-{TIMESTAMP}"

# Serving
MODELS_PATH = epath.Path(BUCKET_URI) / "models"
PREDICTIONS_FOLDER_URI = epath.Path(BUCKET_URI) / "predictions"

### Define helpers

Define an helper function to monitor the status of Ray job using Ray Dashboard API in your notebook.

In [None]:
def monitor_job(client, job_id):
    """Monitors the status of Ray job using Ray Dashboard API"""

    logging.basicConfig(
        level=logging.INFO,
        format=f"%(asctime)s.%(msecs)03d %(levelname)s {job_id} -- %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        force=True,
    )

    while True:
        job_status = client.get_job_status(job_id)

        if job_status == JobStatus.SUCCEEDED:
            logging.info("Job succeeded!")
            break

        elif job_status == JobStatus.FAILED:
            logging.info("Job failed!")
            break

        else:
            logging.info("Job is running...")
            time.sleep(60)

    return job_status


def read_json_files(bucket_name, prefix=None):
    """Reads JSON files from a cloud storage bucket and returns a Pandas DataFrame"""

    # Set up storage client
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)

    dfs = []

    for blob in blobs:
        if blob.name.endswith(".json"):
            file_bytes = blob.download_as_bytes()
            file_string = file_bytes.decode("utf-8")
            with io.StringIO(file_string) as json_file:
                df = pd.read_json(json_file, lines=True)
            dfs.append(df)

    return pd.concat(dfs, ignore_index=True)

### Libraries settings

Initiate some libraries settings.

In [None]:
login(token=HF_TOKEN)
datasets.disable_progress_bar()
transformers.set_seed(8)

### Create a Vertex AI TensorBoard instance

Create a Vertex AI TensorBoard instance for tracking and monitoring your tuning jobs.

In [None]:
tensorboard = vertex_ai.Tensorboard.create(
    display_name=TENSORBOARD_NAME, project=PROJECT_ID, location=REGION
)

vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=BUCKET_URI,
    experiment_tensorboard=tensorboard,
)

## Fine-Tune Gemma with Ray Train

In this tutorial, you fine-tune Gemma 2B (`gemma-2b-it`) for summarizing newspaper articles using HuggingFace Transformer on Ray on Vertex AI. In an effort to make this notebook easily reproducible, you write a simple Python `trainer.py` script and submit it to the Ray cluster on Vertex AI using the Ray Jobs API via the public Ray Dashboard.

As mentioned at the beginning, **consider this option for experimentation only.**

### Initialize the Ray package

Create an `__init__.py` file.

In [None]:
with open(src_path / "__init__.py", "a") as init_file:
    pass

### Prepare the train script

Create the `src/train.py` file which is the Python script for initializing Gemma fine-tuning using HuggingFace TRL library.

In [None]:
train_script = '''
# training libraries
import os
import numpy as np
import torch
from huggingface_hub import login
import datasets
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Seq2SeqTrainingArguments
from peft import LoraConfig
from trl import SFTTrainer
import evaluate
import ray
import ray.train.huggingface.transformers

def train_func(config):
    # Helpers
    def formatting_func(example):
        """Helper function for formatting data for instruction tuning according to Gemma documentation."""
        output_texts = []
        for i in range(len(example)):
          messages = [
            {"role": "user",
             "content": f"Summarize the following ARTICLE in one sentence.\\n###ARTICLE: {example['document'][i]}"},
            {"role": "assistant",
             "content": f"{example['summary'][i]}<eos>"} # Make minor gemma fixes #2029
             ]
          output_texts.append(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False))
        return output_texts

    def compute_metrics(eval_preds):
        """Helper function for computing metrics"""
        preds, labels = eval_preds
        preds = preds[0]

        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        metrics = rouge.compute(predictions=decoded_preds,
                                references=decoded_labels,
                                rouge_types=['rouge1', 'rouge2', 'rougeL', 'rougeLsum'],
                                use_aggregator=True, use_stemmer=True)
        metrics = {k: round(v * 100, 4) for k, v in metrics.items()}
        return metrics

    def preprocess_logits_for_metrics(logits, labels):
        """Helper function for logits preprocessing for metrics"""
        preds = torch.argmax(logits, dim=-1)
        return preds, labels

    # Setting training
    login(token=os.environ['HF_TOKEN'], add_to_git_credential=True)
    transformers.set_seed(8)

    # Load dataset
    dataset_id = "xsum"
    dataset = datasets.load_dataset(dataset_id, trust_remote_code=True)
    train_dataset = dataset["train"]
    eval_dataset = dataset["test"]

    # Preprocess dataset
    model_id = "google/gemma-2b-it"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.padding_side = 'right'

    # Prepare model
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(model_id,
                                                 quantization_config=bnb_config,
                                                 device_map={'': torch.cuda.current_device()},
                                                 torch_dtype=torch.bfloat16,
                                                 # attn_implementation="flash_attention_2"
                                                 )
    lora_config = LoraConfig(
        r=32,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules="all-linear",
        bias="none",
        task_type="CAUSAL_LM"
    )

    # model.gradient_checkpointing_enable()
    rouge = evaluate.load("rouge")

    training_args = Seq2SeqTrainingArguments(
        output_dir="checkpoints",
        per_device_train_batch_size=config.get("per_device_train_batch_size"),
        per_device_eval_batch_size=config.get("per_device_eval_batch_size"),
        gradient_accumulation_steps=config.get("gradient_accumulation_steps"),
        logging_strategy="steps",
        save_strategy="steps",
        evaluation_strategy="steps",
        max_steps=config.get("max_steps"),
        save_steps=config.get("save_steps"),
        logging_steps=config.get("logging_steps"),
        learning_rate=config.get("learning_rate"),
        optim="paged_adamw_8bit",
        bf16=False,
        fp16=True,
        report_to="none",
        predict_with_generate=True,
        ddp_find_unused_parameters=False,
        gradient_checkpointing=True,
        push_to_hub=False,
        disable_tqdm=False,
        load_best_model_at_end=False
    )

    max_seq_length = 512
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        max_seq_length=max_seq_length,
        compute_metrics=compute_metrics,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
        peft_config=lora_config,
        formatting_func=formatting_func
    )
    # model.config.use_cache = False

    callback = ray.train.huggingface.transformers.RayTrainReportCallback()
    trainer.add_callback(callback)
    trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)
    trainer.train()
'''

with open(src_path / "train.py", "w") as f:
    f.write(train_script)
f.close()

### Prepare the distributed training script

Create `src/trainer.py` file which is the Python script for executing the Ray distributed training job.

In [None]:
trainer_script = """
# libraries
import argparse

# training libraries
from train import train_func

# ray libraries
import ray
import ray.train.huggingface.transformers
from ray.train import ScalingConfig, RunConfig, CheckpointConfig
from ray.train.torch import TorchTrainer


# helpers
def get_args():
    parser = argparse.ArgumentParser(description='Supervised tuning Gemma on Ray on Vertex AI')

    # some gemma parameters
    parser.add_argument("--train_batch_size", type=int, default=1, help="train batch size")
    parser.add_argument("--eval_batch_size", type=int, default=1, help="eval batch size")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=4, help="gradient accumulation steps")
    parser.add_argument("--learning_rate", type=float, default=2e-4, help="learning rate")
    parser.add_argument("--max_steps", type=int, default=100, help="max steps")
    parser.add_argument("--save_steps", type=int, default=10, help="save steps")
    parser.add_argument("--logging_steps", type=int, default=10, help="logging steps")

    # ray parameters
    parser.add_argument('--num-workers', dest='num_workers', type=int, default=1, help='Number of workers')
    parser.add_argument('--use-gpu', dest='use_gpu', action='store_true', default=False, help='Use GPU')
    parser.add_argument('--experiment-name', dest='experiment_name', type=str, default='gemma-on-rov', help='Experiment name')
    parser.add_argument('--logging-dir', dest='logging_dir', type=str, help='Logging directory')
    args = parser.parse_args()
    return args


def main():

    args = get_args()
    config = vars(args)

    # initialize ray session
    ray.shutdown()
    ray.init()

    # training config
    train_loop_config = {
        "per_device_train_batch_size": config['train_batch_size'],
        "per_device_eval_batch_size": config['eval_batch_size'],
        "gradient_accumulation_steps": config['gradient_accumulation_steps'],
        "learning_rate": config['learning_rate'],
        "max_steps": config['max_steps'],
        "save_steps": config['save_steps'],
        "logging_steps": config['logging_steps'],
    }
    scaling_config = ScalingConfig(num_workers=config['num_workers'], use_gpu=config['use_gpu'])
    run_config = RunConfig(checkpoint_config=CheckpointConfig(num_to_keep=5,
                          checkpoint_score_attribute="loss",
                          checkpoint_score_order="min"),
                           storage_path=config['logging_dir'],
                           name=config['experiment_name'])
    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=train_loop_config,
        run_config=run_config,
        scaling_config=scaling_config
    )
    # train
    result = trainer.fit()

    ray.shutdown()


if __name__ == "__main__":
    main()
"""

with open(src_path / "trainer.py", "w") as f:
    f.write(trainer_script)
f.close()

### Submit a Ray job using the Ray Jobs API

Submit the script to the Ray cluster on Vertex AI using the Ray Jobs API with  the public Ray dashboard address.

Initiate the client to submit the job.

In [None]:
client = JobSubmissionClient(
    address="vertex_ray://{}".format(ray_cluster.dashboard_address)
)

Set some job configuration including experiment name, job id, training entrypoint and more.

In [None]:
train_id = "".join(random.choices(string.ascii_lowercase + string.digits, k=3))
train_experiment_name = f"rov-dialog-gemma-tune-{train_id}"
train_submission_id = f"ray-job-{train_id}"
train_entrypoint = f"python3 trainer.py --experiment-name={train_experiment_name} --logging-dir={EXPERIMENTS_FOLDER_URI} --num-workers={WORKER_NODE_COUNT} --use-gpu"
train_experiment_uri = EXPERIMENTS_FOLDER_URI / train_experiment_name
train_runtime_env = {
    "working_dir": str(src_path),
    "env_vars": {"HF_TOKEN": HF_TOKEN, "TORCH_NCCL_ASYNC_ERROR_HANDLING": "3"},
}

Submit the job.

In [None]:
train_job_id = client.submit_job(
    submission_id=train_submission_id,
    entrypoint=train_entrypoint,
    runtime_env=train_runtime_env,
)

Check the status of the job while is running using the `monitor_job` function.

In [None]:
train_job_status = monitor_job(client, train_job_id)

### Check training artifacts

After the Ray training job has completed, see the model artifacts in the Cloud Storage location.


In [None]:
! gsutil ls -l {train_experiment_uri}

### Log metrics in Vertex AI TensorBoard

Use Vertex AI TensorBoard for validating your training job by logging resulting metrics.

In [None]:
vertex_ai.upload_tb_log(
    tensorboard_id=tensorboard.name,
    tensorboard_experiment_name=train_experiment_name,
    logdir=str(train_experiment_uri),
)

## Serving tuned Gemma model with Ray Data for offline predictions

Using Ray on Vertex AI for developing AI/ML applications offers various benefits. In this scenario, you can use Cloud storage to conveniently store model checkpoints, metrics and more. This allows you to quickly consume the model for AI/ML downstreaming tasks including generating batch predictions using Ray Data.  


### Generate predictions (locally)

Generate predictions locally to validate the tuned model.


#### Download Ray training checkpoints

Download all resulting checkpoints from Ray job.

In [None]:
! gsutil -q cp -r {train_experiment_uri}/* {experiments_path}

#### Get the best checkpoint

Use the `ExperimentAnalysis` method to retrieve the the best checkpoint according to relevant metrics and mode.

In [None]:
experiment_analysis = ExperimentAnalysis(experiments_path)
log_path = experiment_analysis.get_best_trial(metric="eval_rougeLsum", mode="max")
best_checkpoint = experiment_analysis.get_best_checkpoint(
    log_path, metric="eval_rougeLsum", mode="max"
)

#### Load the model after training

After training the model, load the model as described in the Hugging Face [documentation](https://huggingface.co/docs/trl/use_model#use-adapters-peft).

Set the model and adapters path. Also set the path to store the resulting tuned model.

In [None]:
base_model_path = "google/gemma-2b-it"
peft_model_path = epath.Path(best_checkpoint.path) / "checkpoint"
tuned_model_path = models_path / "xsum-tuned-gemma-it"

Initiate the associated Gemma tokenizer and base model. Also initiate the resulting adapters.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path, device_map="auto", torch_dtype=torch.float16
)
peft_model = PeftModel.from_pretrained(
    base_model,
    peft_model_path,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    is_trainable=False,
)

Merge the base model and adapters to save the tuned model.

In [None]:
tuned_model = peft_model.merge_and_unload()
tuned_model.save_pretrained(tuned_model_path)

#### Generate summaries

Generate summaries with the tuned model. Load the validation set of the tutorial dataset.

In [None]:
dataset = datasets.load_dataset(
    "xsum", split="validation", cache_dir=data_path, trust_remote_code=True
)

Sample one article to summarize.

In [None]:
sample = dataset.select([random.randint(0, len(dataset) - 1)])
document = sample["document"][0]
reference_summary = sample["summary"][0]

Prepare the associated prompt following the [Gemma documentation](https://ai.google.dev/gemma/docs/formatting).

In [None]:
messages = [
    {
        "role": "user",
        "content": f"Summarize the following ARTICLE in one sentence.\\n###ARTICLE: {document}",
    },
]

prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

Initiate the text-generation pipeline for generating summaries.

In [None]:
tuned_gemma_pipeline = pipeline(
    "text-generation", model=tuned_model, tokenizer=tokenizer, max_new_tokens=50
)

Generate the associated summary.

In [None]:
generated_tuned_gemma_summary = tuned_gemma_pipeline(
    prompt, do_sample=True, temperature=0.1, add_special_tokens=True
)[0]["generated_text"][len(prompt) :]

Print the generated summary.

In [None]:
print(f"Reference summary: {reference_summary}")
print("-" * 100)
print(f"Tuned generated summary: {generated_tuned_gemma_summary}")

#### Evaluate models

As an additional step, you can evaluate the tuned model. To evaluate the model you compare models qualitatively and quantitatively.

In one case, you compare responses generated by the base Gemma model with the ones generated by the tuned Gemma model. In the other case, you calculate ROUGE metrics and its improvements which gives you an idea of how well the tuned models is able to reproduce the reference summaries correctly with respect to the base model.

Evaluate models by comparing generated summaries.

In [None]:
gemma_pipeline = pipeline(
    "text-generation", model=base_model, tokenizer=tokenizer, max_new_tokens=50
)

generated_gemma_summary = gemma_pipeline(
    prompt, do_sample=True, temperature=0.1, add_special_tokens=True
)[0]["generated_text"][len(prompt) :]

print(f"Reference summary: {reference_summary}")
print("-" * 100)
print(f"Base generated summary: {generated_gemma_summary}")
print("-" * 100)
print(f"Tuned generated summary: {generated_tuned_gemma_summary}")

Evaluate models by computing ROUGE metrics and its improvements.

In [None]:
rouge = evaluate.load("rouge")

In [None]:
gemma_results = rouge.compute(
    predictions=[generated_gemma_summary],
    references=[reference_summary],
    rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"],
    use_aggregator=True,
    use_stemmer=True,
)

In [None]:
tuned_gemma_results = rouge.compute(
    predictions=[generated_tuned_gemma_summary],
    references=[reference_summary],
    rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"],
    use_aggregator=True,
    use_stemmer=True,
)

In [None]:
improvements = {}
for rouge_metric, gemma_rouge in gemma_results.items():
    tuned_gemma_rouge = tuned_gemma_results[rouge_metric]
    if gemma_rouge != 0:
        improvement = ((tuned_gemma_rouge - gemma_rouge) / gemma_rouge) * 100
    else:
        improvement = None
    improvements[rouge_metric] = improvement

print("Base Gemma vs Tuned Gemma - ROUGE improvements")
for rouge_metric, improvement in improvements.items():
    print(f"{rouge_metric}: {improvement:.3f}%")

### Batch prediction with Ray Data

To generate batch prediction with the tuned model using Ray Data on Ray on Vertex AI, you need a dataset to generate predictions and the tuned model stored in the Cloud bucket.

Then, you can leverage Ray Data which provides an easy-to-use API for offline batch inference.

#### Upload the tuned model

Upload the tuned model on the Cloud storage.

In [None]:
! gsutil -q cp -r {models_path} {MODELS_PATH}

#### Prepare the batch prediction training script

Prepare `src/batch_predict.py` file which is the Python script for executing the Ray batch prediction job.

In [None]:
batch_predictor_script = """
# General
import argparse
import os
from huggingface_hub import login

# Serving
import datasets
import transformers
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.pipelines import pipeline

# Ray
import ray

# Settings
datasets.disable_progress_bar()

# Variables
base_model_path = "google/gemma-2b-it"


# helpers
def get_args():
    parser = argparse.ArgumentParser(description='Batch prediction with Gemma on Ray on Vertex AI')
    parser.add_argument('--tuned_model_path', type=str, help='path of adapter model')
    parser.add_argument('--num_gpus', type=int, default=1, help='number of gpus')
    parser.add_argument('--batch_size', type=int, default=8, help='batch size')
    parser.add_argument('--sample_size', type=int, default=20, help='number of articles to summarize')
    parser.add_argument('--temperature', type=float, default=0.1, help='temperature for generating summaries')
    parser.add_argument('--max_new_tokens', type=int, default=50, help='max new token for generating summaries')
    parser.add_argument('--output_dir', type=str, help='output directory for predictions')
    args = parser.parse_args()
    return args

def main():

    # Set configuration
    args = get_args()
    config = vars(args)

    # Setting training
    login(token=os.environ['HF_TOKEN'], add_to_git_credential=True)
    transformers.set_seed(8)

    # Load dataset
    dataset_id = "xsum"
    sample_size = config["sample_size"]
    input_data = datasets.load_dataset(dataset_id, split="validation", trust_remote_code=True)
    input_data = input_data.select(range(sample_size))
    ray_input_data = ray.data.from_huggingface(input_data)

    # Generate predictions

    class Summarizer:

      def __init__(self):
          self.tokenizer = AutoTokenizer.from_pretrained(base_model_path)
          self.tokenizer.padding_side = "right"

          self.tuned_model = AutoModelForCausalLM.from_pretrained(config["tuned_model_path"],
                                                                  device_map='auto',
                                                                  torch_dtype=torch.float16)

          self.pipeline = pipeline("text-generation",
                                    model=self.tuned_model,
                                    tokenizer=self.tokenizer,
                                    max_new_tokens=config["max_new_tokens"])

      def __call__(self, batch: np.ndarray):

          # prepare dataset
          messages = [{"role": "user",
                      "content": f"Summarize the following ARTICLE in one sentence.\\n###ARTICLE: {document}"}
                      for document in batch["document"]]

          batch['prompt'] = [self.tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)
                             for message in messages]

          # generate
          batch['generated_summary'] = [self.pipeline(prompt,
                                                    do_sample=True,
                                                    temperature=config["temperature"],
                                                    add_special_tokens=True)[0]["generated_text"][len(prompt):]
                                                    for prompt in batch['prompt']]

          return batch


    predictions_data = ray_input_data.map_batches(
        Summarizer,
        concurrency=config["num_gpus"],
        num_gpus=1,
        batch_size=config['batch_size'])

    # Store resulting predictions
    predictions_data.write_json(config["output_dir"], try_create_dir=True)


if __name__ == "__main__":
    main()
"""

with open(src_path / "batch_predictor.py", "w") as f:
    f.write(batch_predictor_script)
f.close()

####  Submit a Ray job using the Ray Jobs API

Submit the script to the Ray on Vertex AI cluster using the Ray Jobs API via  the public Ray dashboard address.

Initiate the client to submit the job.

In [None]:
client = JobSubmissionClient(
    address="vertex_ray://{}".format(ray_cluster.dashboard_address)
)

Set some job configuration including model path, job id, prediction entrypoint and more.

In [None]:
batch_predict_id = "".join(random.choices(string.ascii_lowercase + string.digits, k=4))
batch_predict_submission_id = f"ray-job-{batch_predict_id}"
tuned_model_uri_path = str(MODELS_PATH / "xsum-tuned-gemma-it").replace(
    "gs://", "/gcs/"
)
batch_predict_entrypoint = f"python3 batch_predictor.py --tuned_model_path={tuned_model_uri_path} --num_gpus=2 --output_dir={PREDICTIONS_FOLDER_URI}"
batch_predict_runtime_env = {
    "working_dir": str(src_path),
    "env_vars": {"HF_TOKEN": HF_TOKEN},
}

Submit the job.

In [None]:
batch_predict_job_id = client.submit_job(
    submission_id=batch_predict_submission_id,
    entrypoint=batch_predict_entrypoint,
    runtime_env=batch_predict_runtime_env,
)

Check the status of the job using the `monitor_job` helper function.

In [None]:
batch_predict_job_status = monitor_job(client, batch_predict_job_id)

#### Get generated summaries

Have a quick view of generated summaries using a Pandas DataFrame.

In [None]:
predictions_df = read_json_files(prefix="predictions/", bucket_name=BUCKET_NAME)
predictions_df = predictions_df[
    ["id", "document", "prompt", "summary", "generated_summary"]
]
predictions_df.head()

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) that you used for the tutorial.

Otherwise, you can delete the individual resources that you created in this tutorial.

In [None]:
import os

delete_tensorboards = False
delete_experiments = False
delete_ray_clusters = False
delete_image_repo = False
delete_bucket = False
delete_tutorial = False

# Delete tensorboard
if delete_tensorboards:
    tensorboard_list = vertex_ai.Tensorboard.list()
    for tensorboard in tensorboard_list:
        tensorboard.delete()

# Delete experiments
if delete_experiments:
    experiment_list = vertex_ai.Experiment.list()
    for experiment in experiment_list:
        experiment.delete()

# Delete ray on vertex cluster
if delete_ray_clusters:
    ray_cluster_list = vertex_ray.list_ray_clusters()
    for ray_cluster in ray_cluster_list:
        vertex_ray.delete_ray_cluster(ray_cluster.cluster_resource_name)

if delete_image_repo:
    ! gcloud artifacts repositories delete {REPO_NAME}

# Delete Cloud Storage objects that were created
if delete_bucket:
    ! gsutil -q -m rm -r {BUCKET_URI}

# Delete tutorial folder
if delete_tutorial:
    shutil.rmtree(tutorial_path)