In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Llama 3.1 Finetuning with customized container

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/instances">
      <img alt="Workbench logo" src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" width="32px"><br> Run in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_llama3_1_finetuning_with_workbench.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_llama3_1_finetuning_with_workbench.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates -
- Making changes to existing finetuning code.
- Finetuning Llama 3.1 models with this modified code.
- Deploying Llama 3.1 models.

All of the examples in this notebook use parameter efficient finetuning methods [PEFT (LoRA)](https://github.com/huggingface/peft) to reduce training and storage costs. LoRA (Low-Rank Adaptation) is one approach of Parameter Efficient FineTuning (PEFT), where pretrained model weights are frozen and rank decomposition matrices representing the change in model weights are trained during finetuning. Read more about LoRA in the following publication: [Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L. and Chen, W., 2021. Lora: Low-rank adaptation of large language models. *arXiv preprint arXiv:2106.09685*](https://arxiv.org/abs/2106.09685).

After finetuning, we can deploy models on Vertex with GPU.

**It is advised to use the Workbench for this notebook.**


### Objective

- Customize docker container code and rebuild the container.
- Finetune Llama 3.1 models in local environment using docker run.
- Finetune Llama 3.1 models with Vertex AI Custom Training Jobs.
- Run local predictions for finetuned Llama 3.1 models.
- Deploy finetuned Llama 3.1 models on Vertex AI Prediction and send prediction requests.

### File a bug

File a bug on [GitHub](https://github.com/GoogleCloudPlatform/vertex-ai-samples/issues/new) if you encounter any issue with the notebook.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

### Create workbench local environment.

For finetuning using workbench local environment, **[click here](https://cloud.google.com/vertex-ai/docs/workbench/instances/create#console)** to create a workbench instance. We need 4 Nvidia A100 GPUs or 8 Nvidia A100 80 GB GPUs or 8 Nvidia H100 80 GB GPUs for training the 8b model. **Note that for 70b and 405b models 8 Nvidia A100 80 GB GPUs or 8 Nvidia H100 80 GB GPUs are required.**
Follow [this](https://cloud.google.com/vertex-ai/docs/workbench/instances/create-euc-instance#create-instance) to link service account with workbench instance.

Check following links to see if there is enough quota available to create workbench instance: [A100 GPU Quota](https://console.cloud.google.com/iam-admin/quotas?location=us-central1&metric=compute.googleapis.com%2Fnvidia_a100_gpus), [A100 80GB GPU Quota](https://console.cloud.google.com/iam-admin/quotas?location=us-central1&metric=compute.googleapis.com%2Fnvidia_a100_80gb_gpus), [H100 80GB GPU Quota](https://console.cloud.google.com/iam-admin/quotas?location=us-central1&metric=compute.googleapis.com%2Fgpus_per_gpu_family)

Refer to [vertex AI Workbench instances locations](https://cloud.google.com/vertex-ai/docs/general/locations#instances) for the workbench instance availability.

### Install Python Packages for Finetuning

In [None]:
! pip install --quiet google-cloud-aiplatform
! pip install --quiet gcsfs==2024.3.1
! pip install --quiet accelerate==0.31.0
! pip install --quiet transformers==4.43.1
! pip install --quiet datasets==2.19.2
! pip install --quiet tensorflow==2.18.0

### Import the necessary packages

In [None]:
! rm -rf vertex-ai-samples && git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git
! cd vertex-ai-samples && git reset --hard dd333b8fdd7dd22e8902a963fb8269885eac49ee

import datetime
import importlib
import os
import uuid
from typing import Tuple

from google.cloud import aiplatform
from google.cloud.aiplatform.compat.types import \
    custom_job as gca_custom_job_compat

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

### Setup Google Cloud project

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

2. For finetuning using Vertex AI training, schedule your job with cs via Dynamic Workload Scheduler using [these instructions](https://cloud.google.com/vertex-ai/docs/training/schedule-jobs-dws). For Dynamic Workload Scheduler, check Nvidia Tesla A100 quota in [us-central1](https://console.cloud.google.com/iam-admin/quotas?location=us-central1&metric=aiplatform.googleapis.com%2Fcustom_model_training_preemptible_nvidia_a100_gpus) or [europe-west4](https://console.cloud.google.com/iam-admin/quotas?location=europe-west4&metric=aiplatform.googleapis.com%2Fcustom_model_training_preemptible_nvidia_a100_gpus. Check the [us-central1](https://console.cloud.google.com/iam-admin/quotas?location=us-central1&metric=aiplatform.googleapis.com%2Fcustom_model_training_preemptible_nvidia_a100_80gb_gpus) or [europe-west4](https://console.cloud.google.com/iam-admin/quotas?location=europe-west4&metric=aiplatform.googleapis.com%2Fcustom_model_training_preemptible_nvidia_a100_80gb_gpus) quota for Nvidia A100 80GB GPUs. Check the [us-central1](https://console.cloud.google.com/iam-admin/quotas?location=us-central1&metric=aiplatform.googleapis.com%2Fcustom_model_training_preemptible_nvidia_h100_gpus) or [europe-west4](https://console.cloud.google.com/iam-admin/quotas?location=europe-west4&metric=aiplatform.googleapis.com%2Fcustom_model_training_preemptible_nvidia_h100_gpus) quota for Nvidia H100 GPUs. If you do not have enough GPUs, then you can follow [these instructions](https://cloud.google.com/docs/quotas/view-manage#viewing_your_quota_console) to request quota. **Note: 8 Nvidia Tesla A100 GPUs can only be used to run 8b and 70b parameter models. For 405b parameter model, 8 Nvidia A100 80 GB or 8 Nvidia H100 80 GB GPUs are required**.

3. For serving, **[click here](https://console.cloud.google.com/iam-admin/quotas?location=us-central1&metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_l4_gpus)** to check if your project already has the required 1 L4 GPU in the us-central1 region.  If yes, then run this notebook in the us-central1 region. If you need more L4 GPUs for your project, then you can follow [these instructions](https://cloud.google.com/docs/quotas/view-manage#viewing_your_quota_console) to request more. Alternatively, if you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus).

> | Machine Type | Accelerator Type | Recommended Regions |
| ----------- | ----------- | ----------- |
| a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |
| a3-highgpu-2g | 2 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
| a3-highgpu-4g | 4 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
| a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, us-east5, europe-west4, us-west1, asia-southeast1 |

Set region.

In [None]:
REGION = ""  # @param {type:\"string\"}"
assert REGION, "Region must be specified."

**[Optional]** [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1"). If not set, a unique GCS bucket will be created instead.

In [None]:
BUCKET_URI = ""  # @param {type:\"string\"}"
if BUCKET_URI and not BUCKET_URI.startswith("gs://"):
    BUCKET_URI = "gs://" + BUCKET_URI

In [None]:
train_job = None
models, endpoints = {}, {}


# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    if not os.environ.get("GOOGLE_CLOUD_REGION"):
        raise ValueError(
            "REGION must be set. See"
            " https://cloud.google.com/vertex-ai/docs/general/locations for"
            " available cloud locations."
        )
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, change the value yourself below.
now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])

if BUCKET_URI is None or BUCKET_URI.strip() == "" or BUCKET_URI == "gs://":
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}-{str(uuid.uuid4())[:4]}"
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
    ! gsutil mb -l {REGION} {BUCKET_URI}
else:
    assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."
    shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep "Location constraint:" | sed "s/Location constraint://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            "Bucket region %s is different from notebook region %s"
            % (bucket_region, REGION)
        )
print(f"Using this GCS Bucket: {BUCKET_URI}")

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
MODEL_BUCKET = os.path.join(BUCKET_URI, "llama3_1")


# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)


# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

! gcloud config set project $PROJECT_ID
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/storage.admin"
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/aiplatform.user"

### Access Llama 3.1 models

For GPU based finetuning and serving, choose between accessing Llama 3.1 models on [Hugging Face](https://huggingface.co/) or Vertex AI as described below.

If you already obtained access to Llama 3.1 models on [Hugging Face](https://huggingface.co/), you can load models from there.
Alternatively, you can also load the original Llama 3.1 models for finetuning and serving from Vertex AI after accepting the agreement.

It is recommended to use "Google Cloud" for 405B model since it can be downloaded faster.

In [None]:
# Modify the following parameter based on the model source.
LOAD_MODEL_FROM = "Google Cloud"  # @param ["Google Cloud", "Hugging Face"]
HF_TOKEN = ""
MODEL_BUCKET = ""

#### Access Model from Google Cloud

The original models from Meta are converted into the Hugging Face format for serving in Vertex AI.
Accept the model agreement to access the models:
1. Open the [Llama 3.1 model card](https://console.cloud.google.com/vertex-ai/publishers/meta/model-garden/llama3_1) from [Vertex AI Model Garden](https://cloud.google.com/model-garden).
2. Review and accept the agreement in the pop-up window on the model card page. If you have previously accepted the model agreement, there will not be a pop-up window on the model card page and this step is not needed.
3. After accepting the agreement of Llama 3.1, a `gs://` URI containing Llama 3.1 pretrained and finetuned models will be shared.
4. Paste the URI in the `MODEL_BUCKET` field below.

In [None]:
MODEL_BUCKET = ""  # @param {type:"string"}
if LOAD_MODEL_FROM == "Google Cloud":
    assert (
        MODEL_BUCKET
    ), "Click the agreement of Llama3.1 in Vertex AI Model Garden, and get the GCS path of the model artifacts."

#### Access Llama 3.1 models on Hugging Face

You must provide a Hugging Face User Access Token (read) to access the Llama 3.1 models. You can follow the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens) to create a **read** access token and put it in the `HF_TOKEN` field below.

In [None]:
HF_TOKEN = ""  # @param {type:"string"}
if LOAD_MODEL_FROM == "Hugging Face":
    assert (
        HF_TOKEN
    ), "Provide a read HF_TOKEN to load models from Hugging Face, or select a different model source."

## Finetune with HuggingFace PEFT

### Set Dataset

Use the Vertex AI SDK to create and run the custom training jobs.

This notebook uses [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) dataset as an example.
You can set `dataset_name` to any existing [Hugging Face dataset](https://huggingface.co/datasets) name, and set `instruct_column_in_dataset` to the name of the dataset column containing training data. The [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) has only one column `text`, and therefore we set `instruct_column_in_dataset` to `text` in this notebook.


#### (Optional) Prepare a custom JSONL dataset for finetuning
You can prepare a JSONL file where each line is a valid JSON string as your custom training dataset. For example, here is one line from the [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) dataset:
```
{"text": "### Human: Hola### Assistant: \u00a1Hola! \u00bfEn qu\u00e9 puedo ayudarte hoy?"}
```

The JSON object has a key `text`, which should match `instruct_column_in_dataset`; The value should be one training data point, i.e. a string. After you prepared your JSONL file, you can either upload it to [Hugging Face datasets](https://huggingface.co/datasets) or [Google Cloud Storage](https://cloud.google.com/storage).
- To upload a JSONL dataset to [Hugging Face datasets](https://huggingface.co/datasets), follow the instructions on [Uploading Datasets](https://huggingface.co/docs/hub/en/datasets-adding). Then, set `dataset_name` to the name of your newly created dataset on Hugging Face.
- To upload a JSONL dataset to [Google Cloud Storage](https://cloud.google.com/storage), follow the instructions on [Upload objects from a filesystem](https://cloud.google.com/storage/docs/uploading-objects). Then, set `dataset_name` to the `gs://` URI to your JSONL file. For example: `gs://cloud-samples-data/vertex-ai/model-evaluation/peft_train_sample.jsonl`.

Optionally update the `instruct_column_in_dataset` field below if your JSON objects use a key other than the default `text`.

#### (Optional) Format your data with custom JSON template

Sometimes, your dataset might have multiple text columns and you want to construct the training data with a template. You can prepare a JSON template in the following format:
```
{
  "description": "Template used by Llama 3.1, accepting text-bison format.",
  "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/models/tune-text-models-supervised#dataset-format",
  "prompt_input": "<|start_header_id|>user<|end_header_id|>\n\n{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{output_text}<|eot_id|>",
  "instruction_separator": "<|start_header_id|>user<|end_header_id|>\n\n",
  "response_separator": "<|start_header_id|>assistant<|end_header_id|>\n\n"
}
```

As an example, the template above can be used to format the following training data (this line comes from `gs://cloud-samples-data/vertex-ai/model-evaluation/peft_train_sample.jsonl`):

```
{"input_text":"TRANSCRIPT: \nREASON FOR EVALUATION:,\n\n LABEL:","output_text":"Chiropractic"}
```

This example template simply concatenates `input_text` with `output_text` with some special tokens in between.

To try such custom dataset, you can make the following changes:
1. Set `template` to `llama3-text-bison`
2. Set `train_dataset_name` to `gs://cloud-samples-data/vertex-ai/model-evaluation/peft_train_sample.jsonl`
3. Set `train_split_name` to `train`
4. Set `eval_dataset_name` to `gs://cloud-samples-data/vertex-ai/model-evaluation/peft_eval_sample.jsonl`
5. Set `eval_split_name` to `train` (**NOT** `test`)
6. Set `instruct_column_in_dataset` as `input_text`.

In [None]:
# Template name or gs:// URI to a custom template.
template = "openassistant-guanaco"  # @param {type:"string"}

# Hugging Face dataset name or gs:// URI to a custom JSONL dataset.
train_dataset_name = "timdettmers/openassistant-guanaco"  # @param {type:"string"}
train_split_name = "train"  # @param {type:"string"}
eval_dataset_name = "timdettmers/openassistant-guanaco"  # @param {type:"string"}
eval_split_name = "test"  # @param {type:"string"}

# Name of the dataset column containing training text input.
instruct_column_in_dataset = "text"  # @param {type:"string"}

### Set model

Select a model variant of Llama 3.1.

In [None]:
# valid base model ids
supported_base_model_ids = [
    "meta-llama/Meta-Llama-3.1-8B",
    "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "meta-llama/Meta-Llama-3.1-70B",
    "meta-llama/Meta-Llama-3.1-70B-Instruct",
    "meta-llama/Meta-Llama-3.1-405B",
    "meta-llama/Meta-Llama-3.1-405B-Instruct",
]

base_model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
assert base_model_id in supported_base_model_ids, "Provide a valid base model id."

if LOAD_MODEL_FROM == "Google Cloud":
    pretrained_model_id = os.path.join(MODEL_BUCKET, base_model_id.split("/")[-1])
else:
    pretrained_model_id = base_model_id

### Modify Finetuning docker
Here we will demonstrate how we can make changes to [existing finetuning code from GitHub](https://github.com/GoogleCloudPlatform/vertex-ai-samples/tree/dd333b8fdd7dd22e8902a963fb8269885eac49ee/community-content/vertex_model_garden/model_oss/peft). One can follow a similar process to customize finetuning code.

#### Modify Trainer stats

The original code for `callbacks.py` is [here](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/f641e5d2213f27acb203af02e02ff66b2ef8b9ba/community-content/vertex_model_garden/model_oss/peft/train/vmg/callbacks.py). The `callbacks.py` file contains callbacks in TrainerStatsCallback which will be executed at the end of training step. Currently this contains information about gpu usage, gpu memory usage and training throughput stats.
Here we will modify `callbacks.py` to add the TFLOPS stats to trainer stats. TFLOPS is a unit of measurement for a GPU's performance that indicates how many floating-point operations a processor can perform per second.

In [None]:
%%writefile vertex-ai-samples/community-content/vertex_model_garden/model_oss/peft/train/vmg/callbacks.py
"""Different trainer callbacks for PEFT Trainer."""

import time

from absl import logging
import accelerate
from transformers import TrainingArguments
from transformers.trainer_callback import TrainerCallback
from transformers.trainer_callback import TrainerControl
from transformers.trainer_callback import TrainerState

from vertex_vision_model_garden_peft.train.vmg import utils


class TrainerStatsCallback(TrainerCallback):
  """Trainer callback to report trainer stats."""

  def __init__(self, max_seq_length, filename=None):
    self._max_seq_length = max_seq_length
    self._filename = filename

    self._partial_state = accelerate.PartialState()
    self._start_time = float('nan')
    self._prev_time = float('nan')
    self._peak_mem = 0.0
    self._avg_throughput = 0.0
    self._avg_tflops_per_sec = 0.0

  def on_step_end(
      self,
      args: TrainingArguments,
      state: TrainerState,
      control: TrainerControl,
      **kwargs,
  ):
    if self._partial_state.is_main_process:
      if state.global_step == 1:
        self._prev_time = time.time()
        delta_t = float('nan')
        self._prev_tflops = state.total_flos / 1e12
        tflops_per_sec = 0.0
      else:
        cur_time = time.time()
        cur_tflops = state.total_flos / 1e12
        tflops_per_sec = (cur_tflops - self._prev_tflops) / (
            cur_time - self._prev_time
        )
        self._prev_tflops = cur_tflops
        self._avg_tflops_per_sec += (
            tflops_per_sec - self._avg_tflops_per_sec
        ) / (state.global_step - 1)
        delta_t = cur_time - self._prev_time
        self._prev_time = cur_time
        self._avg_throughput += (delta_t - self._avg_throughput) / (
            state.global_step - 1
        )

      gpu_stats = utils.gpu_stats()
      self._peak_mem = max(gpu_stats.total_mem, self._peak_mem)
      logging.info(
          'on_step_end: %s, throughput: %.2f s/it, flops: %.2f tflops/s',
          utils.gpu_stats_str(gpu_stats),
          delta_t,
          tflops_per_sec
      )

  def on_train_begin(
      self,
      args: TrainingArguments,
      state: TrainerState,
      control: TrainerControl,
      **kwargs,
  ):
    if self._partial_state.is_main_process:
      self._start_time = time.time()
      logging.info('on_train_begin: %s', utils.gpu_stats_str())

  def on_train_end(
      self,
      args: TrainingArguments,
      state: TrainerState,
      control: TrainerControl,
      **kwargs,
  ):
    if self._partial_state.is_main_process:
      train_time = time.time() - self._start_time
      logging.info(
          'training time %.2f s, throughput: %.2f s/it, peak_mem: %.2f GB',
          train_time,
          self._avg_throughput,
          self._peak_mem,
      )
      if self._filename:
        with open(self._filename, 'a') as out_f:
          out_f.write(
              f'{self._max_seq_length/1024.0:.1f}k | {self._peak_mem:.2f} |'
              f' {self._avg_throughput:.2f} | {self._avg_tflops_per_sec:.2f}\n'
          )

#### Build docker using gcloud build
Here we will add `cloudbuild.yaml` file to build and push docker container using gcloud builds.
**Note: gcloud docker build takes at least 15 mins to finish.**

In [None]:
%%writefile vertex-ai-samples/community-content/vertex_model_garden/cloudbuild.yaml
steps:
- name: 'gcr.io/cloud-builders/docker'
  script: |
    docker build -t $_LOCATION-docker.pkg.dev/$_PROJECT_ID/$_REPO_NAME/$_DOCKER_IMAGE_NAME:$_TAG_NAME  -f model_oss/peft/train/vmg/dockerfile/train.Dockerfile .
  automapSubstitutions: true
images:
- '$_LOCATION-docker.pkg.dev/$_PROJECT_ID/$_REPO_NAME/$_DOCKER_IMAGE_NAME:$_TAG_NAME'

In [None]:
REPOSITORY = "vmg-llama-repo"

TAG_NAME = "tflops"

DOCKER_IMAGE_NAME = "peft"

# 1. Create a repository.

! gcloud artifacts repositories create {REPOSITORY} --repository-format=docker --location={REGION} --description="Docker repository" --quiet

! gcloud artifacts repositories list

# 2. Configure authentication to your private repo.

! gcloud auth configure-docker {REGION}-docker.pkg.dev --quiet

# 3. Build the docker image.

! cd vertex-ai-samples/community-content/vertex_model_garden && gcloud builds submit --region=us-central1 \
--substitutions=_LOCATION={REGION},_PROJECT_ID={PROJECT_ID},_REPO_NAME={REPOSITORY},_TAG_NAME={TAG_NAME},_DOCKER_IMAGE_NAME={DOCKER_IMAGE_NAME} --config cloudbuild.yaml

### Set Finetuning Parameters

**Note**:
1. We recommend setting `finetuning_precision_mode` to `4bit` because it enables using fewer hardware resources for finetuning.
2. If `max_steps > 0`, it takes precedence over `epochs`. One can set a small `max_steps` value to quickly check the pipeline.

In [None]:
TRAIN_DOCKER_URI = (
    f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{DOCKER_IMAGE_NAME}:{TAG_NAME}"
)

# Batch size for finetuning.
per_device_train_batch_size = 1  # @param{type:"integer"}
# Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
gradient_accumulation_steps = 4  # @param{type:"integer"}
# Maximum sequence length.
max_seq_length = 4096  # @param{type:"integer"}
# Setting a positive `max_steps` here will override `num_epochs`.
max_steps = -1  # @param{type:"integer"}
num_epochs = 1.0  # @param{type:"number"}
# Learning rate.
learning_rate = 5e-5  # @param{type:"number"}
# The scheduler type to use.
lr_scheduler_type = "cosine"  # @param{type:"string"}
# LoRA parameters.
lora_rank = 16  # @param{type:"integer"}
lora_alpha = 32  # @param{type:"integer"}
lora_dropout = 0.05  # @param{type:"number"}
# gradient checkpointing for the current model (may be referred to as activation checkpointing or checkpoint activations in other frameworks).
enable_gradient_checkpointing = True
# Attention implementation to use in the model.
attn_implementation = "flash_attention_2"
# The optimizer for which to schedule the learning rate.
optimizer = "adamw_torch"
# Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases.
warmup_ratio = "0.01"
# The list or string of integrations to report the results and logs to.
report_to = "tensorboard"
# Number of updates steps before two checkpoint saves.
save_steps = 10
# Number of update steps between two logs.
logging_steps = save_steps
# Precision to use for training.
train_precision = "float16"


base_output_dir = os.path.join(STAGING_BUCKET, "modified_peft")
# Create a GCS folder to store the LORA adapter.
lora_output_dir = os.path.join(base_output_dir, "adapter")
# Create a GCS folder to store the merged model with the base model and the
# finetuned LORA adapter.
merged_model_output_dir = os.path.join(base_output_dir, "merged-model")
# Create a GCS folder to store the finetuned LORA adapter.
final_checkpoint = os.path.join(lora_output_dir, "checkpoint-final")

eval_args = [
    f"--eval_dataset_path={eval_dataset_name}",
    f"--eval_column={instruct_column_in_dataset}",
    f"--eval_template={template}",
    f"--eval_split={eval_split_name}",
    f"--eval_steps={save_steps}",
    "--eval_tasks=builtin_eval",
    "--eval_metric_name=loss",
]

training_args = [
    "--task=instruct-lora",
    "--completion_only=True",
    f"--pretrained_model_id={pretrained_model_id}",
    f"--dataset_name={train_dataset_name}",
    f"--train_split_name={train_split_name}",
    f"--instruct_column_in_dataset={instruct_column_in_dataset}",
    f"--output_dir={lora_output_dir}",
    f"--merge_base_and_lora_output_dir={merged_model_output_dir}",
    f"--per_device_train_batch_size={per_device_train_batch_size}",
    f"--gradient_accumulation_steps={gradient_accumulation_steps}",
    f"--lora_rank={lora_rank}",
    f"--lora_alpha={lora_alpha}",
    f"--lora_dropout={lora_dropout}",
    f"--max_steps={max_steps}",
    f"--max_seq_length={max_seq_length}",
    f"--learning_rate={learning_rate}",
    f"--lr_scheduler_type={lr_scheduler_type}",
    f"--train_precision={train_precision}",
    f"--enable_gradient_checkpointing={enable_gradient_checkpointing}",
    f"--num_epochs={num_epochs}",
    f"--attn_implementation={attn_implementation}",
    f"--optimizer={optimizer}",
    f"--warmup_ratio={warmup_ratio}",
    f"--report_to={report_to}",
    f"--logging_output_dir={base_output_dir}",
    f"--save_steps={save_steps}",
    f"--logging_steps={logging_steps}",
    f"--template={template}",
    f"--huggingface_access_token={HF_TOKEN}",
] + eval_args

### Local Finetuning

This section demonstrates how to finetune a Llama 3.1 model with the modified peft docker using local run. The cell below will output docker command which need to be run in the local terminal.
We need Local environment with 8 Nvidia A100 80 GB GPUs or 8 Nvidia H100 80 GB GPUs to run the command successfully.

In [None]:
import torch

num_of_gpus = torch.cuda.device_count()

if num_of_gpus == 4:
    local_training_args = training_args + [
        "--config_file=vertex_vision_model_garden_peft/deepspeed_zero2_4gpu.yaml"
    ]
elif num_of_gpus == 8:
    local_training_args = training_args + [
        "--config_file=vertex_vision_model_garden_peft/llama_fsdp_8gpu.yaml"
    ]
else:
    raise ValueError(f"Unsupported number of GPUs for local training: {num_of_gpus}.")

args = " ".join(local_training_args)
print("Run peft training with the following command:\n")

print(
    f"docker run --gpus=all --net=host --rm --shm-size=128gb {TRAIN_DOCKER_URI} {args}\n"
)

print("after running the command, check the following files for the training results:")
print("LoRA adapter will be saved in:", lora_output_dir)
print("Trained and merged models will be saved in:", merged_model_output_dir)

#### Verify added trainer stats
Once training starts, you should be able to see logs with above changes printed like below:
"on_step_end: GPU memory: 26.50(occupied=15.17, unused=7.83, smi_diff=3.50) GB. Utilization: 48.00%, throughput: 12.08 s/it, flops: 463.39 tflops/s"

### Vertex AI finetuning

This section demonstrates how to finetune a Llama 3.1 model with the modified peft docker using Vertex AI run. **This section is expected to take at least 30 mins to finish.**

#### Set machine configuration

In [None]:
# Set accelerator type for training job. Accelerator type must be one of the following: NVIDIA_A100_80GB, NVIDIA_H100_80GB
accelerator_type = "NVIDIA_A100_80GB"  # @param ["NVIDIA_TESLA_A100", "NVIDIA_A100_80GB", "NVIDIA_H100_80GB"]
# Set number of replicas to use for training.
replica_count = 1

# Worker pool spec.
if accelerator_type == "NVIDIA_TESLA_A100":
    per_node_accelerator_count = 8
    machine_type = "a2-highgpu-8g"
    boot_disk_size_gb = 500
    dws_kwargs = {
        "max_wait_duration": 1800,  # 30 minutes
        "scheduling_strategy": gca_custom_job_compat.Scheduling.Strategy.FLEX_START,
    }
elif accelerator_type == "NVIDIA_A100_80GB":
    per_node_accelerator_count = 8
    machine_type = "a2-ultragpu-8g"
    boot_disk_size_gb = 500
    dws_kwargs = {
        "max_wait_duration": 1800,  # 30 minutes
        "scheduling_strategy": gca_custom_job_compat.Scheduling.Strategy.FLEX_START,
    }
elif accelerator_type == "NVIDIA_H100_80GB":
    per_node_accelerator_count = 8
    machine_type = "a3-highgpu-8g"
    boot_disk_size_gb = 2000
    dws_kwargs = {
        "max_wait_duration": 1800,  # 30 minutes
        "scheduling_strategy": gca_custom_job_compat.Scheduling.Strategy.FLEX_START,
    }
else:
    raise ValueError(
        f"Recommended machine settings not found for: {accelerator_type}. To use another accelerator type, edit this code block to pass in an appropriate `machine_type`, `accelerator_type`, and `per_node_accelerator_count` to the deploy_model_vllm function by clicking `Show Code` and then modifying the code."
    )

if replica_count == 1:
    config_file = "vertex_vision_model_garden_peft/llama_fsdp_8gpu.yaml"
elif replica_count <= 4:
    config_file = (
        "vertex_vision_model_garden_peft/"
        f"llama_hsdp_{replica_count * per_node_accelerator_count}gpu.yaml"
    )
else:
    raise ValueError(
        f"Recommended config settings not found for replica_count: {replica_count}."
    )

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=per_node_accelerator_count,
    is_for_training=True,
)

#### Run training job

In [None]:
vertex_training_args = training_args + [f"--config_file={config_file}"]
job_name = common_util.get_job_name_with_datetime("llama3_1-lora-train")

# Add labels for the finetuning job.
labels = {
    "mg-source": "notebook",
    "mg-notebook-name": "model_garden_llama3_1_finetuning_with_workbench.ipynb".split(
        "."
    )[0],
}
labels["mg-tune"] = "publishers-meta-models-llama3-1"
versioned_model_id = base_model_id.split("/")[1].lower().replace(".", "-")
labels["versioned-mg-tune"] = f"{labels['mg-tune']}-{versioned_model_id}"

# Pass training arguments and launch job.
train_job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name,
    container_uri=TRAIN_DOCKER_URI,
    labels=labels,
)

print("Running training job with args:")
print(" \\\n".join(training_args))
train_job.run(
    args=vertex_training_args,
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=per_node_accelerator_count,
    boot_disk_size_gb=boot_disk_size_gb,
    service_account=SERVICE_ACCOUNT,
    base_output_dir=base_output_dir,
    **dws_kwargs,
)

#### Verify added trainer stats
Once training starts, you can go to above traning job link and open the logs. In the logs you should be able to see above changes printed like below:
"on_step_end: GPU memory: 26.50(occupied=15.17, unused=7.83, smi_diff=3.50) GB. Utilization: 48.00%, throughput: 12.08 s/it, flops: 463.39 tflops/s"

## Deploy with vLLM on GPUs

In [None]:
# Wait until training job is finished.
if train_job.end_time is None:
    print("Waiting for the training job to finish...")
    train_job.wait()
    print("The training job has finished.")

VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240819_0916_RC00"

# Set vllm prediction arguments
prompt = "What is a car?"
# If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.
max_tokens = 50
temperature = 1.0
top_p = 1.0
top_k = 1
raw_response = False

### Run Local Predictions
This section outputs docker run command. This command can be run inside local terminal.
Local environment is required to be at least L4 GPUs for 8x7B models and A100/H100 GPUs for 8x22B models to run the command successfully.

In [None]:
# Set Docker Arguments
GPU_DEVICES = 0
SOURCE_DIR = "/home/jupyter"
CODE_DIR = "/home/jupyter"

# Set vllm arguments
model_id = pretrained_model_id
accelerator_count = 1
gpu_memory_utilization = 0.95
max_model_len = 8192
dtype = "auto"
max_loras = 1
max_cpu_loras = 8
max_num_seqs = 256
enable_trust_remote_code = False
enforce_eager = False
enable_lora = True
model_type = None

vllm_args = [
    "python",
    "-m",
    "vllm.entrypoints.api_server",
    "--host=0.0.0.0",
    "--port=8080",
    f"--model={model_id}",
    f"--tensor-parallel-size={accelerator_count}",
    "--swap-space=16",
    f"--gpu-memory-utilization={gpu_memory_utilization}",
    f"--max-model-len={max_model_len}",
    f"--dtype={dtype}",
    f"--max-loras={max_loras}",
    f"--max-cpu-loras={max_cpu_loras}",
    f"--max-num-seqs={max_num_seqs}",
    "--disable-log-stats",
]

if enable_trust_remote_code:
    vllm_args.append("--trust-remote-code")

if enforce_eager:
    vllm_args.append("--enforce-eager")

if enable_lora:
    vllm_args.append("--enable-lora")

if model_type:
    vllm_args.append(f"--model-type={model_type}")

docker_cmd_part = f"docker run -t --rm --gpus=all --net=host --shm-size 32gb  --volume {SOURCE_DIR}:{CODE_DIR}  -p 7080:7080  -e NVIDIA_VISIBLE_DEVICES={GPU_DEVICES}"
if HF_TOKEN:
    docker_cmd_part += f" -e HF_TOKEN={HF_TOKEN}"
docker_args = " ".join(vllm_args)
cmd = f"{docker_cmd_part} {VLLM_DOCKER_URI} {docker_args}"

print(f"run below command in local terminal to start the container:\n {cmd}")

Once above command has been run successfully, you will see the server running on 7080.
You can use below cell to run predictions on the vllm server.

In [None]:
import json

vllm_request_data = {
    "prompt": prompt,
    "max_tokens": max_tokens,
    "temperature": temperature,
    "top_p": top_p,
    "top_k": top_k,
    "raw_response": raw_response,
}

curl_data = json.dumps(vllm_request_data)
cmd = f"curl --header 'Content-Type: application/json'  --request POST  --data '{curl_data}'  http://localhost:7080/generate"

print(f"run below command in local terminal to send request to the container:\n {cmd}")

### Deploy with Vertex AI
This section uploads the model to Model Registry and deploys it on the Endpoint. It takes 15 minutes to 1 hour to finish depending on the size of model.

In [None]:
if train_job.end_time is None:
    print("Waiting for the training job to finish...")
    train_job.wait()
    print("The training job has finished.")

# @markdown Set `use_dedicated_endpoint` to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint).
use_dedicated_endpoint = True  # @param {type:"boolean"}

# Find Vertex AI prediction supported accelerators and regions [here](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute).
if "8b" in base_model_id.lower():
    machine_type = "g2-standard-12"
    accelerator_type = "NVIDIA_L4"
    per_node_accelerator_count = 1
elif "70b" in base_model_id.lower():
    machine_type = "g2-standard-96"
    accelerator_type = "NVIDIA_L4"
    per_node_accelerator_count = 8
elif "405b" in base_model_id.lower():
    machine_type = "a3-highgpu-8g"
    accelerator_type = "NVIDIA_H100_80GB"
    per_node_accelerator_count = 8
else:
    raise ValueError(f"Unsupported model ID or GCS path: {base_model_id}.")


def get_deploy_source() -> str:
    """Gets deploy_source string based on running environment."""
    vertex_product = os.environ.get("VERTEX_PRODUCT", "")
    if vertex_product == "COLAB_ENTERPRISE":
        return "notebook_colab_enterprise"
    elif vertex_product == "WORKBENCH_INSTANCE":
        return "notebook_workbench"
    else:
        # Legacy workbench, legacy colab, or other custom environments.
        return "notebook_environment_unspecified"


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    service_account: str,
    base_model_id: str = None,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    gpu_memory_utilization: float = 0.9,
    max_model_len: int = 4096,
    dtype: str = "auto",
    enable_trust_remote_code: bool = False,
    enforce_eager: bool = False,
    enable_lora: bool = False,
    enable_chunked_prefill: bool = False,
    enable_prefix_cache: bool = False,
    host_prefix_kv_cache_utilization_target: float = 0.0,
    max_loras: int = 1,
    max_cpu_loras: int = 8,
    use_dedicated_endpoint: bool = False,
    max_num_seqs: int = 256,
    model_type: str = None,
    enable_llama_tool_parser: bool = False,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )

    if not base_model_id:
        base_model_id = model_id

    # See https://docs.vllm.ai/en/latest/models/engine_args.html for a list of possible arguments with descriptions.
    vllm_args = [
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=8080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        f"--gpu-memory-utilization={gpu_memory_utilization}",
        f"--max-model-len={max_model_len}",
        f"--dtype={dtype}",
        f"--max-loras={max_loras}",
        f"--max-cpu-loras={max_cpu_loras}",
        f"--max-num-seqs={max_num_seqs}",
        "--disable-log-stats",
    ]

    if enable_trust_remote_code:
        vllm_args.append("--trust-remote-code")

    if enforce_eager:
        vllm_args.append("--enforce-eager")

    if enable_lora:
        vllm_args.append("--enable-lora")

    if enable_chunked_prefill:
        vllm_args.append("--enable-chunked-prefill")

    if enable_prefix_cache:
        vllm_args.append("--enable-prefix-caching")

    if 0 < host_prefix_kv_cache_utilization_target < 1:
        vllm_args.append(
            f"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}"
        )

    if model_type:
        vllm_args.append(f"--model-type={model_type}")

    if enable_llama_tool_parser:
        vllm_args.append("--enable-auto-tool-choice")
        vllm_args.append("--tool-call-parser=vertex-llama-3")

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_args=vllm_args,
        serving_container_ports=[8080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )
    print(
        f"Deploying {model_name} on {machine_type} with {accelerator_count} {accelerator_type} GPU(s)."
    )
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
        system_labels={
            "NOTEBOOK_NAME": "model_garden_llama3_1_finetuning_with_workbench.ipynb",
            "NOTEBOOK_ENVIRONMENT": get_deploy_source(),
        },
    )
    print("endpoint_name:", endpoint.name)

    return model, endpoint


# Use FP8 base model for 405B since original model does not fit.
deploy_pretrained_model_id = pretrained_model_id
if "Meta-Llama-3.1-405B" in deploy_pretrained_model_id:
    deploy_pretrained_model_id += "-FP8"
print("Deploying model in:", deploy_pretrained_model_id)

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=per_node_accelerator_count,
    is_for_training=False,
)

models["vllm_gpu"], endpoints["vllm_gpu"] = deploy_model_vllm(
    model_name=common_util.get_job_name_with_datetime(prefix="llama3_1-vllm-serve"),
    model_id=deploy_pretrained_model_id,
    publisher="meta",
    publisher_model_id="llama3_1",
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=per_node_accelerator_count,
    gpu_memory_utilization=gpu_memory_utilization,
    max_model_len=max_model_len,
    enable_lora=True,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/dev/sampling_params.html).
Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

In [None]:
def predict_vllm(
    prompt: str,
    max_tokens: int,
    temperature: float,
    top_p: float,
    top_k: int,
    raw_response: bool,
    lora_weight: str = "",
):
    # Parameters for inference.
    instance = {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "raw_response": raw_response,
    }
    if lora_weight:
        instance["dynamic-lora"] = lora_weight
    instances = [instance]
    response = endpoints["vllm_gpu"].predict(
        instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
    )

    for prediction in response.predictions:
        print(prediction)


predict_vllm(
    prompt=prompt,
    max_tokens=max_tokens,
    temperature=temperature,
    top_p=top_p,
    top_k=top_k,
    raw_response=raw_response,
)

## Clean up resources

In [None]:
if train_job:
    train_job.delete()

# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()

delete_bucket = False  # @param {type:"boolean"}
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_NAME