In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Fine-tuning with Axolotl

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_axolotl_finetuning.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_axolotl_finetuning.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview
This notebook demonstrates fine-tuning using [Axolotl](https://github.com/axolotl-ai-cloud/axolotl). Axolotl streamlines AI model fine-tuning by providing a wide range of training recipes and supporting multiple configurations and architectures.

We can use either Enterprise Colab runtime or Vertex AI training for fine-tuning using axolotl.
Colab runtime has below advantages:
- **Sanity check for flags**: Use Enterprise Colab runtime to do sanity check for Axolotl flags before running it on Vertex AI training directly.
- **Quick experimentations**: Use Enterprise Colab runtime to do quick experimentations with Axolotl flags. The [max-steps](https://github.com/axolotl-ai-cloud/axolotl/blob/8fb72cbc0b94129141bae5fa4d84edd23b648af6/docs/config.qmd#L360) flag is useful to limit the training time.
- **Debugging**: Use Enterprise Colab runtime to debug axolotl fine-tuning. This can be more efficient because debugging on the Vertex AI training involves waiting for resources to be provisioned, which can add delays. Also it is easier to add debug statements on Enterprise Colab runtime compared to Vertex AI training.

Once the local fine-tuning is verified, the Vertex AI training is the recommended way to run the fine-tuning. Vertex AI training has several advantages, including:
- **Running multiple training jobs in parallel**: This can be useful for hyperparameter tuning or running experiments with different datasets etc.
- **For High End GPU**: Vertex AI training provides access to higher-end GPUs like the H100, which can be crucial if you encounter out-of-memory (OOM) errors.
- **[DWS support](https://cloud.google.com/vertex-ai/docs/training/schedule-jobs-dws)**: DWS makes Vertex AI training more cost-effective, and easier to manage, especially in scenarios where GPU availability is a concern.
Refer to [this documentation](https://cloud.google.com/vertex-ai/docs/training/overview#vertexi-ai-operationalizes-training-at-scale) for more details on Vertex AI training advantages.

### Objective
- Train model using Axolotl in local Enterprise Colab runtime.
- Train model using Axolotl with Vertex AI Training.

## Setup Colab Runtime
**You need to setup the Colab Runtime with L4 GPU or A100 GPU if you want to run local finetuning. The following sections perform the setup for L4 GPU.**
To learn more about creating runtime, you can optionally read [this](https://cloud.google.com/colab/docs/create-runtime).

**Note: make sure to create a runtime with appropriate machine type and gpu type to avoid out of memory issues. [Refer this](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) to decide which machine type and gpu type to select.**

In [None]:
# @title Create runtime
# @markdown This cell creates a runtime template and then creates a runtime using that template.
# @markdown **If you have already created a runtime, you can skip this cell.**
# @markdown This cell can take up to 5 minutes to run.
# @markdown After the cell execution finishes, you have to connect manually to the runtime by following [the instructions here](https://cloud.google.com/colab/docs/connect-to-runtime).

import os
import uuid
import re

RUNTIME_PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]
RUNTIME_REGION = os.environ["GOOGLE_CLOUD_REGION"]

RUNTIME_ACCELERATOR_TYPE = "NVIDIA_L4"  # @param ["NVIDIA_L4", "NVIDIA_TESLA_A100", "NVIDIA_A100_80GB"]
RUNTIME_ACCELERATOR_COUNT = "1"  # @param [1, 2, 4, 8, 16]
RUNTIME_ACCELERATOR_COUNT = int(RUNTIME_ACCELERATOR_COUNT)

if RUNTIME_ACCELERATOR_TYPE == "NVIDIA_L4" and RUNTIME_ACCELERATOR_COUNT == 1:
  RUNTIME_MACHINE_TYPE = "g2-standard-8"
elif RUNTIME_ACCELERATOR_TYPE == "NVIDIA_L4" and RUNTIME_ACCELERATOR_COUNT == 2:
  RUNTIME_MACHINE_TYPE = "g2-standard-24"
elif RUNTIME_ACCELERATOR_TYPE == "NVIDIA_L4" and RUNTIME_ACCELERATOR_COUNT == 4:
  RUNTIME_MACHINE_TYPE = "g2-standard-48"
elif RUNTIME_ACCELERATOR_TYPE == "NVIDIA_L4" and RUNTIME_ACCELERATOR_COUNT == 8:
  RUNTIME_MACHINE_TYPE = "g2-standard-96"
elif RUNTIME_ACCELERATOR_TYPE == "NVIDIA_TESLA_A100" and RUNTIME_ACCELERATOR_COUNT != 16:
  RUNTIME_MACHINE_TYPE = f"a2-highgpu-{RUNTIME_ACCELERATOR_COUNT}g"
elif RUNTIME_ACCELERATOR_TYPE == "NVIDIA_TESLA_A100" and RUNTIME_ACCELERATOR_COUNT == 16:
  RUNTIME_MACHINE_TYPE = "a2-megagpu-16g"
elif RUNTIME_ACCELERATOR_TYPE == "NVIDIA_A100_80GB":
  assert RUNTIME_ACCELERATOR_COUNT in [1, 2, 4, 8], "Only 1, 2, 4, 8 A100-80GB are supported."
  RUNTIME_MACHINE_TYPE = f"a2-ultragpu-{RUNTIME_ACCELERATOR_COUNT}g"

uuid = uuid.uuid4()
RUNTIME_DISPLAY_NAME = f"axolotl-{RUNTIME_ACCELERATOR_TYPE}-{RUNTIME_ACCELERATOR_COUNT}-{uuid}"

# create runtime template
shell_output = ! gcloud colab runtime-templates create --display-name=$RUNTIME_DISPLAY_NAME \
  --project=$RUNTIME_PROJECT_ID --region=$RUNTIME_REGION \
  --machine-type=$RUNTIME_MACHINE_TYPE --accelerator-type=$RUNTIME_ACCELERATOR_TYPE \
  --accelerator-count=$RUNTIME_ACCELERATOR_COUNT --disk-type=PD_BALANCED
shell_output = "\n".join(shell_output)
print(shell_output)
RUNTIME_TEMPLATE_ID = re.search(r"projects/.*/locations/.*/notebookRuntimeTemplates/(\d+)", shell_output).group(1)

# create runtime
shell_output = ! gcloud colab runtimes create --display-name=$RUNTIME_DISPLAY_NAME \
  --runtime-template=$RUNTIME_TEMPLATE_ID --project=$RUNTIME_PROJECT_ID \
  --region=$RUNTIME_REGION
shell_output = "\n".join(shell_output)
print(shell_output)
RUNTIME_ID = re.search(r"projects/.*/locations/.*/notebookRuntimes/(\d+)", shell_output).group(1)

# start runtime
! gcloud colab runtimes start $RUNTIME_ID --project=$RUNTIME_PROJECT_ID --region=$RUNTIME_REGION

## Before you begin

In [None]:
# @title Import utility packages for fine-tuning

# Import the necessary packages.
! rm -rf vertex-ai-samples && git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git
! cd vertex-ai-samples

# Import the necessary packages.

import datetime
import importlib
import os
import pathlib
import uuid
from typing import Tuple

import requests
import yaml
from google.cloud import aiplatform

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

train_job = None
models, endpoints = {}, {}

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. For finetuning, follow [these instructions](https://cloud.google.com/vertex-ai/docs/training/schedule-jobs-dws) to use Dynamic Workload Scheduler. For Dynamic Workload Scheduler, check the [us-central1](https://console.cloud.google.com/iam-admin/quotas?location=us-central1&metric=aiplatform.googleapis.com%2Fcustom_model_training_preemptible_nvidia_h100_gpus) or [europe-west4](https://console.cloud.google.com/iam-admin/quotas?location=europe-west4&metric=aiplatform.googleapis.com%2Fcustom_model_training_preemptible_nvidia_h100_gpus) quota for Nvidia H100 GPUs, and [us-central1](https://console.cloud.google.com/iam-admin/quotas?location=us-central1&metric=aiplatform.googleapis.com%2Fcustom_model_training_preemptible_nvidia_a100_gpus) quota for Nvidia Tesla A100 GPUs. To train using L4 gpus with default quota, check [us-central1](https://console.cloud.google.com/iam-admin/quotas?location=us-central1&metric=aiplatform.googleapis.com%2Fcustom_model_training_preemptible_nvidia_l4_gpus) quota for Nvidia L4 GPUs. If you do not have enough GPUs, then you can follow [these instructions](https://cloud.google.com/docs/quotas/view-manage#viewing_your_quota_console) to request quota.

# @markdown 3. For serving, **[click here](https://console.cloud.google.com/iam-admin/quotas?location=us-central1&metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_l4_gpus)** to check if your project already has the required 1 L4 GPU in the us-central1 region.  If yes, then run this notebook in the us-central1 region. If you need more L4 GPUs for your project, then you can follow [these instructions](https://cloud.google.com/docs/quotas/view-manage#viewing_your_quota_console) to request more. Alternatively, if you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus).

# @markdown > | Machine Type | Accelerator Type | Recommended Regions |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |
# @markdown | a3-highgpu-2g | 2 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-4g | 4 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, us-east5, europe-west4, us-west1, asia-southeast1 |

# @markdown 4. **[Optional]** [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1"). If not set, a unique GCS bucket will be created instead.

BUCKET_URI = "gs://"  # @param {type:"string"}

# @markdown 5. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = ""  # @param {type:"string"}


# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    if not os.environ.get("GOOGLE_CLOUD_REGION"):
        raise ValueError(
            "REGION must be set. See"
            " https://cloud.google.com/vertex-ai/docs/general/locations for"
            " available cloud locations."
        )
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, change the value yourself below.
now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])

if BUCKET_URI is None or BUCKET_URI.strip() == "" or BUCKET_URI == "gs://":
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}-{str(uuid.uuid4())[:4]}"
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
    ! gsutil mb -l {REGION} {BUCKET_URI}
else:
    assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."
    shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep "Location constraint:" | sed "s/Location constraint://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            "Bucket region %s is different from notebook region %s"
            % (bucket_region, REGION)
        )
print(f"Using this GCS Bucket: {BUCKET_URI}")

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
MODEL_BUCKET = os.path.join(BUCKET_URI, "axolotl")


# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)


# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

! gcloud config set project $PROJECT_ID
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/storage.admin"
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/aiplatform.user"

## Finetune with Axolotl

In [None]:
# @title Set Axolotl config

# @markdown You can use below axolotl configs taken from [examples directory](https://github.com/axolotl-ai-cloud/axolotl/tree/8fb72cbc0b94129141bae5fa4d84edd23b648af6/examples), which have been verified by model garden team through internal testing. Note that we have used A100 80GB and H100 80GB GPU for testing.
# @markdown > | Model Name | Base Model | Axolotl Config |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | code-llama | codellama/CodeLlama-7b-hf | examples/code-llama/7b/lora.yml |
# @markdown | code-llama | codellama/CodeLlama-7b-hf | examples/code-llama/7b/qlora.yml |
# @markdown | code-llama | codellama/CodeLlama-13b-hf | examples/code-llama/13b/lora.yml |
# @markdown | code-llama | codellama/CodeLlama-13b-hf | examples/code-llama/13b/qlora.yml |
# @markdown | code-llama | codellama/CodeLlama-34b-hf | examples/code-llama/34b/lora.yml |
# @markdown | code-llama | codellama/CodeLlama-34b-hf | examples/code-llama/34b/qlora.yml |
# @markdown | falcon | tiiuae/falcon-7b | examples/falcon/config-7b-lora.yml |
# @markdown | falcon | tiiuae/falcon-7b | examples/falcon/config-7b.yml |
# @markdown | gemma | google/gemma-7b | examples/gemma/qlora.yml |
# @markdown | llama-2 | NousResearch/Llama-2-7b-hf | examples/llama-2/fft_optimized.yml |
# @markdown | llama-2 | NousResearch/Llama-2-7b-hf | examples/llama-2/loftq.yml |
# @markdown | llama-2 | NousResearch/Llama-2-7b-hf | examples/llama-2/lora.yml |
# @markdown | llama-2 | NousResearch/Llama-2-7b-hf | examples/llama-2/qlora-fsdp.yml |
# @markdown | llama-2 | NousResearch/Llama-2-7b-hf | examples/llama-2/qlora.yml |
# @markdown | llama-3 | NousResearch/Meta-Llama-3.1-8B | examples/llama-3/fft-8b.yaml |
# @markdown | llama-3 | NousResearch/Meta-Llama-3-8B-Instruct | examples/llama-3/instruct-lora-8b.yml |
# @markdown | llama-3 | NousResearch/Meta-Llama-3-8B | examples/llama-3/lora-8b.yml |
# @markdown | llama-3 | NousResearch/Llama-3.2-1B | examples/llama-3/qlora-1b.yml |
# @markdown | llama-3 | casperhansen/llama-3-70b-fp16 | examples/llama-3/qlora-fsdp-70b.yaml |
# @markdown | mistral | mistralai/Mistral-7B-v0.1 | examples/mistral/config.yml |
# @markdown | mistral | mistralai/Mistral-7B-v0.1 | examples/mistral/lora-mps.yml |
# @markdown | mistral | mistralai/Mistral-7B-v0.1 | examples/mistral/lora.yml |
# @markdown | mistral | mistralai/Mistral-7B-v0.1 | examples/mistral/mistral-qlora-orpo.yml |
# @markdown | mistral | mistral-community/Mixtral-8x22B-v0.1 | examples/mistral/mixtral-8x22b-qlora-fsdp.yml |
# @markdown | mistral | mistralai/Mixtral-8x7B-v0.1 | examples/mistral/mixtral-qlora-fsdp.yml |
# @markdown | mistral | mistralai/Mistral-7B-v0.1 | examples/mistral/qlora.yml |
# @markdown | openllama-3b | openlm-research/open_llama_3b_v2 | examples/openllama-3b/config.yml |
# @markdown | openllama-3b | openlm-research/open_llama_3b_v2 | examples/openllama-3b/lora.yml |
# @markdown | openllama-3b | openlm-research/open_llama_3b_v2 | examples/openllama-3b/qlora.yml |
# @markdown | phi | microsoft/Phi-3.5-mini-instruct | examples/phi/lora-3.5.yaml |
# @markdown | phi | microsoft/phi-1_5 | examples/phi/phi-ft.yml |
# @markdown | phi | microsoft/phi-1_5 | examples/phi/phi-qlora.yml |
# @markdown | phi | microsoft/phi-2 | examples/phi/phi2-ft.yml |
# @markdown | phi | microsoft/Phi-3-mini-4k-instruct | examples/phi/phi3-ft.yml |
# @markdown | qwen | Qwen/Qwen1.5-MoE-A2.7B | examples/qwen/qwen2-moe-lora.yaml |
# @markdown | qwen | Qwen/Qwen1.5-MoE-A2.7B | examples/qwen/qwen2-moe-qlora.yaml |
# @markdown | qwen2 | Qwen/Qwen2.5-0.5B | examples/qwen2/dpo.yaml |
# @markdown | qwen2 | Qwen/Qwen2-7B | examples/qwen2/qlora-fsdp.yaml |
# @markdown | tiny-llama | TinyLlama/TinyLlama_v1.1 | examples/tiny-llama/lora-mps.yml |
# @markdown | tiny-llama | TinyLlama/TinyLlama_v1.1 | examples/tiny-llama/lora.yml |
# @markdown | tiny-llama | TinyLlama/TinyLlama-1.1B-Chat-v1.0 | examples/tiny-llama/pretrain.yml |
# @markdown | tiny-llama | TinyLlama/TinyLlama_v1.1 | examples/tiny-llama/qlora.yml |

# @markdown 1. Set Axolotl config source.<br>
# @markdown For `GITHUB` as source, you can explore different Axolotl configurations in the [examples directory](https://github.com/axolotl-ai-cloud/axolotl/tree/8fb72cbc0b94129141bae5fa4d84edd23b648af6/examples). For `GITHUB` source, `AXOLOTL_CONFIG_PATH` should start with `examples/`. e.g. examples/tiny-llama/lora.yml.<br>
# @markdown For `LOCAL` as source, create Axolotl config yaml file and specify correct path below. Note that, the local file will be copied to GCS bucket before running Vertex AI training job. For `LOCAL` source, `AXOLOTL_CONFIG_PATH` should be a complete path of the config file. e.g. /content/lora.yml.<br>
# @markdown For `GCS` as source, specify the GCS URI to the Axolotl config file. Make sure the file is accessible to service account used in the notebook. For `GCS` source, `AXOLOTL_CONFIG_PATH` should be a complete GCS URI of the config file. e.g. gs://bucket/path/to/config/file.yml.

AXOLOTL_SOURCE = "GITHUB"  # @param ["GITHUB", "LOCAL", "GCS"]

# @markdown 2. Set the Axolotl config file path.
AXOLOTL_CONFIG_PATH = "examples/tiny-llama/lora.yml"  # @param {type:"string"}

assert AXOLOTL_CONFIG_PATH, "AXOLOTL_CONFIG_PATH must be set."

if AXOLOTL_SOURCE == "GITHUB":
    assert AXOLOTL_CONFIG_PATH.startswith(
        "examples/"
    ), "AXOLOTL_CONFIG_PATH must start with examples/ for GITHUB source."
    github_url = f"https://github.com/axolotl-ai-cloud/axolotl/raw/8fb72cbc0b94129141bae5fa4d84edd23b648af6/{AXOLOTL_CONFIG_PATH}"
    r = requests.get(github_url)
    axolotl_config = r.content.decode("utf-8")
    axolotl_config = yaml.safe_load(axolotl_config)
elif AXOLOTL_SOURCE == "LOCAL":
    config_path = pathlib.Path(AXOLOTL_CONFIG_PATH)
    assert config_path.exists(), "AXOLOTL_CONFIG_PATH must exist for LOCAL source."
    file_content = config_path.read_text()
    axolotl_config = yaml.safe_load(file_content)
elif AXOLOTL_SOURCE == "GCS":
    local_path = pathlib.Path("/content/tmp/axolotl_config.yml")
    common_util.download_gcs_file_to_local(AXOLOTL_CONFIG_PATH, local_path.absolute())
    file_content = local_path.read_text()
    axolotl_config = yaml.safe_load(file_content)
    AXOLOTL_CONFIG_PATH = common_util.gcs_fuse_path(AXOLOTL_CONFIG_PATH)
else:
    raise ValueError(f"Unsupported AXOLOTL_SOURCE: {AXOLOTL_SOURCE}")

OUTPUT_GCS_URI = MODEL_BUCKET

if not OUTPUT_GCS_URI.startswith("gs://"):
    OUTPUT_GCS_URI = f"gs://{OUTPUT_GCS_URI}"

In [None]:
# @title **[Optional]** Setup HF token
# @markdown Some models like Gemma2, Mistral, Llama3 etc require a token to access with [gated access from huggingface](https://huggingface.co/docs/hub/en/models-gated).
HF_TOKEN = ""  # @param {type:"string"}

In [None]:
# @title **[Optional]** Setup dataset

# @markdown This section configures the dataset used for fine-tuning. **Note: If you don't fill any of the dataset options given below, then the dataset used will be the one defined in the Axolotl config file.** You have two options to configure the dataset:

# @markdown **1. Use a Hugging Face Dataset**
# @markdown   - Requires specifying the dataset name and type.

# @markdown **2. Load from Google Cloud Storage (GCS)**
# @markdown   - Requires specifying the bucket name, dataset type, file type, and paths to training/test splits.

# @markdown **Choose ONE of the following options:**

# @markdown ---
# @markdown **Option 1: Hugging Face**

# @markdown **Hugging Face Dataset Name:**
HF_DATASET = ""  # @param {type:"string", placeholder: "e.g. timdettmers/openassistant-guanaco"}
# @markdown **Set the dataset type:** Refer to [Axolotl config file](https://github.com/axolotl-ai-cloud/axolotl/blob/8fb72cbc0b94129141bae5fa4d84edd23b648af6/docs/config.qmd#L87) for more details.
HF_DATASET_TYPE = ""  # @param {type:"string", placeholder: "e.g. completion"}
if HF_DATASET:
    assert HF_DATASET_TYPE, "HF_DATASET_TYPE must be set if HF_DATASET is set."

# @markdown ---
# @markdown **Option 2: GCS**

# @markdown **Bucket Name:**
DATASET_BUCKET_NAME = ""  # @param {type:"string"}
# @markdown **Dataset Type:** Refer to the [Axolotl config file](https://github.com/axolotl-ai-cloud/axolotl/blob/8fb72cbc0b94129141bae5fa4d84edd23b648af6/docs/config.qmd#L181) for more details.
DATASET_TYPE = ""  # @param {type:"string"}
# @markdown **File Type**. Refer to the [Axolotl config file](https://github.com/axolotl-ai-cloud/axolotl/blob/8fb72cbc0b94129141bae5fa4d84edd23b648af6/docs/config.qmd#L178).
FILE_TYPE = ""  # @param {type:"string"}

# @markdown **Path to Training Data (relative to bucket):**
TRAIN_DATAFILES_PATH = ""  # @param {type:"string"}
# @markdown **[Optional] Path to Test Data (relative to bucket):**
# @markdown To use a dedicated validation set, provide the file path. Otherwise, the training data will be split to create a validation set.
TEST_DATAFILES_PATH = ""  # @param {type:"string"}

if DATASET_BUCKET_NAME:
    assert (
        TRAIN_DATAFILES_PATH
    ), "TRAIN_DATAFILES_PATH must be set if DATASET_BUCKET_NAME is set."
    assert DATASET_TYPE, "DATASET_TYPE must be set if DATASET_BUCKET_NAME is set."
    assert FILE_TYPE, "FILE_TYPE must be set if DATASET_BUCKET_NAME is set."

assert not (
    HF_DATASET and DATASET_BUCKET_NAME
), "Only one of HF_DATASET or DATASET_BUCKET_NAME can be set."

datasets = []
if DATASET_BUCKET_NAME:
    paths = TRAIN_DATAFILES_PATH.split(",")
    dataset = {
        "path": f"/gcs/{DATASET_BUCKET_NAME}/",
        "type": DATASET_TYPE,
        "data_files": [],
        "ds_type": FILE_TYPE,
    }
    for path in paths:
        if path.startswith("/"):
            path = path[1:]
        dataset["data_files"].append(f"/gcs/{DATASET_BUCKET_NAME}/{path}")
        dataset["split"] = "train"
    datasets.append(dataset)

test_datasets = []
if TEST_DATAFILES_PATH:
    paths = TEST_DATAFILES_PATH.split(",")
    dataset = {
        "path": f"/gcs/{DATASET_BUCKET_NAME}/",
        "type": DATASET_TYPE,
        "data_files": [],
        "ds_type": FILE_TYPE,
    }
    for path in paths:
        if path.startswith("/"):
            path = path[1:]
        dataset["data_files"].append(f"/gcs/{DATASET_BUCKET_NAME}/{path}")
        dataset["split"] = "train"
    test_datasets.append(dataset)

if HF_DATASET:
    datasets.append({"path": HF_DATASET, "type": HF_DATASET_TYPE})

In [None]:
# @title Setup Axolotl Flags
# @markdown This section configures additional Axolotl flags. You can explore different Axolotl flags in the [Axolotl config file](https://github.com/axolotl-ai-cloud/axolotl/blob/8fb72cbc0b94129141bae5fa4d84edd23b648af6/docs/config.qmd).

# @markdown **To avoid OOM, you can reduce sequence length.** This can be done by setting `sequence_len` flag to some smaller value. But reducing sequence length will also reduce the model performance.
# @markdown **Another alternative to avoid OOM is to use higher memory gpu.** It is recommended to use vertex ai training for Higher memory gpu like A100 and H100. Vertex AI training offers greater availability of high-end GPUs.

# @markdown **Training can take a long time (20+ hours) to complete depending on the model, dataset and axololt config.** You can reduce the training time by reducing the max training steps. This can be done by setting `max_steps` flag to some smaller value. Note that this will also reduce the model performance.

axolotl_flag_overrides = ["--use-tensorboard=True"]  # @param {type:"raw"}
assert type(axolotl_flag_overrides) is list, "axolotl_flag_overrides must be a list."

### Finetune with Local Run

In [None]:
# @title Install Axolotl
! rm -rf axolotl
! git clone https://github.com/axolotl-ai-cloud/axolotl.git
! cd axolotl && git reset --hard 8fb72cbc0b94129141bae5fa4d84edd23b648af6
! pip3 install packaging ninja
! cd axolotl && pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'

# This is needed because of this issue: https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1492
! pip3 install bitsandbytes==0.45.1

# @title Install GCSFUSE
! apt-get install gcsfuse -y

In [None]:
# @title Run Local fine-tuning
# @markdown This section runs the Axolotl training locally (i.e. colab runtime).
# @markdown **Note: This section can take a long time to run. You can reduce the training time by reducing the max training steps as mentioned in `Setup Axolotl Flags` section.**
# @markdown Model trained using Axolotl will be saved in the GCS bucket with the help of GCSFUSE.

assert OUTPUT_GCS_URI, "OUTPUT_GCS_URI must be set for local fine-tuning."

# @markdown 1. Run GCSFUSE so that axolotl can store the training output in the GCS bucket.
! mkdir -p /gcs/
! gcsfuse /gcs

# @markdown 2. Run Axolotl training.
AXOLOTL_OUTPUT_GCS_URI = f"{OUTPUT_GCS_URI}/axolotl_output"
AXOLOTL_OUTPUT_DIR = common_util.gcs_fuse_path(AXOLOTL_OUTPUT_GCS_URI)

axolotl_args = f" --output-dir={AXOLOTL_OUTPUT_DIR}"
if len(datasets) > 0:
    axolotl_args += f' --datasets="{datasets}"'
if len(test_datasets) > 0:
    axolotl_args += f' --test-datasets="{test_datasets}"'
    axolotl_args += " --val-set-size=0"
additional_flags = " ".join(axolotl_flag_overrides)
axolotl_args += f" {additional_flags}"
! accelerate launch -m axolotl.cli.train $axolotl_args /content/axolotl/$AXOLOTL_CONFIG_PATH

# @markdown 3. Check the output in the bucket.
! gsutil ls $AXOLOTL_OUTPUT_GCS_URI

In [None]:
# @title Run Local inference
# @markdown This section performs inference using the finetuned model.

# @markdown 1. Copy the finetuned model from GCS to local.
! mkdir -p /tmp/axolotl_output
! gsutil -m cp -r $AXOLOTL_OUTPUT_GCS_URI/* /tmp/axolotl_output/

# @markdown 2. Run Axolotl inference using gradio on local finetuned model.
! cd axolotl && axolotl inference  examples/tiny-llama/lora.yml --output-dir=/tmp/axolotl_output/ --gradio

# @markdown 3. After running the cell, a public URL (["https://*.gradio.live"](#)) will appear in the cell output. The playground is available in a separate browser tab when you click the URL.

In [None]:
# @markdown This section merges the finetuned adapter with the base model.
# @markdown **Note: This is only needed for lora and qlora. In case of full finetuning you can skip this cell.**

if (
    "adapter" in axolotl_config
    and axolotl_config["adapter"] != "lora"
    and axolotl_config["adapter"] != "qlora"
):
    raise ValueError("This cell is only needed for lora and qlora.")

# @markdown 1. Copy the finetuned model from GCS to local.
! mkdir -p /tmp/axolotl_output
! gsutil -m cp -r $AXOLOTL_OUTPUT_GCS_URI/* /tmp/axolotl_output/

# @markdown 2. Run Axolotl merge.
! cd axolotl && python3 -m axolotl.cli.merge_lora $AXOLOTL_CONFIG_PATH --output-dir=/tmp/axolotl_output/

# @markdown 3. Copy the merged model to GCS.
! gsutil -m cp -r /tmp/axolotl_output/merged /* $AXOLOTL_OUTPUT_GCS_URI/merged/

### Finetune with Vertex AI Training

In [None]:
# @title Vertex AI fine-tuning job
# @markdown This section runs the Axolotl training using Vertex AI training job.
# @markdown **Note: This section can take a long time to run. You can reduce the training time by reducing the max training steps as mentioned in `Setup Axolotl Flags` section.**
# @markdown Refer to [Axolotl config](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html) to override additional Axolotl flags.

from google.cloud.aiplatform.compat.types import \
    custom_job as gca_custom_job_compat

# @markdown Acceletor type to use for training.
training_accelerator_type = "NVIDIA_L4"  # @param ["NVIDIA_L4", "NVIDIA_TESLA_A100", "NVIDIA_H100_80GB"]


replica_count = 1
repo = "us-docker.pkg.dev/vertex-ai"
per_node_accelerator_count = 1
boot_disk_size_gb = 500
dws_kwargs = {
    "max_wait_duration": 1800,  # 30 minutes
    "scheduling_strategy": gca_custom_job_compat.Scheduling.Strategy.FLEX_START,
}
is_dynamic_workload_scheduler = True
if training_accelerator_type == "NVIDIA_L4":
    training_machine_type = "g2-standard-8"
    is_dynamic_workload_scheduler = False
    dws_kwargs = {}
elif training_accelerator_type == "NVIDIA_TESLA_A100":
    training_machine_type = "a2-highgpu-1g"
elif training_accelerator_type == "NVIDIA_H100_80GB":
    training_machine_type = "a3-highgpu-8g"
    per_node_accelerator_count = 8
    boot_disk_size_gb = 2000
else:
    raise ValueError(f"Unsupported accelerator type: {training_accelerator_type}")

TRAIN_DOCKER_URI = (
    f"{repo}/vertex-vision-model-garden-dockers/axolotl-train:20250225-1800-rc0"
)

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=training_accelerator_type,
    accelerator_count=per_node_accelerator_count * replica_count,
    is_for_training=True,
    is_restricted_image=False,
    is_dynamic_workload_scheduler=is_dynamic_workload_scheduler,
)

# @markdown Run Vertex AI job.

# Copy the config file to the bucket.
if AXOLOTL_SOURCE == "LOCAL":
    ! gsutil -m cp $AXOLOTL_CONFIG_PATH $MODEL_BUCKET/config/
    AXOLOTL_CONFIG_PATH = f"{common_util.gcs_fuse_path(MODEL_BUCKET)}/config/{pathlib.Path(AXOLOTL_CONFIG_PATH).name}"

# Set axolotl flags.
datasets = []
if DATASET_BUCKET_NAME:
    paths = TRAIN_DATAFILES_PATH.split(",")
    dataset = {
        "path": f"/gcs/{DATASET_BUCKET_NAME}/",
        "type": DATASET_TYPE,
        "data_files": [],
        "ds_type": FILE_TYPE,
    }
    for path in paths:
        if path.startswith("/"):
            path = path[1:]
        dataset["data_files"].append(f"/gcs/{DATASET_BUCKET_NAME}/{path}")
        dataset["split"] = "train"
    datasets.append(dataset)

test_datasets = []
if TEST_DATAFILES_PATH:
    paths = TEST_DATAFILES_PATH.split(",")
    dataset = {
        "path": f"/gcs/{DATASET_BUCKET_NAME}/",
        "type": DATASET_TYPE,
        "data_files": [],
        "ds_type": FILE_TYPE,
    }
    for path in paths:
        if path.startswith("/"):
            path = path[1:]
        dataset["data_files"].append(f"/gcs/{DATASET_BUCKET_NAME}/{path}")
        dataset["split"] = "train"
    test_datasets.append(dataset)

if HF_DATASET:
    datasets.append({"path": HF_DATASET, "type": HF_DATASET_TYPE})

if not OUTPUT_GCS_URI:
    OUTPUT_GCS_URI = MODEL_BUCKET
AXOLOTL_OUTPUT_GCS_URI = f"{OUTPUT_GCS_URI}/axolotl_output"
AXOLOTL_OUTPUT_DIR = common_util.gcs_fuse_path(AXOLOTL_OUTPUT_GCS_URI)
TRAINING_JOB_OUTPUT_DIR = f"{AXOLOTL_OUTPUT_GCS_URI}/training_job_output"


axolotl_config_overwrites = []
axolotl_config_overwrites.append(f"--output_dir={AXOLOTL_OUTPUT_DIR}")
if len(datasets) > 0:
    axolotl_config_overwrites.append(f'--datasets="{datasets}"')
if len(test_datasets) > 0:
    axolotl_config_overwrites.append(f'--test_datasets="{test_datasets}"')
    axolotl_config_overwrites.append("--val_set_size=0")
axolotl_config_overwrites += axolotl_flag_overrides

train_job_args = []
train_job_args.append(f"--axolotl_config_path={AXOLOTL_CONFIG_PATH}")
train_job_args += axolotl_config_overwrites


train_job_envs = {}
if HF_TOKEN:
    train_job_envs["HF_TOKEN"] = HF_TOKEN

job_name = common_util.get_job_name_with_datetime("axolotl-train")

# Add labels for the finetuning job.
labels = {
    "mg-source": "notebook",
    "mg-notebook-name": "model_garden_axolotl_finetuning.ipynb".split(".")[0],
}

model_name = AXOLOTL_CONFIG_PATH.split("/")[1]
publisher = axolotl_config["base_model"].split("/")[0]
model_id = axolotl_config["base_model"].split("/")[1]
model_id = model_id.replace(".", "-")
labels["mg-tune"] = f"publishers-{publisher}-models-{model_name}".lower()
labels["versioned-mg-tune"] = f"{labels['mg-tune']}-{model_id}".lower()


# Pass training arguments and launch job.
train_job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name,
    container_uri=TRAIN_DOCKER_URI,
    labels=labels,
)

print("Running training job with args:")
print(" \\\n".join(train_job_args))
train_job.run(
    args=train_job_args,
    replica_count=replica_count,
    machine_type=training_machine_type,
    accelerator_type=training_accelerator_type,
    accelerator_count=per_node_accelerator_count,
    boot_disk_size_gb=boot_disk_size_gb,
    service_account=SERVICE_ACCOUNT,
    base_output_dir=TRAINING_JOB_OUTPUT_DIR,
    sync=False,  # Non-blocking call to run.
    **dws_kwargs,
)

# Wait until resource has been created.
train_job.wait_for_resource_creation()

### Run TensorBoard

In [None]:
base_output_dir = AXOLOTL_OUTPUT_DIR

# @markdown This section shows how to launch TensorBoard in a [Cloud Shell](https://cloud.google.com/shell/docs).
# @markdown 1. Click the Cloud Shell icon(![terminal](https://github.com/google/material-design-icons/blob/master/png/action/terminal/materialicons/24dp/1x/baseline_terminal_black_24dp.png?raw=true)) on the top right to open the Cloud Shell.
# @markdown 2. Copy the `tensorboard` command shown below by running this cell.
# @markdown 3. Paste and run the command in the Cloud Shell to launch TensorBoard.
# @markdown 4. Once the command runs (You may have to click `Authorize` if prompted), click the link starting with `http://localhost`.

# @markdown Note: You may need to wait around 10 minutes after the job starts in order for the TensorBoard logs to be written to the GCS bucket.
print(f"Command to copy: tensorboard --logdir {base_output_dir}/logs")

## Deploy using vllm

In [None]:
# @markdown 1. Wait for the training job to finish.
if train_job and train_job.end_time is None:
    print("Waiting for the training job to finish...")
    train_job.wait()
    print("The training job has finished.")

# @markdown 2. Set up VLLM docker URI and model gcs uri.

VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20241001_0916_RC00"
VLLM_MODEL_GCS_URI = AXOLOTL_OUTPUT_GCS_URI

if "adapter" in axolotl_config and (
    axolotl_config["adapter"] == "lora" or axolotl_config["adapter"] == "qlora"
):
    VLLM_MODEL_GCS_URI = f"{AXOLOTL_OUTPUT_GCS_URI}/merged"

### Create model endpoint

In [None]:
# @markdown This section uploads the model to Model Registry and deploys it on the Endpoint. It takes 15 minutes to 1 hour to finish.
# @markdown 1. Set the machine type and accelerator type.
# @markdown Find Vertex AI prediction supported accelerators and regions [here](https://cloud.google.com/vertex-ai/docs/predictions/configure-compute).
machine_type = "g2-standard-12"  # @param {type:"string"}
accelerator_type = "NVIDIA_L4"  # @param {type:"string"}
per_node_accelerator_count = 1  # @param {type:"integer"}

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=per_node_accelerator_count,
    is_for_training=False,
)

# @markdown Set `use_dedicated_endpoint` to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint).
use_dedicated_endpoint = True  # @param {type:"boolean"}

gpu_memory_utilization = 0.95
max_model_len = 2048


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    service_account: str,
    base_model_id: str = None,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    gpu_memory_utilization: float = 0.9,
    max_model_len: int = 4096,
    dtype: str = "auto",
    enable_trust_remote_code: bool = False,
    enforce_eager: bool = False,
    enable_lora: bool = False,
    enable_chunked_prefill: bool = False,
    enable_prefix_cache: bool = False,
    host_prefix_kv_cache_utilization_target: float = 0.0,
    max_loras: int = 1,
    max_cpu_loras: int = 8,
    use_dedicated_endpoint: bool = False,
    max_num_seqs: int = 256,
    model_type: str = None,
    enable_llama_tool_parser: bool = False,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )

    if not base_model_id:
        base_model_id = model_id

    # See https://docs.vllm.ai/en/latest/models/engine_args.html for a list of possible arguments with descriptions.
    vllm_args = [
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=8080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        f"--gpu-memory-utilization={gpu_memory_utilization}",
        f"--max-model-len={max_model_len}",
        f"--dtype={dtype}",
        f"--max-loras={max_loras}",
        f"--max-cpu-loras={max_cpu_loras}",
        f"--max-num-seqs={max_num_seqs}",
        "--disable-log-stats",
    ]

    if enable_trust_remote_code:
        vllm_args.append("--trust-remote-code")

    if enforce_eager:
        vllm_args.append("--enforce-eager")

    if enable_lora:
        vllm_args.append("--enable-lora")

    if enable_chunked_prefill:
        vllm_args.append("--enable-chunked-prefill")

    if enable_prefix_cache:
        vllm_args.append("--enable-prefix-caching")

    if 0 < host_prefix_kv_cache_utilization_target < 1:
        vllm_args.append(
            f"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}"
        )

    if model_type:
        vllm_args.append(f"--model-type={model_type}")

    if enable_llama_tool_parser:
        vllm_args.append("--enable-auto-tool-choice")
        vllm_args.append("--tool-call-parser=vertex-llama-3")

    env_vars = {
        "MODEL_ID": base_model_id,
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_args=vllm_args,
        serving_container_ports=[8080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        serving_container_deployment_timeout=7200,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )
    print(
        f"Deploying {model_name} on {machine_type} with {accelerator_count} {accelerator_type} GPU(s)."
    )
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
        system_labels={
            "NOTEBOOK_NAME": "model_garden_axolotl_finetuning.ipynb",
            "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
        },
    )
    print("endpoint_name:", endpoint.name)

    return model, endpoint


models["vllm_gpu"], endpoints["vllm_gpu"] = deploy_model_vllm(
    model_name=common_util.get_job_name_with_datetime(prefix="axolotl-vllm-serve"),
    publisher=publisher,
    publisher_model_id=model_id,
    model_id=VLLM_MODEL_GCS_URI,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=per_node_accelerator_count,
    gpu_memory_utilization=gpu_memory_utilization,
    max_model_len=max_model_len,
    enable_lora=True,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

### Perform Prediction

In [None]:
def predict_vllm(
    prompt: str,
    max_tokens: int,
    temperature: float,
    top_p: float,
    top_k: int,
    raw_response: bool,
    lora_weight: str = "",
):
    # Parameters for inference.
    instance = {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "raw_response": raw_response,
    }
    if lora_weight:
        instance["dynamic-lora"] = lora_weight
    instances = [instance]
    response = endpoints["vllm_gpu"].predict(
        instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
    )

    for prediction in response.predictions:
        print(prediction)


# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/dev/sampling_params.html).

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoints["vllm_gpu"] = aiplatform.Endpoint(aip_endpoint_name)

prompt = "Write a function to list n Fibonacci numbers in Python."  # @param {type: "string"}
max_tokens = 500  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}
top_p = 1.0  # @param {type:"number"}
top_k = 1  # @param {type:"integer"}
# @markdown Set `raw_response` to `True` to obtain the raw model output. Set `raw_response` to `False` to apply additional formatting in the structure of `"Prompt:\n{prompt.strip()}\nOutput:\n{output}"`.
raw_response = True  # @param {type:"boolean"}

instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "raw_response": raw_response,
    },
]
response = endpoints["vllm_gpu"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

# "<|file_separator|>" is the end of the file token.
for prediction in response.predictions:
    print(prediction.split("<|file_separator|>")[0])

## Clean up resources

In [None]:
# @markdown Delete the training job.

if train_job:
    train_job.delete()

# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()

delete_bucket = False  # @param {type:"boolean"}
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_NAME