In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Code LLaMA

<table align="left">
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_pytorch_codellama.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_codellama.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>

## Overview

This notebook showcases deploying pretrained Code LLaMA models using a HuggingFace transformers based serving container and using [vLLM](https://github.com/vllm-project/vllm). This notebook also demonstrates how to evaluate the Code LLaMA models using EleutherAI's Language Model Evaluation Harness (lm-evaluation-harness) with Vertex CustomJob.

### Objective

- Deploy pre-trained Code LLaMA models using a standard HuggingFace serving solution
- Deploy pre-trained Code LLaMA models with [vLLM](https://github.com/vllm-project/vllm) with best serving throughput

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Run the notebook

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. [Optional] [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1"). If not set, a unique GCS bucket will be created instead.

import os
import sys
from datetime import datetime

from google.cloud import aiplatform

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, please change the value yourself below.
now = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_URI = "gs://"  # @param {type:"string"}
assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."

# Gets the default BUCKET_URI and SERVICE_ACCOUNT if they were not specified by the user.
SERVICE_ACCOUNT = None
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

print("Using this default Service Account:", SERVICE_ACCOUNT)

# Create a unique GCS bucket for this notebook, if not specified by the user.
if BUCKET_URI is None or BUCKET_URI.strip() == "" or BUCKET_URI == "gs://":
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}"
    ! gsutil mb -l {REGION} {BUCKET_URI}
else:
    shell_output = ! gsutil ls -Lb {BUCKET_URI} | grep "Location constraint:" | sed "s/Location constraint://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            "Bucket region %s is different from notebook region %s"
            % (bucket_region, REGION)
        )

print(f"Using this GCS Bucket: {BUCKET_URI}")

# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_URI


! gcloud config set project $PROJECT_ID
! gcloud services enable language.googleapis.com

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
MODEL_BUCKET = os.path.join(STAGING_BUCKET, "model")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user(project_id=PROJECT_ID)


def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


# The pre-built serving docker images.
PREDICTION_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-serve:20231026_1907_RC00"
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20231002_0916_RC00"
EVAL_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-lm-evaluation-harness:20231011_0934_RC00"


def deploy_model(
    model_name: str,
    model_id: str,
    finetuned_lora_model_path: str,
    service_account: str,
    task: str,
    precision_loading_mode: str = "float16",
    machine_type: str = "n1-standard-8",
    accelerator_type: str = "NVIDIA_TESLA_V100",
    accelerator_count: int = 1,
) -> tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")
    serving_env = {
        "MODEL_ID": model_id,
        "PRECISION_LOADING_MODE": precision_loading_mode,
        "TASK": task,
    }
    if finetuned_lora_model_path:
        serving_env["FINETUNED_LORA_MODEL_PATH"] = finetuned_lora_model_path
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=PREDICTION_DOCKER_URI,
        serving_container_ports=[7080],
        serving_container_predict_route="/predictions/peft_serving",
        serving_container_health_route="/ping",
        serving_container_environment_variables=serving_env,
    )
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    service_account: str,
    precision: str = "float16",
    swap_space: int = 16,
    machine_type: str = "n1-standard-8",
    accelerator_type: str = "NVIDIA_TESLA_V100",
    accelerator_count: int = 1,
) -> tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        f"--swap-space={swap_space}",
        "--gpu-memory-utilization=0.9",
        "--max-num-batched-tokens=16385",
        f"--dtype={precision}",
        "--disable-log-stats",
    ]
    serving_env = {
        "MODEL_ID": "meta/code-llama@001",
    }
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_command=["python", "-m", "vllm.entrypoints.api_server"],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=serving_env,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint

In [None]:
# @title Access pretrained Code LLaMA models

# @markdown The original models from Meta are converted into the HuggingFace format for serving in Vertex AI.

# @markdown Accept the model agreement to access the models:
# @markdown 1. Open the [Code LLaMA model card](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/137).
# @markdown 2. Review and accept the agreement in the pop-up window on the model card page. If you have previously accepted the model agreement, there will not be a pop-up window on the model card page and this step is not needed.
# @markdown 3. A Cloud Storage bucket (starting with ‘gs://’) containing Code LLaMA pretrained and finetuned models will be shared under the “Documentation” section and its “Get started” subsection.

# This path will be shared once click the agreement in Code LLaMA model card
# as described in the `Access pretrained Code LLaMA models` section.
VERTEX_AI_MODEL_GARDEN_CODE_LLAMA = "gs://"  # @param {type: "string"}
assert (
    VERTEX_AI_MODEL_GARDEN_CODE_LLAMA
), "Please click the agreement of Code LLaMA in Vertex AI Model Garden, and get the GCS path of Code LLaMA model artifacts."
print(
    "Copying Code LLaMA model artifacts from",
    VERTEX_AI_MODEL_GARDEN_CODE_LLAMA,
    "to ",
    MODEL_BUCKET,
)
! gsutil -m cp -R $VERTEX_AI_MODEL_GARDEN_CODE_LLAMA/* $MODEL_BUCKET

In [None]:
# @title Deploy pretrained Code LLaMA (PEFT)
# @markdown This section deploys prebuilt Code LLaMA models on Vertex AI. V100 GPUs are used for demonstration. Note that V100 serving generally offers better throughput and latency performance than L4 serving, while L4 serving is generally more cost efficient than V100 serving. The serving efficiency of V100 and L4 GPUs is inferior to that of A100 GPUs, but V100 and L4 GPUs are nevertheless good serving solutions if you do not have A100 quota.

# @markdown We use the PEFT serving image to deploy prebuilt Code LLaMA models, by setting finetuning LoRA model paths as empty. The model deployment step will take 15 minutes to 1 hour to complete, depending on the model sizes.

# @markdown Set the model name.
model_name = "CodeLlama-7b-Instruct-hf"  # @param ["CodeLlama-7b-hf", "CodeLlama-7b-Python-hf", "CodeLlama-7b-Instruct-hf", "CodeLlama-13b-hf", "CodeLlama-13b-Python-hf", "CodeLlama-13b-Instruct-hf", "CodeLlama-34b-hf", "CodeLlama-34b-Python-hf", "CodeLlama-34b-Instruct-hf", "CodeLlama-70b-hf", "CodeLlama-70b-Python-hf", "CodeLlama-70b-Instruct-hf"]
model_id = os.path.join(MODEL_BUCKET, model_name)
print(model_id)

# @markdown Find Vertex AI prediction supported accelerators and regions at https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

accelerator_type = "NVIDIA_TESLA_V100"  # @param ["NVIDIA_L4", "NVIDIA_TESLA_V100", "NVIDIA_TESLA_A100"]

if "7b" in model_name:
    # Sets A100 (40G) to deploy 7B models.
    if accelerator_type == "NVIDIA_TESLA_A100":
        # V100 serving has better throughput and latency performance than L4 serving.
        machine_type = "a2-highgpu-1g"
        accelerator_count = 1
    # Sets 1 V100 (16G) to deploy 7B models.
    # V100 serving has better throughput and latency performance than L4 serving.
    elif accelerator_type == "NVIDIA_TESLA_V100":
        # L4 serving is more cost efficient than V100 serving.
        machine_type = "n1-standard-8"
        accelerator_count = 1
    # Sets 1 L4 (24G) to deploy 7B models.
    # L4 serving is more cost efficient than V100 serving.
    elif accelerator_type == "NVIDIA_L4":
        # L4 serving is more cost efficient than V100 serving.
        machine_type = "g2-standard-8"
        accelerator_count = 1
    else:
        raise ValueError(
            f"Recommended GPU setting not found for: {accelerator_type} and {model_name}."
        )
elif "13b" in model_name:
    # Sets A100 (40G) to deploy 13B models.
    if accelerator_type == "NVIDIA_TESLA_A100":
        # V100 serving has better throughput and latency performance than L4 serving.
        machine_type = "a2-highgpu-1g"
        accelerator_count = 1
    # Sets 2 V100 (16G) to deploy 13B models.
    # V100 serving has better throughput and latency performance than L4 serving.
    elif accelerator_type == "NVIDIA_TESLA_V100":
        # L4 serving is more cost efficient than V100 serving.
        machine_type = "n1-standard-16"
        accelerator_count = 2
    # Sets 2 L4 (24G) to deploy 13B models.
    # L4 serving is more cost efficient than V100 serving.
    elif accelerator_type == "NVIDIA_L4":
        # L4 serving is more cost efficient than V100 serving.
        machine_type = "g2-standard-24"
        accelerator_count = 2
    else:
        raise ValueError(
            f"Recommended GPU setting not found for: {accelerator_type} and {model_name}."
        )
elif "34b" in model_name:
    # Sets 2 A100 (40G) to deploy 34B models.
    if accelerator_type == "NVIDIA_TESLA_A100":
        # V100 serving has better throughput and latency performance than L4 serving.
        machine_type = "a2-highgpu-2g"
        accelerator_count = 2
    # Sets 8 V100 (16G) to deploy 34B models.
    # V100 serving has better throughput and latency performance than L4 serving.
    elif accelerator_type == "NVIDIA_TESLA_V100":
        # L4 serving is more cost efficient than V100 serving.
        machine_type = "n1-standard-32"
        accelerator_count = 8
    # Sets 4 L4 (24G) to deploy 34B models.
    elif accelerator_type == "NVIDIA_L4":
        # L4 serving is more cost efficient than V100 serving.
        machine_type = "g2-standard-48"
        accelerator_count = 4
    else:
        raise ValueError(
            f"Recommended GPU setting not found for: {accelerator_type} and {model_name}."
        )
elif "70b" in model_name:
    # Sets 4 A100 (40G) to deploy 70B models.
    if accelerator_type == "NVIDIA_TESLA_A100":
        # V100 serving has better throughput and latency performance than L4 serving.
        machine_type = "a2-highgpu-4g"
        accelerator_count = 4
    # Sets 8 L4 (24G) to deploy 70B models.
    elif accelerator_type == "NVIDIA_L4":
        # L4 serving is more cost efficient than V100 serving.
        machine_type = "g2-standard-96"
        accelerator_count = 8
    else:
        raise ValueError(
            f"Recommended GPU setting not found for: {accelerator_type} and {model_name}."
        )

precision_loading_mode = "float16"

model, endpoint = deploy_model(
    model_name=get_job_name_with_datetime(prefix="code-llama-serve-peft"),
    model_id=model_id,
    finetuned_lora_model_path="",  # This will avoid override finetuning models.
    service_account=SERVICE_ACCOUNT,
    task="causal-language-modeling-lora",
    precision_loading_mode=precision_loading_mode,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
)
print(f"Endpoint name: {endpoint.name}")

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the endpoint name of
#   the endpoint `endpoint` created in the cell above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = endpoint.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

instances = [
    {
        "prompt": "import argparse",
        "max_tokens": 200,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 10,
    },
]
response = endpoint.predict(instances=instances)
print(response.predictions[0])

In [None]:
# @title Deploy pretrained Code LLaMA (vLLM)
# @markdown This section deploys prebuilt Code LLaMA models with [vLLM](https://github.com/vllm-project/vllm) on the Endpoint. Code LLaMA model weights are stored in bfloat16 precision. L4 or A100 GPUs are needed for vLLM serving at bfloat16 precision. V100 GPUs can be used with vLLM serving at float16 precision. Changing the precision from bfloat16 to float16 can result in a change in performance, and this change can be an increase and a decrease. However, the performance change should be small (within 5%).

# @markdown V100 GPUs are used for demonstration. Note that V100 serving generally offers better throughput and latency performance than L4 serving, while L4 serving is generally more cost efficient than V100 serving. The serving efficiency of V100 and L4 GPUs is inferior to that of A100 GPUs, but V100 and L4 GPUs are nevertheless good serving solutions if you do not have A100 quota. The model deployment step will take 15 minutes to 1 hour to complete.

# @markdown The vLLM project is an highly optimized LLM serving framework which can increase serving throughput a lot. The higher QPS you have, the more benefits you get using vLLM.


# Finds Vertex AI prediction supported accelerators and regions in
#  https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.


# @markdown Set the model name.
model_name = "CodeLlama-7b-Instruct-hf"  # @param ["CodeLlama-7b-hf", "CodeLlama-7b-Python-hf", "CodeLlama-7b-Instruct-hf", "CodeLlama-13b-hf", "CodeLlama-13b-Python-hf", "CodeLlama-13b-Instruct-hf", "CodeLlama-34b-hf", "CodeLlama-34b-Python-hf", "CodeLlama-34b-Instruct-hf", "CodeLlama-70b-hf", "CodeLlama-70b-Python-hf", "CodeLlama-70b-Instruct-hf"]
model_id = os.path.join(MODEL_BUCKET, model_name)
print(model_id)

# @markdown Find Vertex AI prediction supported accelerators and regions at https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

accelerator_type = "NVIDIA_TESLA_V100"  # @param ["NVIDIA_L4", "NVIDIA_TESLA_V100", "NVIDIA_TESLA_A100"]

if "7b" in model_name:
    # Sets A100 (40G) to deploy 7B models.
    if accelerator_type == "NVIDIA_TESLA_A100":
        # V100 serving has better throughput and latency performance than L4 serving.
        machine_type = "a2-highgpu-1g"
        accelerator_count = 1
        vllm_precision = "bfloat16"
        vllm_swap_space = 16
    # Sets 1 V100 (16G) to deploy 7B models.
    # V100 serving has better throughput and latency performance than L4 serving.
    elif accelerator_type == "NVIDIA_TESLA_V100":
        # L4 serving is more cost efficient than V100 serving.
        machine_type = "n1-standard-8"
        accelerator_count = 1
        vllm_precision = "float16"
        vllm_swap_space = 16
    # Sets 1 L4 (24G) to deploy 7B models.
    # L4 serving is more cost efficient than V100 serving.
    elif accelerator_type == "NVIDIA_L4":
        # L4 serving is more cost efficient than V100 serving.
        machine_type = "g2-standard-8"
        accelerator_count = 1
        vllm_precision = "bfloat16"
        vllm_swap_space = 16
    else:
        raise ValueError(
            f"Recommended GPU setting not found for: {accelerator_type} and {model_name}."
        )
elif "13b" in model_name:
    # Sets A100 (40G) to deploy 13B models.
    if accelerator_type == "NVIDIA_TESLA_A100":
        # V100 serving has better throughput and latency performance than L4 serving.
        machine_type = "a2-highgpu-1g"
        accelerator_count = 1
        vllm_precision = "bfloat16"
        vllm_swap_space = 16
    # Sets 2 V100 (16G) to deploy 13B models.
    # V100 serving has better throughput and latency performance than L4 serving.
    elif accelerator_type == "NVIDIA_TESLA_V100":
        # L4 serving is more cost efficient than V100 serving.
        machine_type = "n1-standard-16"
        accelerator_count = 2
        vllm_precision = "float16"
        vllm_swap_space = 16
    # Sets 2 L4 (24G) to deploy 13B models.
    # L4 serving is more cost efficient than V100 serving.
    elif accelerator_type == "NVIDIA_L4":
        # L4 serving is more cost efficient than V100 serving.
        machine_type = "g2-standard-24"
        accelerator_count = 2
        vllm_precision = "bfloat16"
        vllm_swap_space = 16
    else:
        raise ValueError(
            f"Recommended GPU setting not found for: {accelerator_type} and {base_model_name}."
        )
elif "34b" in model_name:
    # Sets 2 A100 (40G) to deploy 34B models.
    if accelerator_type == "NVIDIA_TESLA_A100":
        # V100 serving has better throughput and latency performance than L4 serving.
        machine_type = "a2-highgpu-2g"
        accelerator_count = 2
        vllm_precision = "bfloat16"
        vllm_swap_space = 16
    # Sets 8 V100 (16G) to deploy 34B models.
    # V100 serving has better throughput and latency performance than L4 serving.
    elif accelerator_type == "NVIDIA_TESLA_V100":
        # L4 serving is more cost efficient than V100 serving.
        machine_type = "n1-standard-32"
        accelerator_count = 8
        vllm_precision = "float16"
        vllm_swap_space = 12
    # Sets 4 L4 (24G) to deploy 34B models.
    elif accelerator_type == "NVIDIA_L4":
        # L4 serving is more cost efficient than V100 serving.
        machine_type = "g2-standard-48"
        accelerator_count = 4
        vllm_precision = "bfloat16"
        vllm_swap_space = 16
    else:
        raise ValueError(
            f"Recommended GPU setting not found for: {accelerator_type} and {base_model_name}."
        )
elif "70b" in model_name:
    # Sets 4 A100 (40G) to deploy 70B models.
    if accelerator_type == "NVIDIA_TESLA_A100":
        # V100 serving has better throughput and latency performance than L4 serving.
        machine_type = "a2-highgpu-4g"
        accelerator_count = 4
        vllm_precision = "bfloat16"
        vllm_swap_space = 16
    # Sets 8 L4 (24G) to deploy 70B models.
    elif accelerator_type == "NVIDIA_L4":
        # L4 serving is more cost efficient than V100 serving.
        machine_type = "g2-standard-96"
        accelerator_count = 8
        vllm_precision = "bfloat16"
        vllm_swap_space = 16
    else:
        raise ValueError(
            f"Recommended GPU setting not found for: {accelerator_type} and {base_model_name}."
        )

model_vllm, endpoint_vllm = deploy_model_vllm(
    model_name=get_job_name_with_datetime(prefix="code-llama-serve-vllm"),
    model_id=model_id,
    service_account=SERVICE_ACCOUNT,
    precision=vllm_precision,
    swap_space=vllm_swap_space,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
)
print(f"Endpoint name: {endpoint_vllm.name}")

In [None]:
# @title Prediction with endpoint
# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Parameters supported by vLLM can be found [here](https://github.com/vllm-project/vllm/blob/2e8e49fce3775e7704d413b2f02da6d7c99525c9/vllm/sampling_params.py#L23-L64).

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint_vllm.name` allows us to get the endpoint
#   name of the endpoint `endpoint_without_peft` created in the cell above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = endpoint_vllm.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint_vllm = aiplatform.Endpoint(aip_endpoint_name)

prompt = "import argparse"  # @param {type:"string"}
n = 1  # @param {type:"integer"}
max_tokens = 200  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}
top_p = 1.0  # @param {type:"number"}
top_k = 10  # @param {type:"number"}

instances = [
    {
        "prompt": "import argparse",
        "n": 1,
        "max_tokens": 200,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 10,
    },
]
response = endpoint_vllm.predict(instances=instances)
print(response.predictions[0])

In [None]:
# @title Clean up resources
# @markdown Delete the experiment models and endpoints to recycle the resources
# @markdown and avoid unnecessary continouous charges that may incur.

delete_bucket = False  # @param {type:"boolean"}
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI
    # Uncomment below to delete all artifacts
    # !gsutil -m rm -r $STAGING_BUCKET $MODEL_BUCKET

# Undeploy models and delete endpoints.
endpoint.delete(force=True)
endpoint_vllm.delete(force=True)

# Delete models.
model.delete()
model_vllm.delete()