In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - LLaMA2 (Quantization)

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama2_quantization.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama2_quantization.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_pytorch_llama2_quantization.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
Open in Vertex AI Workbench
    </a> (A Python-3 CPU notebook is recommended)
  </td>
</table>

## Overview

This notebook demonstrates downloading prebuilt [LLaMA2 models](https://huggingface.co/meta-llama), deploying prequantized LLaMA2 models with [vLLM](https://github.com/vllm-project/vllm), quantizating LLaMA2 models yourself using either AWQ or GPTQ to reduce the GPU memory requirements and then deploying these models to vLLM as well. This notebook uses [Text moderation APIs](https://cloud.google.com/natural-language/docs/moderating-text) to analyze predictions against a list of safety attributes.

### Objective

- Download prebuilt LLaMA2 models
- Deploy prequantized LLaMA2 models on vLLM
- Quantize LLaMA2 models with AWQ or GPTQ and deploy on vLLM

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage
* Cloud NL APIs

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), [Cloud NL API pricing](https://cloud.google.com/natural-language/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

### Colab only
Run the following commands for Colab and skip this section if you are using Workbench.

In [None]:
import sys

if "google.colab" in sys.modules:
    ! pip3 install --upgrade google-cloud-aiplatform
    ! pip3 install ipython pandas[output_formatting] google-cloud-language==2.10.0
    from google.colab import auth as google_auth

    google_auth.authenticate_user()

    # Restart the notebook kernel after installs.
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

### Workbench only
If you are using Workbench, you should find that the necessary dependencies are already pre-installed. If this is not the case or if you have previously modified the existing libraries, you may install the dependencies using the following commands:
```
! pip3 install --upgrade google-cloud-aiplatform
! pip3 install ipython pandas[output_formatting] google-cloud-language==2.10.0
```

### Setup Google Cloud project

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI API, Compute Engine API and Cloud Natural Language API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component,language.googleapis.com).

1. [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs.

1. [Create a service account](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console) with `Vertex AI User` and `Storage Object Admin` roles for deploying fine tuned model to Vertex AI endpoint.

### Import the necessary packages

In [None]:
import os
import sys
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform, language

Set the following variables for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the specified region (`REGION`). Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1").

In [None]:
# Cloud project id.
PROJECT_ID = ""  # @param {type:"string"}

# Region for launching jobs.
REGION = ""  # @param {type:"string"}

# Cloud Storage bucket for storing experiments output.
# Start with gs:// prefix, e.g. gs://foo_bucket.
BUCKET_URI = "gs://"  # @param {type:"string"}

! gcloud config set project $PROJECT_ID
! gcloud services enable language.googleapis.com


STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
EXPERIMENT_BUCKET = os.path.join(BUCKET_URI, "peft")
BASE_MODEL_BUCKET = os.path.join(EXPERIMENT_BUCKET, "base_model")

# The service account looks like:
# '@.iam.gserviceaccount.com'
# Please go to https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
# and create service account with `Vertex AI User` and `Storage Object Admin` roles.
# The service account for deploying fine tuned model.
SERVICE_ACCOUNT = ""  # @param {type:"string"}

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user(project_id=PROJECT_ID)

### Initialize Vertex AI API

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

### Define constants

In [None]:
# The pre-built training, serving and evaluation docker images.
TRAIN_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20231211_0936_RC00"
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20231127_0916_RC00"
VLLM_GPTQ_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:gptq"

### Define common functions

In [None]:
def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def deploy_model_vllm(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "n1-standard-8",
    accelerator_type: str = "NVIDIA_TESLA_V100",
    accelerator_count: int = 1,
    quantization_method: str = "",
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        "--gpu-memory-utilization=0.9",
        "--max-num-batched-tokens=4096",
        "--disable-log-stats",
    ]
    if quantization_method:
        vllm_args.append(f"--quantization={quantization_method}")
    if quantization_method == "gptq":
        vllm_docker_uri = VLLM_GPTQ_DOCKER_URI
    else:
        vllm_docker_uri = VLLM_DOCKER_URI

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=vllm_docker_uri,
        serving_container_command=["python", "-m", "vllm.entrypoints.api_server"],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint


def moderate_text(text: str) -> language.ModerateTextResponse:
    """Calls Vertex AI APIs to analyze text moderations."""
    client = language.LanguageServiceClient()
    document = language.Document(
        content=text,
        type_=language.Document.Type.PLAIN_TEXT,
    )
    return client.moderate_text(document=document)


def show_text_moderation(text: str, response: language.ModerateTextResponse) -> None:
    """Shows text moderation results."""
    import pandas as pd

    def confidence(category: language.ClassificationCategory) -> float:
        return category.confidence

    columns = ["category", "confidence"]
    categories = sorted(response.moderation_categories, key=confidence, reverse=True)
    data = ((category.name, category.confidence) for category in categories)
    df = pd.DataFrame(columns=columns, data=data)

    print(f"Text analyzed:\n{text}")
    print(df.to_markdown(index=False, tablefmt="presto", floatfmt=".0%"))

## Access LLaMA2 pretrained and finetuned models
The original models from Meta are converted into the Hugging Face format for finetuning and serving in Vertex AI.

Accept the model agreement to access the models:
1. Navigate to the Vertex AI > Model Garden page in the Google Cloud console
2. Find the LLaMA2 model card and click on "VIEW DETAILS"
3. Review the agreement on the model card page
4. After clicking the agreement of LLaMA2, a Cloud Storage bucket containing LLaMA2 pretrained and finetuned models will be shared
5. Paste the Cloud Storage bucket link below and assign it to `VERTEX_AI_MODEL_GARDEN_LLAMA2`

In [None]:
VERTEX_AI_MODEL_GARDEN_LLAMA2 = ""  # This will be shared once click the agreement of LLaMA2 in Vertex AI Model Garden.
assert (
    VERTEX_AI_MODEL_GARDEN_LLAMA2
), "Please click the agreement of LLaMA2 in Vertex AI Model Garden, and get the GCS path of LLaMA2 model artifacts."
print(
    "Copy LLaMA2 model artifacts from",
    VERTEX_AI_MODEL_GARDEN_LLAMA2,
    "to ",
    BASE_MODEL_BUCKET,
)
! gsutil -m cp -R $VERTEX_AI_MODEL_GARDEN_LLAMA2/* $BASE_MODEL_BUCKET

Set the base model id.

In [None]:
base_model_name = "llama2-7b-chat-hf"  # @param ["llama2-7b-hf", "llama2-7b-chat-hf", "llama2-13b-hf", "llama2-13b-chat-hf", "llama2-70b-hf", "llama2-70b-chat-hf"]
base_model_id = os.path.join(BASE_MODEL_BUCKET, base_model_name)

## Quantize and deploy LLaMA 2 models

This section demonstrates post-training quantization of LLaMA2 models with Vertex Custom Job. Quantization reduces the memory required by a model while attempting to retain the same performance. Two such algorithms to do so are AWQ and GPTQ. Read more about AWQ in the following publication: [AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration](https://arxiv.org/abs/2306.00978). Read more about GPTQ in the following publication: [GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers
](https://arxiv.org/abs/2210.17323).

### Deploy pre-quantized models with Google Cloud Text Moderation
Many AWQ-quantized models are provided by TheBloke [here](https://huggingface.co/TheBloke?search_models=-awq), and GPTQ-quantized models are provided [here](https://huggingface.co/TheBloke?search_models=-gptq).

This section uploads the model to Model Registry and deploys it on the Endpoint.

The model deployment step will take 15 minutes to 1 hour to complete, depending on the model sizes.

Notice that deploying a quantized model requires much less GPU.
We can deploy a quantized 13B model with only one L4 instead of four, and
we can deploy a quantized 70B model with only two L4s instead of eight.

In [None]:
quantized_model_id = "TheBloke/Llama-2-7B-chat-AWQ"  # @param ["TheBloke/Llama-2-7B-chat-AWQ", "TheBloke/Llama-2-13B-chat-AWQ", "TheBloke/Llama-2-70B-chat-AWQ", "TheBloke/Llama-2-7B-chat-GPTQ", "TheBloke/Llama-2-13B-chat-GPTQ", "TheBloke/Llama-2-70B-chat-GPTQ"]

quantization_method = quantized_model_id.split("-")[-1].lower()

# Finds Vertex AI prediction supported accelerators and regions in
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

# Sets 1 L4 (24G) to deploy LLaMA2 7B and 13B models.
machine_type = "g2-standard-8"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1

# Sets 2 L4's (24G) to deploy LLaMA2 70B models.
# machine_type = "g2-standard-24"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 2

model_prequantized_vllm, endpoint_prequantized_vllm = deploy_model_vllm(
    model_name=get_job_name_with_datetime(prefix="llama2-serve-vllm-prequantized"),
    model_id=quantized_model_id,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    quantization_method=quantization_method,
)

NOTE: After the deployment succeeds, the model weights will be downloaded on the fly. Thus additional 10 ~ 40 minutes (depending on the model sizes) of waiting time is needed **after** the above model deployment step succeeds and before you run the next step below. Otherwise you might see a `ServiceUnavailable: 503 502:Bad Gateway` error when you send requests to the endpoint.

Once deployment succeeds, you can send requests to the endpoint with text prompts.

Example:

```
Human: What is a car?
Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
```

In [None]:
# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint_prequantized_vllm.name` allows us to get the
#   endpoint name of the endpoint `endpoint_prequantized_vllm` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = endpoint_prequantized_vllm.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint_prequantized_vllm = aiplatform.Endpoint(aip_endpoint_name)


# Overides max_length and top_k parameters during inferences.
# If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`,
# you can reduce the max length, such as set max_length as 20.
# If you are using L4 GPUs to serve LLaMA2 70B models, you should set
# max_length to around 1,000 tokens or fewer. If you need longer generated
# sequences, please file a request with Vertex to allowlist your project for a
# longer timeout threshold with Vertex endpoints.
instances = [
    {
        "prompt": "What is a car?",
        "max_tokens": 50,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 10,
    },
]
response = endpoint_prequantized_vllm.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

Text moderation analyzes a document against a list of safety attributes, which include "harmful categories" and topics that may be considered sensitive.

In [None]:
for generated_text in response.predictions:
    # Send a request to the API.
    response = moderate_text(generated_text)
    # Show the results.
    show_text_moderation(generated_text, response)

### Quantize LLaMA2 models

Quantization reduces the amount of GPU required to serve a model by reducing the bit precision of the weights while minimizing drop in performance. Serving quantized models on VLLM requires models to be quantized to 4 bits. It is recommended to first search if a model has already been quantized and made publicly available: [AWQ](https://huggingface.co/TheBloke?search_models=-awq) and [GPTQ](https://huggingface.co/TheBloke?search_models=-gptq). Quantizing models with AWQ will take around 0.5 hours for LLaMA2 7B, 1.5 hours for LLaMA2 13B, and 4.5 hours for LLaMA2 70B, using 1 NVIDIA_L4 GPU for 7B and 13B models and 8 NVIDIA_L4 GPUs for 70B model. Quantizing models with GPTQ will take around 1.5 hours for LLaMA2 7B, 3 hours for LLaMA2 2.5 hours for LLaMA2 13B, and 6 hours for LLaMA 70B models, using 1 NVIDIA_L4 GPU for 7B and 13B models and 8 NVIDIA_L4 GPUs for 70B model. Finetuned LLaMA2 models can also be quantized, so long as the LoRA weights are merged with the base model.

In [None]:
# Setup quantization job.

# Set `finetuned_model_path` to a finetuned LLaMA2 model stored in GCS to
# quantize it. If not, the base model will be quantized.
finetuned_model_path = ""  # @param {type:"string"}
if finetuned_model_path:
    prequantized_model_path = finetuned_model_path
else:
    prequantized_model_path = base_model_id

quantization_method = "awq"  # @param ["awq", "gptq"]
quantization_job_name = get_job_name_with_datetime(
    f"llama2-{quantization_method}-quantize"
)

quantization_output_dir = os.path.join(MODEL_BUCKET, quantization_job_name)
quantization_output_dir_gcsfuse = quantization_output_dir.replace("gs://", "/gcs/")

# Worker pool spec.

# Sets 1 L4 (24G) to quantize 7B and 13B models.
machine_type = "g2-standard-16"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1

# Sets 8 L4 (24G) to quantize 70B models.
# machine_type = "g2-standard-96"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 8


# Quantization parameters.
quantization_precision_mode = "4bit"
if quantization_method == "awq":
    awq_dataset_name = "pileval"
    group_size = 128
    quantization_args = [
        "--task=quantize-model",
        f"--quantization_method={quantization_method}",
        f"--pretrained_model_id={base_model_id}",
        f"--quantization_precision_mode={quantization_precision_mode}",
        f"--quantization_output_dir={quantization_output_dir_gcsfuse}",
        f"--quantization_dataset_name={awq_dataset_name}",
        f"--group_size={group_size}",
    ]
else:
    # The original datasets used in GPTQ paper ["wikitext2","c4","c4-new","ptb","ptb-new"].
    gptq_dataset_name = "c4"  # @param {type:"string"}
    gptq_precision_mode = "4bit"
    group_size = -1
    damp_percent = 0.1
    desc_act = True
    quantization_args = [
        "--task=quantize-model",
        f"--quantization_method={quantization_method}",
        f"--pretrained_model_id={base_model_id}",
        f"--quantization_precision_mode={quantization_precision_mode}",
        f"--quantization_output_dir={quantization_output_dir_gcsfuse}",
        f"--quantization_dataset_name={gptq_dataset_name}",
        f"--group_size={group_size}",
        f"--damp_percent={damp_percent}",
        f"--desc_act={desc_act}",
        "--cache_examples_on_gpu=False",
    ]

# Pass quantization arguments and launch job.
worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        },
        "replica_count": 1,
        "disk_spec": {
            "boot_disk_type": "pd-ssd",
            "boot_disk_size_gb": 500,
        },
        "container_spec": {
            "image_uri": TRAIN_DOCKER_URI,
            "env": [
                {
                    "name": "PYTORCH_CUDA_ALLOC_CONF",
                    "value": "max_split_size_mb:32",
                },
            ],
            "command": [],
            "args": quantization_args,
        },
    }
]

print(f"Quantizing {prequantized_model_path}.")
quantize_job = aiplatform.CustomJob(
    display_name=quantization_job_name,
    project=PROJECT_ID,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=STAGING_BUCKET,
)
quantize_job.run()

print("Quantized models were saved in: ", quantization_output_dir)

### Deploy quantized models with Google Cloud Text Moderation
This section uploads the model to Model Registry and deploys it on the Endpoint.

The model deployment step will take 15 minutes to 1 hour to complete, depending on the model sizes.

Notice that deploying a quantized model requires much less GPU.
We can deploy a quantized 13B model with only one L4 instead of four, and
we can deploy a quantized 70B model with only two L4s instead of eight.

In [None]:
# Finds Vertex AI prediction supported accelerators and regions in
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

# Sets 1 L4 (24G) to deploy LLaMA2 7B and 13B models.
machine_type = "g2-standard-8"
accelerator_type = "NVIDIA_L4"
accelerator_count = 1

# Sets 4 L4's (24G) to deploy LLaMA2 70B models.
# machine_type = "g2-standard-24"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 2

model_quantized_vllm, endpoint_quantized_vllm = deploy_model_vllm(
    model_name=get_job_name_with_datetime(prefix="llama2-serve-vllm-quantized"),
    model_id=quantization_output_dir,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    quantization_method=quantization_method,
)

NOTE: After the deployment succeeds, the model weights will be downloaded on the fly. Thus additional 10 ~ 40 minutes (depending on the model sizes) of waiting time is needed **after** the above model deployment step succeeds and before you run the next step below. Otherwise you might see a `ServiceUnavailable: 503 502:Bad Gateway` error when you send requests to the endpoint.

Once deployment succeeds, you can send requests to the endpoint with text prompts.

Example:

```
Human: What is a car?
Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
```

In [None]:
# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint_quantized_vllm.name` allows us to get the
#   endpoint name of the endpoint `endpoint_quantized_vllm` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = endpoint_quantized_vllm.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint_quantized_vllm = aiplatform.Endpoint(aip_endpoint_name)


# Overides max_length and top_k parameters during inferences.
# If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`,
# you can reduce the max length, such as set max_length as 20.
# If you are using L4 GPUs to serve LLaMA2 70B models, you should set
# max_length to around 1,000 tokens or fewer. If you need longer generated
# sequences, please file a request with Vertex to allowlist your project for a
# longer timeout threshold with Vertex endpoints.
instances = [
    {
        "prompt": "What is a car?",
        "max_tokens": 50,
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 10,
    },
]
response = endpoint_quantized_vllm.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

Text moderation analyzes a document against a list of safety attributes, which include "harmful categories" and topics that may be considered sensitive.

In [None]:
for generated_text in response.predictions:
    # Send a request to the API.
    response = moderate_text(generated_text)
    # Show the results.
    show_text_moderation(generated_text, response)

## Clean up resources

In [None]:
# Delete the quantization job.
quantize_job.delete()

# Undeploy model and delete endpoint.
endpoint_prequantized_vllm.delete(force=True)
endpoint_quantized_vllm.delete(force=True)

# Delete models.
model_prequantized_vllm.delete()
model_quantized_vllm.delete()

# Delete Cloud Storage objects that were created
delete_bucket = False
if delete_bucket:
    ! gsutil -m rm -r $EXPERIMENT_BUCKET