In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - LLaMA2 (Quantization)

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_pytorch_llama2_quantization.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/2Fmodel_garden_pytorch_llama2_quantization.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates downloading prebuilt [LLaMA2 models](https://huggingface.co/meta-llama), deploying prequantized LLaMA2 models with [vLLM](https://github.com/vllm-project/vllm), quantizating LLaMA2 models using either AWQ or GPTQ to reduce the GPU memory requirements and then deploying these models to vLLM as well.

### Objective

- Download prebuilt LLaMA2 models
- Deploy prequantized LLaMA2 models on vLLM
- Quantize LLaMA2 models with AWQ or GPTQ and deploy on vLLM

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. [Optional] [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the specified region (`REGION`). Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1"). If not set, a unique GCS bucket will be created instead.

# Import the necessary packages
import os
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, please change the value yourself below.
now = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_URI = "gs://"  # @param {type:"string"}

if BUCKET_URI is None or BUCKET_URI.strip() == "" or BUCKET_URI == "gs://":
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}"
    ! gsutil mb -l {REGION} {BUCKET_URI}
else:
    assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
    shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep "Location constraint:" | sed "s/Location constraint://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            "Bucket region %s is different from notebook region %s"
            % (bucket_region, REGION)
        )
print(f"Using this GCS Bucket: {BUCKET_URI}")

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
MODEL_BUCKET = os.path.join(BUCKET_URI, "llama2")


# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)

# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

! gcloud config set project $PROJECT_ID

# @markdown # Access LLaMA2 models on Vertex AI for GPU based serving
# @markdown The original models from Meta are converted into the Hugging Face format for serving in Vertex AI.
# @markdown Accept the model agreement to access the models:
# @markdown 1. Open the [LLaMA2 model card](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/139) from [Vertex AI Model Garden](https://cloud.google.com/model-garden).
# @markdown 2. Review the agreement on the model card page.
# @markdown 3. After accepting the agreement of LLaMA2, a `gs://` URI containing LLaMA2 pretrained and finetuned models will be shared.
# @markdown 4. Paste the link in the `VERTEX_AI_MODEL_GARDEN_LLAMA2` field below.
# @markdown 5. The LLaMA2 models will be copied into `BUCKET_URI`.


VERTEX_AI_MODEL_GARDEN_LLAMA2 = ""  # @param {type:"string", isTemplate:true}
assert (
    VERTEX_AI_MODEL_GARDEN_LLAMA2
), "Please click the agreement of LLaMA2 in Vertex AI Model Garden, and get the GCS path of LLaMA2 model artifacts."
print(
    "Copying LLaMA2 model artifacts from",
    VERTEX_AI_MODEL_GARDEN_LLAMA2,
    "to ",
    MODEL_BUCKET,
)

! gsutil -m cp -R $VERTEX_AI_MODEL_GARDEN_LLAMA2/* $MODEL_BUCKET
base_model_path_prefix = MODEL_BUCKET

# The pre-built serving docker images.
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240222_0916_RC00"
TRAIN_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20240213_1108_RC00"


def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def deploy_model(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    quantization_method: str = "",
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        "--gpu-memory-utilization=0.9",
        "--max-num-batched-tokens=4096",
        "--disable-log-stats",
    ]
    if quantization_method:
        vllm_args.append(f"--quantization={quantization_method}")

    env_vars = {"MODEL_ID": model_id}
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_command=["python", "-m", "vllm.entrypoints.api_server"],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
    )
    print(
        f"Deploying {model_id} on {machine_type} with {accelerator_count} {accelerator_type} GPU(s)."
    )
    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    print("endpoint_name:", endpoint.name)

    print("To load this existing endpoint from a different session:")
    print("from google.cloud import aiplatform")
    print(
        f'endpoint = aiplatform.Endpoint("projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint.name}")'
    )
    return model, endpoint


quantize_job = None
endpoint_prequantized_vllm = None
endpoint_quantized_vllm = None
model_prequantized_vllm = None
model_quantized_vllm = None

## Quantize LLaMA2 models and deploy

Quantization reduces the amount of GPU required to serve a model by reducing the bit precision of the weights while minimizing drop in performance. Two such algorithms to do so are AWQ and GPTQ. Read more about AWQ in the following publication: [AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration](https://arxiv.org/abs/2306.00978). Read more about GPTQ in the following publication: [GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers
](https://arxiv.org/abs/2210.17323).

Many AWQ-quantized models are provided by TheBloke [here](https://huggingface.co/TheBloke?search_models=-awq), and GPTQ-quantized models are provided [here](https://huggingface.co/TheBloke?search_models=-gptq), including LLaMA2. To save time and cost, you can skip the `Quantize` section and select a pre-quantized model from the dropdown to deploy in the `Deploy` section.

Quantizing models with AWQ will take around 0.5 hours for LLaMA2 7B, 1.5 hours for LLaMA2 13B, and 4.5 hours for LLaMA2 70B, using 1 NVIDIA_L4 GPU for 7B and 13B models and 8 NVIDIA_L4 GPUs for 70B model. Quantizing models with GPTQ will take around 1.5 hours for LLaMA2 7B, 3 hours for LLaMA2 2.5 hours for LLaMA2 13B, and 6 hours for LLaMA 70B models, using 1 NVIDIA_L4 GPU for 7B and 13B models and 8 NVIDIA_L4 GPUs for 70B model. Finetuned LLaMA2 models can also be quantized, so long as the LoRA weights are merged with the base model. Custom datasets can be used by specifying a text file in Cloud Storage.

To use your own dataset, please [upload it to Google Cloud Storage](https://cloud.google.com/storage/docs/uploading-objects) and put the `gs://` URI in either the `awq_dataset_name` field or the `gptq_dataset_name` field below. The dataset should be a textfile with each sample on a new line.

For example, the following would be a custom calibration dataset with 3 examples from the [Penn Treebank Project: Release 2 CDROM](https://huggingface.co/datasets/ptb_text_only) dataset:
```
the plant will produce control devices used in motor vehicles and household appliances
cray research did not want to fund a project that did not include seymour
no price for the new shares has been set
```

In [None]:
# @title Quantize

# Setup quantization job.

# @markdown Set `finetuned_model_path` to a finetuned LLaMA2 model stored in GCS to quantize the finetuned model. If not, the base model will be quantized.

base_model_name = "llama2-7b-chat-hf"  # @param ["llama2-7b-hf", "llama2-7b-chat-hf", "llama2-13b-hf", "llama2-13b-chat-hf", "llama2-70b-hf", "llama2-70b-chat-hf"] {isTemplate:true}
base_model_path = os.path.join(base_model_path_prefix, base_model_name)

accelerator_type = "NVIDIA_L4"  # @param ["NVIDIA_L4"] {isTemplate: true}
finetuned_model_path = ""  # @param {type:"string"}
if finetuned_model_path:
    model_path = finetuned_model_path
else:
    model_path = base_model_path

quantization_method = "awq"  # @param ["awq", "gptq"]
quantization_job_name = get_job_name_with_datetime(
    f"llama2-{quantization_method}-quantize"
)

quantization_output_dir = os.path.join(MODEL_BUCKET, quantization_job_name)
quantization_output_dir_gcsfuse = quantization_output_dir.replace("gs://", "/gcs/")
print("Quantized models will be saved in: ", quantization_output_dir)

# Worker pool spec.
if "7b" in base_model_name:
    if accelerator_type == "NVIDIA_L4":
        # Sets 1 L4 (24G) to quantize 7B and 13B models.
        machine_type = "g2-standard-16"
        accelerator_count = 1
    else:
        raise ValueError(
            f"Recommended GPU setting not found for: {accelerator_type} and {base_model_name}."
        )
elif "13b" in base_model_name:
    if accelerator_type == "NVIDIA_L4":
        # Sets 1 L4 (24G) to quantize 7B and 13B models.
        machine_type = "g2-standard-16"
        accelerator_count = 1
    else:
        raise ValueError(
            f"Recommended GPU setting not found for: {accelerator_type} and {base_model_name}."
        )
elif "70b" in base_model_name:
    if accelerator_type == "NVIDIA_L4":
        # Sets 8 L4 (24G) to quantize 70B models.
        machine_type = "g2-standard-96"
        accelerator_count = 8
    else:
        raise ValueError(
            f"Recommended GPU setting not found for: {accelerator_type} and {base_model_name}."
        )
else:
    raise ValueError(
        f"Recommended GPU setting not found for: {accelerator_type} and {base_model_name}."
    )

# @markdown ---
# @markdown AWQ parameters only
# Quantization parameters.
quantization_precision_mode = "4bit"
if quantization_method == "awq":
    awq_dataset_name = "pileval"  # @param ["pileval"] {allow-input: true}
    group_size = 128  # @param {type: "number"}
    quantization_args = [
        "--task=quantize-model",
        f"--quantization_method={quantization_method}",
        f"--pretrained_model_id={model_path}",
        f"--quantization_precision_mode={quantization_precision_mode}",
        f"--quantization_output_dir={quantization_output_dir_gcsfuse}",
        f"--quantization_dataset_name={awq_dataset_name}",
        f"--group_size={group_size}",
        "--cache_examples_on_gpu=False",
    ]
else:
    # @markdown ---
    # @markdown GPTQ parameters only

    # @markdown Provided are the original datasets used in GPTQ paper
    gptq_dataset_name = "wikitext2"  # @param ["wikitext2","c4","c4-new","ptb","ptb-new"] {allow-input: true}
    gptq_precision_mode = "4bit"
    group_size = -1  # @param {type: "number"}
    damp_percent = 0.1  # @param {type: "number"}
    desc_act = True  # @param {type: "boolean"}
    quantization_args = [
        "--task=quantize-model",
        f"--quantization_method={quantization_method}",
        f"--pretrained_model_id={model_path}",
        f"--quantization_precision_mode={quantization_precision_mode}",
        f"--quantization_output_dir={quantization_output_dir_gcsfuse}",
        f"--quantization_dataset_name={gptq_dataset_name}",
        f"--group_size={group_size}",
        f"--damp_percent={damp_percent}",
        f"--desc_act={desc_act}",
    ]

# Pass quantization arguments and launch job.
worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        },
        "replica_count": 1,
        "disk_spec": {
            "boot_disk_type": "pd-ssd",
            "boot_disk_size_gb": 500,
        },
        "container_spec": {
            "image_uri": TRAIN_DOCKER_URI,
            "env": [
                {
                    "name": "PYTORCH_CUDA_ALLOC_CONF",
                    "value": "max_split_size_mb:32",
                },
            ],
            "command": [],
            "args": quantization_args,
        },
    }
]

print(f"Quantizing {prequantized_model_path}.")
quantize_job = aiplatform.CustomJob(
    display_name=quantization_job_name,
    project=PROJECT_ID,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=STAGING_BUCKET,
)
quantize_job.run()

print("Quantized models have been saved in: ", quantization_output_dir)

In [None]:
# @title Deploy
# @markdown This section uploads the model to Model Registry and deploys it to an Endpoint.

# @markdown The model deployment step will take 15 minutes to 1 hour to complete, depending on the model sizes.

# @markdown Setting `quantized_model_id` to `Custom quantized model` will deploy the quantized model from the section above.

# @markdown To deploy a pre-quantized LLaMA2 model, select a model from the dropdown.

quantized_model_id = "Custom quantized model"  # @param ["TheBloke/Llama-2-7B-Chat-AWQ", "TheBloke/Llama-2-13B-chat-AWQ", "TheBloke/Llama-2-70B-Chat-AWQ", "TheBloke/Llama-2-7B-Chat-GPTQ", "TheBloke/Llama-2-13B-chat-GPTQ", "TheBloke/Llama-2-70B-Chat-GPTQ", "Custom quantized model"] {isTemplate: true, allow-input:true}

if quantized_model_id == "Custom quantized model":
    model_id = quantization_output_dir
    # quantization_method and base_model_name are set in the Quantize section
else:
    model_id = quantized_model_id
    quantization_method = quantized_model_id.split("-")[:-1].lower()
    base_model_name = "-".join(
        quantized_model_id.split("/")[-1].split("-")[:-1]
    ).lower()


# @markdown Deploying a quantized model requires much less GPU.
# @markdown We can deploy a quantized 13B model with only one `NVIDIA_L4` GPUs instead of four, and
# @markdown we can deploy a quantized 70B model with only two `NVIDIA_L4` GPUs instead of eight.

# @markdown Finds Vertex AI prediction supported accelerators and regions in
# @markdown https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

# @markdown Note: AWQ-quantized models cannot be deployed to NVIDIA_TESLA_V100.

accelerator_type = "NVIDIA_L4"  # @param ["NVIDIA_L4", "NVIDIA_TESLA_V100", "NVIDIA_TESLA_A100"] {isTemplate: true}
assert not (
    accelerator_type == "NVIDIA_TESLA_V100" and quantization_method == "awq"
), "Serving AWQ models on vLLM is not supported for NVIDIA_TESLA_V100."


if "7b" in base_model_name:
    if accelerator_type == "NVIDIA_L4":
        machine_type = "g2-standard-8"
        accelerator_count = 1
    elif accelerator_type == "NVIDIA_TESLA_V100":
        machine_type = "n1-standard-8"
        accelerator_count = 1
    else:
        raise ValueError(
            f"Recommended GPU setting not found for: {accelerator_type} and quantized {base_model_name}."
        )
elif "13b" in base_model_name:
    if accelerator_type == "NVIDIA_L4":
        machine_type = "g2-standard-8"
        accelerator_count = 1
    elif accelerator_type == "NVIDIA_TESLA_V100":
        machine_type = "n1-standard-8"
        accelerator_count = 1
    else:
        raise ValueError(
            f"Recommended GPU setting not found for: {accelerator_type} and quantized {base_model_name}."
        )
elif "70b" in base_model_name:
    if accelerator_type == "NVIDIA_L4":
        # Sets 2 L4's (24G) to deploy LLaMA2 70B models.
        machine_type = "g2-standard-24"
        accelerator_count = 2
    elif accelerator_type == "NVIDIA_TESLA_V100":
        machine_type = "n1-highmem-16"
        accelerator_count = 4
    elif accelerator_type == "NVIDIA_TESLA_A100":
        accelerator_count = 1
        machine_type = "a2-highgpu-1g"
    else:
        raise ValueError(
            f"Recommended GPU setting not found for: {accelerator_type} and quantized {base_model_name}."
        )
else:
    raise ValueError(
        f"Recommended GPU setting not found for: {accelerator_type} and quantized {base_model_name}."
    )

model_quantized_vllm, endpoint_quantized_vllm = deploy_model(
    model_name=get_job_name_with_datetime(prefix="llama2-serve-vllm-quantized"),
    model_id=quantized_model_id,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    quantization_method=quantization_method,
)

# @markdown Click "Show Code" to see more details.

In [None]:
# @title Predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://github.com/vllm-project/vllm/blob/2e8e49fce3775e7704d413b2f02da6d7c99525c9/vllm/sampling_params.py#L23-L64).

# @markdown Example:

# @markdown ```
# @markdown Prompt:
# @markdown What is a car?
# @markdown Output:
# @markdown A car is a type of vehicle that is designed to transport people and goods on roads. It is typically powered by an engine and has four wheels, although some cars may have three or five wheels.
# @markdown ```

# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.

prompt = "What is a car?"  # @param {type: "string"}
max_tokens = 50  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}
top_p = 1.0  # @param {type:"number"}
top_k = 1  # @param {type:"integer"}
raw_response = False  # @param {type:"boolean"}

# Overides parameters for inferences.
# If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`,
# you can reduce the max length, such as set max_tokens as 20.
instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "raw_response": raw_response,
    },
]
response = endpoint_quantized_vllm.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

# @markdown Click "Show Code" to see more details.

## Clean up resources

In [None]:
# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continouous charges that may incur.

# Delete the quantization job.
if quantize_job:
    quantize_job.delete()

# Undeploy model and delete endpoint.
if endpoint_quantized_vllm:
    endpoint_quantized_vllm.delete(force=True)

# Delete models.
if model_quantized_vllm:
    model_quantized_vllm.delete()

# Delete Cloud Storage objects that were created
delete_bucket = False  # @param {type: "boolean", isTemplate:true}
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI