In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Falcon Instruct Deployment

<table align="left">
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_pytorch_falcon_instruct_deployment.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_falcon_instruct_deployment.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>

## Overview

This notebook demonstrates running inferences locally with prebuilt Falcon Instruct models and deploying prebuilt Falcon Instruct models in Vertex AI.

### Objective

- Run inferences locally on prebuilt Falcon Instruct models
- Deploy prebuilt Falcon Instruct models
- Clean up the resources

| Models |
| :- |
| [tiiuae/falcon-7b-instruct](https://huggingface.co/tiiuae/falcon-7b-instruct) |
| [tiiuae/falcon-40b-instruct](https://huggingface.co/tiiuae/falcon-40b-instruct) |

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Run the notebook

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. [Optional] [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1"). If not set, a unique GCS bucket will be created instead.

# Import the necessary packages
import os
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform, language

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, please change the value yourself below.
now = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_URI = "gs://"  # @param {type:"string"}
if BUCKET_URI is None or BUCKET_URI.strip() == "" or BUCKET_URI == "gs://":
    # Create a unique GCS bucket for this notebook if not specified
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}"
    ! gsutil mb -l {REGION} {BUCKET_URI}
else:
    assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
    shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep "Location constraint:" | sed "s/Location constraint://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            "Bucket region %s is different from notebook region %s"
            % (bucket_region, REGION)
        )
print(f"Using this GCS Bucket: {BUCKET_URI}")

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")

# Set up default SERVICE_ACCOUNT
SERVICE_ACCOUNT = None
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)

# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

# The pre-built training and serving docker images.
TRAIN_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20240409_0936_RC00"
VLLM_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240410_0916_RC00"


def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def deploy_model(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "n1-standard-8",
    accelerator_type: str = "NVIDIA_TESLA_V100",
    accelerator_count: int = 1,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        "--gpu-memory-utilization=0.9",
        "--disable-log-stats",
        "--dtype=float16",
        "--trust-remote-code",
    ]
    serving_env = {"MODEL_ID": model_id}

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=VLLM_DOCKER_URI,
        serving_container_command=["python", "-m", "vllm.entrypoints.api_server"],
        serving_container_args=vllm_args,
        serving_container_ports=[7080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=serving_env,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint

In [None]:
# @title Deploy prebuilt Falcon Instruct models

# @markdown This section deploys prebuilt Falcon Instruct models on the Endpoint. The model deployment step will take 15 minutes to 40 minutes to complete.

# @markdown The peak GPU memory usages for [tiiuae/falcon-7b-instruct](https://huggingface.co/tiiuae/falcon-7b-instruct), and [tiiuae/falcon-40b-instruct](https://huggingface.co/tiiuae/falcon-40b-instruct) are ~15.5G and ~84G separately with the default settings. Please adjust the machine type, accelerator type and accelerator count accordingly. We use V100 in deployments as an example. Note that V100 serving generally offers better throughput and latency performance than L4 serving, while L4 serving is generally more cost efficient than V100 serving. The serving efficiency of V100 and L4 GPUs is inferior to that of A100 GPUs, but V100 and L4 GPUs are nevertheless good serving solutions if you do not have A100 quota.

# @markdown We use the PEFT serving images to deploy prebuilt Falcon Instruct models, by setting finetuning LoRA model paths as empty.


# Find Vertex AI supported accelerators and regions in:
#  https://cloud.google.com/vertex-ai/docs/predictions/configure-compute


# @markdown Set the prebuilt model id.
prebuilt_model_id = "tiiuae/falcon-7b-instruct"  # @param ["tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"]

# The accelerator to use.
accelerator_type = (
    "NVIDIA_L4"  # @param["NVIDIA_TESLA_V100", "NVIDIA_L4", "NVIDIA_TESLA_A100"]
)

# Sets V100 (16G) to deploy tiiuae/falcon-7b-instruct or tiiuae/falcon-40b-instruct.
# If A100 is not available, you may deploy tiiuae/falcon-40b-instruct with
#  multiple V100s. Please keep in mind that the efficiency of serving with
#  multiple V100s is inferior to that of serving with A100s.
# Compared with L4, V100 serving can have better throughput and latency.

# Sets L4 (24G) to deploy tiiuae/falcon-7b-instruct or tiiuae/falcon-40b-instruct.
# If A100 is not available, you may deploy tiiuae/falcon-40b-instruct with
#  multiple L4s. Please keep in mind that the efficiency of serving with
#  multiple L4s is inferior to that of serving with A100s.
# Compared with V100, L4 serving can be more cost efficient.

if "7b" in prebuilt_model_id:
    if accelerator_type == "NVIDIA_TESLA_A100":
        machine_type = "a2-highgpu-1g"
        accelerator_count = 1
    elif accelerator_type == "NVIDIA_TESLA_V100":
        machine_type = "n1-standard-8"
        accelerator_count = 1
    elif accelerator_type == "NVIDIA_L4":
        machine_type = "g2-standard-8"
        accelerator_count = 1
    else:
        raise ValueError(
            f"Recommended GPU setting not found for: {accelerator_type} and {prebuilt_model_id}."
        )
elif "40b" in prebuilt_model_id:
    if accelerator_type == "NVIDIA_TESLA_A100":
        machine_type = "a2-highgpu-1g"
        accelerator_count = 4
        # Sets A100 (80G) to deploy falcon-40b-instruct models for faster inferences.
        # machine_type = "a2-ultragpu-1g"
        # accelerator_count = 2
    elif accelerator_type == "NVIDIA_L4":
        machine_type = "g2-standard-48"
        accelerator_count = 4
    else:
        raise ValueError(
            f"Recommended GPU setting not found for: {accelerator_type} and {prebuilt_model_id}."
        )

model, endpoint = deploy_model(
    model_name=get_job_name_with_datetime(prefix="falcon-instruct-serve"),
    model_id=prebuilt_model_id,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
)
print("endpoint_name:", endpoint.name)

In [None]:
# @title Online Prediction

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Sampling parameters supported by vLLM can be found [here](https://github.com/vllm-project/vllm/blob/2e8e49fce3775e7704d413b2f02da6d7c99525c9/vllm/sampling_params.py#L23-L64). Setting `raw_response` to `True` allows you to obtain raw outputs.
# @markdown Example:

# @markdown ```
# @markdown Human: What is a car?
# @markdown Assistant:  A car, or a motor car, is a road-connected human-transportation system used to move people or goods from one place to another. The term also encompasses a wide range of vehicles, including motorboats, trains, and aircrafts. Cars typically have four wheels, a cabin for passengers, and an engine or motor. They have been around since the early 19th century and are now one of the most popular forms of transportation, used for daily commuting, shopping, and other purposes.
# @markdown ```

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint created in the cel above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

prompt = (
    "Distinguish between nuclear fusion and nuclear fission"  # @param {type: "string"}
)
max_tokens = 50  # @param {type:"integer"}
temperature = 1.0  # @param {type:"number"}
top_p = 1.0  # @param {type:"number"}
top_k = 10  # @param {type:"number"}

# endpoint_name = endpoint.name
# # endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

instances = [
    {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
    },
]
response = endpoint.predict(instances=instances)

for prediction in response.predictions:
    print(prediction)

In [None]:
# @title Clean up resources

delete_bucket = False  # @param {type:"boolean"}
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI

# Undeploy models and delete endpoints.
endpoint.delete(force=True)

# Delete models.
model.delete()