In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden GenAI Workshop for Whisper Large v3

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_pytorch_whisper_large_v3_gradio.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_whisper_large_v3_gradio.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates starting a playground for [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3), [openai/whisper-large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) model based on [Gradio UI](https://www.gradio.app/), which allows users to interact with the ASR model.

### Objective

- Deploy model to a [Vertex AI Endpoint resource](https://cloud.google.com/vertex-ai/docs/predictions/using-private-endpoints).
- Run online predictions for `audio-to-text` tasks from the UI.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Run the playground

In [None]:
# @title Setup Google Cloud project and prepare dependencies

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1"). If not set, a unique GCS bucket will be created instead.

BUCKET_URI = "gs://"  # @param {type:"string"}

# @markdown 3. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = ""  # @param {type:"string"}

# @markdown 4. If you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus).

# @markdown > | Machine Type | Accelerator Type | Recommended Regions |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |
# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, us-west1, europe-west4, asia-southeast1 |

! pip3 install --upgrade gradio==5.1.0
! pip3 install scipy==1.14.1

import datetime
import importlib
import os
import uuid

from google.cloud import aiplatform

! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)


# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Vertex AI API and Compute Engine API, if not already.
print("Enabling Vertex AI API and Compute Engine API.")
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com

# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, change the value yourself below.
now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])

if BUCKET_URI is None or BUCKET_URI.strip() == "" or BUCKET_URI == "gs://":
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}-{str(uuid.uuid4())[:4]}"
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
    ! gsutil mb -l {REGION} {BUCKET_URI}
else:
    assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."
    shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep "Location constraint:" | sed "s/Location constraint://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            "Bucket region %s is different from notebook region %s"
            % (bucket_region, REGION)
        )
print(f"Using this GCS Bucket: {BUCKET_URI}")

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
MODEL_BUCKET = os.path.join(BUCKET_URI, "whisper_large_v3")


# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

# Gets the default SERVICE_ACCOUNT.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)


# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

! gcloud config set project $PROJECT_ID
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/storage.admin"
! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role="roles/aiplatform.user"

In [None]:
# @title Start the playground

# @markdown This is a playground for running Whisper Large V3 models.

# @markdown After running the cell, a public URL (["https://*.gradio.live"](#)) will appear in the cell output. The playground is available in a separate browser tab when you click the URL.

# @markdown **How to use:**
# @markdown 1. Select or deploy model
# @markdown   1. In the playground, select a previous deployed Whisper model.
# @markdown   2. If you don't have any deployed model, deploy a new model in the playground.
# @markdown   3. New deployment takes ~20 minutes. You can check the progress at [Vertex Online Prediction](https://console.cloud.google.com/vertex-ai/online-prediction/endpoints).
# @markdown   4. After the model deployment is complete, restart the playground in Colab to see the updated endpoint list.
# @markdown 1. Inference
# @markdown   1. In the "Inference" section, fill in prompt and parameters
# @markdown   2. Click "Generate" to generate image from text prompt.

# @markdown **Important notes**
# @markdown 1. Reruning this notebook cell creates a new public URL. Previous URLs will stop working.
# @markdown 2. After experiments, manually undeploy models to avoid continuous charges to the project.

import base64
import io
import math

import gradio as gr
import scipy.signal as sps
from google.cloud import aiplatform
from scipy.io.wavfile import write

# The pre-built serving docker image. It contains serving scripts and models.
SERVE_DOCKER_URI = "us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/pytorch-inference.cu125.0-1.ubuntu2204.py310"
DEFAULT_SAMPLING_RATE = 16000


bearer_token = ! gcloud auth print-access-token
bearer_token = bearer_token[0].strip()


def is_whisper_endpoint(endpoint: aiplatform.Endpoint) -> bool:
    """Returns True if the endpoint is a Whisper Large v3 endpoint."""
    return "whisper-large-v3" in endpoint.display_name.lower()


def list_endpoints() -> list[str]:
    """Returns all valid prediction endpoints for in the project and region."""
    # Gets all the valid endpoints in the project and region.
    endpoints = aiplatform.Endpoint.list(order_by="create_time desc")
    # Filters out the endpoints which do not have a deployed model, and the endpoint is for image generation
    endpoints = list(
        filter(
            lambda endpoint: endpoint.traffic_split and is_whisper_endpoint(endpoint),
            endpoints,
        )
    )

    endpoint_names = list(
        map(
            lambda endpoint: f"{endpoint.name} - {endpoint.display_name[:40]}",
            endpoints,
        )
    )

    return endpoint_names


def get_endpoint(endpoint_name: str) -> aiplatform.Endpoint:
    """Returns a Vertex endpoint for the given endpoint_name."""

    endpoint_id = endpoint_name.split(" - ")[0]
    endpoint = aiplatform.Endpoint(
        f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_id}"
    )

    return endpoint


def deploy_model(model_id: str, task_id, accelerator_type: str) -> aiplatform.Endpoint:
    """Creates a new Vertex prediction endpoint and deploys a model to it."""

    if not model_id:
        raise gr.Error("Select a valid model name for model list.")
        return

    gr.Info("Model deployment started.")

    display_name = common_util.create_job_name(model_id)
    endpoint = aiplatform.Endpoint.create(display_name=display_name)
    serving_env = {
        "MODEL_ID": model_id,
        "TASK": task_id,
        "DEPLOY_SOURCE": "notebook_gradio",
    }

    machine_type_map = {
        "NVIDIA_TESLA_A100": "a2-highgpu-1g",
        "NVIDIA_L4": "g2-standard-12",
    }
    if accelerator_type not in machine_type_map:
        raise gr.Error(
            f"Select a valid accelerator type from {list(machine_type_map.keys())}"
        )
        return
    machine_type = machine_type_map.get(accelerator_type)
    accelerator_count = 1
    common_util.check_quota(
        project_id=PROJECT_ID,
        region=REGION,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        is_for_training=False,
    )

    display_name = common_util.create_job_name(model_id)
    model = aiplatform.Model.upload(
        display_name=model_id,
        serving_container_image_uri=SERVE_DOCKER_URI,
        serving_container_ports=[7080],
        serving_container_predict_route="/predict",
        serving_container_health_route="/health",
        serving_container_environment_variables=serving_env,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=SERVICE_ACCOUNT,
        sync=False,
    )

    gr.Info(
        f"Model {display_name} is being deployed. It may take ~20 minutes to complete."
    )

    return endpoint


def format_output(prediction):
    if "text" not in prediction:
        gr.Error("text is missing from predictions")
        return ""
    output = prediction.get("text")
    if "timestamps" in prediction and len(prediction.get("timestamps")) > 0:
        output = ""
        for ts in prediction.get("timestamps"):
            timestamp, text = ts.get("timestamp"), ts.get("text")
            if len(timestamp) != 2 or len(text) == 0:
                continue
            output += f"{timestamp[0]} - {timestamp[1]} : {text}\n"
    return output


def resample(sr, data):
    number_of_samples = round(len(data) * float(DEFAULT_SAMPLING_RATE) / sr)
    data = sps.resample(data, number_of_samples)
    return data


def get_bytes(audio):
    bytes_wav = bytes()
    byte_io = io.BytesIO(bytes_wav)
    write(byte_io, DEFAULT_SAMPLING_RATE, audio)
    return byte_io.read()


def split_audio(audio):
    length = round(float(len(audio)) / DEFAULT_SAMPLING_RATE)
    print(f"audio length: {length}")
    print(f"shape of audio: {audio.shape}")
    splits = math.ceil(length / 5)
    arr = []
    for i in range(splits):
        if i != splits - 1:
            arr = arr + [
                audio[
                    i * 5 * DEFAULT_SAMPLING_RATE : (i + 1) * 5 * DEFAULT_SAMPLING_RATE
                ]
            ]
        else:
            arr = arr + [audio[i * 5 * DEFAULT_SAMPLING_RATE :]]
    print(f"splits: {splits}")
    print(f"arr size: {len(arr)}")
    for i in range(len(arr)):
        print(f"arr[{i}]: {len(arr[i])}")
    return arr


def predict(endpoint_name: str, instance: dict, language: str, timestamp: str):
    if not endpoint_name:
        raise gr.Error("Select (or deploy) a model first.")
    ts = ""
    if timestamp == "Sentence Level":
        ts = "sentence"
    if timestamp == "Word Level":
        ts = "word"
    parameters = {
        "language": language,
        "return_timestamps": ts,
    }
    endpoint = get_endpoint(endpoint_name)
    response = endpoint.predict(instances=[instance], parameters=parameters)
    print(response)
    if "details" in response and "msg" in response["details"]:
        raise gr.Error("Please check inputs: %s", response["details"]["msg"])
    if len(response.predictions) != 0:
        return format_output(response.predictions[0])
    raise gr.Error(f"Invalid response: {response}")


def predict_audio(endpoint_name: str, audio: tuple, language: str, timestamp: str):
    sr, audio = audio
    audio = resample(sr, audio)
    audio_splits = split_audio(audio)
    text = ""
    for audio in audio_splits:
        instance = {"audio": base64.b64encode(get_bytes(audio)).decode("utf-8")}
        response = predict(endpoint_name, instance, language, timestamp)
        if len(response) != 0:
            text += format_output(response)
            continue
    return text


def predict_gcs(endpoint_name: str, gcs_uri: str, language: str, timestamp: str):
    instance = {"audio": gcs_uri, "bearer_token": bearer_token}
    return predict(endpoint_name, instance, language, timestamp)


tip_text = r"""
1. Select a previous deployed Flux model.
2. If you don't have any deployed model, deploy a new model. The deployment takes ~20 minutes. You can check the progress at [Vertex Online Prediction](https://console.cloud.google.com/vertex-ai/online-prediction/endpoints). After the model deployment is complete, restart the playground in Colab to see the updated endpoint list.
3. In the "Inference" section, provide audio, then click "transcribe" to generate text from audio.
"""

css = """
.gradio-container {
  width: 90% !important
}
"""

with gr.Blocks(
    css=css, theme=gr.themes.Default(primary_hue="orange", secondary_hue="blue")
) as demo:
    gr.Markdown("# Model Garden Playground for Whisper Large V3")

    with gr.Accordion("How To Use", open=True):
        tip = gr.Markdown(tip_text)

    gr.Markdown("## Select or deploy model")
    gr.Markdown("### Select a previously deployed model")
    gr.Markdown("### Deploy a new model to Vertex")
    with gr.Row(equal_height=True):
        with gr.Column(scale=7):
            endpoints = list_endpoints()
            default_endpoint = None if len(endpoints) == 0 else endpoints[0]
            endpoint_name = gr.Dropdown(
                label="Select a model previously deployed on Vertex",
                choices=list_endpoints(),
                value=default_endpoint,
                interactive=True,
            )
        with gr.Column(scale=1):
            refresh_endpoints_button = gr.Button(
                "Refresh", scale=1, variant="primary", min_width=10
            )
            refresh_endpoints_button.click(
                lambda: gr.update(choices=list_endpoints()),
                outputs=[endpoint_name],
            )
    gr.Markdown("### Deploy a new model to Vertex")
    model_id = gr.Dropdown(
        label="Select a model to deploy",
        choices=[
            "openai/whisper-large-v3",
            "openai/whisper-large-v3-turbo",
        ],
        value="openai/whisper-large-v3-turbo",
        interactive=True,
    )
    with gr.Row(equal_height=True):
        with gr.Column(scale=7):
            accelerator_type = gr.Dropdown(
                label="Select accelerator type for deployment",
                choices=[
                    "NVIDIA_TESLA_A100",
                    "NVIDIA_L4",
                ],
                value="NVIDIA_L4",
            )
        with gr.Column(scale=1):
            deploy_model_button = gr.Button(
                "Deploy", scale=1, variant="primary", min_width=10
            )

    gr.Markdown("## Inference")
    with gr.Tab("GCS"):
        with gr.Row(equal_height=True):
            with gr.Column(scale=2):
                input_audio = gr.Textbox(
                    lines=1,
                    placeholder="e.g. gs://bucket/path/to/audio.wav",
                    label="GCS uri",
                )
                gr.Markdown("### Parameters")
                language = gr.Textbox(
                    lines=1,
                    placeholder="e.g. french, english etc",
                    label="[language (optional)]()",
                )
                timestamp = gr.Dropdown(
                    label="Timestamps",
                    choices=[
                        "No Timestamp",
                        "Sentence Level",
                        "Word Level",
                    ],
                    value="No Timestamp",
                    interactive=True,
                )
                submit = gr.Button("submit", variant="primary")
            with gr.Column(scale=4):
                prediction = gr.Textbox(
                    lines=1, placeholder="output of model", label="prediction"
                )
            submit.click(
                predict_gcs,
                inputs=[endpoint_name, input_audio, language, timestamp],
                outputs=[prediction],
            )
    with gr.Tab("Audio"):
        with gr.Row(equal_height=True):
            with gr.Column(scale=2):
                input_audio = gr.Audio(
                    sources=["microphone", "upload"],
                    waveform_options=gr.WaveformOptions(
                        skip_length=2,
                        show_controls=False,
                    ),
                )
                gr.Markdown("### Parameters")
                language = gr.Textbox(
                    lines=1,
                    placeholder="e.g. french, english etc",
                    label="[language (optional)]()",
                )
                timestamp = gr.Dropdown(
                    label="Timestamps",
                    choices=[
                        "No Timestamp",
                        "Sentence Level",
                        "Word Level",
                    ],
                    value="No Timestamp",
                    interactive=True,
                )
                submit = gr.Button("submit", variant="primary")
            with gr.Column(scale=4):
                prediction = gr.Textbox(
                    lines=1, placeholder="output of model", label="prediction"
                )
            submit.click(
                predict_audio,
                inputs=[endpoint_name, input_audio, language, timestamp],
                outputs=[prediction],
            )

    deploy_model_button.click(
        lambda model_id, accelerator_type: deploy_model(
            model_id, "audio2text", accelerator_type
        ),
        inputs=[model_id, accelerator_type],
        outputs=[],
    )


show_debug_logs = True  # @param {type: "boolean"}
demo.queue()
demo.launch(share=True, inline=False, debug=show_debug_logs, show_error=True)