In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - CSM-1B Deployment

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_pytorch_csm_deployment.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_csm_deployment.ipynb">
      <img alt="GitHub logo" src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates serving the [sesame/csm-1b](https://huggingface.co/sesame/csm-1b) model on Vertex. CSM (Conversational Speech Model) is a speech generation model from Sesame that generates RVQ audio codes from text and audio inputs. The model architecture employs a Llama backbone and a smaller audio decoder that produces Mimi audio codes.


### Objective

- Deploy [sesame/csm-1b](https://huggingface.co/sesame/csm-1b) on Vertex AI.
- Generate conversation audios with the deployed endpoint.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = ""  # @param {type:"string"}

# @markdown 3. If you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus). You can request for quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

# @markdown > | Machine Type | Accelerator Type | Recommended Regions |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |
# @markdown | a3-highgpu-2g | 2 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-4g | 4 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, europe-west4, us-west1, asia-southeast1 |

# @markdown 4. To use the model, you need to accept the agreement of [Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) and [CSM-1B](https://huggingface.co/sesame/CSM-1B) on Hugging Face.

# @markdown 5. Set Hugging Face access token in `HF_TOKEN` field. If you don't already have a "read" access token, follow the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens) to create an access token with "read" permission. You can find your existing access tokens in the Hugging Face [Access Token](https://huggingface.co/settings/tokens) page.

# Import the necessary packages

! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

import base64
import importlib
import os
from typing import Tuple

from google.cloud import aiplatform
from IPython.core.display import display
from IPython.display import Audio

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

models, endpoints = {}, {}

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION)

! gcloud config set project $PROJECT_ID

HF_TOKEN = ""  # @param {type:"string", isTemplate: true}
assert HF_TOKEN, "Set Hugging Face access token in `HF_TOKEN`."

## Deploy and predict with the CSM-1B model

In [None]:
# @title Deploy

# @markdown This section uploads the CSM-1B model to Model Registry and deploys it to a Vertex Prediction Endpoint with 1 `NVIDIA_L4` GPU and `g2-standard-8` machine type.
# @markdown It takes ~10 minutes to finish.

# @markdown It's recommended to use the region selected by the deployment button on the model card. If the deployment button is not available, it's recommended to stay with the default region of the notebook.

model_id = "sesame/csm-1b"
publisher, publisher_model_id = model_id.split("/")

PYTORCH_DOCKER_URI = (
    "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-csm-serve"
)

# @markdown Use a [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint) for the deployment.
use_dedicated_endpoint = True  # @param {type:"boolean"}

# @markdown Find Vertex AI prediction supported accelerators and regions at https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.
accelerator_type = "NVIDIA_L4"
accelerator_count = 1
machine_type = "g2-standard-8"

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    is_for_training=False,
)


def deploy_model_pytorch(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    task: str,
    handler: str = "",
    service_account: str | None = None,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    use_dedicated_endpoint: bool = False,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys models with Model Garden Pytorch Inference on GPU in Vertex AI."""
    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )

    env_vars = {
        "MODEL_ID": model_id,
        "TASK": task,
    }

    if handler:
        env_vars["HANDLER"] = handler

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=PYTORCH_DOCKER_URI,
        serving_container_ports=[8080],
        serving_container_predict_route="/predict",
        serving_container_health_route="/health",
        serving_container_environment_variables=env_vars,
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=3600,
        system_labels={
            "NOTEBOOK_NAME": "model_garden_pytorch_csm_deployment.ipynb",
            "DEPLOY_SOURCE": "notebook",
        },
        service_account=service_account,
    )
    print("endpoint_name:", endpoint.name)

    return model, endpoint


models["pytorch_gpu"], endpoints["pytorch_gpu"] = deploy_model_pytorch(
    model_name=common_util.get_job_name_with_datetime(prefix="csm-1b-serve"),
    model_id=model_id,
    publisher=publisher,
    publisher_model_id=publisher_model_id,
    task="text-to-speech",
    service_account=None,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    use_dedicated_endpoint=use_dedicated_endpoint,
)
# @markdown Click "Show Code" to see more details.

In [None]:
# @title Predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts. Note that the first few prompts will take longer to execute.

# @markdown Here, we use the following conversation example:
# @markdown - Speaker 0: I just won a million dollar lottery.
# @markdown - Speaker 1: You're kidding me!

instances = [
    {"speaker": 0, "text": "I just won a million dollar lottery."},
    {"speaker": 1, "text": "You're kidding me!"},
]

response = endpoints["pytorch_gpu"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    display(Audio(base64.b64decode(prediction["audio"])))

# @markdown Click "Show Code" to see more details.

## Clean up resources

In [None]:
# @title Delete the models and endpoints

# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()