In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - Wan 2.1

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_xdit_wan2_1.ipynb">
      <img alt="Workbench logo" src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" width="32px"><br> Run in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_xdit_wan2_1.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_xdit_wan2_1.ipynb">
      <img alt="GitHub logo" src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates deploying the pre-trained [Wan2.1](https://huggingface.co/Wan-AI/models) T2V Diffusers variants on Vertex AI for online prediction.

### Objective

- Upload the model to [Model Registry](https://cloud.google.com/vertex-ai/docs/model-registry/introduction).
- Deploy the model on [Endpoint](https://cloud.google.com/vertex-ai/docs/predictions/using-private-endpoints).
- Run online predictions for text-to-video.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Request for quota

# @markdown To deploy the largest variant of the Wan2.1 models, you need 1 host of 4 x H100 machine, or 1 host of 1 x A100-80GB machine. To deploy the smaller variant, you need 1 host of 2 x H100 machine, or 1 host of 1 x A100-80GB machine. Check that you have sufficient quota:
# @markdown - For Spot VM quota, check [`CustomModelServingPreemptibleH100GPUsPerProjectPerRegion`](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_preemptible_nvidia_h100_gpus).
# @markdown - For regular VM quota, check [`CustomModelServingH100GPUsPerProjectPerRegion`](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus).
#
# @markdown If you don't have sufficient quota, request for quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).
#
# @markdown Note: Utilizing 2 x H100 or 4 x H100 provides substantial speedup over 1 x H100 or 1 x A100-80GB. Utilizing 2 x H100 provides a ~2x speedup in inference and 4 x H100 provides a ~3x speedup in inference.

## Run the notebook

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. **[Optional]** Set region. If not set, the region will be set automatically according to Colab Enterprise environment.

REGION = ""  # @param {type:"string"}

# @markdown 3. If you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. **NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus). You can request for quota following the instructions at ["Request a higher quota"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota).

# @markdown > | Machine Type | Accelerator Type | Recommended Regions |
# @markdown | ----------- | ----------- | ----------- |
# @markdown | a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |
# @markdown | a3-highgpu-2g | 2 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-4g | 4 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |
# @markdown | a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, europe-west4, us-west1, asia-southeast1 |

# Upgrade Vertex AI SDK.
! pip3 install --upgrade --quiet 'google-cloud-aiplatform==1.103.0'

# Import the necessary packages
import importlib
import os

from google.cloud import aiplatform

# Upgrade Vertex AI SDK.
if os.environ.get("VERTEX_PRODUCT") != "COLAB_ENTERPRISE":
    ! pip install --upgrade tensorflow
! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

LABEL = "xdit_gpu"
models, endpoints = {}, {}

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
if not REGION:
    REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION)

! gcloud config set project $PROJECT_ID

import vertexai

vertexai.init(
    project=PROJECT_ID,
    location=REGION,
)

## Deploy Wan 2.1 with xDiT

In [None]:
# @title Select the model variants

# @markdown Set the model to deploy.

base_model_name = "Wan2.1-T2V-1.3B-Diffusers"  # @param ["Wan2.1-T2V-1.3B-Diffusers", "Wan2.1-T2V-14B-Diffusers"] {isTemplate:true}
model_id = "Wan-AI/" + base_model_name
task = "text-to-video"
hf_model_id = model_id


# @markdown Choose whether to use a [Spot VM](https://cloud.google.com/compute/docs/instances/spot) for the deployment.
is_spot = False  # @param {type:"boolean"}

# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint). Note that [dedicated endpoint does not support VPC Service Controls](https://cloud.google.com/vertex-ai/docs/predictions/choose-endpoint-type), uncheck the box if you are using VPC-SC.
use_dedicated_endpoint = True  # @param {type:"boolean"}

# @markdown Find Vertex AI prediction supported accelerators and regions at https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.
accelerator_type = "NVIDIA_H100_80GB"  # @param ["NVIDIA_H100_80GB", "NVIDIA_A100_80GB"] {isTemplate:true}

PUBLISHER_MODEL_NAME = f"publishers/wan-ai/models/wan@{base_model_name.lower()}"

if accelerator_type == "NVIDIA_H100_80GB":
    if is_spot:
        resource_id = "custom_model_serving_preemptible_nvidia_h100_gpus"
    else:
        resource_id = "custom_model_serving_nvidia_h100_gpus"
    if base_model_name in ["Wan2.1-T2V-1.3B-Diffusers"]:
        machine_type = "a3-highgpu-2g"
        accelerator_count = 2
    elif base_model_name in ["Wan2.1-T2V-14B-Diffusers"]:
        machine_type = "a3-highgpu-4g"
        accelerator_count = 4
    else:
        raise ValueError(f"Recommended GPU setting not found for: {base_model_name}.")
elif accelerator_type == "NVIDIA_A100_80GB":
    if is_spot:
        resource_id = "custom_model_serving_preemptible_nvidia_a100_gpus"
    else:
        resource_id = "custom_model_serving_nvidia_a100_gpus"
    if base_model_name in ["Wan2.1-T2V-1.3B-Diffusers", "Wan2.1-T2V-14B-Diffusers"]:
        machine_type = "a2-ultragpu-1g"
        accelerator_count = 1
    else:
        raise ValueError(f"Recommended GPU setting not found for: {base_model_name}.")
else:
    raise ValueError(f"Recommended GPU setting not found for: {base_model_name}.")

common_util.check_quota(
    project_id=PROJECT_ID,
    region=REGION,
    resource_id=resource_id,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
)

# @markdown Click "Show Code" to see more details.

In [None]:
# @title [Option 1] Deploy with Model Garden SDK
# @markdown Deploy with Gen AI model-centric SDK. This section uploads the prebuilt model to Model Registry and deploys it to a Vertex AI Endpoint. It takes 15 minutes to 1 hour to finish depending on the size of the model. See [use open models with Vertex AI](https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-open-models) for documentation on other use cases.
deploy_request_timeout = 1800  # 30 minutes
from vertexai import model_garden

model = model_garden.OpenModel(PUBLISHER_MODEL_NAME)
endpoints[LABEL] = model.deploy(
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    use_dedicated_endpoint=use_dedicated_endpoint,
    spot=is_spot,
    deploy_request_timeout=deploy_request_timeout,
    accept_eula=False,
)

endpoint = endpoints[LABEL]

# @markdown Click "Show Code" to see more details.

In [None]:
# @title [Option 2] Deploy with customized configs

# @markdown This section uploads Wan2.1 models to Model Registry and deploys them to a Vertex Prediction Endpoint. It takes ~1 hour to finish.

# @markdown It's recommended to use the region selected by the deployment button on the model card. If the deployment button is not available, it's recommended to stay with the default region of the notebook.

# The pre-built serving docker image. It contains serving scripts and models.
SERVE_DOCKER_URI = "us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/xdit-serve.cu125.0-2.ubuntu2204.py310"


def deploy_model(model_id, task, machine_type, accelerator_type, accelerator_count):
    """Create a Vertex AI Endpoint and deploy the specified model to the endpoint."""
    common_util.check_quota(
        project_id=PROJECT_ID,
        region=REGION,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        is_for_training=False,
    )

    model_name = model_id

    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")
    serving_env = {
        "MODEL_ID": model_id,
        "TASK": task,
        "DEPLOY_SOURCE": "notebook",
    }

    # xDiT serving parameters
    serving_env["N_GPUS"] = accelerator_count
    if accelerator_count == 2:
        serving_env["ULYSSES_DEGREE"] = "2"
    elif accelerator_count == 4:
        serving_env["ULYSSES_DEGREE"] = "4"

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=SERVE_DOCKER_URI,
        serving_container_ports=[7080],
        serving_container_predict_route="/predict",
        serving_container_health_route="/health",
        serving_container_environment_variables=serving_env,
        model_garden_source_model_name="publishers/wan-ai/models/wan",
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        system_labels={"NOTEBOOK_NAME": "model_garden_xdit_wan2_1.ipynb"},
    )
    return model, endpoint


models["xdit_gpu_custom"], endpoints["xdit_gpu_custom"] = deploy_model(
    model_id=model_id,
    task=task,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
)

print("endpoint_name:", endpoints["xdit_gpu_custom"].name)

# @markdown Click "Show Code" to see more details.

In [None]:
# @title Predict

# @markdown Once deployment succeeds, you can send requests to the endpoint with text prompts.
# @markdown Example:

# @markdown ```
# @markdown text: A cat waving a sign that says hello world
# @markdown ```

# @markdown You may adjust the parameters below to achieve best video quality.
from IPython.display import HTML

text = "A cat waving a sign that says hello world"  # @param {type: "string"}
seed = 42  # @param {type:"number"}

instances = [{"text": text, "seed": seed}]
parameters = {
    "seed": seed,
}

response = endpoints[LABEL].predict(instances=instances, parameters=parameters)

video_bytes = response.predictions[0]["output"]

video_html = f"""
<video controls>
<source src="data:video/mp4;base64,{video_bytes}" type="video/mp4">
Your browser does not support the video tag.
</video>
"""  # Assumes MP4. Change type if needed (e.g., video/webm)

display(HTML(video_html))

In [None]:
# @title Delete the models and endpoints

# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()