In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Import, Deploy, and Serve custom open models on Vertex AI using Vertex AI Model Garden SDK.

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/open-models/get_started_with_model_garden_sdk_custom_import.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fopen-models%2Fget_started_with_model_garden_sdk_custom_import.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/open-models/get_started_with_model_garden_sdk_custom_import.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/get_started_with_model_garden_sdk_custom_import.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/get_started_with_model_garden_sdk_custom_import.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/get_started_with_model_garden_sdk_custom_import.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/get_started_with_model_garden_sdk_custom_import.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/get_started_with_model_garden_sdk_custom_import.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/get_started_with_model_garden_sdk_custom_import.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Authors |
| --- |
| [Ivan Nardini](https://github.com/inardini) |
| [Eliza Huang](https://github.com/lizzij) |

## Overview

The Vertex AI Model Garden allows you to not only use Google's state-of-the-art models but also to bring your own. 

This tutorial shows how to take an open-source model from Hugging Face, importing it into Vertex AI, and serving it on a scalable, production-ready endpoint using Vertex AI Model Garden SDK Import Custom Model Weights feature. 

This feature gives you the flexibility to use a vast ecosystem of open models while leveraging the Google Cloud's enterprise-grade infrastructure on Vertex AI for serving models.

### What You'll Learn

  * How to efficiently transfer model artifacts from Hugging Face Hub directly to Google Cloud Storage (GCS).
  * How to register your model with the Vertex AI Model Registry using a GCS URI.
  * How to deploy the custom model to a dedicated Vertex AI Endpoint with a single command.
  * How to run inference on your newly deployed model.


## Get started

### Install required packages

Install the necessary Python packages for this tutorial.

*Note: The hf_transfer extra enables faster downloads using Rust-based transfer. This can speed up large model downloads by 2-5x*


In [None]:
%pip install --upgrade --force-reinstall --quiet 'google-cloud-aiplatform>=1.105.0' 'openai' 'google-auth' 'requests' 'huggingface_hub[hf_transfer]'

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
# import sys

# if "google.colab" in sys.modules:
#     from google.colab import auth

#     auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

import vertexai

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}

if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

BUCKET_NAME = "[your-bucket-name]"  # @param {type: "string", placeholder: "[your-bucket-name]", isTemplate: true}
BUCKET_URI = f"gs://{BUCKET_NAME}"

! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

vertexai.init(project=PROJECT_ID, location=LOCATION)

### Authenticate your HuggingFace account

To download models from Hugging Face, especially gated ones, you need to authenticate.

The `interpreter_login` function provides an easy way to do this within a notebook environment.


In [None]:
from huggingface_hub import interpreter_login

interpreter_login()

### Import libraries

In [None]:
from pathlib import Path
import shutil

import google.auth
from google.cloud import storage
from google.cloud.storage import transfer_manager
from huggingface_hub import snapshot_download
import openai
from tqdm.auto import tqdm
from vertexai.preview import model_garden

### Helpers

Before Vertex AI can deploy a custom model, its artifacts must be accessible in a Google Cloud Storage bucket. This helper function, `transfer_model`, automates the entire process.

It handles:

1.  **Fast Downloads**: Enables `hf_transfer` for accelerated downloads from Hugging Face.
2.  **Local Staging**: Downloads the model files to a temporary local directory.
3.  **Efficient Uploads**: Uses the `transfer_manager` to upload files to GCS in parallel chunks, which is much faster for large model weights.
4.  **Automatic Cleanup**: Removes the temporary local files after the upload is complete.

In [None]:
def transfer_model(
    model_id: str,
    bucket_name: str,
    location: str,
    artifact_path: str = "",
    exclude_patterns: list | None = None,
    chunk_size: int = 32 * 1024 * 1024,  # 32 MiB
    workers: int = 8,
) -> str:
    """
    Download model from HuggingFace and upload to GCS with parallel transfers.
    """
    # Defaults
    exclude_patterns = exclude_patterns or [
        "*.bin",
        "*.pth",
        "*.gguf",
        ".gitattributes",
    ]
    artifact_path = artifact_path or model_id.split("/")[-1]
    local_dir = Path(f"tmp/{model_id.replace('/', '--')}")

    # Enable fast transfers
    os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

    try:
        # Download from HuggingFace
        print(f"📥 Downloading {model_id}...")
        snapshot_download(
            repo_id=model_id,
            local_dir=str(local_dir),
            ignore_patterns=exclude_patterns,
            resume_download=True,
        )

        # Setup GCS
        client = storage.Client()
        bucket = client.bucket(bucket_name)

        # Create bucket if needed
        if not bucket.exists():
            print(f"📦 Creating bucket {bucket_name}...")
            bucket = client.create_bucket(bucket_name, location=location)

        # Get files to upload
        files = [f for f in local_dir.rglob("*") if f.is_file()]
        total_size = sum(f.stat().st_size for f in files) / (1024**3)

        print(f"☁️  Uploading {len(files)} files ({total_size:.2f} GB) to GCS...")

        # Upload files with parallel chunks
        for file_path in tqdm(files, desc="Files", unit="file"):
            blob_name = f"{artifact_path}/{file_path.relative_to(local_dir)}"
            blob = bucket.blob(blob_name)

            # Use parallel upload for large files (>32MB), regular for small files
            if file_path.stat().st_size > chunk_size:
                transfer_manager.upload_chunks_concurrently(
                    str(file_path), blob, chunk_size=chunk_size, max_workers=workers
                )
            else:
                blob.upload_from_filename(str(file_path))

        gcs_path = f"gs://{bucket_name}/{artifact_path}"
        print(f"✅ Done! Model available at {gcs_path}")
        return gcs_path

    finally:
        # Cleanup
        if local_dir.exists():
            shutil.rmtree(local_dir)

## Transfer Your Model from Hugging Face to GCS

Now, let's use the function to transfer a fine-tuned Gemma model from Hugging Face to our GCS bucket.


In [None]:
# The Hugging Face model we want to import.
hf_model_id = "xsanskarx/thinkygemma-4b"

# This command will download the model and upload it to your GCS bucket.
# This may take several minutes depending on the model size and your connection.
imported_custom_model_uri = transfer_model(hf_model_id, BUCKET_NAME, LOCATION)

## Import and Deploy the Model with Model Garden

With the model artifacts in GCS, we can now use the `model_garden` SDK to register and deploy it.

First, we create a `CustomModel` object, pointing it to the GCS URI where our model is stored.

In [None]:
model = model_garden.CustomModel(
    gcs_uri=imported_custom_model_uri,
)

Next, we define the serving infrastructure for our model's endpoint. You need to select a machine type and accelerator (GPU) that are appropriate for your model's size and performance requirements.

The real magic happens with the **`model.deploy()`** command. This single line of code abstracts away a complex series of operations:

  * It creates a new **Vertex AI Model** resource from your GCS artifacts.
  * It provisions the specified compute resources.
  * It creates a **Vertex AI Endpoint**.
  * It deploys the model to that endpoint, making it ready to serve inference requests.

In [None]:
# Define the machine resources for the serving endpoint.
# g2-standard-24 and 2 L4 GPUs is a robust configuration for many ~7B models.
MACHINE_TYPE = "g2-standard-24"
ACCELERATOR_TYPE = "NVIDIA_L4"
ACCELERATOR_COUNT = 2

# Deploy the model. This will create a Vertex AI Endpoint and deploy the model to it.
# This step can take 15-20 minutes as it provisions the necessary hardware.
endpoint = model.deploy(
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
)

## Run Inference on Your Deployed Model

Once the deployment is complete, your model is live and ready to serve predictions. You can interact with the endpoint using the `predict()` method.


In [None]:
response = endpoint.predict(
    instances=[{"prompt": "how many r does strawberry have?"}],
    use_dedicated_endpoint=True,
)
print(response.predictions)

You can also get your prediction using ChatCompletion API from the OpenAI SDK.

In [None]:
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

endpoint_url = f"https://{endpoint.gca_resource.dedicated_endpoint_dns}/v1beta1/{endpoint.resource_name}"

client = openai.OpenAI(base_url=endpoint_url, api_key=creds.token)

prediction = client.chat.completions.create(
    model="", messages=[{"role": "user", "content": "Tell me a joke"}], temperature=0.7
)

print(prediction.choices[0].message.content)

## Cleaning up

To avoid incurring ongoing charges to your Google Cloud account, it's important to clean up the resources you've created. The following commands will undeploy the model from the endpoint and then delete the endpoint itself.


In [None]:
delete_endpoint = True
delete_bucket = True

if delete_endpoint:
    endpoint.delete(force=True)

if delete_bucket:
    !gsutil rm -r {BUCKET_URI}