In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Intro to Vertex AI Multimodal Datasets

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-dataset/intro_vertex_ai_multimodal_dataset.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fmultimodal-dataset%2Fintro_vertex_ai_multimodal_dataset.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/multimodal-dataset/intro_vertex_ai_multimodal_dataset.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/bigquery/import?url=https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-dataset/intro_vertex_ai_multimodal_dataset.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/bigquery/v1/32px.svg" alt="BigQuery Studio logo"><br> Open in BigQuery Studio
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-dataset/intro_vertex_ai_multimodal_dataset.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-dataset/intro_vertex_ai_multimodal_dataset.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-dataset/intro_vertex_ai_multimodal_dataset.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-dataset/intro_vertex_ai_multimodal_dataset.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-dataset/intro_vertex_ai_multimodal_dataset.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/multimodal-dataset/intro_vertex_ai_multimodal_dataset.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author |
| --- |
| [Frances Thoma](https://github.com/diskontinuum) |

## Overview

This notebook demonstrates how to use Vertex AI Multimodal Datasets to assemble Gemini requests, to run a validation and resource estimation for supervised fine-tuning, and to create tuning and batch prediction jobs.

### Objectives

- Preview the new Vertex AI Multimodal Datasets SDK
- Demo upcoming integrations

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage
* BigQuery

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and [BigQuery pricing](https://cloud.google.com/bigquery/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

### Prerequisites
1. Make sure that [billing is enabled](https://cloud.google.com/billing/docs/how-to/modify-project) for your project.

2. You must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

### Questions or Feedback

You can reach out directly to the authors via `vertex-multimodal-dataset-external-feedback@google.com` for feedback or questions.

## Get Started

### Install Vertex AI SDK and other required packages

In [None]:
%pip install --quiet --upgrade google-cloud-aiplatform bigframes

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

- If you are running this notebook in a local development environment:
  - Install the [Google Cloud SDK](https://cloud.google.com/sdk).
  - Obtain authentication credentials. Create local credentials by running the following command and following the oauth2 flow (read more about the command [here](https://cloud.google.com/sdk/gcloud/reference/beta/auth/application-default/login)):

    ```bash
    gcloud auth application-default login
    ```

### Import libraries

In [None]:
import io
import os

import bigframes.pandas as bpd
import pandas
import vertexai
from PIL import Image
from google.cloud import storage
from google.cloud.aiplatform.preview import datasets
from vertexai.generative_models import Content, Part

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

vertexai.init(project=PROJECT_ID, location=LOCATION)

# BigFrames settings
bpd.close_session()
bpd.options.bigquery.project = PROJECT_ID
bpd.options.bigquery.location = LOCATION

### Data preparation

The image files and labels used in this tutorial are from the flower dataset used in this [TensorFlow blog post](https://cloud.google.com/blog/products/gcp/how-to-classify-images-with-tensorflow-using-google-cloud-machine-learning-and-cloud-dataflow).

The dataset contains 7338 images, each of which is annotated with one label across 5 different flower classes.

The input images are stored in a public Cloud Storage bucket. This publicly-accessible bucket also contains a CSV file used to create the Vertex AI multimodal dataset. This file has two columns: the first column lists an image's URI in Cloud Storage, and the second column contains the image's label.

In this notebook, we'll use subsets of the flower dataset, each with a fixed number of examples per category, and prepare training, tuning and test subsets
 as DataFrame.

**Tip:** Use the BigFrames library `bpd` instead of `pandas` for larger datasets.

In [None]:
# Get data from GCS
csv = "gs://cloud-samples-data/ai-platform/flowers/flowers.csv"
all_images = pandas.read_csv(csv, names=["image_uris", "labels"])

# Shuffle
all_images = all_images.sample(frac=1).reset_index(drop=True)

# Prepare training and validation set
CATEGORIES = ["daisy", "dandelion", "roses", "sunflowers", "tulips"]
TRAINING_CASES_PER_CATEGORY = 100  # @param {type: 'integer'}
VALIDATION_CASES_PER_CATEGORY = 100  # @param {type: 'integer'}
PREDICTION_DATASET_SIZE = 100  # @param {type: 'integer'}

# Set up the prediction set
if len(all_images) < PREDICTION_DATASET_SIZE:
    raise ValueError(
        "Prediction dataset size is larger than the total number of images."
    )
prediction_set = all_images.iloc[:PREDICTION_DATASET_SIZE]
all_images = all_images.iloc[PREDICTION_DATASET_SIZE:]

# Set up the training and validation set with evenly distributed labels
training_set = pandas.DataFrame()
validation_set = pandas.DataFrame()


for category in CATEGORIES:
    same_labels = all_images[all_images["labels"] == category]
    if len(same_labels) < TRAINING_CASES_PER_CATEGORY + VALIDATION_CASES_PER_CATEGORY:
        raise ValueError("Please reduce the number of cases per category.")
    training_set = pandas.concat(
        (training_set, same_labels.iloc[:TRAINING_CASES_PER_CATEGORY]),
        ignore_index=True,
    )
    validation_set = pandas.concat(
        (
            validation_set,
            same_labels.iloc[
                TRAINING_CASES_PER_CATEGORY : TRAINING_CASES_PER_CATEGORY
                + VALIDATION_CASES_PER_CATEGORY
            ],
        ),
        ignore_index=True,
    )

In [None]:
# @title Common Functions

# Set Pandas display options to show all columns and full width for better inspection
pandas.set_option("display.max_columns", None)  # Show all columns
pandas.set_option("display.expand_frame_repr", False)  # Prevent line wrapping
pandas.set_option("display.max_colwidth", None)  # Show full column width


def show_dataset_info(dataset):
    """Dataset inspection helper"""
    print("  Resource name: ", dataset.resource_name)
    print("  Display name: ", dataset.display_name)
    print("  Schema URI:   ", dataset.metadata_schema_uri)
    print("  BQ Table:     ", dataset.bigquery_table)


def get_gcs_image(gcs_uri):
    """Download and show an image from Cloud Storage."""
    storage_client = storage.Client(project=PROJECT_ID)
    blob = storage.blob.Blob.from_string(gcs_uri, client=storage_client)
    return Image.open(io.BytesIO(blob.download_as_bytes()))

## User Journey Demo

The user journey demonstrated here contains the following steps:

1. Create Dataset
2. Assemble the dataset with a template and inspect assembly
3. Run a validation for tuning
4. Estimate Resources for tuning
5. Run tuning
6. Run batch prediction

### 1. Create a dataset from a Pandas or BigFrames DataFrame

We prepared a DataFrame `training_set` with two columns:

*   `image_uris`: GCS URIs of flower images
*   `labels`: Flower label (five flower categories, one label per image)

In [None]:
flower_uri = training_set["image_uris"].iloc[0]
flower_label = training_set["labels"].iloc[0]

display(get_gcs_image(flower_uri))
print(f"Image URI: {flower_uri}")
print(f"Flower label: {flower_label}")
training_set.head()

Let's create a Vertex AI multimodal dataset from the prepared DataFrame.

In [None]:
flowers = datasets.MultimodalDataset.from_pandas(dataframe=training_set)

# Inspect the dataset
show_dataset_info(flowers)
flowers.to_bigframes().head()

**Other dataset creation options**

Create from a BigQuery table.

```py
my_dataset_from_bigquery = datasets.MultimodalDataset.from_bigquery(
    bigquery_uri=f"bq://projectId.datasetId.tableId"
)
```

Create from a BigFrames DataFrame.

```py
my_dataset_from_pandas = datasets.MultimodalDataset.from_bigframes(
    dataframe=my_dataframe
)
```

Create from a GCS file in JSONL format for assembled input (the JSONL file contains Gemini requests, no assembly required).

```py
my_dataset = datasets.MultimodalDataset.from_gemini_request_jsonl(
    gcs_uri=gcs_uri_of_jsonl_file
)
```

List or load existing datasets.

```py
# Get the most recently created dataset
first_dataset = datasets.MultimodalDataset.list()[0]

# Load dataset based on dataset name
same_dataset = datasets.MultimodalDataset(first_dataset.name)
```

### 2. Assemble the dataset with a template and inspect assembly

To use our Flowers dataset with Gemini, let's assemble a full Gemini request referencing the images in our dataset.

We construct a template configuration by specifying the general prompt, response and system instructions and use placeholders in curly braces. During the assembly, the placeholders are replaced with the values of the dataset column that the placeholders denote.

The dataset columns referenced by the placeholders can contain e.g. GCS URIS for files of several data types and modalities:
- .pdf
- .png, .jpeg, .jpg, .webp
- .aac, .flac, .mp3, .m4a, .mpga, .opus, .pcm, .wav
- .flv, .mov, .mpegps, .mpg, .wmv, .3pg

In [None]:
template_config = datasets.construct_single_turn_template(
    prompt="This is the image: {image_uris}",
    response="{labels}",
    system_instruction="You are a botanical image classifier. Analyze the provided image "
    "and determine the most accurate classification of the flower."
    'These are the only flower categories: [/"daisy/", /"dandelion/", /"roses/", /"sunflowers/", /"tulips/"].'
    "Return only one category per image.",
)

Here, the template is constructed using the library function `construct_single_turn_template()`. Alternatively, it can be explicitly constructed from a Gemini example as below.

It is also possible to specify a custom field mapping for the placeholders used in the Gemini example. Then the placeholders can have any name, and not necessarily the column name of the dataset column with the values that are being inserted (here image_uris and labels):

```
from vertexai.generative_models import Content, Part

gemini_example = datasets.GeminiExample(
    contents=[
        Content(role="user", parts=[Part.from_text("This is the image: {uri}")]),
        Content(role="model", parts=[Part.from_text("{flower}")]),
    ],
    system_instruction=Content(
        parts=[
            Part.from_text(
                'You are a botanical image classifier. Analyze the provided image '
                'and determine the most accurate classification of the flower.'
                'These are the only flower categories: [/"daisy/", /"dandelion/", /"roses/", /"sunflowers/", /"tulips/"].'
                'Return only one category per image.'
            )
        ]
    ),
)

template_config = datasets.GeminiTemplateConfig(
    gemini_example=gemini_example,
    field_mapping={"uri": "image_uris", "flower": "labels"},
)
```

**Assemble and inspect the dataset.**

The dataset assembly creates a BigQuery table with the assembled examples in a single `request` column. The assembly method below returns a tuple containing a table id (`str`) referencing the assembly BQ table, and a DataFrame (`bigframes.pandas.DataFrame`) for direct inspection.
The DataFrame and the BQ table referenced by the table id contain the assembled dataset in a single column `request`.

In [None]:
table_id, assembly = flowers.assemble(template_config=template_config)

# Inspect assembled dataset
assembly.head()

It is also possible to attach the template and run the assembly and the validation below without passing it:

In [None]:
flowers.attach_template_config(template_config=template_config)
_, _ = flowers.assemble()

### 3. Run a validation for tuning

Validate a dataset for tuning.
Tuning dataset usages are: `SFT_VALIDATION`, `SFT_TRAINING`.

First we attach the `template_config` and use it implicitly for all further tasks.

In [None]:
validation = flowers.assess_tuning_validity(
    model_name="gemini-2.0-flash-001", dataset_usage="SFT_TRAINING"
)

# Check if there are validation errors
validation.errors

Let's validate a dataset with an incorrect `template_config`, e.g. using a `GeminiExample` that contains two consecutive `user` contents, instead of a `user` content followed by a `model` content.

In [None]:
invalid_gemini_example = datasets.GeminiExample(
    contents=[
        Content(role="user", parts=[Part.from_text("This is the image: {image_uris}")]),
        # Consecutive content turn with the same role
        Content(role="user", parts=[Part.from_text(".")]),
    ],
)
invalid_configuration = datasets.GeminiTemplateConfig(
    gemini_example=invalid_gemini_example
)

validation = flowers.assess_tuning_validity(
    model_name="gemini-2.0-flash-001",
    dataset_usage="SFT_TRAINING",
    template_config=invalid_configuration,
)

validation.errors

### 4. Estimate resources for tuning

In [None]:
tuning_resources = flowers.assess_tuning_resources(model_name="gemini-2.5-flash-001")
print(tuning_resources)

## 5. Run Tuning

Prerequisites:

- Your Vertex service account `service-{project_number}@gcp-sa-vertex-tune.iam.gserviceaccount.com` needs to have read permissions on to the GCS buckets referenced in the dataset. If this is not automatically the case then you might have to assign the service account a role such as `Storage Object User`, see the screenshot below.

- The Vertex multimodal dataset needs to have an attached `template_config` (run `flowers.attach_template_config(template_config=template_config)`).

Let's also prepare and use the validation dataset.

In [None]:
# Optional: Create Vertex Multimodal dataset for the validation set
flowers_validation = datasets.MultimodalDataset.from_pandas(dataframe=validation_set)

# Attach the template config
flowers_validation.attach_template_config(template_config=template_config)

Here we use the training and validation set to start a tuning job:


In [None]:
from vertexai.preview.tuning import sft

tuning_job = sft.train(
    source_model="gemini-2.0-flash-001",
    train_dataset=flowers,
    validation_dataset=flowers_validation,  # optional
)

Let's monitor the job state and obtain the model ID of the tuned model once the tuning job has ended.

In [None]:
import time

print(f"Tuning job started: {tuning_job.resource_name}")

# Wait for the job to complete
while tuning_job.has_ended is False:
    time.sleep(60)
    tuning_job.refresh()  # Refresh the job state
    print(f"Polling - Current job state: {tuning_job.state}")

# Check the final state
if tuning_job.state == "JOB_STATE_FAILED":
    print(f"Tuning job failed: {tuning_job.error}")
else:
    print(f"Tuning job ended with state: {tuning_job.state}")
    # Get model ID
    tuned_model_id = tuning_job.tuned_model_name.split("/")[-1]

The tuning job can also be started from the GenAI SDK:


```
from google import genai
from google.genai.types import HttpOptions, CreateTuningJobConfig

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

tuning_job = client.tunings.tune(
    base_model="gemini-2.5-flash",
    training_dataset = {
        "vertex_dataset_resource":flowers.resource_name
    },
    config=CreateTuningJobConfig(
        tuned_model_display_name="Example tuning job with Multimodal Dataset 'Flowers'",
        validation_dataset = {
        "vertex_dataset_resource":flowers_validation.resource_name
    },
    ),
)

```


## 6. Batch Prediction

You can use Vertex Multimodal datasets to run a batch prediction job with Gemini.
In the future we will support directly passing Vertex Multimodal dataset objects, for now users can simply pass the BigQuery URI of the assembly:

```py
# Currently

prediction_assembly_table_id, _ = flowers_prediction.assemble(template_config=template_config)

job = client.batches.create(
    model=model,
    src=f"bq://{prediction_assembly_table_id}",
)

# After the integration

job = client.batches.create(
    model=model,
    src=flowers_prediction,
)

```

See also: [Batch Prediction Documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/batch-prediction-gemini) - Note that Vertex Multimodal datasets will be added as another dataset source next to Google Cloud Storage and BigQuery in the documentation soon, when the integration has been completed.


The model specified can either be a Gemini base model, the model you tuned above, or any other tuned custom model.

By default the colab uses the model tuned above for the batch prediction. If you want to run a batch prediction job on a base model or on another custom model, you can provide the base model name or the custom model id, respectively, in the field below.

In [None]:
# @title Specify the Model used for Batch Prediction
model_for_batch_prediction = "tuned model"  # @param ["tuned model", "base model", "other custom model"] # fmt: skip
optional_base_model_name = ""  # @param {type:"string"}
optional_custom_model_id = ""  # @param {type:"string"}

# Set full model name
if model_for_batch_prediction == "tuned model":
    model = f"projects/{PROJECT_ID}/locations/{LOCATION}/models/{tuned_model_id}"
elif model_for_batch_prediction == "base model":
    if not optional_base_model_name:
        raise ValueError(
            "Please provide a optional_base_model_name when 'base model' is selected."
        )
    model = optional_base_model_name
elif model_for_batch_prediction == "other custom model":
    if not optional_custom_model_id:
        raise ValueError(
            "Please provide a optional_custom_model_id when 'other custom model' is selected."
        )
    model = (
        f"projects/{PROJECT_ID}/locations/{LOCATION}/models/{optional_custom_model_id}"
    )


print(f"Using model: {model}")

Let's prepare the prediction set as Vertex Multimodal dataset and start a Batch Prediction job with the specified model:

In [None]:
# Get Vertex Multimodal Dataset
flowers_prediction = datasets.MultimodalDataset.from_pandas(dataframe=prediction_set)

# Assemble the dataset with a template config and get the assembly table id
prediction_assembly_table_id, _ = flowers_prediction.assemble(
    template_config=template_config
)

In [None]:
# @title Run a Batch Prediction Job

from google import genai
from google.genai.types import JobState

# Initialize GenAI client for batch prediction library
client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

job = client.batches.create(
    # use the model selected above
    model=model,
    # Use the assembly table id as source
    src=f"bq://{prediction_assembly_table_id}",
)


completed_states = {
    JobState.JOB_STATE_SUCCEEDED,
    JobState.JOB_STATE_FAILED,
    JobState.JOB_STATE_CANCELLED,
    JobState.JOB_STATE_PAUSED,
}

while job.state not in completed_states:
    time.sleep(30)
    job = client.batches.get(name=job.name)
    print(f"Job state: {job.state}")