In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Veo 3 Reference to Video

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/vision/getting-started/veo3_reference_to_video.ipynb">
      <img src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fvision%2Fgetting-started%2Fveo3_reference_to_video.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/vision/getting-started/veo3_reference_to_video.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/vision/getting-started/veo3_reference_to_video.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/vision/getting-started/veo3_reference_to_video.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/vision/getting-started/veo3_reference_to_video.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/vision/getting-started/veo3_reference_to_video.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/vision/getting-started/veo3_reference_to_video.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/vision/getting-started/veo3_reference_to_video.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>


| | |
|-|-|
|Author(s) | [Katie Nguyen](https://github.com/katiemn) |

## Overview

### Veo 3

Veo 3 on Vertex AI gives application developers access to Google's cutting-edge video generation. This model creates videos with stunning detail and realistic physics across a wide array of visual styles. Veo 3 enhances video quality from text and image prompts, and now includes dialogue and audio generation.

In this tutorial, you will learn how to use the Google Gen AI SDK for Python to interact with Veo 3.1 to:
- Generate a video from asset images, including subjects, objects and scenes


## Get started

### Install Google Gen AI SDK for Python

In [None]:
%pip install --upgrade --quiet google-genai

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the following cell to authenticate your environment.

In [2]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Import libraries

In [3]:
import time
import urllib.request

from IPython.display import Video, display
from PIL import Image as PIL_Image
from google import genai
from google.genai import types
import matplotlib.image as img
import matplotlib.pyplot as plt
import numpy as np

### Set Google Cloud project information and create client

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [10]:
import os

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

### Define helper functions

In [11]:
def show_video(video):
    if isinstance(video, str):
        file_name = video.split("/")[-1]
        !gsutil cp {video} {file_name}
        display(Video(file_name, embed=True, width=600))
    else:
        with open("sample.mp4", "wb") as out_file:
            out_file.write(video)
        display(Video("sample.mp4", embed=True, width=600))


def show_images(
    images: list[str],
):
    fig, axes = plt.subplots(1, len(images), figsize=(12, 6))
    if len(images) == 1:
        axes = np.array([axes])
    for i, ax in enumerate(axes):
        image = img.imread(images[i])
        ax.imshow(image)
        ax.axis("off")
    plt.show()

### Load the video model

In [12]:
video_model = "veo-3.1-generate-preview"

## Reference images to videos

With Reference-to-Video in Veo 3.1, you can use reference images to generate videos. The reference images are `asset` images of subjects, objects, or scenes that will be included in the final video output.

**NOTE:** You can include up to 3 `asset` images in a request.

### Asset references

Download and display the asset images that you'll use in the following requests. To use your own local images, modify the URLs in the `wget` command and update the `first_image`, `second_image`, and/or `third_image` variables accordingly.

#### Subject reference images

In this example, you'll use two subject reference images of different people. You'll generate a new scene for them based on a text prompt.

In [None]:
# Download subject images from Cloud Storage
!wget https://storage.googleapis.com/cloud-samples-data/generative-ai/image/man-in-field.png

!wget https://storage.googleapis.com/cloud-samples-data/generative-ai/image/woman.jpeg

Set the `first_image` and `second_image` variables.

In [None]:
first_image = "man-in-field.png"  # @param {type: 'string'}
second_image = "woman.jpeg"  # @param {type: 'string'}

show_images([first_image, second_image])

Now, you'll send a request to generate a video. With Veo 3.1, you can generate videos with audio from a text prompt, input image(s), or both. In order to generate a video in the following sample, specify the following info:

  - **Prompt:** A description of the video you would like to see with the reference images.
  - **Reference images:** Up to three `asset` images.
  - **Aspect ratio:** 16:9
  - **Number of videos:** Set this value to 1, 2, 3, or 4.
  - **Video duration:** 8 seconds
  - **Resolution:** 720p
  - **Person generation:** Set to `allow_adult` or `dont_allow`.
  - **Generate audio:** Set to `True` if you'd like audio in your generated video.

In [None]:
prompt = """
a woman and a man drinking a cup of coffee in a cafe, chatting about the rainy weather
"""

operation = client.models.generate_videos(
    model=video_model,
    prompt=prompt,
    config=types.GenerateVideosConfig(
        reference_images=[
            types.VideoGenerationReferenceImage(
                image=types.Image.from_file(location=first_image),
                reference_type="asset",
            ),
            types.VideoGenerationReferenceImage(
                image=types.Image.from_file(location=second_image),
                reference_type="asset",
            ),
        ],
        aspect_ratio="16:9",
        number_of_videos=1,
        duration_seconds=8,
        resolution="720p",
        person_generation="allow_adult",
        generate_audio=True,
    ),
)

while not operation.done:
    time.sleep(15)
    operation = client.operations.get(operation)
    print(operation)

if operation.response:
    show_video(operation.result.generated_videos[0].video.video_bytes)

#### Setting reference image

Now, you'll use a single scenery reference image and a text prompt to generate a video with different subjects and actions.

In [None]:
# Download the image from Cloud Storage
!wget https://storage.googleapis.com/cloud-samples-data/generative-ai/image/room.png

Set the `first_image` variable.

In [None]:
first_image = "room.png"  # @param {type: 'string'}

show_images([first_image])

Run the request. Update the `prompt` if you'd like to see different content within the scene.

In [None]:
prompt = """
a Corgi walks around in a living room, jumps on the couch and begins to bark
"""

operation = client.models.generate_videos(
    model=video_model,
    prompt=prompt,
    config=types.GenerateVideosConfig(
        reference_images=[
            types.VideoGenerationReferenceImage(
                image=types.Image.from_file(location=first_image),
                reference_type="asset",
            ),
        ],
        aspect_ratio="16:9",
        number_of_videos=1,
        duration_seconds=8,
        resolution="720p",
        person_generation="allow_adult",
        generate_audio=True,
    ),
)

while not operation.done:
    time.sleep(15)
    operation = client.operations.get(operation)
    print(operation)

if operation.response:
    show_video(operation.result.generated_videos[0].video.video_bytes)

#### Product reference image

Next, you'll use a product reference image and a text prompt to generate a video. This will demonstrate how Veo maintains product consistency while in motion.

In [None]:
#Download the image from Cloud Storage
!wget https://storage.googleapis.com/cloud-samples-data/generative-ai/image/mug.png

Set the `first_image` variable.

In [None]:
first_image = "mug.png"  # @param {type: 'string'}

show_images([first_image])

Run the request. Update the `prompt` if you'd like to visualize the product in a different manner.

In [None]:
prompt = """
slowly rotate this coffee mug in a 360 degree circle
"""

operation = client.models.generate_videos(
    model=video_model,
    prompt=prompt,
    config=types.GenerateVideosConfig(
        reference_images=[
            types.VideoGenerationReferenceImage(
                image=types.Image.from_file(location=first_image),
                reference_type="asset",
            ),
        ],
        aspect_ratio="16:9",
        number_of_videos=1,
        duration_seconds=8,
        resolution="720p",
        person_generation="allow_adult",
        generate_audio=True,
    ),
)

while not operation.done:
    time.sleep(15)
    operation = client.operations.get(operation)
    print(operation)

if operation.response:
    show_video(operation.result.generated_videos[0].video.video_bytes)

#### Three distinct reference images

In this example, you'll use three different reference images (a product, a subject, and a scene) from Google Cloud Storage. Instead of downloading them, you'll reference their Cloud Storage URIs directly. To use your own images, replace the gcs_uri variables below.

In [None]:
first_image = PIL_Image.open(
    urllib.request.urlopen(
        "https://storage.googleapis.com/cloud-samples-data/generative-ai/image/flowers.png"
    )
)
first_image_gcs = "gs://cloud-samples-data/generative-ai/image/flowers.png"

second_image = PIL_Image.open(
    urllib.request.urlopen(
        "https://storage.googleapis.com/cloud-samples-data/generative-ai/image/suitcase.png"
    )
)
second_image_gcs = "gs://cloud-samples-data/generative-ai/image/suitcase.png"

third_image = PIL_Image.open(
    urllib.request.urlopen(
        "https://storage.googleapis.com/cloud-samples-data/generative-ai/image/woman.jpg"
    )
)
third_image_gcs = "gs://cloud-samples-data/generative-ai/image/woman.jpg"

# Display the images
fig, axis = plt.subplots(1, 3, figsize=(18, 6))
axis[0].imshow(first_image)
axis[1].imshow(second_image)
axis[2].imshow(third_image)
for ax in axis:
    ax.axis("off")
plt.show()

Rather than output video_bytes in this section, you'll save your video to Cloud Storage. In order to accomplish this, set your Cloud Storage bucket location in `output_gcs`.

**Safety:** All Veo videos include [SynthID](https://deepmind.google/science/synthid/), which embeds a digital watermark directly into the AI-generated video.

In [None]:
prompt = "a wide shot of a woman wheeling a blue suitcase through a flower field"  # @param {type: 'string'}
output_gcs = "gs://[your-bucket-path]"  # @param {type: 'string'}

operation = client.models.generate_videos(
    model=video_model,
    prompt=prompt,
    config=types.GenerateVideosConfig(
        reference_images=[
            types.VideoGenerationReferenceImage(
                image=types.Image(gcs_uri=first_image_gcs, mime_type="image/png"),
                reference_type="asset",
            ),
            types.VideoGenerationReferenceImage(
                image=types.Image(gcs_uri=second_image_gcs, mime_type="image/png"),
                reference_type="asset",
            ),
            types.VideoGenerationReferenceImage(
                image=types.Image(gcs_uri=third_image_gcs, mime_type="image/jpeg"),
                reference_type="asset",
            ),
        ],
        output_gcs_uri=output_gcs,
        aspect_ratio="16:9",
        number_of_videos=1,
        duration_seconds=8,
        resolution="720p",
        person_generation="allow_adult",
        generate_audio=True,
    ),
)

while not operation.done:
    time.sleep(15)
    operation = client.operations.get(operation)
    print(operation)

if operation.response:
    show_video(operation.result.generated_videos[0].video.uri)