In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Intro to Veo 2 Video Generation

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/vision/veo/intro_veo.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fvision%2Fveo%2Fintro_veo.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/vision/veo/intro_veo.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/vision/veo/intro_veo.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/notebook_template.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/notebook_template.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/notebook_template.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/53/X_logo_2023_original.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/notebook_template.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/notebook_template.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| | |
|-|-|
| Author(s) | [Dave Wang](https://github.com/wadave/) |

## Overview

### Veo 2

Vertex AI now offers Veo 2, Google's cutting-edge video generation technology, empowering developers to create stunningly detailed videos.  Veo 2 simulates real-world physics and supports a diverse range of visual styles.

This tutorial demonstrates how to use the Vertex AI API to generate videos from both text prompts and input images.


## Get started

### Install libraries

In [24]:
%pip install -q --upgrade mediapy google-genai

### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After it's restarted, continue to the next step.

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the following cell to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [41]:
import os
BUCKET_URI = "[your-bucket-uri ]"
PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

### Import libraries

In [43]:
import time

import google.auth
import google.auth.transport.requests
import mediapy as media
import requests
from google.cloud import storage

from google import genai
from google.genai import types
import base64
import json
import pprint

### Define helper functions

In [None]:
def send_request_to_google_api(api_endpoint, data=None):
    """Sends an HTTP request to a Google API endpoint.

    Args:
        api_endpoint: The URL of the Google API endpoint.
        data: (Optional) Dictionary of data to send in the request body.

    Returns:
        The response from the Google API as a JSON object, or None if an error occurs.
        Raises an exception for bad status codes.
    """
    try:
        creds, _ = google.auth.default()  # _ is often used for variables we don't need
        auth_req = google.auth.transport.requests.Request()
        creds.refresh(auth_req)
        access_token = creds.token

        headers = {
            "Authorization": f"Bearer {access_token}",
            "Content-Type": "application/json",
        }

        response = requests.post(api_endpoint, headers=headers, json=data)
        response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
        return response.json()
    except requests.exceptions.RequestException as e:  # Catch potential request errors
        print(f"Error sending request: {e}")  # Handle or log the error appropriately
        return None  # Or re-raise the exception if you want the caller to handle it


def compose_videogen_request(
    prompt,
    image_uri,
    gcs_uri,
    seed,
    aspect_ratio,
    sample_count,
    negative_prompt,
    person_generation="allow_adult"
):
    """Composes the request body for the video generation API."""

    instance = {"prompt": prompt}
    if image_uri:
        instance["image"] = {"gcsUri": image_uri, "mimeType": "png"}

    request = {
        "instances": [instance],
        "parameters": {
            "storageUri": gcs_uri,
            "sampleCount": sample_count,
            "seed": seed,
            "aspectRatio": aspect_ratio,
            "negativePrompt": negative_prompt,
            "personGeneration": person_generation,
        },
    }
    return request

def fetch_operation(lro_name, timeout_seconds=300, poll_interval_seconds=10):
    """Fetches the status of a Long-Running Operation (LRO).

    Args:
        lro_name: The name of the LRO.
        timeout_seconds: The maximum time to wait for the operation to complete.
        poll_interval_seconds: The time to wait between polls.

    Returns:
        The LRO response if successful, None if it times out, or raises an exception for API errors.
    """
    request = {"operationName": lro_name}
    end_time = time.time() + timeout_seconds

    while time.time() < end_time:
        resp = send_request_to_google_api(fetch_endpoint, request)
        if resp is None: # Handle API errors during polling.
          return None # or raise the exception if you prefer.
        if "done" in resp and resp["done"]:
            return resp
        time.sleep(poll_interval_seconds)

    print(f"LRO {lro_name} timed out after {timeout_seconds} seconds.") # Informative message
    return None  # Indicate timeout


def text_to_video(prompt, seed, aspect_ratio, sample_count, output_gcs, negative_prompt=""):
    req = compose_videogen_request(
        prompt, None, output_gcs, seed, aspect_ratio, sample_count, negative_prompt
    )
    resp = send_request_to_google_api(prediction_endpoint, req)
    if resp is None: # Handle API errors during initial request.
      return None
    print(resp)
    return fetch_operation(resp["name"])


def image_to_video(
    prompt, image_gcs, seed, aspect_ratio, sample_count, output_gcs, negative_prompt=""
):
    req = compose_videogen_request(
        prompt, image_gcs, output_gcs, seed, aspect_ratio, sample_count, negative_prompt
    )
    resp = send_request_to_google_api(prediction_endpoint, req)
    if resp is None: # Handle API errors during initial request.
      return None
    print(resp)
    return fetch_operation(resp["name"])

def show_video(op, bucket_name=None):  # Make bucket name configurable
    """Displays generated videos from an LRO response.

    Args:
        op: The LRO response dictionary.
        bucket_name: The GCS bucket name (optional, auto-detected if None).
    """

    print(op)
    if op and op.get("response") and op["response"].get("generatedSamples"):  # Safer checks
        storage_client = storage.Client()

        for video in op["response"]["generatedSamples"]:
            gcs_uri = video["video"]["uri"]
            # Extract bucket name and blob name more robustly
            parts = gcs_uri.replace("gs://", "").split("/") # Remove "gs://" and split
            if bucket_name is None:
                bucket_name = parts[0]
                blob_name = "/".join(parts[1:])
            else:
              blob_name = "/".join(parts[1:])

            bucket = storage_client.bucket(bucket_name)
            blob = bucket.blob(blob_name)

            file_name = blob_name.split("/")[-1] # Extract filename from blob name
            local_filepath = file_name # Or specify a full local path if needed

            try:
                blob.download_to_filename(local_filepath)
                print(f"Downloaded {gcs_uri} to {local_filepath}")
                media.show_video(media.read_video(local_filepath), height=500)
                os.remove(local_filepath) # Clean up the local file after display
                print(f"Deleted local file: {local_filepath}")
            except Exception as e:  # Handle potential download errors
                print(f"Error downloading {gcs_uri}: {e}")
                if os.path.exists(local_filepath): # Clean up even if download failed
                    os.remove(local_filepath)
                    print(f"Deleted partially downloaded local file: {local_filepath}")

    else:
        print("No videos to display in the LRO response.")

In [44]:
def send_request_to_google_api(api_endpoint, data=None):
    """
    Sends an HTTP request to a Google API endpoint.

    Args:
        api_endpoint: The URL of the Google API endpoint.
        data: (Optional) Dictionary of data to send in the request body (for POST, PUT, etc.).

    Returns:
        The response from the Google API.
    """

    # Get access token calling API
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    access_token = creds.token

    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }

    response = requests.post(api_endpoint, headers=headers, json=data)
    response.raise_for_status()
    return response.json()


def compose_videogen_request(
    prompt,
    image_uri,
    gcs_uri,
    seed,
    aspect_ratio,
    sample_count,
    negative_prompt,
    person_generation="allow_adult"
):
    instance = {"prompt": prompt}
    if image_uri:
        instance["image"] = {"gcsUri": image_uri, "mimeType": "png"}
    request = {
        "instances": [instance],
        "parameters": {
            "storageUri": gcs_uri,
            "sampleCount": sample_count,
            "seed": seed,
            "aspectRatio": aspect_ratio,
            "negativePrompt": negative_prompt,
            "personGeneration": person_generation,
        },
    }
    return request


def fetch_operation(lro_name):
    request = {"operationName": lro_name}
    # The generation usually takes 2 minutes. Loop 30 times, around 5 minutes.
    for i in range(30):
        resp = send_request_to_google_api(fetch_endpoint, request)
        if "done" in resp and resp["done"]:
            return resp
        time.sleep(10)


def text_to_video(prompt, seed, aspect_ratio, sample_count, output_gcs, negative_prompt=""):
    req = compose_videogen_request(
        prompt, None, output_gcs, seed, aspect_ratio, sample_count, negative_prompt
    )
    resp = send_request_to_google_api(prediction_endpoint, req)
    print(resp)
    return fetch_operation(resp["name"])


def image_to_video(
    prompt, image_gcs, seed, aspect_ratio, sample_count, output_gcs, negative_prompt=""
):
    req = compose_videogen_request(
        prompt, image_gcs, output_gcs, seed, aspect_ratio, sample_count, negative_prompt
    )
    resp = send_request_to_google_api(prediction_endpoint, req)
    print(resp)
    return fetch_operation(resp["name"])


def show_video(op):
    print(op)
    if op["response"]:
        for video in op["response"]["generatedSamples"]:
            gcs_uri = video["video"]["uri"]
            file_name = gcs_uri.split("/")[-1]
            !gsutil cp {gcs_uri} {file_name}
            media.show_video(media.read_video(file_name), height=500)


def show_sdk_video(op):
    print(op)
    if op.generate_videos_response.videos:
        for video in op.generate_videos_response.videos:
            gcs_uri = video.uri
            file_name = gcs_uri.split("/")[-1]
            !gsutil cp {gcs_uri} {file_name}
            media.show_video(media.read_video(file_name), height=500)

### Load the video model

In [45]:
video_model = f"https://us-central1-aiplatform.googleapis.com/v1beta1/projects/{PROJECT_ID}/locations/us-central1/publishers/google/models/veo-2.0-generate-exp"
prediction_endpoint = f"{video_model}:predictLongRunning"
fetch_endpoint = f"{video_model}:fetchPredictOperation"

# 1. VEO prompt optimization (optional)

##  This step is optional. If you have a statisfactory prompt, please go to step 2 directly.

### 1.1 Create prompt 1:Create prompt by gemini (Optional)

In [46]:
from google import genai
from google.genai import types
import base64
import json
import pprint

In [48]:
def create_veo_prompt(USER_QUERY):
    client = genai.Client(
        vertexai=True, project="ace-chatbot-demo", location="us-central1"
    )
    #model = "gemini-2.0-flash-exp"
    #model = "gemini-1.5-pro-002"
    model = "gemini-exp-1206"
    prompt = f"""
     Generate a high quality rewrite of USER_QUERY for a text-to-video service. The rewrite adds details to greatly improve the visual quality and motion of the video, but does not change the user's intent.

     Refrain from adding children or minors to the rewrite if not necessary to satisfy the USER_QUERY.

     Consider extra details to enhance creativity. Consider adding visual details IF it would support the user query:
     - camera angle and composition: wide angle, drone camera, low angle view, closeup, macro, view from below looking up, centered, fisheye
     - lighting: silhouette, backlit, dim ambient lighting, long shadows, natural light, sunrise / sunset, daylight
     - camera settings and motion: depth of field, in focus, long exposure, tracking shot, POV
     - general quality identifiers: professional, award winning, high-quality
     - styles: cinematic shot, street photography, fashion photography, architectural photography, dramatic, vintage, retro
     - background: blurred background, bokeh, pink background, solid light blue background
     - color scheme: high contrast, cold muted tones, muted orange warm tones, dark tones, pastel colors
     - subject actions: walking, running, ski, snowboarding, turning head
     - subject poses: rotation, flip, inversions

     Feel free to repeat the most important parts of the description! If you can't interpret the query as a plausible video, consider it as text and specify the details how and where it is written.

     Remember, it is important to include every word or a synonym from the USER_QUERY. Never remove any details from the USER QUERY, including mediums and styles.

     If USER_QUERY is long and detailed, either 1) add minor details in the variations, or 2) copy the USER_QUERY and only correct typos or misspellings.

     Absolutely make sure that EVERY detail of the USER_QUERY is well captured in each variation.
     Consider emphasizing the features of the USER_QUERY so that the video is rendered faithfully to the USER_QUERY.

     Please follow this style of text prompt, each line is a different prompt example:

     This close-up shot follows a happy queen as she ascends the steps of a candlelit throne room. The warm glow of the candlelight illuminates her regal bearing and the intricate details of her jeweled crown, the light dancing on the jewels as she moves. She turns her head, the happiness in her eyes becoming more prominent. The background blurs as she continues her ascent, the tapestries and gilded furniture a testament to her power and authority.

     Close-up portrait of a Black woman dancing in a vibrant carnival in Trinidad and Tobago. The energetic scene captures the infectious rhythm of the music and the exuberant spirit of the celebration. Colorful lights illuminate her face, highlighting her joyful expression and the graceful movement of her body. Her eyes, a sparkling brown, radiate pure happiness and the unbridled passion of Caribbean culture.

     Cinematic shot of a Caucasian man dressed in a weathered green trench coat, bathed in the eerie glow of a green neon sign. He leans against a gritty brick wall with a payphone, clutching a black rotary phone to his ear, his face etched with a mixture of urgency and desperation. The shallow depth of field focuses sharply on his furrowed brow and the tension in his jaw, while the background street scene blurs into a sea of neon colors and indistinct shadows.

     This underwater film scene features a close-up of a man in a dark business suit swimming through murky water. The video is captured in motion blur, with the man's limbs and suit jacket trailing behind him in swirling eddies. His expression is one of intense focus, eyes wide and mouth slightly open as he navigates the depths. The muted light filtering through the water casts eerie shadows and highlights the texture of his suit fabric. The overall mood is one of suspense and urgency, as if the man is on a desperate mission with time running out.

     Close-up shot of a quick cat briskly walking in the park, it’s crafted entirely of glass, illuminated by dramatic lighting. Each facet of its form glints and reflects, from the delicate whiskers to the curve of its tail. Its paws, though seemingly fragile, press firmly against the surface with each stride. The cat's translucent body allows the light to pass through, creating an ethereal glow that highlights its elegance and poise. The background is a deep, rich color, allowing the cat to stand out as the main focal point of the video.

     Cinematic shot of a lone surfer's silhouette, walking on a vast beach with surfboard in hand. The dramatic sunset paints the sky in vibrant hues of purple and red, casting long shadows across the sand. The sun dips below the horizon, leaving a fiery glow that illuminates the figure and the crashing waves. The wide shot captures the vastness of the scene, emphasizing the surfer's solitude and the awe-inspiring beauty of nature.

     Extreme close-up of a woman's eyes, bathed in the vibrant glow of neon lights. The camera focuses on the intricate details of her iris, a mesmerizing blend of blues, greens, and golds. Her long, dark lashes cast delicate shadows on her skin, and a single tear glistens at the corner of her eye. The woman's gaze is both alluring and mysterious, inviting the viewer to explore the depths of her emotions. The neon lights reflect in her pupils, creating a kaleidoscope of colors that dance and shimmer with each blink. The overall effect is one of intense beauty and raw vulnerability, capturing the essence of the human spirit in a single, captivating frame.

     A close-up shot of a man made entirely of glass riding the New York City subway. Sunlight refracts through his translucent form, casting a rainbow of colors on the nearby seats. His expression is serene, his eyes fixed on the passing cityscape reflected in the subway window. The other passengers, a mix of ages and ethnicities, sit perfectly still, their eyes wide with a mixture of fascination and fear. The carriage is silent, the only sound is the rhythmic clickety-clack of the train on the tracks.

     Close-up cinematic shot of an Indian man in a crisp white suit, bathed in the warm glow of an orange neon sign. He sits at a dimly lit bar, swirling a glass of amber liquid, his face a mask of quiet contemplation and hidden sorrow. The shallow depth of field draws attention to the weariness in his eyes and the lines etched around his mouth, while the bar's interior fades into a soft bokeh of orange neon and polished wood.

     A cinematic close-up frames the face of a young Asian woman in the heart of Tokyo's Shibuya Crossing. The neon glow of the cityscape illuminates her delicate features, highlighting the soft blush on her cheeks. Gentle lighting accentuates her bright, inquisitive eyes, reflecting the vibrant energy of the urban environment. A faint smile plays on her lips, hinting at a sense of anticipation and wonder. The blurred motion of pedestrians and vehicles in the background emphasizes her serene presence amidst the bustling metropolis. Her youthful expression captures a moment of fleeting beauty and the boundless possibilities that lie ahead.

     Medium close-up shot of a distinguished dog in a tailored business suit, engrossed in a newspaper on a moving train. Neon lights flicker through the window, casting high-contrast shadows on the dog's face and emphasizing the low vibrance of the scene. The dog's brow is furrowed in concentration, its eyes scanning the newsprint with an air of intelligence and determination. The train's rhythmic motion rocks the dog gently, creating a subtle blur in the background that accentuates the dog's stillness and focus.

     Tracking shot of a vibrant yellow convertible cruising through a scenic Nevada desert. An orange filter bathes the scene in warm, golden light, highlighting the dramatic rock formations and vast sandy expanse. The car speeds along a winding road, leaving a trail of dust in its wake. The open top allows the driver and passengers to fully experience the breathtaking landscape, their hair tousled by the wind. The low camera angle captures the car's sleek design and emphasizes the sense of freedom and adventure. The orange filter adds a touch of nostalgia and creates a visually stunning scene that evokes the spirit of the open road and the allure of the desert.

     This street style shot captures two chic women strolling through the fashionable streets of Paris. The first woman exudes elegance in a pair of crisp white pants, a pastel pink blazer cinched with a black belt and oversized black sunglasses. The second woman radiates confidence in her yellow wide leg trousers and an oversized hot pink blouson accessorized with a chunky gold necklace. Both women carry luxurious handbags adding to their effortless sophistication. The backdrop of Parisian architecture and bustling city life complements their stylish ensembles, creating a picture perfect moment of Parisian chic.

     Now, provide 4 different REWRITES for the following USER_QUERY in the style above using about 100 words each. Only produce the final four rewrites, one on each line, no intermediate thoughts. The rewrites should be distinct from each other, while following the user's intent.
     Here's User_Query: {USER_QUERY}

     * **Output Format:** Return your analysis as a JSON object with the following structure:
       'prompt1': 'Text prompt for the first rewrite',
       'prompt2': 'Text prompt for the second rewrite',

     """

    contents = [types.Content(role="user", parts=[types.Part.from_text(prompt)])]

    generate_content_config = types.GenerateContentConfig(
        temperature=1,
        top_p=0.95,
        max_output_tokens=8192,
        response_modalities=["TEXT"],
        response_mime_type="application/json",
        # response_schema=response_schema,
        safety_settings=[
            types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
            types.SafetySetting(
                category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"
            ),
            types.SafetySetting(
                category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"
            ),
            types.SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="OFF"),
        ],
    )

    response = client.models.generate_content(
        model=model,
        contents=contents,
        config=generate_content_config,
    )

    return response.text

In [49]:
USER_QUERY="""An xgames snowboarder performs superpipe snowboarding Frontside 1080 trick at X Games Aspen.
The snowboarder initiates a jump while facing downhill, then rotates their body and board a full three rotations (1080 degrees) in the air,
 at X Games Aspen"""

In [None]:
resp=create_veo_prompt(USER_QUERY)

In [None]:
pprint.pprint(json.loads(resp))

In [None]:
prompt =json.loads(resp)['prompt2']
prompt

In [4]:
prompt ="""
 Generate a photorealistic 8-second video of Snowboard SuperPipe competition at X Games Aspen 2025. The video should capture the high-energy atmosphere and focus on a snowboarder successfully executing a complex Haakon Flip maneuver, followed by a roaring crowd reaction.
Video Structure & Key Moments with Veo-Specific Instructions:
(0-1s) Crowd Anticipation: Establishing Shot
Visual: Close-up shot of a diverse and energetic crowd at the base of the halfpipe. Faces should display expressions of excitement, anticipation, and awe. Include authentic winter attire, visible breath in the cold air, and prominent Monster Energy branding (banners, clothing, etc.).
Camera: Start with a tight close-up on individual faces, then gradually widen to reveal the larger crowd.
Veo Instructions: "Generate a close-up shot of a diverse crowd at a winter sporting event. Faces should show excitement and anticipation. Include details like winter clothing, visible breath, and Monster Energy branding. The shot should gradually widen to reveal a larger crowd scene. Resolution: 4K. Aspect ratio: 16:9. Lighting: Bright, natural winter daylight with potential for lens flares. Style: Photorealistic. Focus: Sharp focus on individual faces in the foreground with a gradual softening of focus towards the background. Crowd density: High. Duration: 1 second."
Keywords: close-up, diverse crowd, anticipation, excitement, X Games Aspen 2025, Monster Energy, winter attire, visible breath, cold weather, detailed textures, 4K, 16:9, natural lighting, lens flare
(1-2s) Dynamic Descent: The Drop-In
Visual: A snowboarder drops into the halfpipe, gaining speed.
Camera: Initiate a rapid camera movement that starts from a high, wide-angle, top-down perspective at the top of the pipe and quickly pans downwards, following the rider's descent into the halfpipe. The camera movement should create a sense of speed and dynamism.
Veo Instructions: "Generate a high, wide-angle, top-down view of a snowboarder at the top of a halfpipe. The snowboarder drops into the halfpipe. Initiate a fast camera pan that follows the snowboarder's descent. The camera movement should be smooth and dynamic. Duration: 1 second. Speed: Fast. Camera movement: Top-down to eye-level tracking pan. Perspective: Bird's-eye view transitioning to eye-level. Lighting: Natural winter daylight. Focus: Sharp focus on the snowboarder throughout the movement."
Keywords: snowboarder, halfpipe, drop-in, fast camera pan, top-down view, wide-angle, dynamic movement, bird's-eye view, tracking shot, high speed, winter sports, X Games, 4K, 16:9, natural lighting
(2-7s) The Haakon Flip: Slow-Motion Hero Moment
Visual: This is the core of the video. Generate a highly-detailed slow-motion sequence of the snowboarder performing a perfect Haakon Flip.
Haakon Flip Execution:
Rider ascends the halfpipe wall, gaining height.
At the peak, initiate a backside 180-degree rotation.
Simultaneously, the snowboard flips up and over the rider's head (invert).
The rider grabs the board with their hand during the mid-air rotation.
The rider rotates back, completing the flip.
Land smoothly on the opposite wall, facing forward.
Camera: The camera should be positioned at a side-profile perspective, slightly below the athlete (worm's-eye view) to emphasize the height and full rotation of the trick. Maintain a tight focus on the snowboarder throughout the maneuver.
Veo Instructions: "Generate a slow-motion sequence of a snowboarder performing a Haakon Flip. Camera angle: Side-profile, slightly lower angle (worm's-eye view). Focus: Maintain tight focus on the snowboarder. Detail: High level of detail on the snowboarder's form, the board's movement, and the snow spray. The snowboarder should ascend the halfpipe wall, perform a backside 180, flip the board over their head, grab the board mid-air, complete the rotation, and land smoothly on the opposite wall. Duration: 5 seconds. Speed: Slow-motion (approximately 1/8 speed). Resolution: 4K. Style: Photorealistic. Lighting: Natural winter daylight with emphasis on the contrast between light and shadow to highlight the trick."
Keywords: Haakon Flip, slow-motion, snowboard trick, mid-air, backside 180, invert, board grab, aerial maneuver, detailed, athletic, high-resolution, 4K, side-profile, worm's-eye view, snow spray, dynamic lighting, photorealistic, X Games
(7-8s) Triumphant Landing
Visual: The snowboarder lands the Haakon Flip perfectly, continuing down the halfpipe.
Camera: Continue the side-profile, slightly lower angle, tracking the snowboarder as they land.
Veo Instructions: "Generate the landing of a snowboarder after performing a Haakon Flip. Camera angle: side-profile, slightly lower angle, tracking the rider. Focus on the smooth landing and continuation down the halfpipe. Duration: 1 second. Resolution: 4K. Lighting: Natural winter daylight."
Keywords: snowboard landing, successful trick, smooth landing, halfpipe, side-profile, tracking shot, 4K, natural lighting
(Final Moment): Freeze Frame
Visual: Conclude with a freeze frame of the athlete at the apex of the Haakon Flip, captured from the side-profile, slightly lower angle, highlighting the athleticism and skill involved.
Veo Instructions: "Generate a still image (freeze frame) of a snowboarder at the peak of a Haakon Flip. Camera angle: Side-profile, slightly lower angle. Focus: Sharp focus on the snowboarder. Style: Iconic, poster-worthy. Resolution: 4K."
Keywords: freeze frame, Haakon Flip, mid-air, iconic, side-profile, worm's-eye view, snowboarder, apex, skill, athleticism, 4K, poster image
Overall Stylistic Guidance:
Realism: Strive for ultra-photorealistic visuals throughout the video. Pay meticulous attention to details such as snow texture, lighting, reflections, clothing textures, human anatomy, and facial expressions.
Color Palette: Vibrant and energetic, accurately reflecting the X Games atmosphere and Monster Energy branding (green, black, white).
Lighting: Utilize realistic winter lighting conditions. Consider the possibility of lens flares from the sun reflecting off the snow, especially during the drop-in and slow-motion segments.
Motion: Ensure that all movements are fluid, natural, and physically accurate, particularly during the complex slow-motion Haakon Flip.
"""

### 1.2 Create prompt 2: rewrite prompt (Optional)

In [None]:
def rewrite_veo_prompt(USER_QUERY):
    client = genai.Client(
        vertexai=True, project="ace-chatbot-demo", location="us-central1"
    )
    #model = "gemini-2.0-flash-exp"
    #model = "gemini-1.5-pro-002"
    model = "gemini-exp-1206"
    prompt = f"""
     Please polish the following prompt: {USER_QUERY}
     Please return prompt content only
     """

    contents = [types.Content(role="user", parts=[types.Part.from_text(prompt)])]

    generate_content_config = types.GenerateContentConfig(
        temperature=1,
        top_p=0.95,
        max_output_tokens=8192,
        response_modalities=["TEXT"],
        #response_mime_type="application/json",
        # response_schema=response_schema,
        safety_settings=[
            types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
            types.SafetySetting(
                category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"
            ),
            types.SafetySetting(
                category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"
            ),
            types.SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="OFF"),
        ],
    )

    response = client.models.generate_content(
        model=model,
        contents=contents,
        config=generate_content_config,
    )

    return response.text

In [None]:
resp=rewrite_veo_prompt(prompt)
pprint.pprint(resp)

In [None]:
prompt =resp

# 2. Veo 2 REST API

In [50]:
prompt = """
create a video shows a snowboarder perfoming a perfect Haakon Flip at superpipe

Haakon Flip Execution:
-Rider ascends the halfpipe wall, gaining height.
-At the peak, initiate a backside 180-degree rotation.
-Simultaneously, the snowboard flips up and over the rider's head (invert).
-The rider grabs the board with their hand during the mid-air rotation.
-The rider rotates back, completing the flip.
-Land smoothly on the opposite wall, facing forward.
"""


### 2.1 Generate video from a text prompt

In [None]:
#prompt = "A xgames snowboarder dress as super hero with 'G' on chest, performs 'triple cork' in half pipe"  # @param {type: 'string'}
aspect_ratio = "16:9"  # @param ["16:9", "9:16"]
#output_gcs = "gs://ace-chatbot-demo-bucket"  # @param {type: 'string'}
output_gcs = "gs://dw-veo2-testing"  # @param {type: 'string'}
negative_prompt = ""  # @param {type: 'string'}
seed = 200
sample_count = 1

ttv = text_to_video(prompt, seed, aspect_ratio, sample_count, output_gcs, negative_prompt)


In [None]:
show_video(ttv)

In [None]:
# prompts =list(json.loads(resp).values())
# for prompt in prompts:
#     op = text_to_video(prompt, seed, aspect_ratio, sample_count, output_gcs, rewrite_prompt)
#     show_video(op)

### 2.2 Generate video from an image and text prompt

In [None]:
#prompt = ""  # @param {type: 'string'}
image_gcs = "gs://dw-veo2-testing/img1.png"  # @param {type: 'string'}
aspect_ratio = "16:9"  # @param ["16:9", "9:16"]
output_gcs = "gs://dw-veo2-testing"  # @param {type: 'string'}
negative_prompt = ""  # @param {type: 'string'}
seed = 200
sample_count = 1

itv = image_to_video(
    prompt, image_gcs, seed, aspect_ratio, sample_count, output_gcs, negative_prompt
)


In [62]:
show_video(itv)

# 3. Python SDK (Internal project WIP)

In [None]:
#https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/veo-video-generation

!gsutil cp gs://unified-genai-dev/alexey-veo/google_genai-0.5.0-py3-none-any.whl .
#!gsutil ls gs://veo_genai_sdk/
#!gsutil cp gs://veo_genai_sdk/google_genai-0.5.0-py3-none-any.whl .
%pip uninstall -y google-genai
%pip install google_genai-0.5.0-py3-none-any.whl

# %pip install -q google-genai

In [2]:
import time
from google.genai import types

In [3]:
from google import genai
vertex_client = genai.Client(
    vertexai=True, project='veo-testing', location='us-central1'
)

In [1]:
import mediapy as media
def show_sdk_video(op):
    print(op)
    if op.generate_videos_response.videos:
        for video in op.generate_videos_response.videos:
            gcs_uri = video.uri
            file_name = gcs_uri.split("/")[-1]
            !gsutil cp {gcs_uri} {file_name}
            media.show_video(media.read_video(file_name), height=500)

In [None]:
prompt ="""
 Generate a photorealistic 8-second video of Snowboard SuperPipe competition at X Games Aspen 2025. The video should capture the high-energy atmosphere and focus on a snowboarder successfully executing a complex Haakon Flip maneuver, followed by a roaring crowd reaction.
Video Structure & Key Moments with Veo-Specific Instructions:
(0-1s) Crowd Anticipation: Establishing Shot
Visual: Close-up shot of a diverse and energetic crowd at the base of the halfpipe. Faces should display expressions of excitement, anticipation, and awe. Include authentic winter attire, visible breath in the cold air, and prominent Monster Energy branding (banners, clothing, etc.).
Camera: Start with a tight close-up on individual faces, then gradually widen to reveal the larger crowd.
Veo Instructions: "Generate a close-up shot of a diverse crowd at a winter sporting event. Faces should show excitement and anticipation. Include details like winter clothing, visible breath, and Monster Energy branding. The shot should gradually widen to reveal a larger crowd scene. Resolution: 4K. Aspect ratio: 16:9. Lighting: Bright, natural winter daylight with potential for lens flares. Style: Photorealistic. Focus: Sharp focus on individual faces in the foreground with a gradual softening of focus towards the background. Crowd density: High. Duration: 1 second."
Keywords: close-up, diverse crowd, anticipation, excitement, X Games Aspen 2025, Monster Energy, winter attire, visible breath, cold weather, detailed textures, 4K, 16:9, natural lighting, lens flare
(1-2s) Dynamic Descent: The Drop-In
Visual: A snowboarder drops into the halfpipe, gaining speed.
Camera: Initiate a rapid camera movement that starts from a high, wide-angle, top-down perspective at the top of the pipe and quickly pans downwards, following the rider's descent into the halfpipe. The camera movement should create a sense of speed and dynamism.
Veo Instructions: "Generate a high, wide-angle, top-down view of a snowboarder at the top of a halfpipe. The snowboarder drops into the halfpipe. Initiate a fast camera pan that follows the snowboarder's descent. The camera movement should be smooth and dynamic. Duration: 1 second. Speed: Fast. Camera movement: Top-down to eye-level tracking pan. Perspective: Bird's-eye view transitioning to eye-level. Lighting: Natural winter daylight. Focus: Sharp focus on the snowboarder throughout the movement."
Keywords: snowboarder, halfpipe, drop-in, fast camera pan, top-down view, wide-angle, dynamic movement, bird's-eye view, tracking shot, high speed, winter sports, X Games, 4K, 16:9, natural lighting
(2-7s) The Haakon Flip: Slow-Motion Hero Moment
Visual: This is the core of the video. Generate a highly-detailed slow-motion sequence of the snowboarder performing a perfect Haakon Flip.
Haakon Flip Execution:
Rider ascends the halfpipe wall, gaining height.
At the peak, initiate a backside 180-degree rotation.
Simultaneously, the snowboard flips up and over the rider's head (invert).
The rider grabs the board with their hand during the mid-air rotation.
The rider rotates back, completing the flip.
Land smoothly on the opposite wall, facing forward.
Camera: The camera should be positioned at a side-profile perspective, slightly below the athlete (worm's-eye view) to emphasize the height and full rotation of the trick. Maintain a tight focus on the snowboarder throughout the maneuver.
Veo Instructions: "Generate a slow-motion sequence of a snowboarder performing a Haakon Flip. Camera angle: Side-profile, slightly lower angle (worm's-eye view). Focus: Maintain tight focus on the snowboarder. Detail: High level of detail on the snowboarder's form, the board's movement, and the snow spray. The snowboarder should ascend the halfpipe wall, perform a backside 180, flip the board over their head, grab the board mid-air, complete the rotation, and land smoothly on the opposite wall. Duration: 5 seconds. Speed: Slow-motion (approximately 1/8 speed). Resolution: 4K. Style: Photorealistic. Lighting: Natural winter daylight with emphasis on the contrast between light and shadow to highlight the trick."
Keywords: Haakon Flip, slow-motion, snowboard trick, mid-air, backside 180, invert, board grab, aerial maneuver, detailed, athletic, high-resolution, 4K, side-profile, worm's-eye view, snow spray, dynamic lighting, photorealistic, X Games
(7-8s) Triumphant Landing
Visual: The snowboarder lands the Haakon Flip perfectly, continuing down the halfpipe.
Camera: Continue the side-profile, slightly lower angle, tracking the snowboarder as they land.
Veo Instructions: "Generate the landing of a snowboarder after performing a Haakon Flip. Camera angle: side-profile, slightly lower angle, tracking the rider. Focus on the smooth landing and continuation down the halfpipe. Duration: 1 second. Resolution: 4K. Lighting: Natural winter daylight."
Keywords: snowboard landing, successful trick, smooth landing, halfpipe, side-profile, tracking shot, 4K, natural lighting
(Final Moment): Freeze Frame
Visual: Conclude with a freeze frame of the athlete at the apex of the Haakon Flip, captured from the side-profile, slightly lower angle, highlighting the athleticism and skill involved.
Veo Instructions: "Generate a still image (freeze frame) of a snowboarder at the peak of a Haakon Flip. Camera angle: Side-profile, slightly lower angle. Focus: Sharp focus on the snowboarder. Style: Iconic, poster-worthy. Resolution: 4K."
Keywords: freeze frame, Haakon Flip, mid-air, iconic, side-profile, worm's-eye view, snowboarder, apex, skill, athleticism, 4K, poster image
Overall Stylistic Guidance:
Realism: Strive for ultra-photorealistic visuals throughout the video. Pay meticulous attention to details such as snow texture, lighting, reflections, clothing textures, human anatomy, and facial expressions.
Color Palette: Vibrant and energetic, accurately reflecting the X Games atmosphere and Monster Energy branding (green, black, white).
Lighting: Utilize realistic winter lighting conditions. Consider the possibility of lens flares from the sun reflecting off the snow, especially during the drop-in and slow-motion segments.
Motion: Ensure that all movements are fluid, natural, and physically accurate, particularly during the complex slow-motion Haakon Flip.
"""

### 3.1 SDK text to video

In [None]:
vertex_operation = vertex_client.models.generate_videos(
    model="veo-2.0-generate-exp",
    prompt=prompt,
    config=types.GenerateVideosConfig(
        # Optional:
        # Only works on Veo2 models at this moment
        # image=types.Image(
        #     gcs_uri="gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png",
        #     mime_type="image/png",
        # ),
        # Only works on Veo2 models at this moment
        # video=types.SourceVideo(
        #     # FPS must match. The defaul FPS of generated videos is 24.
        #     gcs_uri="gs://unified-genai-tests/tmp/genai/video/outputs/17473846688579026039/sample_0.mp4"
        # ),
        output_gcs_uri="gs://dw-veo2-testing/outputs",
        number_of_videos=1,
        fps=24,
        duration_seconds=8,
        seed=1,
        aspect_ratio="16:9",
        resolution="720p",
        person_generation="allow_adult",
        negative_prompt="ugly, low quality",
    ),
)
print(vertex_operation)
while not vertex_operation.done:
    time.sleep(20)
    vertex_operation = vertex_client.models.get_generate_videos_operation(vertex_operation.name)
    print(vertex_operation)

vertex_operation.generate_videos_response

In [None]:
vertex_operation

In [12]:
show_sdk_video(vertex_operation)

### 3.2 SDK image to video

In [None]:
# Image to video

In [None]:
from google.colab import auth
auth.authenticate_user()
!gcloud config set project veo-testing

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from io import BytesIO
from PIL import Image
from google.cloud import storage

# Initialize a GCS client
storage_client = storage.Client(project='veo-testing')

# Replace with your bucket and image file
bucket_name = "dw-veo2-testing"
image_blob_name = "img1.png"

# Get the image blob
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(image_blob_name)

# Download the image as bytes
image_bytes = blob.download_as_bytes()

# Open the image using PIL
image = Image.open(BytesIO(image_bytes))

# Display the image using matplotlib
plt.imshow(image)
plt.axis('off')  # Hide axes
plt.show()

In [None]:
# Image to video
import time
from google.genai import types

vertex_operation3 = vertex_client.models.generate_videos(
    model="veo-2.0-generate-exp",
    prompt=prompt,
    config=types.GenerateVideosConfig(
        # Optional:
        image=types.Image(
            #gcs_uri="gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png",
            gcs_uri="gs://dw-veo2-testing/img1.png",
            mime_type="image/jpg",
        ),
        # video=types.SourceVideo(
        #     # FPS must match. The defaul FPS of generated videos is 24.
        #     uri="gs://dw-veo2-testing/sample_videos/mens_Kaishu Hirano_2_trick1_24fps.mp4"
        # ),
        output_gcs_uri="gs://dw-veo2-testing/outputs",
        number_of_videos=1,
        fps=24,
        duration_seconds=8,
        seed=1,
        aspect_ratio="16:9",
        resolution="720p",
        person_generation="allow_adult",
        # pubsub_topic="projects/<my-project>/topics/video-generation-test",
        negative_prompt="ugly, low quality",
        #enable_prompt_rewriting=True
    ),
)
print(vertex_operation3)
while not vertex_operation3.done:
    time.sleep(20)
    vertex_operation3 = vertex_client.models.get_generate_videos_operation(vertex_operation3.name)
    print(vertex_operation3)

vertex_operation3.generate_videos_response

In [38]:
show_sdk_video(vertex_operation3)

# References
1. Internal SDK: https://colab.sandbox.google.com/drive/1QhfpazPc6v7xfTd5ev-1fX1pr3IDGB8X?resourcekey=0-v9htEwMli1Jo1Bt-no79_g
2. https://cloud.google.com/vertex-ai/generative-ai/docs/video/generate-videos
3. https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/veo-video-generation

# Appendix

# Re create videos

In [23]:
import io
import json
import os
import subprocess
import sys
import tempfile
import time
from pathlib import Path
from typing import Any, Dict
import glob
import cv2
from PIL import Image

from matplotlib import pyplot as plt
import numpy as np
from io import BytesIO
from google.colab.patches import cv2_imshow

import numpy as np

import re

import PIL
import numpy as np

In [22]:
import cv2
def get_frames_only(video_path, FRAME_DIR):
  cap = cv2.VideoCapture(video_path)
  video_name = video_path.split('/')[-1]
  video_prefix = video_name.split('.')[0]

  frame_array = []
  frame_number = 0

  while(cap.isOpened()):
    ret, frame = cap.read()
    if ret == True:
      cv2.imwrite(f'./{FRAME_DIR}/{video_prefix}_{frame_number}.jpg', frame)

      frame_number += 1
    else:
      break

  cap.release()

In [27]:
def sort_by_last_number(file_list):
  """Sorts a list of filenames by the last number in the filename.

  Args:
    file_list: A list of filenames.

  Returns:
    A new list of filenames, sorted by the last number.
  """
  return sorted(file_list, key=lambda x: int(re.findall(r'\d+', x)[-1]))


def create_video_from_frames(frames_folder, output_video_path, fps=24):
  """
  Creates a video from a sequence of image frames.

  Args:
    frames_folder: Path to the folder containing the image frames.
    output_video_path: Path to the output video file (e.g., "output.mp4").
    fps: Frames per second for the output video. Default is 30.
  """

  try:
    # Get a list of image files in the folder
    image_file_list = [f for f in os.listdir(frames_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]
    image_files =  sort_by_last_number(image_file_list) # Make sure frames are in order

    # Read the first image to get dimensions
    first_frame = cv2.imread(os.path.join(frames_folder, image_files[0]))
    height, width, layers = first_frame.shape

    # Create a VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for MP4
    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    # Write each image frame to the video
    for image_file in image_files:
      frame = cv2.imread(os.path.join(frames_folder, image_file))
      video_writer.write(frame)

    # Release the VideoWriter
    video_writer.release()
    print(f"Video created: {output_video_path}")

  except Exception as e:
    print(f"Error creating video: {e}")

In [24]:
def list_files(folder_path, extension=""):
    """Lists all image paths in a folder in alphabetical order.

    Args:
        folder_path: The path to the folder containing the images.

    Returns:
        A list of image paths sorted alphabetically.
    """
    image_paths = sorted(glob.glob(os.path.join(folder_path, f"*{extension}")))  # or use "*" for all image types
    return image_paths

In [28]:
def create_folder_if_not_exists(folder_path):
  """Creates a folder if it doesn't exist.

  Args:
    folder_path: The path to the folder to create.
  """
  if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print(f"Folder '{folder_path}' created.")

In [None]:
videos = list_files("/content/videos")
videos[0]

In [None]:
video = videos[0]
print(video.split("/")[-1].split(".")[0])
video_name = video.split("/")[-1].split(".")[0]
FRAME_DIR = f'ori/{video_name}/frames'


create_folder_if_not_exists(FRAME_DIR)


In [30]:
get_frames_only(videos[0], FRAME_DIR)

In [None]:
video_name = video.split("/")[-1].split(".")[0]



OUTPUT_VIDEO_PATH = '/content/output_videos'
create_folder_if_not_exists(OUTPUT_VIDEO_PATH)
create_folder_if_not_exists(OUTPUT_VIDEO_PATH + f"/{video_name}")


# 24 fps
fps = 24
output_video_file = OUTPUT_VIDEO_PATH + f"/{video_name}/{video_name}_{int(fps)}fps.mp4"
create_video_from_frames(FRAME_DIR, output_video_file, fps)

