In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Identifcation of Scene Transitions in Movies Using Gemini

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/video-analysis/scene_transition.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fuse-cases%2Fvideo-analysis%2Fscene_transition.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/use-cases/video-analysis/scene_transition.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/video-analysis/scene_transition.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/video-analysis/scene_transition.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/video-analysis/scene_transition.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/video-analysis/scene_transition.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/video-analysis/scene_transition.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/video-analysis/scene_transition.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author |
| --- |
| [Vijaylaxmi Lendale](https://github.com/VJlaxmi) |

## Overview

- This notebook demonstrates how to use Gemini to automatically detect **scene transitions** in videos using both **video content** and associated **subtitle (VTT) files**.

## Get started

### Install Google Gen AI SDK and other required packages


In [None]:
%pip install --upgrade --quiet google-genai

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

from google import genai

PROJECT = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
LOCATION = "us-central1"
if not PROJECT or PROJECT == "[your-project-id]":
    PROJECT = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

client = genai.Client(vertexai=True, project=PROJECT, location=LOCATION)

### Import libraries

In [None]:
from google.genai.types import GenerateContentConfig, Part

### Output schema and model config

In [None]:
from pydantic import BaseModel, Field


class Scene(BaseModel):
    scene_number: int = Field(
        ..., description="The scene number in sequential order, e.g., 1, 2, 3."
    )
    start_time: str = Field(
        ...,
        description="The start time of the scene in the format HRS:MIN:SEC (e.g., 01:15:30, 00:12:45).",
        pattern="^([0-9]{2}):([0-9]{2}):([0-9]{2})$",
    )
    end_time: str = Field(
        ...,
        description="The end time of the scene in the format HRS:MIN:SEC (e.g., 01:20:45, 00:20:18).",
        pattern="^([0-9]{2}):([0-9]{2}):([0-9]{2})$",
    )
    description: str = Field(..., description="A brief description of the scene.")

In [None]:
MODEL_ID = "gemini-2.0-flash"

In [None]:
config = GenerateContentConfig(
    temperature=0,
    top_p=1,
    max_output_tokens=8000,
    response_mime_type="application/json",
    response_schema=list[Scene],
)

In [None]:
# Input video paths
input_video_path = (
    "gs://github-repo/generative-ai/gemini/use-cases/scene-transition/video3.mp4"
)
input_vtt_path = (
    "gs://github-repo/generative-ai/gemini/use-cases/scene-transition/video3.vtt"
)

# Print paths
print(f"Input Video Path: {input_video_path}")
print(f"Input VTT Path: {input_vtt_path}")

### Prompt

In [None]:
base_instructions = """
You are a multimodal Scene Boundary Detector.
Your task is to analyze a video along with its VTT (subtitle) file to identify cohesive and meaningful scene transitions, 
ensuring accurate segmentation into self-contained scenes with distinct narrative arcs.
** Key criteria for identifying scene transitions:** 
- Narrative Changes: Transitions must reflect a significant shift in story elements such as location, time, characters, or topic of dialogue.
- Don't treat jump-cuts or insert shots as transitions unless they signify meaningful narrative shifts.
- Visual Cues: Changes in location, character appearance, or recognition of visual elements strongly indicate scene changes.
- Dialogue Topics: Continuous dialogues between a stable set of characters typically belong to a single scene; changes in dialogue themes can signal scene transitions.
- Audio Elements: Shifts in background music or sound effects often accompany scene transitions, reinforcing narrative changes.
- Cohesion: Ensure each identified scene is cohesive, with a contained beginning, middle, and end, contributing to the overarching narrative.
Ensure that the scene transition timestamps you identify strictly fall within the start and end time boundaries of the input video, and accurately reflect the exact position of the scene boundaries in the video.
"""

In [None]:
response = client.models.generate_content(
    model=MODEL_ID,
    contents=[
        Part.from_text(text=base_instructions),
        "Apply the scene transition instructions above.",
        Part.from_uri(file_uri=input_video_path, mime_type="video/mp4"),
        Part.from_uri(file_uri=input_vtt_path, mime_type="text/vtt"),
    ],
    config=config,
)

In [None]:
model_outputs = []
try:
    model_outputs.append(response.parsed)
except (AttributeError, TypeError) as e:
    print(f"Couldn't access parsed response: {e}")

# Flatten list of lists and print scene transitions
for scenes in model_outputs:
    for scene in scenes:
        print(scene)