## パッケージ導入

In [None]:
!pip install --upgrade google-genai japanize_matplotlib

In [None]:
import IPython
app = IPython.Application.instance()
_ = app.kernel.do_shutdown(True)

## 事前準備

In [None]:
import vertexai
PROJECT_ID = !gcloud config get-value project
PROJECT_ID = PROJECT_ID[-1]
LOCATION = 'us-central1'

vertexai.init(project=PROJECT_ID, location='us-central1')

BUCKET = f'gs://{PROJECT_ID}-handson'

In [None]:
target_s = f'{BUCKET}/mp4/s_Google Cloud Next Tokyo ’24 - Innovators Hive の Day 0 に潜入！ -.mp4'
target_n = f'{BUCKET}/mp4/n_Google Cloud Next Tokyo ’24 - Innovators Hive の Day 0 に潜入！ -.mp4'

In [None]:
import base64, copy, json, os, re, time, uuid
from io import BytesIO
import matplotlib.pyplot as plt
from PIL import Image

import vertexai
from google.cloud import storage
from google import genai
from google.genai import types
from google.genai.types import (
    HttpOptions, GenerateContentConfig, GenerateImagesConfig,
    Part, UserContent, ModelContent,
)

import os, json, datetime, pprint
import numpy as np

from IPython.display import Image, display, HTML
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import japanize_matplotlib

In [None]:
def generate_response(system_instruction, contents,
                      response_schema, model='gemini-2.0-flash-001'):
    client = genai.Client(vertexai=True,
                          project=PROJECT_ID, location=LOCATION,
                          http_options=HttpOptions(api_version='v1'))
    response = client.models.generate_content(
        model=model,
        contents=contents,
        config=GenerateContentConfig(
            system_instruction=system_instruction,
            temperature=0.1,
            response_mime_type='application/json',
            response_schema=response_schema,
        )
    )
    return '\n'.join(
        [p.text for p in response.candidates[0].content.parts if p.text]
    )

In [None]:
def load_json(text):
    text = text.replace('```json', '').replace('```', '').replace('\n', ' ')
    return json.loads(text)

In [None]:
!gcloud storage ls --long {BUCKET}/mp4/s*

In [None]:
filename = target_s.split('/')[-1]
basename = filename.rstrip('.mp4').lstrip('s_')
image_dir = f'{BUCKET}/image/{basename}'
gsutil_opt = '-o GSUtil:parallel_composite_upload_threshold=150M'
local_image_dir = basename

In [None]:
# 静止画像ファイルをローカルにコピー
!gcloud storage cp --recursive "{image_dir}" ./

## キャラクター抽出 (Optional)

In [None]:
system_instruction = '''\
You are a video content editor. Work on the following tasks.

[task]
A. Find characters in the movie and describe the visual appearance of each character as detailed as possible.
B. Identify the name of each character you found on task A. If you cannot identify the name, name it "Unknown".
C. Identify representative scenes for each character where the character visually appears on the screen.

[condition]
B. The description has more than three sentences.
C. The number of scenes for each character is at most three. Each scene is identified with timestamp mm:ss-mm:ss.

[format instruction]
In Japanese. Output is a JSON list of "character dict". "character dict" is a JSON dict in the following format:
{
  "Character ID": <Sequential number starting from 1>,
  "Name": "<Character name>",
  "Visual Description": "<Visual appearance>",
  "Scenes": [ "<list of timestamp of representative scenes>" ]
}
'''

response_schema = {
    "type": "array",
    "items": {
            "type": "object",
            "properties": {
                "Character ID": {"type": "string",
                                 "description": "Sequential number starting from 1"
                },
                "Name" : {"type": "string",
                          "description": "Character name"
                },
                "Visual Description": {"type": "string",
                                       "description": "Visual appearance of the character"
                },
                "Scenes": {"type": "array",
                           "items": {
                              "type": "string"
                           },
                           "description": "list of timestamp of representative scenes"
                },
            },
            "required": [
                "Character ID",
                "Name",
                "Visual Description",
                "Scenes"
            ],

    }
}

In [None]:
contents = UserContent([
    Part.from_text(text='[movie]'),
    Part.from_uri(file_uri=target_n, mime_type='video/mp4'),
])

result = generate_response(system_instruction, contents,
                           response_schema=response_schema,
                           model='gemini-2.0-flash-001')
print(result)

In [None]:
scenes = load_json(result)
jpg_files = !ls "{local_image_dir}/"
m = 0
for c in [s.lstrip('capture').rstrip('.jpg') for s in jpg_files]:
    try:
        m = max(m, int(c))
    except:
        pass

for item in scenes:
    print('=====')
    print(item['Character ID'], item['Name'], item['Visual Description'])
    for ts in item['Scenes'][:3]:
        ts1, ts2 = ts.split('-')
        m1, s1 = ts1.split(':')
        ss1 = int(m1) * 60 + int(s1) + 1
        m2, s2 = ts2.split(':')
        ss2 = int(m2) * 60 + int(s2) + 1
        
        fig, axs = plt.subplots(1, 5, figsize=(15, 2.5))  # Adjust figsize as needed
        axs = axs.ravel()
        fig.suptitle(str(item['Character ID']) + ': ' + item['Name'], fontsize=16)
        for c, ts in enumerate(np.linspace(max(1, ss1), min(ss2, m), 5)):
            ts = int(ts)
            mm, ss = divmod(ts - 1, 60)
            image_name = f'{local_image_dir}/capture{ts:04d}.jpg'
            img = mpimg.imread(image_name)
            axs[c].imshow(img)
            axs[c].axis('off')
            axs[c].set_title(f'{mm:02d}:{ss:02d}')
        plt.show()
        plt.clf()

キャラクター名はハルシネーションが起きている可能性があるので、外見の記述と参考画像を元にネット検索で正しい名前を検索して、参照用画像ファイルを別途用意

In [None]:
chars = !gcloud storage ls "{image_dir}/characters/*.png"
chars

In [None]:
prompt_reference = [
    '\n[Reference information to identify character names]'
]
image_files = {}
for item in chars:
    name = item.split('/')[-1].rstrip('.png')
    image_files[name] = item
    prompt_reference += [
        f'The name of following character is "{name}"',
        Part.from_uri(file_uri=image_files[name], mime_type='image/png')
    ]

prompt_reference

## 動画サマリー

In [None]:
system_instruction = ''

In [None]:
prompt_summary = '''\
You are a video content editor. Work on the following tasks.

[task]
A. Give an one sentence summary of the movie that can be used as a title of the movie.
B. Summarize the content of the movie in five to ten sentences.

[condition]
A, B. If possible, identify the names of characters. Use the full name on every part of the output.
A, B. The output should be based on the objective information in the movie.

[format instruction]
Answer in Japanese. In the JSON dict with the following format:
{
  "Title": "<Output of Task A>",
  "Summary": "<Output of Task B>"
}
'''

# prompt_summary = [prompt_summary] + prompt_reference

response_schema = {
    "type": "array",
    "items": {
            "type": "object",
            "properties": {
                "Title": {"type": "string",
                },
                "Summary" : {"type": "string",
                },
            },
            "required": [
                "Title",
                "Summary",
            ],

    }
}

In [None]:
contents = UserContent([
    Part.from_text(text=prompt_summary),
    Part.from_uri(file_uri=target_n, mime_type='video/mp4'),
])

result = generate_response(system_instruction, contents,
                           response_schema=response_schema,
                           model='gemini-2.0-flash-001')
print(result)

In [None]:
summary = pprint.pformat(load_json(result))

## 重要シーン抽出

In [None]:
prompt_important_scenes = '''\
You are a video content editor. Work on the following tasks.

[task]
A. Find around 15 important scenes in the movie with accurate timestamps.
B. Give a short summary of each scene and why that scene is important.

[condition]
A, B. If possible, identify the names of characters. Use the full name on every part of the output.
The [summary] section contains the summary of the entire movie.
You don't need to use the summary information to find scenes, but the result should be consistent with the summary.

[format instruction]
In Japanese. Show only the following information.
Output in a comma separated list with two columns: <timestamp mm:ss-mm:ss>, <output of task B>
Header is: Timestamp, Description

[summary]
''' + summary

In [None]:
contents = UserContent([
    Part.from_text(text=prompt_important_scenes),
    Part.from_uri(file_uri=target_n, mime_type='video/mp4'),
])

important_scenes = generate_response(system_instruction, contents, response_schema=None, model='gemini-2.0-flash-001')
print(important_scenes)

## シーン情報（ビジュアル情報）取得

In [None]:
prompt_visuals = '''\
You are a video content editor. Work on the following tasks.

[task]
A. Split the entire movie into scenes with accurate timestamps from start to the exact end of the movie file.
B. Describe what's happening in the scene as detailed as possible.
C. Enrich the output of task B by adding visual information of not only characters but also things in the background.

[condition]
A. The length of each scene is 1 to 15 seconds.
 - Good example: 00:05-00:08, 00:05-00:18. / Bad example: 00:05-01:14 as the timestamp jumps more than 15 seconds.
B, C. Avoid using audio information to describe the scene. Describle only what you see on the screen.
B. If possible, identify the names of characters. Use the full name on every part of the output.
C. The final description is very detailed, vivid and realistic to covey all the visual information of the scene, using up to three sentences.

[format instruction]
In Japanese. Show only the following information.
Output in a comma separated list with two columns: <timestamp mm:ss-mm:ss>, <output of task C>
Header is: Timestamp, Description
'''

In [None]:
contents = UserContent([
    Part.from_text(text=prompt_visuals),
    Part.from_uri(file_uri=target_n, mime_type='video/mp4'),
])

visual_info = generate_response(system_instruction, contents, response_schema=None, model='gemini-2.0-flash-001')
print(visual_info)

## 音声文字起こし

In [None]:
prompt_transcription = '''\
You are a video content editor. Work on the following tasks.

[task]
A. Transcript what they are speaking with accurate timestamps.

[condition]
A. Process the entire movie from start to the exact end of the movie file.
A. Identify the name of person who is speaking for each speech. Use the full name on every part of the output. If you cannot identify the name, name it "Unknown".

[format instruction]
In Japanese. Output is a JSON list of "speech dict". "speech dict" is a JSON dict in the following format:
{
  "timestamp": "<mm:ss-mm:ss>",
  "Name": "<Name of the speaker>",
  "Transcription": "<Transcription>"
}
'''

response_schema = {
    "type": "array",
    "items": {
            "type": "object",
            "properties": {
                "timestamp": {"type": "string",
                                 "description": "<mm:ss-mm:ss>"
                },
                "Name" : {"type": "string",
                          "description": "Name of the speaker"
                },
                "Transcription": {"type": "string",
                },
            },
            "required": [
                "timestamp",
                "Name",
                "Transcription",
            ],

    }
}

In [None]:
contents = UserContent([
    Part.from_text(text=prompt_transcription),
    Part.from_uri(file_uri=target_s, mime_type='video/mp4'),
])
result = generate_response('Process the entire movie from start to the exact end of the movie file.', contents,
                           response_schema=response_schema,
                           model='gemini-2.5-pro-preview-05-06')
print(result)

In [None]:
transcription = pprint.pformat(load_json(result))

## テキスト情報取得

In [None]:
prompt_texts = '''\
You are a video content editor. Work on the following tasks.

[task]
A. Extract exact text strings from each scene.

[steps]
2. Extract text strings from the scene.
3. Output the new "text dict" that have the "Text" element. "Timestamp" and "Layout" should be the same as the original item.

[condition]
A. Process the entire movie from start to the exact end of the movie file.A. Output each character literally as on the screen. Don't modify them.
A. Use a list to store multiple lines of texts instead of using the return code \\n in the extracted text strings.
A. Make sure to use double quotes "" in the output JSON.

[format instruction]
Final output is a JSON list of "text dict". "text dict" is a JSON dict in the following format:
{
  "Timestamp": "<Timestamp mm:ss-mm:ss>",
  "Text": [List of text strings from Task A>],
  "Layout": "<layout description>"
}

[text layout]
'''

response_schema = {
    "type": "array",
    "items": {
            "type": "object",
            "properties": {
                "Timestamp": {"type": "string",
                              "description": "<Timestamp mm:ss-mm:ss>"
                },
                "Text": {"type": "array",
                           "items": {
                              "type": "string"
                           },
                           "description": "List of text strings from Task A>"
                },
                "Layout" : {"type": "string",
                            "description": "<layout description>"
                },
            },
            "required": [
                "Timestamp",
                "Text",
                "Layout",
            ],

    }
}

contents = UserContent([
    Part.from_text(text=prompt_texts),
    Part.from_uri(file_uri=target_n, mime_type='video/mp4'),
])

result = generate_response('Process the entire movie from start to the exact end of the movie file.', contents,
                           response_schema=response_schema,
                           model='gemini-2.5-pro-preview-05-06')
print(result)

In [None]:
texts = pprint.pformat(load_json(result))

## テキストベースの動画内検索

In [None]:
prompt_content_search = '''
You are a video content editor.

Given the following information of a movie:
- The [summary] section contains the summary of the movie.
- The [important scenes] section contains the important scenes of the movie with timestamps.
- The [visual info] section contains the visual information on what's happening in each scene with timestamps.
- The [transcription] section contains speech transcription with timestamps.
- The [text] section contains text information with timestamps.

Find one to three scenes that matches the user query with timestamps.

[format instruction]
Output in Japanese. Output is a JSON list with "scene dict".
Each "scene dict" is a JSON dict with the following format:
{{
  "Timestamp": "<timestamp mm:ss-mm:ss>",
  "Description": "<Explain how this scene matches the query.>",
  "Evidence": [<List of data snippets that support your result>]
}}

[user query]
{}

[summary]
{}

[important scenes]
{}

[visual info]
{}

[transcription]
{}

[texts]
{}
'''

response_schema = {
    "type": "array",
    "items": {
            "type": "object",
            "properties": {
                "Timestamp": {"type": "string",
                              "description": "<Timestamp mm:ss-mm:ss>"
                },
                "Description" : {"type": "string",
                            "description": "Explain how this scene matches the query."
                },
                "Evidence": {"type": "array",
                             "items": {
                                 "type": "string"
                             },
                             "description": "List of data snippets that support your result>"
                },
            },
            "required": [
                "Timestamp",
                "Description",
                "Evidence"
            ],

    }
}

In [None]:
def show_thumbnails(scenes):
    jpg_files = !ls "{local_image_dir}/"
    m = 0
    for c in [s.lstrip('capture').rstrip('.jpg') for s in jpg_files]:
        try:
            m = max(m, int(c))
        except:
            pass
    for item in scenes:
        ts = item['Timestamp']
        ts1, ts2 = ts.split('-')
        m1, s1 = ts1.split(':')
        ss1 = int(m1) * 60 + int(s1) + 1
        m2, s2 = ts2.split(':')
        ss2 = int(m2) * 60 + int(s2) + 1
        
        fig, axs = plt.subplots(1, 5, figsize=(15, 2))  # Adjust figsize as needed
        axs = axs.ravel()
        print('=====')
        print(str(item['Description']))
        # 報告されたタイムスタンプの前後 3 秒を含めて 5 枚の静止画像を表示する
        for c, ts in enumerate(np.linspace(max(1, ss1-3), min(ss2+3, m), 5)):
            ts = int(ts)
            mm, ss = divmod(ts - 1, 60)
            image_name = f'{local_image_dir}/capture{ts:04d}.jpg'
            img = mpimg.imread(image_name)
            axs[c].imshow(img)
            axs[c].axis('off')
            axs[c].set_title(f'{mm:02d}:{ss:02d}')
        plt.show()
        plt.clf()

In [None]:
%%time
query = '視聴者の興味を惹く面白そうなシーン'
prompt = prompt_content_search.format(
    query, summary, important_scenes, visual_info, transcription, texts)
# result = generate(prompt, model=model_flash)

contents = UserContent([
    Part.from_text(text=prompt),
])
result = generate_response(system_instruction, contents,
                           response_schema=response_schema,
                           model='gemini-2.0-flash-001')
print(result)

In [None]:
show_thumbnails(load_json(result))

In [None]:
%%time
query = '複数の人物が会話しているシーン'
prompt = prompt_content_search.format(
    query, summary, important_scenes, visual_info, transcription, texts)

contents = UserContent([
    Part.from_text(text=prompt),
])

result = generate_response(system_instruction, contents,
                           response_schema=response_schema,
                           model='gemini-2.0-flash-001')
print(result)

In [None]:
show_thumbnails(load_json(result))

In [None]:
%%time
query = '会場を歩いているシーン'
prompt = prompt_content_search.format(
    query, summary, important_scenes, visual_info, transcription, texts)

contents = UserContent([
    Part.from_text(text=prompt),
])

result = generate_response(system_instruction, contents,
                           response_schema=response_schema,
                           model='gemini-2.0-flash-001')
print(result)

In [None]:
show_thumbnails(load_json(result))

## 動画ベースの動画内検索

In [None]:
prompt_movie_content_search = '''\
You are a video content editor.
Find one to three scenes that matches the user query with timestamps.

[condition]
Try to identify names of characters.
The result should be based on the objective information in the movie itself.

[user query]
{}

[format instruction]
Output in Japanese. Output is a JSON list with "scene dict".
Each "scene dict" is a JSON dict with the following format:
{{
  "Timestamp": "<timestamp mm:ss-mm:ss>",
  "Description": "<Explain how this scene matches the query.>"
}}
'''

response_schema = {
    "type": "array",
    "items": {
            "type": "object",
            "properties": {
                "Timestamp": {"type": "string",
                              "description": "<Timestamp mm:ss-mm:ss>"
                },
                "Description" : {"type": "string",
                            "description": "Explain how this scene matches the query."
                },
            },
            "required": [
                "Timestamp",
                "Description"
            ],

    }
}

In [None]:
%%time
query = '視聴者の興味を惹く面白そうなシーン'
prompt = prompt_movie_content_search.format(query)

contents = UserContent([
    Part.from_text(text=prompt),
    Part.from_uri(file_uri=target_n, mime_type='video/mp4'),
])

result = generate_response(system_instruction, contents,
                           response_schema=response_schema,
                           model='gemini-2.0-flash-001')
print(result)

In [None]:
show_thumbnails(load_json(result))

In [None]:
%%time
query = '複数の人物が会話しているシーン'
prompt = prompt_movie_content_search.format(query)
contents = UserContent([
    Part.from_text(text=prompt),
    Part.from_uri(file_uri=target_n, mime_type='video/mp4'),
])

result = generate_response(system_instruction, contents,
                           response_schema=response_schema,
                           model='gemini-2.0-flash-001')
print(result)

In [None]:
show_thumbnails(load_json(result))

In [None]:
%%time
query = '会場を歩いているシーン'
prompt = prompt_movie_content_search.format(query)
contents = UserContent([
    Part.from_text(text=prompt),
    Part.from_uri(file_uri=target_n, mime_type='video/mp4'),
])

result = generate_response(system_instruction, contents,
                           response_schema=response_schema,
                           model='gemini-2.0-flash-001')
print(result)

In [None]:
show_thumbnails(load_json(result))

## 検索用メタテキストを保存

In [None]:
meta_text = '''\
[summary]
{}

[important scenes]
{}

[visual info]
{}

[transcription]
{}

[texts]
{}'''.format(summary, important_scenes, visual_info, transcription, texts)

metafile = f'{basename}.txt'
with open(metafile, 'w') as f:
    f.write(meta_text)

In [None]:
!gcloud storage cp "{metafile}" {BUCKET}/metadata/

In [None]:
print(meta_text)