In [None]:
# @title Copyright & License (click to expand)
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Video Data Curation - Video Quality Filtering

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/multimodal-data-curation/quality-filtering.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fuse-cases%2Fmultimodal-data-curation%2Fquality-filtering.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/use-cases/multimodal-data-curation/quality-filtering.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/multimodal-data-curation/quality-filtering.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/multimodal-data-curation/quality-filtering.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/multimodal-data-curation/quality-filtering.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/multimodal-data-curation/quality-filtering.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/multimodal-data-curation/quality-filtering.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/multimodal-data-curation/quality-filtering.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>            

| | |
|-|-|
|Author(s) | [John Semerdjian]

## Overview

After we have split and transcoded our initial set of videos, the next step in a video curation pipeline is to filter videos based on their quality. Our goal is to discard as many "low quality" clips as possible in order to use our limited modeling compute as efficiently as possible. "Quality" in this context refers to a collection of subjective and practical based on the developer's own requirements. The practical requirements here may translate into filters that correspond to video metadata like resolution or the absence of visible text, while subjective quality filters may target the aesthetics of the video itself. Since there is not a single set of filters or thresholds to use for _all_ video data curation pipelines, we will focus on the process of extracting and inferring severals fields from our video collection. The approaches below have been aggregated from multiple research papers for training text-to-video foundation models.

This notebook is organized as follows:
* Metadata Filters
* OCR & Watermark Detection
* Aesthetic Scoring
* Motion Scores 

### Install required packages

In [None]:
%pip install --upgrade --user --quiet google-genai google-cloud-storage pyav opencv-python numpy torch transformers pillow

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

## Before you begin

### Set your project ID

**If you don't know your project ID**, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[YOUR_PROJECT_ID]"  # @param {type:"string"}
# Set the project id
! gcloud config set project {PROJECT_ID}

#### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [2]:
REGION = "us-central1"  # @param {type:"string"}

#### Bucket


In [3]:
BUCKET_NAME = "[YOUR_BUCKET_NAME]"  # @param {type:"string"}
BUCKET_URI = f"gs://{BUCKET_NAME}"

### Authenticate your Google Cloud account

Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below.

**1. Vertex AI Workbench**
* Do nothing as you are already authenticated.

**2. Local JupyterLab instance, uncomment and run:**

In [4]:
# ! gcloud auth login

**3. Authenticate your notebook environment**

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
from google.colab import auth

auth.authenticate_user()

**4. Service account or other**
* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples.

### Import libraries

In [4]:
import io
import json
import math

from PIL import Image
import av
import cv2 as cv
from google import genai
from google.cloud import storage
from google.genai import types
import numpy as np
import torch
from torch import nn
from transformers import CLIPModel, CLIPProcessor

storage_client = storage.Client(project=PROJECT_ID)
gemini_client = genai.Client(vertexai=True, project=PROJECT_ID, location=REGION)

### Load video paths from Cloud Storage

We assume you have video data already stored in a Cloud Storage bucket. We will read a few videos and demonsrate their outputs in this notebook.

In [5]:
def load_video_paths(
    bucket: str,
    num_videos: int,
) -> list[str]:
    video_blobs = []
    for i, blob in enumerate(storage_client.list_blobs(bucket)):
        if i >= num_videos:
            break
        video_blobs.append(blob.name)
    return video_blobs

In [6]:
videos = load_video_paths(BUCKET_NAME, 3)
bucket = storage_client.get_bucket(BUCKET_NAME)
blobs = [bucket.get_blob(blob_name=blob_name) for blob_name in videos]

## Metadata Filters

To extract video metadata we will use [PyAV](https://github.com/PyAV-Org/PyAV), a Python wrapper over the popular video processing software FFmpeg. Identifying the video metadata fields below is important for adhering to downstream modeling requirements. For example, training over very dark or very bright videos may lead to poor modeling performance, and we should therefore exclude them during training. 

There are several metadata fields to consider but we will only cover the following:
* Frames per second (FPS)
* Duration (already covered in a previous notebook but we include it here as well)
* Resolution
* Brightness
* Aspect Ratio

In [8]:
def get_last_timestamp(container: av.container.Container) -> dict:
    """Gets the last frame timestamp and total frame count from video.

    Args:
        container: PyAV container

    Returns:
        dict: Last frame index and duration information

    """
    video_stream = container.streams.video[0]
    if video_stream.duration:
        duration = float(video_stream.duration * video_stream.time_base)
    else:
        duration = 0.0

    frame_count = 0
    for _ in container.decode(video_stream):
        frame_count += 1
    return {
        "last_frame_index": frame_count,
        "duration": np.round(duration, 2).item(),
    }


def get_average_brightness(
    container: av.container.Container,
    sample_interval: int = 10,
) -> float:
    """Calculate average brightness based on key frames from the video.

    See: https://en.wikipedia.org/wiki/Relative_luminance

    Args:
        container: PyAV container
        sample_interval: Interval at which to sample frames

    Returns:
        float: Average brightness of the video

    """
    video_stream = container.streams.video[0]

    total_frames = 0
    for _ in container.decode(video_stream):
        total_frames += 1

    container.seek(0)
    sample_interval = max(1, total_frames // sample_interval)
    luminance_values = []

    frame_idx = 0
    for frame in container.decode(video_stream):
        if frame_idx % sample_interval == 0:
            frame_array = frame.to_ndarray(format="rgb24")
            red = frame_array[..., 0]
            green = frame_array[..., 1]
            blue = frame_array[..., 2]
            luminance = 0.2126 * red + 0.7152 * green + 0.0722 * blue
            luminance_values.append(np.mean(luminance))
        frame_idx += 1

    return np.round(np.mean(luminance_values), 2).item() if luminance_values else 0.0


def get_video_metadata(video_file_object: io.BytesIO) -> dict:
    """Extracts video metadata from a video file object using PyAV.

    Args:
        video_file_object: A binary file-like object containing the video data

    Returns:
        dict: Video metadata

    """
    aspect_ratio_map = {
        "16:9": "hdtv",
        "4:3": "standard television",
        "21:9": "ultrawide",
        "3:2": "common photography",
        "1:1": "square",
        "5:4": "large format photography",
        "16:10": "computer display",
        "239:100": "anamorphic",
        "47:20": "anamorphic",
        "12:5": "anamorphic",
        "37:20": "common widescreen theatrical",
        "14:9": "cropped standard television",
    }

    video_file_object.seek(0)
    container = av.open(video_file_object)
    video_stream = container.streams.video[0]

    height = video_stream.height
    width = video_stream.width
    avg_fps = float(video_stream.average_rate)

    common_divisor = math.gcd(width, height)
    aspect_ratio = f"{width // common_divisor}:{height // common_divisor}"
    aspect_ratio_str = aspect_ratio_map.get(aspect_ratio, aspect_ratio)

    duration_info = get_last_timestamp(container)

    container.seek(0)
    avg_brightness = get_average_brightness(container)
    container.close()

    return {
        "height": height,
        "width": width,
        "avg_fps": int(avg_fps),
        "aspect_ratio": aspect_ratio,
        "aspect_ratio_type": aspect_ratio_str,
        "avg_brightness": avg_brightness,
        **duration_info,
    }

In [None]:
for b in blobs:
    with b.open("rb") as f:
        video_file_object = io.BytesIO(f.read())
    print(f"{b.name}: {get_video_metadata(video_file_object)}")

## OCR and Watermark Detection

Videos with text may or may not be desirable in our curated dataset, depending on the capabilities of the model we want to train. On the other hand, video watermarks may be more consistently undesirable. Either way, we will walk through examples of doing both using a single call to Gemini 2.0 Flash. We will also incorporate structured outputs to separate the text from the watermark content as JSON objects.

In [10]:
def get_text_and_watermarks(video_uri: str) -> dict:
    """Extracts text, watermarks, and usage metadata video.

    Args:
        vr: decord.VideoReader

    Returns:
        dict: Visitable text, watermark descriptions, and usage metadata

    """
    system_instruction = """Your task is to carefully watch the provided video and do the following:

* Extract all visible text that appears across all frames in the video (e.g. text overlays from a weather forecast, credits at the start or end of a video). For example, if there is text on a t-shirt, traffic signs, or something that is naturally part of the video content, do NOT include it in the response.
* Identify and describe any visual watermarks in the video. These are typically logos or other images that are overlaid on the video during the production process.
* If no text or watermarks are visible, return 'None' as the response.
* Be concise, especially if there is no text or watermarks visible.
* Return your response in JSON format with the following fields and nothing else:
{
    "text": "The text found in the video (if any)",
    "watermarks": "Watermark descriptions found in the video (if any)"
}
"""

    response = gemini_client.models.generate_content(
        model="gemini-2.0-flash-001",
        contents=[
            types.Part.from_text(text="Watch this video:"),
            types.Part.from_uri(file_uri=video_uri, mime_type="video/mp4"),
        ],
        config=types.GenerateContentConfig(
            system_instruction=system_instruction,
            temperature=0.0,
            response_mime_type="application/json",
            response_schema={
                "type": "ARRAY",
                "items": {
                    "type": "OBJECT",
                    "properties": {
                        "text": {"type": "STRING"},
                        "watermarks": {"type": "STRING"},
                    },
                },
            },
        ),
    )
    response_json = json.loads(response.text)[0]
    return {**response_json, **response.usage_metadata.to_json_dict()}

In [None]:
for v in videos:
    print(f"{v}:")
    text_and_watermarks = get_text_and_watermarks(f"{BUCKET_URI}/{v}")
    text = text_and_watermarks["text"]
    watermarks = text_and_watermarks["watermarks"]
    print(f"\ttext: {text}")
    print(f"\twatermarks: {watermarks}\n")

## Aesthetic Scoring

Aesthetic scoring involves using a model to evaluate the quality of sampled frames from our videos. Below we show how to use the [CLIP+MLP Aesthetic Score](https://github.com/christophschuhmann/improved-aesthetic-predictor) model that was train over an annotated dataset of images and aesthetic scores, i.e. a numeric value on a scale of 1 to 10, where the higher the score the better. For more details on the underlying training data and classifier see [LAION-Aesthetics blog](https://laion.ai/blog/laion-aesthetics/). We will sample frames from each video, run inference with the trained CLIP+MLP model above, and average the scores. We recommend figuring out the optimal score threshold to filter by manually inspecting a sample of low scoring videos.

Alternatively, we can prompt a native multi-modal model like Gemini 2.x to make a similar classification, and even fine-tune Gemini 2.x using the labelled data from the LAION dataset. Either way, the threshold to use (or classification system) will involve manually inspecting a sample of low scoring predictions.

Note: Be sure to download the [`sac+logos+ava1-l14-linearMSE.pth`](https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/refs/heads/main/sac+logos+ava1-l14-linearMSE.pth) model weights before running the code below.

In [16]:
class MLP(nn.Module):

    def __init__(self, input_size: int, xcol: str = "emb", ycol: str = "avg_rating"):
        super().__init__()
        self.input_size = input_size
        self.xcol = xcol
        self.ycol = ycol
        self.layers = nn.Sequential(
            nn.Linear(self.input_size, 1024),
            nn.Dropout(0.2),
            nn.Linear(1024, 128),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.Dropout(0.1),
            nn.Linear(64, 16),
            nn.Linear(16, 1),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.layers(x)


def normalized(a, axis: int = -1, order: int = 2) -> np.ndarray:
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2 == 0] = 1
    return a / np.expand_dims(l2, axis)


def sample_video_frames(
    video_file_object: io.BytesIO,
    num_frames: int,
) -> list[Image.Image]:
    """Sample frames from video at regular intervals.

    Args:
        video_file_object: A binary file-like object containing the video data
        num_frames: Number of frames to sample

    Returns:
        List of PIL Images

    """
    video_file_object.seek(0)
    container = av.open(video_file_object)
    video_stream = container.streams.video[0]

    total_frames = 0
    for _ in container.decode(video_stream):
        total_frames += 1

    container.seek(0)
    if num_frames == 1:
        # Get the middle frame
        target_frame = total_frames // 2
        frame_idx = 0
        for frame in container.decode(video_stream):
            if frame_idx == target_frame:
                frame_array = frame.to_ndarray(format="rgb24")
                container.close()
                return [Image.fromarray(frame_array)]
            frame_idx += 1
    else:
        # Sample frames at regular intervals, use 80% of frames
        last_frame_index = int(total_frames * 0.8)
        frame_indices = np.linspace(0, last_frame_index, num_frames).astype(int)

        sampled_frames = []
        frame_idx = 0
        for frame in container.decode(video_stream):
            if frame_idx in frame_indices:
                frame_array = frame.to_ndarray(format="rgb24")
                sampled_frames.append(Image.fromarray(frame_array))
            frame_idx += 1
            if len(sampled_frames) >= num_frames:
                break

        container.close()
        return sampled_frames

    container.close()
    return []


def get_frames(blob: storage.blob.Blob) -> list[Image.Image]:
    """Extract key frames from video blob for aesthetic scoring.

    Args:
        blob: Cloud Storage blob object

    Returns:
        List of PIL Images

    """
    with blob.open("rb") as f:
        video_file_object = io.BytesIO(f.read())

    video_file_object.seek(0)
    container = av.open(video_file_object)
    video_stream = container.streams.video[0]

    # Sample every nth frame to get key frames (up to 10 frames)
    total_frames = 0
    for _ in container.decode(video_stream):
        total_frames += 1

    container.seek(0)

    # Sample frames at regular intervals
    sample_interval = max(1, total_frames // 10)
    sampled_frames = []

    frame_idx = 0
    for frame in container.decode(video_stream):
        if frame_idx % sample_interval == 0:
            frame_array = frame.to_ndarray(format="rgb24")
            sampled_frames.append(Image.fromarray(frame_array))
        frame_idx += 1

    container.close()
    return sampled_frames


def load_model(model_path: str) -> tuple[CLIPModel, CLIPProcessor, MLP, str]:
    mlp_model = MLP(768)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    state_dict = torch.load(model_path, map_location=torch.device(device))
    mlp_model.load_state_dict(state_dict)
    mlp_model.to(device)
    mlp_model.eval()

    clip_model_name = "openai/clip-vit-large-patch14"
    processor = CLIPProcessor.from_pretrained(clip_model_name)
    clip_model = CLIPModel.from_pretrained(clip_model_name).to(device)
    return clip_model, processor, mlp_model, device


def get_aesthetic_score(
    clip_model: CLIPModel,
    processor: CLIPProcessor,
    mlp_model: MLP,
    device: str,
    pil_images: Image.Image,
) -> float:
    inputs = processor(
        text=None,
        images=pil_images,
        return_tensors="pt",
        padding=True,
    ).to(device)

    with torch.no_grad():
        image_features = clip_model.get_image_features(
            pixel_values=inputs["pixel_values"],
        )

    im_emb_arr = normalized(image_features.cpu().detach().numpy())

    input_tensor = torch.from_numpy(im_emb_arr).to(device).float()
    return mlp_model(input_tensor).mean()

In [None]:
clip_model, processor, mlp_model, device = load_model(
    "./sac+logos+ava1-l14-linearMSE.pth"
)


for b in blobs:
    pil_images = get_frames(b)
    aesthetic_score = get_aesthetic_score(
        clip_model, processor, mlp_model, device, pil_images
    )
    print(f"{b.name}: {aesthetic_score}")

## Motion Scores

In order to detect different types of motion we can measure the difference between consective frames in a video. Since we only care about measurement movement in the context of slow or very fast motion, we will focus on more coarse and computationally efficient approaches that fall under *Sparse Optical Flow Estimation*. These techniques look at displacement in important patches of an image, in contrast to *Dense Optical Flow Estimation*, which examines all pixel values ([RAFT (Recurrent All-pairs Field Transforms)](https://arxiv.org/pdf/2003.12039) is one example of a Dense Optical Flow model). The key method in OpenCV is `cv.calcOpticalFlowPyrLK`, which uses the Lucas-Kanade method to calculate optical flow. See the [OpenCV documentation](https://docs.opencv.org/4.x/db/d7f/tutorial_js_lucas_kanade.html) for more details.

In [83]:
def estimate_video_motion(video_file_object: io.BytesIO) -> float:
    """Estimates motion in a video.

    Args:
        video_file_object: A binary file-like object containing the video data

    Returns:
        A single float representing the estimated average motion per frame.

    Raises:
        ValueError: If the video data cannot be read or is empty.

    """
    feature_params = {
        "maxCorners": 100,
        "qualityLevel": 0.3,
        "minDistance": 7,
        "blockSize": 7,
    }
    lk_params = {
        "winSize": (15, 15),
        "maxLevel": 2,
        "criteria": (cv.TERM_CRITERIA_EPS | cv.TERM_CRITERIA_COUNT, 10, 0.03),
    }

    motion_per_frame = []

    try:
        video_file_object.seek(0)
        container = av.open(video_file_object)
        stream = container.streams.video[0]
        frames = container.decode(stream)
        # Get the first frame
        old_frame = next(frames)
        old_frame_bgr = old_frame.to_ndarray(format="bgr24")
    except Exception as e:
        raise ValueError(f"Failed to read video data from memory. Error: {e}")

    old_gray = cv.cvtColor(old_frame_bgr, cv.COLOR_BGR2GRAY)
    p0 = cv.goodFeaturesToTrack(old_gray, mask=None, **feature_params)

    if p0 is None:
        return 0.0

    for frame in frames:
        frame_bgr = frame.to_ndarray(format="bgr24")
        frame_gray = cv.cvtColor(frame_bgr, cv.COLOR_BGR2GRAY)
        p1, st, err = cv.calcOpticalFlowPyrLK(
            old_gray, frame_gray, p0, None, **lk_params
        )

        if p1 is not None and st is not None:
            good_new = p1[st == 1]
            good_old = p0[st == 1]

            if len(good_new) > 0:
                distances = np.linalg.norm(good_new - good_old, axis=1)
                motion_per_frame.append(np.mean(distances))

            p0 = good_new.reshape(-1, 1, 2)
        else:
            p0 = cv.goodFeaturesToTrack(frame_gray, mask=None, **feature_params)
            if p0 is None:
                break

        old_gray = frame_gray.copy()

    return np.mean(motion_per_frame) if motion_per_frame else 0.0

In [None]:
for b in blobs:
    with b.open("rb") as f:
        video_file_object = io.BytesIO(f.read())
    motion_score = estimate_video_motion(video_file_object)
    print(f"{b.name}: {motion_score}")