<a href="https://colab.research.google.com/github/JayNguyen-123/AI-for-Video-Understanding-From-Content-Moderation-to-Summarization/blob/main/Video_Content_Filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### AI for Video Understanding: From COntent Moderation to Summarization
- The rapid growth of video content has created a need for advanced systems to process and understand this complex data. Video understanding is a critical field in AI, where the goal is to enable machines to comprehend, analyze, and interact with video content, much like humans.

Model Pipelines:
- Video Content Moderation with CLIP + Gemini
- Video Summarization with Qwen2.5-VL

In [1]:
import torch
import gradio as gr
from PIL import Image
import cv2
import google.generativeai as genai
from transformers import CLIPProcessor, CLIPModel
import json


In [3]:
from google.colab import userdata
userdata.get('GOOGLE_API_KEY')

genai.configure(api_key="GOOGLE_API_KEY")


In [4]:
# Loading CLIP

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [5]:
# Extracting Frames
def extract_frames(video_path):
  cap = cv2.VideoCapture(video_path)
  frames = []
  idx = 0
  while True:
    ret, frame = cap.read()
    if not ret:
      break
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frames.append((idx, Image.fromarray(rgb)))
    idx += 1
  cap.release()
  return frames


In [6]:
# Frame Classification with CLIP
def classify_frame_with_clip(frame: Image.Image):
  text_inputs = [
      "this image is normal",
      "this image contains nudity",
      "this image contains enticing or sensual content",
      "this image contains violence"
  ]

  inputs = processor(text=text_inputs, images=frame, return_tensors='pt', padding=True)
  with torch.no_grad():
    # Get the full output from CLIP model
    output = model(**inputs)

  # Acces the logits from the output object
  logits_per_image = output.logits_per_image    # Logits for image classification

  # Convert logits to probabilities (softmax)
  probs = output.logits_per_image.softmax(dim=-1)
  confidence, predicted_class = probs.max(dim=-1)
  print("Predicted Class: ", predicted_class)

  labels = ["normal", "nudity", "enticing or sensual", "violent"]
  label = labels[predicted_class.item()]
  print("Predicted label: ", label)
  return label, confidence.item()



In [7]:
# Explainations with Gemini
import io

def get_gemini_explanation(images: list, flagged_frames_: list) -> str:
  try:
    llm = genai.GenerativeModel("gemini-2.5-flash")

    # Convert all images to byte format
    image_parts = []
    for img in images:
      buffer = io.BytesIO()
      img.save(buffer, format="JPEG")
      img_bytes = buffer.getvalue()
      image_parts.append({
          "mime_type": "image/jpeg",
          "data": img_bytes
      })

    # Prompt specially for images and NSFW classification
    prompt = f"""
         You are given a set of image frames extracted from a video, and a list of predictions for some of those frames indicating they are NSFW (Not Safe For Work). Each dictionary in the list contains:
           - frame_id: the frame number,
           - label: always "nsfw",
           - confidence: the model's confidence score.

           Here is metadata:
           {json.dumps(flagged_frames_, indent=2)}
           **Do not include words like metadata or images in the explanation**
          Analyze the flagged images and their metadata to explain clearly in **no more than 2 lines** why the video has been classified as NSFW.
        """
    # Combine prompt and all image parts
    response = llm.generate_content(
        [prompt] + image_parts
    )

    explanation = response.parts[0].text.strip()
    return explanation

  except Exception as e:
    return f"Error: {str(e)}"



In [8]:
# Moderation Orchestration
def moderate_video(video_file):
  if video_file is None:
    return "No video upload", {}

  frames = extract_frames(video_file)
  if not frames:
    return "No frames extracted", {}

  flagged_frames = []
  img_list = []
  for idx, img in frames:
    # Classify the frame(normal or nsfw)
    label, confidence = classify_frame_with_clip(img)

    # If the frame is classified as "nsfw", get the reasoning from gemini
    if label != "normal" and confidence > 0.5:
      # Get the explanation from Gemini
      flagged_frames.append({
          "frame_id": idx,
          "classification": label,
          "confidence": confidence,
      })
      img_list.append(img)
      # remember to generate the whole video summary through the combined flagged frames throughout the video.

  if not flagged_frames:
    return  "✅ All frames are appropriate.", "None", {}

  explanation = get_gemini_explanation(img_list, flagged_frames)

  final_perct = (len(img_list)/len(frames))*100

  return explanation, final_perct, flagged_frames


In [14]:
# Gradio Interface
iface = gr.Interface(
    fn=moderate_video,
    inputs=gr.Video(label="Upload a video"),
    outputs=[
        gr.Textbox(label="Why is it Flagged?"),
        gr.Textbox(label="NSFW Percentage throughout the video"),
        gr.JSON(label="Flagged Frames")

    ],
    title="Video Content Moderation with CLIP and Gemini",
    description="Detects unsafe content in a video and provides detailed classification reasoning using Google Gemini."
)

if __name__=="__main__":
  iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://65d49644861029e1a6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [12]:
!pip install qwen_vl_utils




In [13]:
import argparse
import json
from pathlib import Path

import torch
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from qwen_vl_utils import process_vision_info


def summarize_video(
    video_path: str,
    max_new_tokens: int = 256,
) -> str:
    """
    Ask Qwen2.5-VL-3B-Instruct to summarize a local video.
    Returns raw generated text from the model.
    """

    model_id = "Qwen/Qwen2.5-VL-3B-Instruct"

    # Load model & processor (per HF card)
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        model_id, torch_dtype="auto", device_map="auto"
    )
    processor = AutoProcessor.from_pretrained(model_id)

    # Build messages with a local video path and a prompt
    # Using "file://" absolute path so the processor/loader can find the file.
    vpath = Path(video_path).resolve()
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": f"file://{vpath}",
                },
                {
                    "type": "text",
                    "text": (
                        "Summarize this video. Return ONLY valid JSON with two keys:\n"
                        '  "bullets": a list of 5-7 concise bullet points (chronological),\n'
                        '  "paragraph": a short 120-word paragraph summary.\n'
                    ),
                },
            ],
        }
    ]

    # Prepare inputs (per HF card pattern)
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to(model.device)

    # Generate
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
        )

    # Trim prompt tokens
    trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, output_ids)]
    out_text = processor.batch_decode(
        trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]

    return out_text


def try_parse_json(text: str):
    """
    Try to parse strict JSON from the model output. If it fails,
    attempt to extract the JSON object from the text.
    """
    try:
        return json.loads(text)
    except Exception:
        # crude fallback: find the first {...} block
        import re
        m = re.search(r"\{[\s\S]*\}", text)
        if not m:
            return None
        try:
            return json.loads(m.group(0))
        except Exception:
            return None


def write_outputs(obj_or_text, out_path: Path):
    """
    Save Markdown + JSON. If obj_or_text is dict-like with bullets/paragraph,
    write pretty Markdown; otherwise, save raw text.
    """
    out_path = out_path.with_suffix(".md")
    json_path = out_path.with_suffix(".json")

    if isinstance(obj_or_text, dict) and "bullets" in obj_or_text and "paragraph" in obj_or_text:
        bullets = obj_or_text.get("bullets") or []
        paragraph = obj_or_text.get("paragraph") or ""

        md = ["# Video Summary", ""]
        if bullets:
            md.append("## Key Points")
            for b in bullets:
                md.append(f"- {b}")
            md.append("")
        if paragraph:
            md.append("## Short Summary")
            md.append(paragraph)
            md.append("")

        out_path.write_text("\n".join(md), encoding="utf-8")
        json_path.write_text(json.dumps(obj_or_text, ensure_ascii=False, indent=2), encoding="utf-8")
    else:
        # Raw text fallback
        out_path.write_text(str(obj_or_text), encoding="utf-8")

    return str(out_path), str(json_path)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--video", required=True, help="Path to a local video file (e.g., .mp4)")
    ap.add_argument("--out", default="video_summary.md", help="Output Markdown path (JSON will be next to it)")
    ap.add_argument("--max_new_tokens", type=int, default=256, help="Max new tokens for generation")
    args = ap.parse_args()

    raw = summarize_video(args.video, max_new_tokens=args.max_new_tokens)
    maybe_json = try_parse_json(raw)

    out_md, out_json = write_outputs(maybe_json if maybe_json is not None else raw, Path(args.out))
    print(f"Saved:\n- {out_md}\n- {out_json if Path(out_json).exists() else '(no JSON parsed)'}")


if __name__ == "__main__":
    main()

usage: colab_kernel_launcher.py [-h] --video VIDEO [--out OUT]
                                [--max_new_tokens MAX_NEW_TOKENS]
colab_kernel_launcher.py: error: the following arguments are required: --video
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/lib/python3.12/argparse.py", line 1943, in _parse_known_args2
    namespace, args = self._parse_known_args(args, namespace, intermixed)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/argparse.py", line 2230, in _parse_known_args
    raise ArgumentError(None, _('the following arguments are required: %s') %
argparse.ArgumentError: the following arguments are required: --video

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-2219191179.py", line 143, in <cell line: 0>
    main()
  File "/tmp/ipython-input-2219191179.py", line 133, in main
    args = ap.parse_args()
           ^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/argparse.py", line 1904, in pa

TypeError: object of type 'NoneType' has no len()