# 📽️ Subtitle Extractor & Summarizer Project
extracts burned-in subtitles from a video and summarizes them using an LLM.

In [None]:
# Install dependencies
!pip install opencv-python pytesseract transformers openai

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [None]:
# Step 1: Extract subtitle frames using OpenCV
import cv2, os

def extract_subtitle_frames(video_path, output_dir, step=10, crop_height_ratio=0.2):
    cap = cv2.VideoCapture(video_path)
    frame_id = 0
    saved = 0

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_id % step == 0:
            h, w, _ = frame.shape
            cropped = frame[int(h * (1 - crop_height_ratio)):, :]
            cv2.imwrite(f"{output_dir}/frame_{saved}.png", cropped)
            saved += 1
        frame_id += 1
    cap.release()

In [None]:
# Step 2: Extract text from subtitle frames using OCR
from PIL import Image
import pytesseract

def extract_text_from_images(images_folder):
    texts = []
    for file in sorted(os.listdir(images_folder)):
        if file.endswith(".png"):
            img_path = os.path.join(images_folder, file)
            img = Image.open(img_path)
            text = pytesseract.image_to_string(img, lang='eng')
            if text.strip():
                texts.append(text.strip())
    return texts

In [None]:
# Step 3: Clean duplicate subtitles
def clean_subtitles(text_list):
    cleaned = []
    prev = ""
    for line in text_list:
        if line != prev:
            cleaned.append(line)
            prev = line
    return cleaned

In [None]:
# Step 4: Summarize subtitles using HuggingFace Transformers
from transformers import pipeline

def summarize_text(texts, max_tokens=512):
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    full_text = " ".join(texts)
    chunks = [full_text[i:i+max_tokens] for i in range(0, len(full_text), max_tokens)]
    summaries = [summarizer(chunk)[0]['summary_text'] for chunk in chunks]
    return " ".join(summaries)

In [None]:
!pip install yt-dlp
!yt-dlp https://www.youtube.com/watch?v=YInY5GSey-Q -o sample_hardsub.mp4

Collecting yt-dlp
  Downloading yt_dlp-2025.6.25-py3-none-any.whl.metadata (174 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/174.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m174.1/174.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.3/174.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2025.6.25-py3-none-any.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt-dlp
Successfully installed yt-dlp-2025.6.25
[youtube] Extracting URL: https://www.youtube.com/watch?v=YInY5GSey-Q
[youtube] YInY5GSey-Q: Downloading webpage
[youtube] YInY5GSey-Q: Downloading tv client config
[youtube] YInY5GSey-Q: Downloading player a10d7fcc-main
[youtube] YInY5GSey-Q: Downloading tv player API JSON
[youtube] YIn

In [None]:
# Step 5: Run all steps together (example)
video_path = "/content/sample_hardsub.mp4.mkv"
output_dir = "frames"

extract_subtitle_frames(video_path, output_dir)
raw_texts = extract_text_from_images(output_dir)
cleaned = clean_subtitles(raw_texts)
summary = summarize_text(cleaned)

print("\n📄 Summary:\n", summary)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu
Your max_length is set to 142, but your input_length is only 139. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)



📄 Summary:


In [8]:
# Step 6: Gradio Interface to upload video and get summary
import gradio as gr

def process_video_and_summarize(video_file):
    import tempfile, os
    video_path = video_file.name
    temp_dir = tempfile.mkdtemp()

    extract_subtitle_frames(video_path, temp_dir)
    raw_texts = extract_text_from_images(temp_dir)
    cleaned = clean_subtitles(raw_texts)
    if not cleaned:
        return "⛔ No subtitle text detected."
    summary = summarize_text(cleaned)
    return summary

gr.Interface(
    fn=process_video_and_summarize,
    inputs=gr.Video(label="Upload video with burned-in subtitles"),
    outputs="text",
    title="🎬 Subtitle Summarizer",
    description="Upload a video with hardcoded subtitles. This app will extract subtitles using OCR and summarize them using an LLM."
).launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a76a589905cedae323.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


