In [1]:
import os
import subprocess
from pathlib import Path
from webvtt import WebVTT
from yt_dlp import YoutubeDL

In [2]:
OUTPUT_DIR = Path("captions")
OUTPUT_MD = Path("../data_to_process/markdowns/yts_transcripts.md")

In [3]:
def download_captions(url):
    ydl_opts = {
        'writesubtitles': True,
        'writeautomaticsub': True,
        'subtitleslangs': ['en'],
        'skip_download': True,
        'outtmpl': str(OUTPUT_DIR / '%(title)s.%(ext)s'),
        'quiet': True,
        'no_warnings': True,
    }

    with YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

In [4]:
def parse_vtt_to_text(vtt_path):
    """Convert .vtt file to clean paragraph text"""
    text_chunks = []
    for caption in WebVTT().read(vtt_path):
        text_chunks.append(caption.text.strip())
    return " ".join(text_chunks)


In [5]:
def process_url(url):
    print(f"📥 Downloading subtitles for: {url}")
    download_captions(url)

    # Find all .vtt files just downloaded
    for vtt_file in OUTPUT_DIR.glob("*.en.vtt"):
        try:
            title = vtt_file.stem.replace(".en", "")
            transcript = parse_vtt_to_text(vtt_file)
            md_entry = f"\n\n## {title}\n\n{transcript}"

            if OUTPUT_MD.exists():
                OUTPUT_MD.write_text(OUTPUT_MD.read_text(encoding="utf-8") + md_entry, encoding="utf-8")
            else:
                OUTPUT_MD.write_text(md_entry, encoding="utf-8")

            print(f"✅ Added: {title}")
            vtt_file.unlink()  # optional: delete the vtt after use
        except Exception as e:
            print(f"❌ Failed to process {vtt_file.name}: {e}")

In [6]:
if __name__ == "__main__":
    urls = [
        "https://www.youtube.com/playlist?list=PLOFEBzvs-VvrX2HwqjmzpVSzfJ5wjYBcq",
        "https://www.youtube.com/watch?v=lt4OsgmUTGI",
        "https://www.youtube.com/watch?v=1lTA2n142Mk",
        "https://www.youtube.com/playlist?list=PLOFEBzvs-VvqKKMXX4vbi4EB1uaErFMSO",
        "https://www.youtube.com/watch?v=kgSVkVNxXyU",
        "https://www.youtube.com/watch?v=RQWpF2Gb-gU",
        "https://www.youtube.com/watch?v=Dlsa9EBKDGI"
    ]

    OUTPUT_DIR.mkdir(exist_ok=True)

    for url in urls:
        process_url(url)

    print(f"\n📄 Done! All transcripts saved to: {OUTPUT_MD.resolve()}")

📥 Downloading subtitles for: https://www.youtube.com/playlist?list=PLOFEBzvs-VvrX2HwqjmzpVSzfJ5wjYBcq
✅ Added: Can all computers take advantage of quantum computers？
✅ Added: Do you need to write your own software for quantum computers？
✅ Added: How do quantum computers break encryption？ #quantum #quantumphysics #quantumcomputing
✅ Added: How do quantum computers process information？ #quantum #quantumphysics #quantumcomputing
✅ Added: If quantum computers are real, why aren’t they changing the world yet？
✅ Added: More Qubits, More Power？ Not So Fast…
✅ Added: Quantum Computing： Advanced Insights
✅ Added: Quantum Computing： Beyond The Basics
✅ Added: Quantum Computing： The Basics
✅ Added: Tour a Quantum Lab with Dr. Olivia Lanes
✅ Added: What is a Bit-Flip？ Quantum Jargon Explained
✅ Added: What is a Hamiltonian？ Quantum Jargon Explained
✅ Added: What is a quantum gate？ #quantum #science #quantumcomputing
✅ Added: What is an Ansatz？ Quantum Jargon Explained
✅ Added: What is Quantum Adva