In [1]:
import json
import os
import re
from yt_dlp import YoutubeDL

# Utility to extract YouTube video ID from URL
def extract_video_id(url):
    m = re.search(r"v=([^&]+)", url)
    if m:
        return m.group(1)
    m = re.search(r"youtu\.be/([^?&]+)", url)
    if m:
        return m.group(1)
    return None

# Sanitize titles for filenames
def sanitize_filename(name):
    return re.sub(r"[\\\\/:*?\"<>|]", "_", name)

# Load video URLs from JSON search results
def load_video_urls(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return [item.get("link") for item in data if item.get("link")]

# Build parent-reply hierarchy in comments
def structure_comments(comments):
    cmap = {c['id']: c for c in comments if c.get('id')}
    roots = []
    for c in comments:
        pid = c.get('parent_id')
        if pid and pid in cmap:
            cmap[pid].setdefault('replies', []).append(c)
        else:
            roots.append(c)
    return roots


def main():
    # Ensure output directory exists
    os.makedirs("output", exist_ok=True)

    # Read URLs
    urls = load_video_urls('videossearch-snowwhite.json')

    for url in urls:
        vid = extract_video_id(url)
        if not vid:
            print(f"Invalid URL skipped: {url}")
            continue
        video_url = f"https://www.youtube.com/watch?v={vid}"

        # Scrape metadata + comments
        ydl_opts = {"getcomments": True}
        with YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(video_url, download=False)

        # Meta fields
        meta = {
            "Title": info.get("title"),
            "Description": info.get("description"),
            "Uploader": info.get("uploader"),
            "View Count": info.get("view_count"),
            "Likes": info.get("like_count"),
            "Dislikes": info.get("dislike_count"),
            "Thumbnail": info.get("thumbnail"),
            "Tags": info.get("tags"),
        }

        # Process comments
        raw = info.get("comments") or []
        comments = []
        for c in raw:
            comments.append({
                "id": c.get("id"),
                "parent_id": c.get("parent"),
                "author": c.get("author"),
                "text": c.get("text"),
            })
        structured = structure_comments(comments)
        meta["Comments"] = structured

        # Statistics
        top_comments = len(structured)
        replies = sum(len(c.get("replies", [])) for c in structured)
        unique_authors = len({c.get("author") for c in comments if c.get("author")})

        # Save JSON
        title_safe = sanitize_filename(info.get("title", vid))
        outfile = os.path.join("output", f"{title_safe}_{vid}.json")
        with open(outfile, "w", encoding="utf-8") as fout:
            json.dump(meta, fout, ensure_ascii=False, indent=2)

        print(f"{outfile}: {top_comments} comments, {replies} replies, {unique_authors} unique authors")

if __name__ == '__main__':
    main()


[youtube] Extracting URL: https://www.youtube.com/watch?v=iV46TJKL8cU
[youtube] iV46TJKL8cU: Downloading webpage
[youtube] iV46TJKL8cU: Downloading tv client config
[youtube] iV46TJKL8cU: Downloading tv player API JSON
[youtube] iV46TJKL8cU: Downloading ios player API JSON
[youtube] iV46TJKL8cU: Downloading m3u8 information
[youtube] Downloading comment section API JSON
[youtube] Downloading ~125827 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~125827)
[youtube] Downloading comment API JSON page 2 (20/~125827)
[youtube] Downloading comment API JSON page 3 (40/~125827)
[youtube]     Downloading comment API JSON reply thread 1 (45/~125827)
[youtube]     Downloading comment API JSON reply thread 2 (47/~125827)
[youtube]     Downloading comment API JSON reply thread 3 (51/~125827)
[youtube]     Downloading comment API JSON reply thread 4 (58/~125827)
[youtube]     Downloading comment API JSON reply thread 5 (66/~125827)
[youtube] Down