In [None]:
import json
import os
import re
from yt_dlp import YoutubeDL

# To extract YouTube video ID from URL
def extract_video_id(url):
    m = re.search(r"v=([^&]+)", url)
    if m:
        return m.group(1)
    m = re.search(r"youtu\.be/([^?&]+)", url)
    if m:
        return m.group(1)
    return None

# Sanitize titles for filenames
def sanitize_filename(name):
    return re.sub(r"[\\\\/:*?\"<>|]", "_", name)

# Load video URLs from JSON search results
def load_video_urls(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return [item.get("link") for item in data if item.get("link")]

# Build parent-reply hierarchy in comments
def structure_comments(comments):
    cmap = {c['id']: c for c in comments if c.get('id')}
    roots = []
    for c in comments:
        pid = c.get('parent_id')
        if pid and pid in cmap:
            cmap[pid].setdefault('replies', []).append(c)
        else:
            roots.append(c)
    return roots


def main():
    # Ensure output directory exists
    os.makedirs("output", exist_ok=True)

    # Read URLs
    urls = load_video_urls('videossearch-aliceinwonderland.json')

    for url in urls:
        vid = extract_video_id(url)
        if not vid:
            print(f"Invalid URL skipped: {url}")
            continue
        video_url = f"https://www.youtube.com/watch?v={vid}"

        # Scrape metadata + comments
        ydl_opts = {"getcomments": True}
        with YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(video_url, download=False)

        # Meta fields
        meta = {
            "Title": info.get("title"),
            "Description": info.get("description"),
            "Uploader": info.get("uploader"),
            "View Count": info.get("view_count"),
            "Likes": info.get("like_count"),
            "Dislikes": info.get("dislike_count"),
            "Thumbnail": info.get("thumbnail"),
            "Tags": info.get("tags"),
        }

        # Process comments
        raw = info.get("comments") or []
        comments = []
        for c in raw:
            comments.append({
                "id": c.get("id"),
                "parent_id": c.get("parent"),
                "author": c.get("author"),
                "text": c.get("text"),
                "timestamp": c.get("timestamp"),
            })
        structured = structure_comments(comments)
        meta["Comments"] = structured

        # Statistics
        top_comments = len(structured)
        replies = sum(len(c.get("replies", [])) for c in structured)
        unique_authors = len({c.get("author") for c in comments if c.get("author")})

        # Save JSON
        title_safe = sanitize_filename(info.get("title", vid))
        outfile = os.path.join("output", f"{title_safe}_{vid}.json")
        with open(outfile, "w", encoding="utf-8") as fout:
            json.dump(meta, fout, ensure_ascii=False, indent=2)

        print(f"{outfile}: {top_comments} comments, {replies} replies, {unique_authors} unique authors")

if __name__ == '__main__':
    main()

[youtube] Extracting URL: https://www.youtube.com/watch?v=9POCgSRVvf0
[youtube] 9POCgSRVvf0: Downloading webpage
[youtube] 9POCgSRVvf0: Downloading tv client config
[youtube] 9POCgSRVvf0: Downloading tv player API JSON
[youtube] 9POCgSRVvf0: Downloading ios player API JSON
[youtube] 9POCgSRVvf0: Downloading m3u8 information
[youtube] Downloading comment section API JSON
[youtube] Downloading ~1412 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~1412)
[youtube] Downloading comment API JSON page 2 (20/~1412)
[youtube] Downloading comment API JSON page 3 (40/~1412)
[youtube] Downloading comment API JSON page 4 (60/~1412)
[youtube] Downloading comment API JSON page 5 (80/~1412)
[youtube] Downloading comment API JSON page 6 (100/~1412)
[youtube] Downloading comment API JSON page 7 (120/~1412)
[youtube] Downloading comment API JSON page 8 (140/~1412)
[youtube] Downloading comment API JSON page 9 (159/~1412)
[youtube] Downloading comment A



[youtube] Downloading comment API JSON page 39 (759/~1412)
[youtube] Downloading comment API JSON page 40 (779/~1412)
[youtube] Downloading comment API JSON page 41 (799/~1412)
[youtube] Downloading comment API JSON page 42 (819/~1412)
[youtube] Downloading comment API JSON page 43 (839/~1412)
[youtube] Downloading comment API JSON page 44 (859/~1412)
[youtube] Downloading comment API JSON page 45 (879/~1412)
[youtube] Downloading comment API JSON page 46 (899/~1412)
[youtube] Downloading comment API JSON page 47 (919/~1412)
[youtube] Downloading comment API JSON page 48 (939/~1412)
[youtube] Downloading comment API JSON page 49 (959/~1412)
[youtube] Downloading comment API JSON page 50 (979/~1412)
[youtube] Downloading comment API JSON page 51 (999/~1412)
[youtube] Downloading comment API JSON page 52 (1019/~1412)
[youtube] Downloading comment API JSON page 53 (1039/~1412)
[youtube] Downloading comment API JSON page 54 (1059/~1412)
[youtube] Downloading comment API JSON page 55 (1079/



[youtube]     Downloading comment API JSON reply thread 3 (3311/~4553)
[youtube] Downloading comment API JSON page 120 (3319/~4553)
[youtube]     Downloading comment API JSON reply thread 1 (3320/~4553)
[youtube]        Downloading comment replies API JSON page 1 (3330/~4553)
[youtube] Downloading comment API JSON page 121 (3353/~4553)
[youtube]     Downloading comment API JSON reply thread 1 (3361/~4553)
[youtube]     Downloading comment API JSON reply thread 2 (3365/~4553)
[youtube]     Downloading comment API JSON reply thread 3 (3374/~4553)
[youtube] Downloading comment API JSON page 122 (3390/~4553)
[youtube] Downloading comment API JSON page 123 (3410/~4553)
[youtube]     Downloading comment API JSON reply thread 1 (3415/~4553)
[youtube]     Downloading comment API JSON reply thread 2 (3419/~4553)
[youtube] Downloading comment API JSON page 124 (3433/~4553)
[youtube]     Downloading comment API JSON reply thread 1 (3451/~4553)
[youtube]     Downloading comment API JSON reply thre



[youtube] Downloading comment API JSON page 140 (3987/~7728)
[youtube]     Downloading comment API JSON reply thread 1 (3990/~7728)
[youtube]     Downloading comment API JSON reply thread 2 (4010/~7728)
[youtube] Downloading comment API JSON page 141 (4018/~7728)
[youtube]     Downloading comment API JSON reply thread 1 (4037/~7728)
[youtube] Downloading comment API JSON page 142 (4042/~7728)
[youtube]     Downloading comment API JSON reply thread 1 (4048/~7728)
[youtube]     Downloading comment API JSON reply thread 2 (4058/~7728)
[youtube] Downloading comment API JSON page 143 (4066/~7728)
[youtube]     Downloading comment API JSON reply thread 1 (4076/~7728)
[youtube]     Downloading comment API JSON reply thread 2 (4092/~7728)
[youtube]        Downloading comment replies API JSON page 1 (4102/~7728)
[youtube] Downloading comment API JSON page 144 (4105/~7728)
[youtube]     Downloading comment API JSON reply thread 1 (4113/~7728)
[youtube]        Downloading comment replies API JSON