In [7]:
import pandas as pd
import json
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import (
    TranscriptsDisabled, VideoUnavailable, NoTranscriptFound
)
from tqdm import tqdm
import time
import os

# Load dataset with existing transcripts if available;
# otherwise, load the base clickbait dataset.
try:
    df = pd.read_pickle("clickbait_with_transcripts.pkl")
    print("Resuming from existing dataset with transcripts.")
except FileNotFoundError:
    df = pd.read_pickle("clickbait-df")

# Ensure there's a column for transcripts
if 'transcript' not in df.columns:
    df['transcript'] = None

# File to save results incrementally (NDJSON format)
json_filename = "clickbait_with_transcripts.json"

# Function to fetch transcripts with error handling
def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return " ".join([entry['text'] for entry in transcript])
    except (TranscriptsDisabled, VideoUnavailable, NoTranscriptFound):
        # Return None for videos with no valid transcript
        return None
    except Exception:
        # Handle unexpected errors gracefully
        return None

# Identify videos that still need transcripts
unscraped_videos = df[df['transcript'].isna()]

print(f"Resuming transcript scraping for {len(unscraped_videos)} videos (all).")

# Track already saved video IDs to prevent duplicate processing
saved_video_ids = set()

# If the JSON file exists, read saved video IDs from it (NDJSON: one JSON per line)
if os.path.exists(json_filename):
    with open(json_filename, "r") as json_file:
        for line in json_file:
            try:
                entry = json.loads(line.strip())
                saved_video_ids.add(entry["video_id"])
            except json.JSONDecodeError:
                continue

# Open JSON file in append mode to add new entries immediately
with open(json_filename, "a") as json_file:
    for i, row in tqdm(unscraped_videos.iterrows(), total=len(unscraped_videos)):
        video_id = row['video_id']

        # Skip if already saved
        if video_id in saved_video_ids:
            continue

        # Fetch transcript
        transcript = get_transcript(video_id)

        # Skip saving if no transcript was found
        if transcript is None:
            continue

        # Update the DataFrame
        df.at[i, 'transcript'] = transcript

        # Convert the row to a dictionary and add transcript
        row_dict = row.to_dict()
        row_dict["transcript"] = transcript

        # Write this JSON object as a new line (NDJSON)
        json_file.write(json.dumps(row_dict) + "\n")
        json_file.flush()  # Ensure data is written immediately

        # Prevent rate limiting
        time.sleep(1)

# Save the updated DataFrame in pickle format as well
df.to_pickle("clickbait_with_transcripts.pkl")

print(f"Updated transcripts saved in '{json_filename}' and 'clickbait_with_transcripts.pkl'.")


Resuming transcript scraping for 18317 videos (all).


100%|██████████| 18317/18317 [2:29:26<00:00,  2.04it/s]   

Updated transcripts saved in 'clickbait_with_transcripts.json' and 'clickbait_with_transcripts.pkl'.



