In [1]:
import pandas as pd
import json
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import (
    TranscriptsDisabled, VideoUnavailable, NoTranscriptFound
)
from googleapiclient.discovery import build
from tqdm import tqdm
import time
import os

# Set up the YouTube API client (you need a valid API key)
API_KEY = 'AIzaSyDeNpVV0yNsa2Th1AEvm8iBFGzZgphnYfY'  # Replace with your API key
youtube = build('youtube', 'v3', developerKey=API_KEY)

# Load the CSV containing new clickbait video IDs
df = pd.read_csv("clickbait.csv")

# Standardize the column name for consistency
df.rename(columns={"ID": "video_id"}, inplace=True)

# Add a 'transcript' column if it's not already there
if 'transcript' not in df.columns:
    df['transcript'] = None

# Save results to a separate file from the original dataset
json_filename = "clickbait_extra_transcripts.json"

# Function to fetch video details using YouTube API
def get_video_details(video_id):
    try:
        # Retrieve video details using YouTube API
        request = youtube.videos().list(
            part="snippet,statistics",
            id=video_id
        )
        response = request.execute()

        # Extract relevant video details
        video_details = response['items'][0]
        snippet = video_details['snippet']
        statistics = video_details['statistics']

        channel_id = snippet['channelId']
        channel_name = snippet['channelTitle']
        channel_subscribers = get_channel_subscribers(channel_id)
        channel_videos = get_channel_videos(channel_id)
        channel_views = get_channel_views(channel_id)

        video_comments = statistics.get('commentCount', 0)
        video_dislikes = statistics.get('dislikeCount', 0)
        video_likes = statistics.get('likeCount', 0)
        video_views = statistics.get('viewCount', 0)
        video_title = snippet['title']

        return {
            'channel_id': channel_id,
            'channel_name': channel_name,
            'channel_subscribers': channel_subscribers,
            'channel_videos': channel_videos,
            'channel_views': channel_views,
            'video_comments': video_comments,
            'video_dislikes': video_dislikes,
            'video_likes': video_likes,
            'video_views': video_views,
            'video_id': video_id,
            'video_title': video_title
        }
    except Exception as e:
        print(f"Error fetching details for video {video_id}: {e}")
        return None

# Function to fetch channel details like subscribers, videos, and views
def get_channel_subscribers(channel_id):
    try:
        request = youtube.channels().list(
            part="statistics",
            id=channel_id
        )
        response = request.execute()
        return response['items'][0]['statistics'].get('subscriberCount', 0)
    except Exception as e:
        print(f"Error fetching channel subscribers for {channel_id}: {e}")
        return 0

def get_channel_videos(channel_id):
    try:
        request = youtube.channels().list(
            part="contentDetails",
            id=channel_id
        )
        response = request.execute()
        return len(response['items'][0]['contentDetails']['relatedPlaylists']['uploads'])
    except Exception as e:
        print(f"Error fetching channel videos for {channel_id}: {e}")
        return 0

def get_channel_views(channel_id):
    try:
        request = youtube.channels().list(
            part="statistics",
            id=channel_id
        )
        response = request.execute()
        return response['items'][0]['statistics'].get('viewCount', 0)
    except Exception as e:
        print(f"Error fetching channel views for {channel_id}: {e}")
        return 0

# Function to fetch transcript with error handling
def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return " ".join([entry['text'] for entry in transcript])
    except (TranscriptsDisabled, VideoUnavailable, NoTranscriptFound):
        return None
    except Exception:
        return None

# Filter for videos that haven't been processed yet
unscraped_videos = df[df['transcript'].isna()]
print(f"Transcript scraping for {len(unscraped_videos)} videos.")

# Track previously saved video IDs to avoid duplication
saved_video_ids = set()
if os.path.exists(json_filename):
    with open(json_filename, "r") as json_file:
        for line in json_file:
            try:
                entry = json.loads(line.strip())
                saved_video_ids.add(entry["video_id"])
            except json.JSONDecodeError:
                continue

# Scrape transcripts and save all information incrementally
with open(json_filename, "a") as json_file:
    for i, row in tqdm(unscraped_videos.iterrows(), total=len(unscraped_videos)):
        video_id = row['video_id']

        # Skip videos that have already been saved
        if video_id in saved_video_ids:
            continue

        # Get all video details (including transcript)
        video_details = get_video_details(video_id)
        if not video_details:
            continue

        # Get the transcript
        transcript = get_transcript(video_id)
        video_details['transcript'] = transcript

        # Save to JSON file
        row_dict = {**row.to_dict(), **video_details}
        json_file.write(json.dumps(row_dict) + "\n")
        json_file.flush()

        # Wait to avoid rate-limiting by YouTube
        time.sleep(1)

# Save final DataFrame as a backup
df.to_pickle("clickbait_extra_transcripts.pkl")

print("✅ Scraping complete. Data saved to 'clickbait_extra_transcripts.json' and 'clickbait_extra_transcripts.pkl'.")


Transcript scraping for 101 videos.


  1%|          | 1/101 [00:00<00:45,  2.18it/s]

Error fetching details for video 5WTXHdc1_zI: list index out of range


  2%|▏         | 2/101 [00:02<02:18,  1.40s/it]

Error fetching details for video aKkRwPGSWGQ: list index out of range


 23%|██▎       | 23/101 [00:41<01:33,  1.19s/it]

Error fetching details for video t889ifadAMo: list index out of range
Error fetching details for video C7QwaapFRQs: list index out of range
Error fetching details for video hfFGTVZNjis: list index out of range


 26%|██▌       | 26/101 [00:41<00:46,  1.62it/s]

Error fetching details for video DPCeF7ifQJU: list index out of range
Error fetching details for video pKg8GJ7dIug: list index out of range


 29%|██▊       | 29/101 [00:42<00:23,  3.07it/s]

Error fetching details for video ck1EQ-vmmc4: list index out of range
Error fetching details for video ck1EQ-vmmc4: list index out of range
Error fetching details for video 3-U-PUspr50: list index out of range


 32%|███▏      | 32/101 [00:42<00:13,  5.10it/s]

Error fetching details for video bas_0jmmgI8: list index out of range
Error fetching details for video spjY7VnFrnM: list index out of range
Error fetching details for video oqq5aXBg3FU: list index out of range


 34%|███▎      | 34/101 [00:42<00:10,  6.16it/s]

Error fetching details for video QwqiXWkKpbw: list index out of range
Error fetching details for video XFlBwuu_p6k: list index out of range
Error fetching details for video SJNZ1PJXxg4: list index out of range


 52%|█████▏    | 53/101 [01:20<01:40,  2.10s/it]

Error fetching details for video 4XnMaufLlEk: list index out of range


 81%|████████  | 82/101 [02:15<00:21,  1.13s/it]

Error fetching details for video ylgjWFf6ssY: list index out of range
Error fetching details for video aHPKWjTmDEU: list index out of range
Error fetching details for video yVaMtCi9Xw0: list index out of range


 83%|████████▎ | 84/101 [02:17<00:19,  1.14s/it]

Error fetching details for video yVaMtCi9Xw0: list index out of range


 89%|████████▉ | 90/101 [02:22<00:07,  1.52it/s]

Error fetching details for video prcK6Jt9STQ: list index out of range
Error fetching details for video LCaxzn0geyQ: list index out of range
Error fetching details for video 7Z0qOH9zqoU: list index out of range


100%|██████████| 101/101 [02:43<00:00,  1.62s/it]

Error fetching details for video 0uocETPj4Jx4: list index out of range
✅ Scraping complete. Data saved to 'clickbait_extra_transcripts.json' and 'clickbait_extra_transcripts.pkl'.



