In [None]:
#%pip install google-api-python-client

In [None]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import pandas as pd
import numpy as np
import json

API_KEYS = [
]

CSV_FILE = "1697_channels_to_scrape.csv"
OUTPUT_NPY = "youtube_comments.npy"
MAX_COMMENTS_PER_CHANNEL = 20000
MAX_RESULTS_PER_REQUEST = 100

current_key_index = 0
youtube = None

all_rows = []   # [channel_name, channel_id, video_name, video_id, author, comment_text, likes, time, comment_id]


def build_youtube_service(api_key):
    return build("youtube", "v3", developerKey=api_key)


def print_current_key():
    print(f"Using API key #{current_key_index + 1}/{len(API_KEYS)}")


def init_youtube():
    global youtube, current_key_index
    current_key_index = 0
    youtube = build_youtube_service(API_KEYS[current_key_index])
    print_current_key()


def switch_to_next_key():
    global youtube, current_key_index
    current_key_index = (current_key_index + 1) % len(API_KEYS)
    youtube = build_youtube_service(API_KEYS[current_key_index])
    print_current_key()


def is_fatal_key_error(http_error: HttpError) -> bool:
    fatal_reasons = {
        "quotaExceeded",
        "dailyLimitExceeded",
        "keyInvalid",
        "ipRefererBlocked",
        "rateLimitExceeded",
    }
    try:
        error_json = json.loads(http_error.content.decode("utf-8"))
        errors = error_json.get("error", {}).get("errors", [])
        if errors:
            reason = errors[0].get("reason", "")
            if reason in fatal_reasons:
                return True
    except Exception:
        pass

    msg = str(http_error)
    return any(r in msg for r in fatal_reasons)


def comment_threads_list_with_rotation(**kwargs):
    global youtube, current_key_index

    tried_keys = 0

    while True:
        try:
            return youtube.commentThreads().list(**kwargs).execute()
        except HttpError as e:
            if is_fatal_key_error(e):
                print(
                    f"[API key #{current_key_index + 1}] "
                    f"HttpError (fatal for this key): {e}"
                )
                tried_keys += 1
                if tried_keys >= len(API_KEYS):
                    raise RuntimeError(
                        "T·∫•t c·∫£ API key ƒë√£ h·∫øt quota ho·∫∑c g·∫∑p l·ªói kh√¥ng th·ªÉ d√πng."
                    )
                switch_to_next_key()
                continue
            raise


def save_numpy(rows):
    data_array = np.array(rows, dtype=object)
    np.save(OUTPUT_NPY, data_array)
    print(f"Saved numpy file: {OUTPUT_NPY} with shape {data_array.shape}")


def fetch_comments_for_video(video_id, max_comments_for_this_video):
    comments = []
    next_page_token = None

    while len(comments) < max_comments_for_this_video:
        remaining = max_comments_for_this_video - len(comments)
        max_results = min(MAX_RESULTS_PER_REQUEST, remaining)

        response = comment_threads_list_with_rotation(
            part="snippet",
            videoId=video_id,
            maxResults=max_results,
            pageToken=next_page_token,
            textFormat="plainText",
        )

        items = response.get("items", [])
        if not items:
            break

        for item in items:
            tlc = item.get("snippet", {}).get("topLevelComment", {})
            snippet = tlc.get("snippet", {})

            raw_text = snippet.get("textDisplay", "") or snippet.get("textOriginal", "")

            clean_text = (
                raw_text.replace("\r\n", " ")
                        .replace("\n", " ")
                        .replace("\r", " ")
            )

            author = snippet.get("authorDisplayName", "")
            like_count = snippet.get("likeCount", 0)
            time_str = snippet.get("publishedAt") or snippet.get("updatedAt", "")
            comment_id = tlc.get("id", "")

            comments.append((author, clean_text, like_count, time_str, comment_id))

            if len(comments) >= max_comments_for_this_video:
                break

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    return comments


def main():
    global all_rows
    all_rows.clear()  # reset global list at the start of a run

    init_youtube()

    df = pd.read_csv(CSV_FILE)

    df = df.rename(columns={
        "channel name": "channel_name",
        "channel id": "channel_id",
        "video name": "video_name",
        "video id": "video_id"
    })

    grouped = df.groupby(["channel_id", "channel_name"], sort=False)

    total_comments = 0

    for (channel_id, channel_name), group in grouped:
        channel_comment_count = 0

        if "Comment Count" in group.columns:
            group_sorted = group.sort_values("Comment Count", ascending=False)
        else:
            group_sorted = group

        for _, row in group_sorted.iterrows():
            if channel_comment_count >= MAX_COMMENTS_PER_CHANNEL:
                break

            video_name = row["video_name"]
            video_id = row["video_id"]
            remaining_for_channel = MAX_COMMENTS_PER_CHANNEL - channel_comment_count

            try:
                video_comments = fetch_comments_for_video(
                    video_id,
                    remaining_for_channel
                )
            except RuntimeError as e:
                print(f"FATAL ERROR (API keys) khi x·ª≠ l√Ω video {video_id}: {e}")
                save_numpy(all_rows)
                print(f"Channel {channel_name} ({channel_id}): {channel_comment_count} comments")
                print(f"TOTAL COMMENTS: {total_comments}")
                return
            except Exception as e:
                print(f"ERROR khi x·ª≠ l√Ω video {video_id}: {e}")
                continue

            for author, comment_text, like_count, time_str, comment_id in video_comments:
                all_rows.append([
                    channel_name,
                    channel_id,
                    video_name,
                    video_id,
                    author,
                    comment_text,
                    like_count,
                    time_str,
                    comment_id,
                ])

            channel_comment_count += len(video_comments)
            total_comments += len(video_comments)

        print(f"Channel {channel_name} ({channel_id}): {channel_comment_count} comments")

    save_numpy(all_rows)
    print(f"TOTAL COMMENTS: {total_comments}")


if __name__ == "__main__":
    main()


In [None]:
def save_numpy1(rows):
    data_array = np.array(rows, dtype=object)
    np.save("youtube_comments_backup.npy", data_array)
    print(f"Saved numpy file: {OUTPUT_NPY} with shape {data_array.shape}")

save_numpy1(all_rows)

In [1]:
import numpy as np


data = np.load('youtube_comments.npy', allow_pickle=True)
print(data.shape)
print(data)

# [channel_name, channel_id, video_name, video_id, author, comment_text, likes, time, comment_id]


(13954012, 5)
[['21 Savage' 'UCOjEHmBKwdS7joWpW0VrXkg'
  '21 Savage - a lot (Official Video) ft. J. Cole' 'DmWWqogr_r8'
  'J Cole ruined this straight killed the vibe.. this would‚Äôve had billions of views']
 ['21 Savage' 'UCOjEHmBKwdS7joWpW0VrXkg'
  '21 Savage - a lot (Official Video) ft. J. Cole' 'DmWWqogr_r8'
  'Nov.14,2025??']
 ['21 Savage' 'UCOjEHmBKwdS7joWpW0VrXkg'
  '21 Savage - a lot (Official Video) ft. J. Cole' 'DmWWqogr_r8' 'Trash']
 ...
 ['Gesic' 'UCEyoqQQK4vWWZZ57banT6Kg'
  'Neha kakkar or Guru Randhava Comedy video | Talking Tom Comedy video ‡•§ ‡§¨‡§ø‡§≤‡•ç‡§≤‡•Ç ‡§ï‡•ã‡§Æ‡•á‡§°‡•Ä ‡§ï‡•â‡§≤'
  '6aKpJAC_69s' 'Nice']
 ['Gesic' 'UCEyoqQQK4vWWZZ57banT6Kg'
  'Neha kakkar or Guru Randhava Comedy video | Talking Tom Comedy video ‡•§ ‡§¨‡§ø‡§≤‡•ç‡§≤‡•Ç ‡§ï‡•ã‡§Æ‡•á‡§°‡•Ä ‡§ï‡•â‡§≤'
  '6aKpJAC_69s' 'Beautifulüíì NiceüíìüíÉüé∂üï∫üëçü§©']
 ['Gesic' 'UCEyoqQQK4vWWZZ57banT6Kg'
  'Neha kakkar or Guru Randhava Comedy video | Talking Tom Comedy video ‡•§ ‡§¨‡§ø‡§≤‡•ç‡§≤‡•Ç ‡§ï‡