In [1]:
#connect to BigQuery
from google.cloud import bigquery
from google.cloud import storage
from googleapiclient.discovery import build
import json
from datetime import datetime
import time
import io

In [2]:
import os
from dotenv import load_dotenv
from pathlib import Path

parent_env_path = Path.cwd().parent / ".env"
load_dotenv(dotenv_path=parent_env_path)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
API_KEY = os.getenv("API_KEY_1")
project_id = os.getenv("PROJECT_ID")

In [3]:
#connect to prj_id, dataset_id = 2_cleaned_data, table_id: video_basic_info
client = bigquery.Client(project=project_id)
dataset_id = "2_cleaned_data"
table_id = "video_basic_info"
table_ref = client.dataset(dataset_id).table(table_id)




In [4]:
#querry and get the channel_id
query = f"""
SELECT distinct(channelId)
FROM `{project_id}.{dataset_id}.{table_id}`
"""


In [5]:
# Thực thi truy vấn
query_job = client.query(query)
results = query_job.result()

# Lưu kết quả vào danh sách
channel_list = [row.channelId for row in results]
len(channel_list)

4245

In [6]:
channel_list

['UCJ-lUkI7HFBXUG16RYQYWxw',
 'UCyj5OG4nbnQMnueBOvryi9g',
 'UCGODnMAAEhZeO0ROmB2EFBg',
 'UCcOIZzJgLCyMPILY7-1Vsdg',
 'UCDl5YK8CD2fY267VdDHLgfg',
 'UC_RovKmk0OCbuZjA8f08opw',
 'UCYjnt7fIBOjxCKF4m_DW3Ag',
 'UCOJp1lsu9vCF-TllwMzcCLg',
 'UCOXHwUny8BePng1gXxIe-PA',
 'UCdvVtDgpEXI8e2UzBJR-NSQ',
 'UCDkRmtg6Wt2jUsER5-xLRkQ',
 'UCo0wMFwDhVqmdI2BVnE4Qeg',
 'UCFGBxznLew-4-j0LESic87A',
 'UChYs-_zjKRYhdMddjx-NPLw',
 'UC5WjFrtBdufl6CZojX3D8dQ',
 'UCsBqUzFfHvNHEmUj-NUC26w',
 'UCYb4LjLNw3-K2B90ZTw7G7w',
 'UC907TPs-L1d-hgE_pH1HVmg',
 'UCR_gwC__rtlFBdNXdYXKy0A',
 'UCENw8EpouX_IGd8a_p-Px2g',
 'UCYhkv2Xbuj_FssJS74Eyn1w',
 'UCWajepQsMCmNBUHVIPirk8g',
 'UCUetYLN8ZnErkC0NmF0wfcA',
 'UCRVtlcqayOmyuLIDrT3ng1w',
 'UCE7spuKzxP1k8A-XucxxUAQ',
 'UCLuYADJ6hESLHX87JnsGbjA',
 'UC-B3lKAktiHyLwFv7TRq2xg',
 'UC9gXEfDwAt9x8DzV8Pz2Caw',
 'UCvViwWz0_m_lcHJEUFbIOlQ',
 'UCCK6b9W7w4dJ6ZtclFZdpsA',
 'UCaC3GsUi4NuY_IGVzhMA11A',
 'UCi8C7TNs2ohrc6hnRQ5Sn2w',
 'UCLF7ITqhCLwb9ZkZpyvyoUQ',
 'UC58znoFTA2_8qnXIkOC75JQ',
 'UC0IcuSzkill

In [10]:
#crawl snippet, statistics, contentDetails, topicDetails of each channel in channel_list, lưu crawl vào ytbdata/1_crawl/channel_raw_info/channel_raw_info{date}.json
youtube = build('youtube', 'v3', developerKey=API_KEY)

bucket_name = "ytbdata"
gcs_folder = "1_crawl"
date_str = datetime.today().strftime("%Y%m%d")
filename = f"channel_raw_info_{date_str}.json"
blob_path = f"{gcs_folder}/channel_raw_info/{filename}"

In [None]:
# ─── CRAWL CHANNEL INFO ────────────────────────────────────
raw_data = []

batch_size = 50
for i in range(0, len(channel_list), batch_size):
    batch = channel_list[i:i + batch_size]
    id_string = ",".join(batch)

    try:
        response = youtube.channels().list(
            part="snippet,statistics,contentDetails,topicDetails",
            id=id_string
        ).execute()

        raw_data.extend(response["items"])

        print(f"✅ Crawled batch {i} → {i + len(batch) - 1}")

        time.sleep(0.1)  # hạn chế bị quota exceeded

    except Exception as e:
        print(f"❌ Error with batch {i}-{i + len(batch) - 1}: {e}")
        continue

✅ Crawled batch 0 → 49
✅ Crawled batch 50 → 99
✅ Crawled batch 100 → 149
✅ Crawled batch 150 → 199
✅ Crawled batch 200 → 249
✅ Crawled batch 250 → 299
✅ Crawled batch 300 → 349
✅ Crawled batch 350 → 399
✅ Crawled batch 400 → 449
✅ Crawled batch 450 → 499
✅ Crawled batch 500 → 549
✅ Crawled batch 550 → 599
✅ Crawled batch 600 → 649
✅ Crawled batch 650 → 699
✅ Crawled batch 700 → 749
✅ Crawled batch 750 → 799
✅ Crawled batch 800 → 849
✅ Crawled batch 850 → 899
✅ Crawled batch 900 → 949
✅ Crawled batch 950 → 999
✅ Crawled batch 1000 → 1049
✅ Crawled batch 1050 → 1099
✅ Crawled batch 1100 → 1149
✅ Crawled batch 1150 → 1199
✅ Crawled batch 1200 → 1249
✅ Crawled batch 1250 → 1299
✅ Crawled batch 1300 → 1349
✅ Crawled batch 1350 → 1399
✅ Crawled batch 1400 → 1449
✅ Crawled batch 1450 → 1499
✅ Crawled batch 1500 → 1549
✅ Crawled batch 1550 → 1599
✅ Crawled batch 1600 → 1649
✅ Crawled batch 1650 → 1699
✅ Crawled batch 1700 → 1749
✅ Crawled batch 1750 → 1799
✅ Crawled batch 1800 → 1849
✅ Crawled

In [11]:
json_str = json.dumps(raw_data, indent=2, ensure_ascii=False)

# Upload to GCS
storage_client = storage.Client(project=project_id)
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_path)

# Upload from string as file-like object
json_bytes = json_str.encode("utf-8")
blob.upload_from_file(io.BytesIO(json_bytes), content_type='application/json')

print(f"✅ Đã lưu {len(raw_data)} kênh vào GCS: gs://{bucket_name}/{blob_path}")



✅ Đã lưu 4230 kênh vào GCS: gs://ytbdata/1_crawl/channel_raw_info/channel_raw_info_20250614.json
