# Stage 1: Tìm hiểu và crawl data các video nổi bật, tiền xử lí


In [None]:
import requests
import json
import time
from datetime import datetime

In [None]:
import os
from dotenv import load_dotenv
from pathlib import Path

parent_env_path = Path.cwd().parent / ".env"
load_dotenv(dotenv_path=parent_env_path)
API_KEY = os.getenv("API_KEY_1")

In [None]:
SEARCH_URL = "https://www.googleapis.com/youtube/v3/search"

In [None]:
def crawl_youtube_videos(api_key, keywords, max_results=300):
    results = []
    today = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    for keyword in keywords:
        print(f"🔍 Crawling keyword: {keyword}")
        next_page_token = None
        total_collected = 0

        while total_collected < max_results:
            params = {
                "part": "snippet",
                "q": keyword,
                "type": "video",
                "order": "relevance",
                "maxResults": 50,
                "key": api_key,
            }
            if next_page_token:
                params["pageToken"] = next_page_token

            response = requests.get(SEARCH_URL, params=params)
            if response.status_code != 200:
                print("❌ Error:", response.text)
                break

            data = response.json()
            for item in data.get("items", []):
                snippet = item["snippet"]
                video_data = {
                    "videoId": item["id"]["videoId"],
                    "title": snippet["title"],
                    "description": snippet["description"],
                    "channelId": snippet["channelId"],
                    "channelTitle": snippet["channelTitle"],
                    "publishedAt": snippet["publishedAt"],
                    "searchKeyword": keyword,
                    "crawlDate": today
                }
                results.append(video_data)
                total_collected += 1

            next_page_token = data.get("nextPageToken")
            if not next_page_token:
                break

            time.sleep(0.1)  # tránh bị giới hạn API

    return results


In [None]:
keywords = [
    "AI tool", "Artificial Intelligence", "AI agent",
    "Generative AI", "AI Automation", "AI for", "Learn AI"
]

results = crawl_youtube_videos(API_KEY, keywords)
# Lưu kết quả
with open("../data/raw/video_search_result/ai_videos_snippets.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

In [None]:
results = []
keywords = [
    "AI Algorithms", "AI in Business",
    "Prompt Engineering", "AI for Data Science", "AI for Project Management",
    "AI in Education", "AI Career", "AI Productivity",  "xAI",
    "AI and Big Data", "AI for Developers", "ChatGPT", "Cursor AI", "Claude AI", "Google Gemini", "No/low code AI",
]

results = crawl_youtube_videos(API_KEY, keywords, max_results=200)

with open("../data/raw/video_search_result/ai_videos_snippets_v2.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

In [None]:
results = []
keywords = [
    "AI tutorial", "Machine Learning AI", "Deep Learning AI",
    "AI coding", "Chatbox AI", "How to AI", "Latest AI", "AI application", "AI robot", "AI trends"
]
results = crawl_youtube_videos(API_KEY, keywords, max_results=200)

with open("../data/raw/video_search_result/ai_videos_snippets_v3.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)