In [None]:
import os  # For file and directory operations
import time  # For adding delays to avoid hitting API rate limits
import requests  # For downloading thumbnails
import pandas as pd  # For creating and saving dataframes
from googleapiclient.discovery import build  # For interacting with the YouTube Data API

# Initialize YouTube API
api_key = "AIzaSyCmT0GwAIlQxEMqisj-Jsm208Q6XRX_k9w"
youtube = build('youtube', 'v3', developerKey=api_key)

# 1. Fetch 100 gaming videos with pagination
video_ids = []
next_page_token = None
total_to_fetch = 500

while len(video_ids) < total_to_fetch:
    remaining = total_to_fetch - len(video_ids)
    search_response = youtube.search().list(
        part="snippet",
        type="video",
        videoCategoryId="20",  # Gaming category ID
        maxResults=min(50, remaining),  # API max is 50 per request
        pageToken=next_page_token
    ).execute()
    
    video_ids += [item['id']['videoId'] for item in search_response['items']]
    next_page_token = search_response.get('nextPageToken')
    
    if not next_page_token or len(video_ids) >= total_to_fetch:
        break

print(f"Retrieved {len(video_ids)} video IDs.")

# 2. Download thumbnails
thumbnail_dir = r"C:\Users\ChanK\OneDrive - Tilburg University\Thesis 2024\YT Thumbnails"
os.makedirs(thumbnail_dir, exist_ok=True)

for vid in video_ids:
    try:
        response = requests.get(f"https://img.youtube.com/vi/{vid}/maxresdefault.jpg")
        if response.status_code == 200:
            with open(os.path.join(thumbnail_dir, f"{vid}.jpg"), "wb") as f:
                f.write(response.content)
        else:
            print(f"Thumbnail not available for video ID: {vid}")
    except Exception as e:
        print(f"Error downloading thumbnail for {vid}: {str(e)}")

print("Thumbnails downloaded successfully.")

# 3. Fetch metadata for each video
metadata = []
for vid in video_ids:
    try:
        response = youtube.videos().list(
            part="snippet,contentDetails,statistics,status",
            id=vid
        ).execute()
        
        for item in response['items']:
            metadata.append({
                "video_id": vid,
                "title": item['snippet']['title'],
                "description": item['snippet']['description'],
                "published_at": item['snippet']['publishedAt'],
                "duration": item['contentDetails']['duration'],
                "definition": item['contentDetails']['definition'],
                "dimension": item['contentDetails']['dimension'],
                "caption": item['contentDetails']['caption'],
                "licensedContent": item['contentDetails']['licensedContent'],
                "view_count": int(item['statistics'].get('viewCount', 0)),
                "like_count": int(item['statistics'].get('likeCount', 0)),
                "comment_count": int(item['statistics'].get('commentCount', 0)),
                "privacy_status": item['status']['privacyStatus']
            })
    except Exception as e:
        print(f"Error fetching metadata for video ID {vid}: {str(e)}")

metadata_df = pd.DataFrame(metadata)
print("Metadata fetched successfully.")

# 4. Fetch comments for each video with pagination
comments = []
for vid in video_ids:
    next_comment_page = None
    
    while True:
        try:
            response = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=vid,
                maxResults=100,  # Maximum allowed per request
                pageToken=next_comment_page,
                textFormat="plainText"
            ).execute()
            
            for item in response['items']:
                top_comment = item['snippet']['topLevelComment']['snippet']
                
                # Gather replies if available
                replies = [
                    {
                        "text": reply['snippet']['textDisplay'],
                        "author": reply['snippet']['authorDisplayName'],
                        "timestamp": reply['snippet']['publishedAt'],
                        "likes": reply['snippet']['likeCount']
                    } for reply in item.get('replies', {}).get('comments', [])
                ]
                
                comments.append({
                    "video_id": vid,
                    "comment": top_comment['textDisplay'],
                    "author": top_comment['authorDisplayName'],
                    "timestamp": top_comment['publishedAt'],
                    "likes": top_comment['likeCount'],
                    "replies": replies
                })
            
            next_comment_page = response.get('nextPageToken')
            if not next_comment_page:  # Exit if no more pages
                break
            
            time.sleep(1)  # Add delay to avoid hitting quota limits
            
        except Exception as e:
            print(f"Error fetching comments for video ID {vid}: {str(e)}")
            break

comments_df = pd.DataFrame(comments)
print("Comments fetched successfully.")

# Save metadata and comments to CSV files (optional)
metadata_df.to_csv(r"C:\Users\ChanK\OneDrive - Tilburg University\Thesis 2024\metadata.csv", index=False)
comments_df.to_csv(r"C:\Users\ChanK\OneDrive - Tilburg University\Thesis 2024\comments.csv", index=False)

print("Data saved to CSV files.")
