# IMT 547 Project: Keyword-Based Data Collection

Chesie Yu

2/9/2024

In [1]:
# The YouTube API key
# API_KEY = "AIzaSyBkcQW2HfLQJ63Y5imoGmvUlPyTuCNt9_M"
# API_KEY = "AIzaSyBIyx2JwjI8UWEsjONqSg7mdL3xP--rx78"
API_KEY = "AIzaSyAZoK_8LGGGeTh21WBqDxa94zUztIPGwQM"

In [2]:
# Install libraries
!pip install --upgrade google-api-python-client --quiet

In [3]:
# Import libraries
import json
import random
import time
import datetime
import pandas as pd
import googleapiclient
from googleapiclient import discovery, errors

In [4]:
# Initialize the YouTube API
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

### Keyword-Based Video Collection

In [5]:
def get_video_ids(max_videos=2000, days=None, keywords=None):
    """
    Fetch video IDs from the past X days based on specific keywords.
    """
    # Empty list to store video ids
    video_ids = []
    page_token = None
    published_after = None
    
    # Calculate the date X days ago from today
    if days:
        published_after = (datetime.datetime.now() - datetime.timedelta(days=days))\
                           .isoformat("T") + "Z"
    
    # Loop until we collect enough videos
    while len(video_ids) < max_videos:  
        # Call the API to extract videos based on keywords
        # Documentation: https://developers.google.com/youtube/v3/docs/search/list
        request = youtube.search().list(
            part="id,snippet",
            pageToken=page_token,
            maxResults=min(50, max_videos - len(video_ids)),
            type="video",
            order="relevance", 
            publishedAfter=published_after, 
            q="|".join(keywords) if keywords else None,
            videoCategoryId=20  # Gaming 
        )
        res = request.execute()
        
        # Store the video ids
        for item in res["items"]:
            video_id = item["id"]["videoId"]
            video_ids.append(video_id)
        
        # Set the token
        page_token = res.get("nextPageToken")

        # Exit the loop if no token is found
        if not page_token: 
            break

    return video_ids

### Random Selection

In [6]:
def select_random_video_ids(video_ids, seed=None, n=300):
    """
    Randomly select n videos from a list of YouTube video ids. 
    """
    random.seed(seed)
    if len(video_ids) < n:
        raise ValueError(f"Insufficient videos: Requested {n}, but only {len(video_ids)} available.")     
    return random.sample(video_ids, n)

### Video & Comment Info Collection

In [7]:
def get_video_info(video_ids):
    """
    Fetch video info from a list of YouTube video ids.
    """
    # Empty list to store video info
    video_info = []

    # For each video id
    for vid in video_ids:
        # Call the API to extract video info from ids
        # Documentation: https://developers.google.com/youtube/v3/docs/videos#resource
        request = youtube.videos().list(
            part="snippet, statistics",
            id=vid
        )
        res = request.execute()
        
        for v in res["items"]:
            # Extract relevant video info
            video_info.append({
                "channel_id": v["snippet"]["channelId"],
                "channel_name": v["snippet"]["channelTitle"],
                "video_id": v["id"],
                "video_title": v["snippet"]["title"],
                "video_creation_time": v["snippet"]["publishedAt"],
                "video_description": v["snippet"]["description"],
                "video_tags": v["snippet"].get("tags", []), 
                "video_viewcount": v["statistics"].get("viewCount", "0"),
                "video_likecount": v["statistics"].get("likeCount", "0"),
                "video_commentcount": v["statistics"].get("commentCount", "0"),
            })
    
    return video_info

In [8]:
def get_all_comments(video_ids):
    """
    Fetch all comments for a list of videos.
    """
    # Empty list to store the comments
    comment_info = []
    
    # Set to store unique video ids
    unique_video_ids = set()

    # Loop through the video ids
    for vid in video_ids:
        page_token = None 
        while True:
            try:
                # Call the API to extract comments for videos
                # Documentation: https://developers.google.com/youtube/v3/docs/commentThreads/list
                request = youtube.commentThreads().list(
                    videoId=vid,
                    part="id, snippet, replies",
                    textFormat="plainText",
                    maxResults=100,
                    pageToken=page_token
                )
                res = request.execute()

                # Extract relevant comment info
                for c in res["items"]:
                    video_id = c["snippet"]["videoId"]
                    comment_info.append({
                        "video_id": video_id,
                        "comment_id": c["snippet"]["topLevelComment"]["id"],
                        "comment_author_id": c["snippet"]["topLevelComment"]["snippet"]["authorChannelId"]["value"],
                        "comment_text": c["snippet"]["topLevelComment"]["snippet"]["textOriginal"],
                        "comment_time": c["snippet"]["topLevelComment"]["snippet"]["updatedAt"],
                        "comment_likecount": c["snippet"]["topLevelComment"]["snippet"]["likeCount"],
                        "comment_replycount": c["snippet"]["totalReplyCount"]
                    })
                    unique_video_ids.add(video_id) 

                # Set the token
                page_token = res.get("nextPageToken")

                # Exit the loop if no token is found
                if not page_token: 
                    break

            # Error handling for commentsDisabled
            except errors.HttpError as e:
                if e.resp.status == 403 and "commentsDisabled" in str(e):
                    print(f"Comments are disabled for video {vid}.")
                else:
                    print(f"An error occurred for video {vid}: {e}")
                break 

    # Print summary
    print(f"{len(comment_info)} comments extracted for {len(unique_video_ids)} videos.")
    print("====================================\n")
    
    return comment_info

### Main Function

In [9]:
# def get_youtube_data(max_videos=2000, days=None, keywords=None,
#                      seed=None, n=300):
#     """
#     Fetch videos and comments from past X days based on keywords.
#     """
#     # Start timing
#     start_time = time.time()
    
#     # Get video IDs
#     video_ids = get_video_ids(max_videos, days, keywords)
#     selected_video_ids = select_random_video_ids(video_ids, seed, n)
    
#     # Get video info
#     video_info = get_video_info(selected_video_ids)
    
#     # Get comment info
#     comment_info = get_all_comments(video_ids)

#     # Convert to DataFrames
#     video_info_df = pd.DataFrame(video_info)
#     comment_info_df = pd.DataFrame(comment_info)

#     # Merge video information with comments
#     yt_comments = pd.merge(video_info_df, comment_info_df, on="video_id", how="inner")

#     # End timing 
#     end_time = time.time()
#     print(f"Runtime: {end_time - start_time:.4f}\n")
    
#     return yt_comments

In [10]:
def get_youtube_videos(max_videos=2000, days=None, keywords=None,
                       seed=None, n=300):
    """
    Fetch N videos from past X days based on keywords.
    """
    # Start timing
    start_time = time.time()
    
    # Get video IDs
    video_ids = get_video_ids(max_videos, days, keywords)
    selected_video_ids = select_random_video_ids(video_ids, seed, n)
    
    # Get video info
    video_info = get_video_info(video_ids)
    
    # Convert to DataFrames
    yt_videos = pd.DataFrame(video_info)

    # End timing 
    print(f"Runtime: {time.time() - start_time:.4f} seconds")
    
    return yt_videos

In [11]:
def get_youtube_comments(yt_videos):
    """
    Fetch comments for videos.
    """
    # Start timing
    start_time = time.time()

    # Get comment info
    video_ids = yt_videos["video_id"].unique().tolist()
    comment_info = get_all_comments(video_ids)

    # Convert to DataFrames
    comment_info_df = pd.DataFrame(comment_info)

    # Merge video information with comments
    yt_comments = pd.merge(video_info_df, comment_info_df, on="video_id", how="inner")

    # End timing 
    print(f"Runtime: {time.time() - start_time:.4f} seconds")
    
    return yt_comments

In [12]:
# Specify the parameters
days = None
max_videos = 200
seed = 547
n = 200
action_keywords = ["call of duty montage", 
                   "gta 5 montage", 
                   "brawl stars montage", 
                   "elden ring montage"]
non_action_keywords = ["minecraft montage", 
                       "pokemon go montage", 
                       "just dance montage", 
                       "it takes two montage"]

In [13]:
# Get YouTube action videos and comments
action_videos = get_youtube_videos(max_videos=max_videos, 
                                   days=days, 
                                   keywords=action_keywords,
                                   seed=seed, 
                                   n=n)
action_videos["genre"] = "action"
action_videos.head()

Runtime: 15.6023 seconds


Unnamed: 0,channel_id,channel_name,video_id,video_title,video_creation_time,video_description,video_tags,video_viewcount,video_likecount,video_commentcount,genre
0,UCmq1iHmomVRCf2APufQZkCQ,Aerith,hWCjLG8t8FM,"""No Lie"" a CODM Montage",2022-03-11T14:00:30Z,"GFUEL Code ""AERITH""!! gfuel.ly/3xlTbwG \n\nHel...","[callofduty, callofdutymobile, cod, codm, game...",319360,11681,399,action
1,UCku-D8Wh6aaFdj4xMvjaE3w,Kaiser,KHNq10Uykv4,FaZe asked me to make this montage...,2022-03-18T17:53:18Z,#FaZe1 #FaZeKaiser #FaZe Last FaZe1 submission...,"[call of duty, call of duty montage, cod monta...",141952,8996,564,action
2,UC9YydG57epLqxA9cTzZXSeQ,Call of Duty,DOaUcJIxX9k,MWII 'Squad Up' | Call of Duty: Modern Warfare II,2022-10-09T16:00:16Z,☝️ SQUAD UP!\n💥 RUN IT BACK! \n\nGet ready to ...,"[call of duty, cod, activision, modern warfare...",7469952,65174,7127,action
3,UC_w_KsdOKQ27kWAmWGh4wkw,Shergill edit's,bxcRAaYmaYI,Beggin' 🙏🔥 ( Call of Duty Montage),2023-05-26T08:47:25Z,BEGGIN | COD Warzone Montage ;)\n\nI hope you ...,[],87,14,2,action
4,UCI72aVZJsL-zfy9Icqb_1dw,radius,ihog1Z5PJ9s,Lemonade 🍋 (Fortnite Montage),2020-10-16T11:16:00Z,Lemonade 🍋 (Fortnite Montage)\n\nuse code ''da...,"[sync, syncyfishy, syncfn, fortnite montage, l...",10839040,355770,11223,action


In [16]:
# Save to CSV
action_videos.to_csv("data/action_videos.csv", index=False)

In [15]:
# Get YouTube non-action videos and comments
non_action_videos = get_youtube_videos(max_videos=max_videos, 
                                       days=days, 
                                       keywords=non_action_keywords,
                                       seed=seed, 
                                       n=n)
non_action_videos["genre"] = "action"
non_action_videos.head()

Runtime: 15.2768 seconds


Unnamed: 0,channel_id,channel_name,video_id,video_title,video_creation_time,video_description,video_tags,video_viewcount,video_likecount,video_commentcount,genre
0,UCdqwFH32SM74H-99k7lj-TQ,Nexqn,Ozm4Zyz9aS4,Dance | Lt3 Crystal PvP Montage,2023-01-14T20:00:11Z,tysm for over 2000 views on the last montage i...,[],84212,1519,655,action
1,UCahy5poGxgeT8kt70aF-SQw,Ilegator,JvKubWOBPz4,JONNYSURVIVES MCSG MONTAGE - D.A.N.C.E. [Reupl...,2020-11-17T12:38:38Z,UPDATE: He republished the montage: https://ww...,[],910,34,23,action
2,UCO-mjzAjXpwB4vgzVxokiIg,Tayler,O70xnwL_xV4,DANCE 🕺 (Minecraft Montage),2022-03-20T10:36:43Z,Thanks for watching!\n\nJoin My Discord server...,[],212,10,12,action
3,UCl_c4MPHAjdFE_z1uL7wY0Q,Th4t,kHFHICAf250,Dance! // Hive and Zeqa Montage,2022-11-01T05:43:55Z,🔊 Dance! - A Minecraft Hive/Zeqa montage \nSub...,"[#zeqa, #minecraft, #hive]",3743,112,71,action
4,UCgkYEn8IJ5eSmotlFneNWJw,HapLegit,HqkhdwFVT8g,Way Down We Go - PvP Edit | Minecraft PvP Mont...,2023-12-10T13:17:08Z,Way Down We Go - PvP Edit | Minecraft PvP Mont...,"[minecraft pvp, pvp texture pack, crystal pvp,...",812,58,40,action


In [17]:
# Save to CSV
non_action_videos.to_csv("data/non_action_videos.csv", index=False)