This script collects YouTube comments for all videos in a YouTube playlist, using the *Google API*. The code is based on the instructions provided in the following Github repository:

https://github.com/rodolflying/youtube_automation/blob/master/main.py

Changes were made by Monika Barget in October 2024 to adapt the code for the data science research infrastructure (DSRI) at Maastricht University and to make the code easier to use for students without programming experience.

All that students have to do now is run the grey fields of the Jupyter Notebook one after the other from the top after having saved their playlist URL in the playlist_url.txt file. A valid API key is provided to them in the api_key.txt file for testing. They can ingest their own API keys at a later stage.

The first step is to install the Google API Python Client by running the grey section below. Please wait for the "done" message before continuing!

In [3]:
# install client

!pip install google-api-python-client python-dotenv pandas

print("Done!")

Done!


If the installation above was successful, please run the section below and wait for the "done" message!

In [4]:
# start client and import other relevant packages

from googleapiclient.discovery import build
import pandas as pd
from time import sleep
import traceback
import os
import re
from dotenv import load_dotenv

print("Done!")

Done!


Now make sure that you have entered a valid YouTube playlist URL in the playlist_url.txt file. Then run the whole section below and wait for output.
If an error occurs, let your tutor know. You should see the "All comments scraped!" message when the process has been successfully completed!

In [5]:
# collect comments for videos in your playlist

def get_api_key(file_path):
    ## Read API key from a text file.
    with open(file_path, 'r') as file:
        return file.read().strip()  # Read and strip any whitespace/newlines

def get_playlist_id(playlist_url):
    ## Extract the playlist ID from the playlist URL
    playlist_id_match = re.search(r"list=([^&]+)", playlist_url)
    if playlist_id_match:
        return playlist_id_match.group(1)
    else:
        raise ValueError("Invalid playlist URL")

def get_video_ids_from_playlist(api_key, playlist_id):
    ## Fetch video IDs from playlist
    youtube = build('youtube', 'v3', developerKey=api_key)
    video_ids = []
    request = youtube.playlistItems().list(
        part="snippet",
        playlistId=playlist_id,
        maxResults=50  # Define max. number of videos checked!
    )

    while request:
        try:
            response = request.execute()
            for item in response['items']:
                video_id = item['snippet']['resourceId']['videoId']
                video_ids.append(video_id)

            request = youtube.playlistItems().list_next(request, response)
        except Exception as e:
            print(f"Error occurred while fetching video IDs: {str(e)}")
            print(traceback.format_exc())
            break

    return video_ids

def get_comments(api_key, video_id):
    ## Fetch comments for each video
    youtube = build('youtube', 'v3', developerKey=api_key)

    request = youtube.commentThreads().list(
        part="snippet,replies",
        videoId=video_id,
        textFormat="plainText"
    )

    df = pd.DataFrame(columns=['comment', 'replies', 'date', 'user_name'])

    while request:
        replies = []
        comments = []
        dates = []
        user_names = []

        try:
            response = request.execute()

            for item in response['items']:
                ## Extracting comments
                comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
                comments.append(comment)

                user_name = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
                user_names.append(user_name)

                date = item['snippet']['topLevelComment']['snippet']['publishedAt']
                dates.append(date)

                ## Counting number of replies
                replycount = item['snippet']['totalReplyCount']

                ## Collecting replies
                if replycount > 0:
                    replies.append([])
                    #### Combine all existing replies
                    for reply in item['replies']['comments']:
                        reply_text = reply['snippet']['textDisplay']
                        replies[-1].append(reply_text)
                else:
                    replies.append([])  


            # Create new DataFrame with the collected comments
            df2 = pd.DataFrame({"comment": comments, "replies": replies, "user_name": user_names, "date": dates})
            df = pd.concat([df, df2], ignore_index=True)

            # Save to CSV
            df.to_csv(f"YouTube_comments/{video_id}_user_comments.csv", index=False, encoding='utf-8')
            print(f"New comments saved for video ID {video_id}")

            sleep(2)
            request = youtube.commentThreads().list_next(request, response)
            print("Please be patient! We are now collecting the next batch of comments...")

        except Exception as e:
            print(f"Error occurred: {str(e)}")
            print(traceback.format_exc())
            print("Sleeping for 10 seconds")
            sleep(10)
            df.to_csv(f"{video_id}_user_comments.csv", index=False, encoding='utf-8')
            break

def main():
    api_key_file = "api_key.txt"  # file containing API key
    playlist_url_file = "playlist_url.txt"  # file containing the playlist URL

    ## Read API key and URL from files
    api_key = get_api_key(api_key_file)  
    with open(playlist_url_file, 'r') as file:
        playlist_url = file.read().strip() 
        
    ## Fetch videos based on playlist ID
    playlist_id = get_playlist_id(playlist_url)  
    video_ids = get_video_ids_from_playlist(api_key, playlist_id)  

    ## Get comments for each video
    for video_id in video_ids:
        get_comments(api_key, video_id)
        
    print("All comments scraped!")

if __name__ == "__main__":
    main()


New comments saved for video ID P8ggfb67ODs
Please be patient! We are now ollecting the next batch of comments...
New comments saved for video ID P8ggfb67ODs
Please be patient! We are now ollecting the next batch of comments...
New comments saved for video ID P8ggfb67ODs
Please be patient! We are now ollecting the next batch of comments...
New comments saved for video ID P8ggfb67ODs
Please be patient! We are now ollecting the next batch of comments...
New comments saved for video ID P8ggfb67ODs
Please be patient! We are now ollecting the next batch of comments...
New comments saved for video ID P8ggfb67ODs
Please be patient! We are now ollecting the next batch of comments...
New comments saved for video ID P8ggfb67ODs
Please be patient! We are now ollecting the next batch of comments...
New comments saved for video ID P8ggfb67ODs
Please be patient! We are now ollecting the next batch of comments...
New comments saved for video ID P8ggfb67ODs
Please be patient! We are now ollecting the 