In [9]:
from googleapiclient.discovery import build
from langdetect import detect, DetectorFactory
import json
import pandas as pd
import html
from bs4 import BeautifulSoup
import unicodedata
import re

In [2]:
def LoadKeyYotube():
    
    # Load the API key of youtube v3 found in .env
    
    with open("../.env","r") as iJSON:
        key = json.load(iJSON)["keys"]["key_youtube"]
    return key

def GetVideoComments(video_id:str) -> "list(dict)":

    '''
    This function retrieves the comment, date, user and likes of a youtube video
    
    Parameters
    ----------
    
    video_id: str
        id of youtube video
    
    Return
    ------
        list(dict): List of diccionaries containing the comment, date, user and likes of the comment
    
    Examples
    --------

    >>> from googleapiclient.discovery import build
    >>> my_video = "dGiQaabX3_o"
    >>> youtube = build('youtube', 'v3', developerKey=api_key) # replace by your api key
    >>> comments = get_video_comments(my_video) 

    '''
    
    comments = []
    next_page_token = None

    iteration = 1
    while True:
        
        if (iteration % 10) == 0:
            print(f"\tPages checked: {iteration}", end="\r")
        
        response = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            pageToken=next_page_token,
            maxResults=10000
        ).execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comment_date = item['snippet']['topLevelComment']['snippet']['publishedAt']
            comment_user = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
            comment_likes = item['snippet']['topLevelComment']['snippet']['likeCount']

            comment_data = {
                'comment': comment,
                'date': comment_date,
                'user': comment_user,
                'likes': comment_likes
            }
            
            comments.append(comment_data)

        next_page_token = response.get('nextPageToken')

        if not next_page_token:
            break
            
        iteration += 1
    
    print("\nAll done")
    return comments

def GetLanguage(text:str):
    
    try:
        return detect(text)
    except:
        return "undefined"

In [3]:
# Set up API key
api_key = LoadKeyYotube()

# Set up YouTube Data API client
youtube = build('youtube', 'v3', developerKey=api_key)

# Specify the video ID for which you want to retrieve comments
video_names = ["What Happened Before History? Human Origins",
               "The Past We Can Never Return To – The Anthropocene Reviewed",
               "Why Blue Whales Don't Get Cancer - Peto's Paradox",
               "What If We Detonated All Nuclear Bombs at Once?",
               "We WILL Fix Climate Change!",
               "Building a Marsbase is a Horrible Idea: Let's do it!",
               "What if We Nuke a City?"]

videos_id = ["dGiQaabX3_o","YbgnlkJPga4",
             "1AElONvi9WQ","JyECrGp-Sw8",
             "LxgMdjyw8uw","uqKGREZs6-w",
             "5iPH-br_eJQ"]

# Call the function to retrieve comments for the specified video
full_comment_metadata_videos = []
for name, ID in zip(video_names, videos_id):
    
    print(f"Retrieving from video: {name}")
    video_comments = GetVideoComments(ID)
    full_comment_metadata_videos.append(video_comments)
    
    print(f"Comments retrieved: {len(video_comments)}")

Retrieving from video: What Happened Before History? Human Origins
	Pages checked: 210
All done
Comments retrieved: 20935
Retrieving from video: The Past We Can Never Return To – The Anthropocene Reviewed
	Pages checked: 100
All done
Comments retrieved: 10631
Retrieving from video: Why Blue Whales Don't Get Cancer - Peto's Paradox
	Pages checked: 160
All done
Comments retrieved: 15971
Retrieving from video: What If We Detonated All Nuclear Bombs at Once?
	Pages checked: 390
All done
Comments retrieved: 39682
Retrieving from video: We WILL Fix Climate Change!
	Pages checked: 250
All done
Comments retrieved: 25130
Retrieving from video: Building a Marsbase is a Horrible Idea: Let's do it!
	Pages checked: 160
All done
Comments retrieved: 16042
Retrieving from video: What if We Nuke a City?
	Pages checked: 510
All done
Comments retrieved: 51118


In [4]:
video_id_df = pd.DataFrame({"id_video": video_names,"title_video": videos_id})
comments_df = pd.DataFrame()

# Adding id_video where the comments came from
for comments_by_video, ID in zip(full_comment_metadata_videos, videos_id):
    df = pd.DataFrame(comments_by_video)
    df["id_video"] = ID
    comments_df = pd.concat([comments_df, df])
    
comments_df = comments_df.reset_index(drop=True).copy()
comments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179509 entries, 0 to 179508
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   comment   179509 non-null  object
 1   date      179509 non-null  object
 2   user      179509 non-null  object
 3   likes     179509 non-null  int64 
 4   id_video  179509 non-null  object
dtypes: int64(1), object(4)
memory usage: 6.8+ MB


In [5]:
# Casting correctly date time
date_series = comments_df["date"].str.replace("(T|Z)", " ", regex=True)
date_format = pd.to_datetime(date_series, format="%Y-%m-%d %H:%M:%S")
comments_df["date"] = date_format

comments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179509 entries, 0 to 179508
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   comment   179509 non-null  object        
 1   date      179509 non-null  datetime64[ns]
 2   user      179509 non-null  object        
 3   likes     179509 non-null  int64         
 4   id_video  179509 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 6.8+ MB


In [10]:
%%time

# Transforming HTML "codification" to utf-8
map_columns_to_function = {"comment":html.unescape, "user":html.unescape}
comments_df[["comment","user"]] = comments_df[["comment","user"]].agg(map_columns_to_function)

# Emoji code patterns
emoji_patterns = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002500-\U00002BEF"
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"
        u"\u3030"
                      "]+", re.UNICODE)

# Non printable characters
non_printable_patterns = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\r]')

# Clean emojis, html tags and detect language
clean_text = []
for text in comments_df["comment"]:
    clean_emojis_text = emoji_patterns.sub(r"", text)
    
    # More expensive than ReGex but more useful using BeatifulSoup for html tags
    clean_html_text = BeautifulSoup(clean_emojis_text, "lxml").text
    clean_non_print = non_printable_patterns.sub(r" ", clean_html_text)
    clean_text.append(clean_non_print)
    
comments_df["comment"] = clean_text



CPU times: user 30.1 s, sys: 55.9 ms, total: 30.2 s
Wall time: 30.3 s


In [12]:
%%time

# Detecting language
comment_languages = []
for i, text in enumerate(comments_df["comment"], start=1):
    
    print(f"Processing comment number {i}", end="\r")
    
    # langdetect doesn't work very well in short sentences. Threshold > 4
    if len(re.split("\s+", text)) > 4:
        language = GetLanguage(text)
    else:
        language = "undefined"
        
    comment_languages.append(language)

print("")

comments_df["language"] = comment_languages

Processing comment number 179508
CPU times: user 10min 46s, sys: 5.19 s, total: 10min 51s
Wall time: 10min 51s


In [58]:
comments_df.columns

Index(['comment', 'date', 'user', 'likes', 'id_video', 'language'], dtype='object')

In [63]:
[len(id) for id in comments_df["id_video"].unique()]

[11, 11, 11, 11, 11, 11, 11]