In [1]:
import re
import json
import html
import getpass
import warnings

from googleapiclient.discovery import build
from langdetect import detect, DetectorFactory
import pandas as pd
from bs4 import BeautifulSoup

import sqlalchemy
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import SQLAlchemyError

In [2]:
def LoadKeyYotube():
    
    # Load the API key of youtube v3 found in .env
    
    with open("../.env","r") as iJSON:
        key = json.load(iJSON)["keys"]["key_youtube"]
    return key

def GetVideoComments(video_id:str) -> "list(dict)":

    '''
    This function retrieves the comment, date, user and likes of a youtube video
    
    Parameters
    ----------
    
    video_id: str
        id of youtube video
    
    Return
    ------
        list(dict): List of diccionaries containing the comment, date, user and likes of the comment
    
    Examples
    --------

    >>> from googleapiclient.discovery import build
    >>> my_video = "dGiQaabX3_o"
    >>> youtube = build('youtube', 'v3', developerKey=api_key) # replace by your api key
    >>> comments = get_video_comments(my_video) 

    '''
    
    comments = []
    next_page_token = None

    iteration = 1
    while True:
        
        if (iteration % 10) == 0:
            print(f"\tPages checked: {iteration}", end="\r")
        
        response = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            pageToken=next_page_token,
            maxResults=10000
        ).execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comment_date = item['snippet']['topLevelComment']['snippet']['publishedAt']
            comment_user = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
            comment_likes = item['snippet']['topLevelComment']['snippet']['likeCount']

            comment_data = {
                'comment': comment,
                'published_date': comment_date,
                'name': comment_user,
                'likes': comment_likes
            }
            
            comments.append(comment_data)

        next_page_token = response.get('nextPageToken')

        if not next_page_token:
            break
            
        iteration += 1
    
    print("\nAll done")
    return comments

def GetLanguage(text:str):
    
    try:
        return detect(text)
    except:
        return "undefined"

# Defining tables found in database
    
class Languages(declarative_base()):
    __tablename__ = 'languages'

    # Here Integer represents SERIAL in id column
    id_language = Column(Integer, primary_key=True, nullable=False, unique=True)
    code = Column(String(9), nullable=False, unique=True)
    
class Titles(declarative_base()):
    __tablename__ = 'titles'

    id_video = Column(String(12), primary_key=True, nullable=False, unique=True)
    title = Column(String, nullable=False)
    
class Users(declarative_base()):
    __tablename__ = 'users'
    
    # Here Integer represents SERIAL in id column
    id_user = Column(Integer, primary_key=True, nullable=False, unique=True)
    name = Column(String, nullable=False)
    
class Comments(declarative_base()):
    __tablename__ = 'comments'

    # Here Integer represents SERIAL in id column
    id_comment = Column(Integer, primary_key=True, nullable=False, unique=True)
    comment = Column(Text, nullable=True)
    published_date = Column(DateTime, nullable=True)
    likes = Column(Integer, nullable=True)
    id_user = Column(Integer, nullable=False)
    id_video = Column(String, nullable=False)
    id_language = Column(Integer, nullable=False)

class DataValidator:
    def __init__(self, engine, orm_model):
        self.engine = engine
        self.Session = sessionmaker(bind=self.engine, autoflush=False)
        self.orm_model = orm_model
    
    def ValidateData(self, dataframe:pd.DataFrame, errors:str="stop"):
        
        session = self.Session()
        bad_rows = []
        
        for row in dataframe.itertuples():

            row = row._asdict()
            index = row.pop("Index")
                
            try:
                model = self.orm_model(**row)
                session.add(model)
                session.flush()
                
            except SQLAlchemyError as ErrorDataType:
                
                if errors == "ignore":
                    bad_rows.append(index)
                    warnings.warn(f"Data validation failed: {ErrorDataType}")
                else:
                    session.rollback()
                    session.close()
                    if errors == "stop":
                        raise Exception(f"Data validation failed: {ErrorDataTyper}")
                    else:
                        raise ValueError(f"Argument 'error' not valid: {errors}. Expected ['stop', 'ignore']")
        
        session.rollback()
        print("Done")
        session.close()
        
        return bad_rows
    
class InsertData:
    def __init__(self, engine, orm_model):
        self.engine = engine
        self.Session = sessionmaker(bind=self.engine, autoflush=False)
        self.orm_model = orm_model
        
    def Insert(self, dataframe:pd.DataFrame):
        
        session = self.Session()
        
        for row in dataframe.itertuples(index=False):
            
            row = row._asdict()
            model = self.orm_model(**row)
            session.add(model)
            
        session.commit()
        session.close()
    

In [3]:
# Set up API key
api_key = LoadKeyYotube()

# Set up YouTube Data API client
youtube = build('youtube', 'v3', developerKey=api_key)

# Specify the video ID for which you want to retrieve comments
video_names = ["What Happened Before History? Human Origins",
               "The Past We Can Never Return To – The Anthropocene Reviewed",
               "Why Blue Whales Don't Get Cancer - Peto's Paradox",
               "What If We Detonated All Nuclear Bombs at Once?",
               "We WILL Fix Climate Change!",
               "Building a Marsbase is a Horrible Idea: Let's do it!",
               "What if We Nuke a City?"]

videos_id = ["dGiQaabX3_o","YbgnlkJPga4",
             "1AElONvi9WQ","JyECrGp-Sw8",
             "LxgMdjyw8uw","uqKGREZs6-w",
             "5iPH-br_eJQ"]

# Call the function to retrieve comments for the specified video
full_comment_metadata_videos = []
for name, ID in zip(video_names, videos_id):
    
    print(f"Retrieving from video: {name}")
    video_comments = GetVideoComments(ID)
    full_comment_metadata_videos.append(video_comments)
    
    print(f"Comments retrieved: {len(video_comments)}")

Retrieving from video: What Happened Before History? Human Origins
	Pages checked: 210
All done
Comments retrieved: 20936
Retrieving from video: The Past We Can Never Return To – The Anthropocene Reviewed
	Pages checked: 100
All done
Comments retrieved: 10631
Retrieving from video: Why Blue Whales Don't Get Cancer - Peto's Paradox
	Pages checked: 160
All done
Comments retrieved: 15974
Retrieving from video: What If We Detonated All Nuclear Bombs at Once?
	Pages checked: 390
All done
Comments retrieved: 39683
Retrieving from video: We WILL Fix Climate Change!
	Pages checked: 250
All done
Comments retrieved: 25138
Retrieving from video: Building a Marsbase is a Horrible Idea: Let's do it!
	Pages checked: 160
All done
Comments retrieved: 16043
Retrieving from video: What if We Nuke a City?
	Pages checked: 510
All done
Comments retrieved: 51121


In [4]:
video_id_df = pd.DataFrame({"id_video": video_names,"title": videos_id})
comments_df = pd.DataFrame()

# Adding id_video where the comments came from
for comments_by_video, ID in zip(full_comment_metadata_videos, videos_id):
    df = pd.DataFrame(comments_by_video)
    df["id_video"] = ID
    comments_df = pd.concat([comments_df, df])
    
comments_df = comments_df.reset_index(drop=True).copy()
comments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179526 entries, 0 to 179525
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   comment         179526 non-null  object
 1   published_date  179526 non-null  object
 2   name            179526 non-null  object
 3   likes           179526 non-null  int64 
 4   id_video        179526 non-null  object
dtypes: int64(1), object(4)
memory usage: 6.8+ MB


In [5]:
# Casting correctly date time
date_series = comments_df["published_date"].str.replace("(T|Z)", " ", regex=True)
date_format = pd.to_datetime(date_series, format="%Y-%m-%d %H:%M:%S")
comments_df["published_date"] = date_format

comments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179526 entries, 0 to 179525
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   comment         179526 non-null  object        
 1   published_date  179526 non-null  datetime64[ns]
 2   name            179526 non-null  object        
 3   likes           179526 non-null  int64         
 4   id_video        179526 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 6.8+ MB


In [6]:
%%time

# Transforming HTML "codification" to utf-8
map_columns_to_function = {"comment":html.unescape, "name":html.unescape}
comments_df[["comment","name"]] = comments_df[["comment","name"]].agg(map_columns_to_function)

# Emoji code patterns
emoji_patterns = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002500-\U00002BEF"
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"
        u"\u3030"
                      "]+", re.UNICODE)

# Non printable characters
non_printable_patterns = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\r]')

# Clean emojis, html tags and detect language
clean_text = []
for text in comments_df["comment"]:
    clean_emojis_text = emoji_patterns.sub(r"", text)
    
    # More expensive than ReGex but more useful using BeatifulSoup for html tags
    clean_html_text = BeautifulSoup(clean_emojis_text, "lxml").text
    clean_non_print = non_printable_patterns.sub(r" ", clean_html_text)
    clean_text.append(clean_non_print)
    
comments_df["comment"] = clean_text



CPU times: user 34.3 s, sys: 126 ms, total: 34.4 s
Wall time: 34.4 s


In [7]:
%%time

# Detecting language
comment_languages = []
for i, text in enumerate(comments_df["comment"], start=1):
    
    print(f"Processing comment number {i}", end="\r")
    
    # langdetect doesn't work very well in short sentences. Threshold > 4
    if len(re.split("\s+", text)) > 4:
        language = GetLanguage(text)
    else:
        language = "undefined"
        
    comment_languages.append(language)

print("")

comments_df["code"] = comment_languages

Processing comment number 179526
CPU times: user 11min 26s, sys: 7.29 s, total: 11min 34s
Wall time: 11min 32s


In [8]:
username = input("User postgres:")
host = input("Host:")
database = input("Database:")
password = getpass.getpass("Password:")

engine = create_engine(f'postgresql://{username}:{password}@{host}/{database}')

User postgres: postgres
Host: localhost
Database: youtube
Password: ········


In [9]:
df_languages = pd.DataFrame(comments_df["code"].unique(), columns=["code"])
df_titles = pd.DataFrame({"id_video":videos_id, "title":video_names})
df_users = pd.DataFrame(comments_df["name"].unique(), columns=["name"])

In [10]:
%%time

tables_isolated = [("languages", Languages, df_languages),
          ("titles", Titles, df_titles),
          ("users", Users, df_users)]

commit = True
for table_name, orm_model, df in tables_isolated:
    
    print(f"Validating registers in table: {table_name}")
    validator = DataValidator(engine, orm_model)
    bad_rows = validator.ValidateData(df, errors="ignore")
    
    if commit:
        df = df.copy().drop(bad_rows, axis=0)
        if df.shape[0] > 0:
            print(f"Commiting registers in table: {table_name}")
            insert = InsertData(engine, orm_model)
            insert.Insert(df)
            print(f"Done")
        else:
            print(f"Nothing to insert in table: {table_name}")

Validating registers in table: languages
Done
Commiting registers in table: languages
Done
Validating registers in table: titles
Done
Commiting registers in table: titles
Done
Validating registers in table: users
Done
Commiting registers in table: users
Done
CPU times: user 1min 45s, sys: 7.95 s, total: 1min 53s
Wall time: 2min 39s


In [11]:
registers_languages = pd.read_sql("SELECT * FROM languages", engine)
registers_users = pd.read_sql("SELECT * FROM users", engine)

In [12]:
# Mapping language of the comment and user with the id repectively
comments_df_mapped = comments_df.copy().merge(registers_languages,
                       how="left",
                       on="code")

comments_df_mapped.drop(["code"], axis=1, inplace=True)

comments_df_mapped = comments_df_mapped.merge(registers_users,
                        how="left",
                        on="name")

comments_df_mapped.drop(["name"], axis=1, inplace=True)

In [13]:
tables_no_isolated = [("comments", Comments, comments_df_mapped)]

commit = True
for table_name, orm_model, df in tables_no_isolated:
    
    print(f"Validating registers in table: {table_name}")
    validator = DataValidator(engine, orm_model)
    bad_rows = validator.ValidateData(df, errors="ignore")
    
    if commit:
        df = df.copy().drop(bad_rows, axis=0)
        if df.shape[0] > 0:
            print(f"Commiting registers in table: {table_name}")
            insert = InsertData(engine, orm_model)
            insert.Insert(df)
            print(f"Done")
        else:
            print(f"Nothing to insert in table: {table_name}")

Validating registers in table: comments
Done
Commiting registers in table: comments
Done


In [14]:
engine.dispose()