Here we'll use YouTube API to load comments for NLP porposes

In [None]:
# Common libraries
import os
import json
import pandas as pd
from getpass import getpass
import datetime
from tqdm.notebook import tqdm

# YouTube API libraries
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow, Flow

In [None]:
# secret file - file with information about your YouTube API session
CLIENT_SECRETS_FILE = getpass(prompt='Enter the path to your secret file: ') #This is the path to your JSON file

In [None]:
# This OAuth 2.0 access scope allows for full read/write access to the
# authenticated user's account and requires requests to use an SSL connection.
SCOPES = ['https://www.googleapis.com/auth/youtube.force-ssl']
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'

def get_authenticated_service():
    flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRETS_FILE, SCOPES)
    credentials = flow.run_console()
    return build(API_SERVICE_NAME, API_VERSION, credentials = credentials)

os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'

service = get_authenticated_service()

In [None]:
def get_list(prompt='Enter the path to your video ids file: '):
    """
    get list from json
    input:  path + filename
    output: list    
    """
    path = getpass(prompt=prompt)
    if os.path.exists(path):
        with open(path, 'r') as file:
            video_ids = json.loads(file.read())
        return video_ids, path
                  
    else:
        return list(), path
    
    
def save_list(video_ids, path):
    """
    save new list to file 
    
    """    
    with open(path, 'w') as file:
        file.write(json.dumps(video_ids))

In [None]:
def get_comments(n_videos=50):
    
    """
    allows to get comments on video ids which are not in list
    """
    
    video_processed, path_video_processed = get_list()
    video_processed = set(video_processed)
    df_all = pd.DataFrame() # dataframe for comments and other information
    path_to_data = getpass(prompt='Enter the path to data folder: ')
    
    request = service.videos().list( # API request
        part="snippet,id",
        chart="mostPopular", # get most popular videos 
        regionCode="RU", 
        maxResults=n_videos
    )
    executed_result = request.execute()
    
    # here we get information we needed
    for video_data in tqdm(executed_result['items']):
        video_id = video_data['id']

        # need to check is this video already processed
        if video_id in video_processed:
            continue
            
        else:
            video_channel = video_data['snippet']['channelTitle']
            try:
                response = service.commentThreads().list(
                    part = 'snippet',
                    videoId = video_id,
                    maxResults = 100, # Only take top 100 comments (max)
                    order = 'relevance', #... ranked on relevance
                    textFormat = 'plainText',
                    ).execute()
            except:
                continue
            
            # arrays to save the info 
            arr_text = []
            arr_likes = []
            arr_replies = []

            for comment in response['items']: # for every comment get the info
                arr_text.append(comment['snippet']['topLevelComment']['snippet']['textOriginal'])
                arr_likes.append(comment['snippet']['topLevelComment']['snippet']['likeCount'])
                arr_replies.append(comment['snippet']['totalReplyCount'])
                
            df_video = pd.DataFrame()
            df_video['text'] = arr_text
            df_video['likes'] = arr_likes
            df_video['replies'] = arr_replies
            df_video['video_id'] = video_id
            df_video['channel_name'] = video_channel
            
            df_all = pd.concat((df_all, df_video), axis=0) # concat one video to other
            
            video_processed.add(video_id)
    
    save_list(list(video_processed), path_video_processed)
    ctime = datetime.datetime.now() # for correct naming porpuses
    df_all.to_csv(path_to_data + '/' 
                   + str(ctime.date()) + '_' + str(ctime.hour) + '.csv', index=False)

In [None]:
# get_comments()