Here we'll use YouTube API to load comments for NLP porposes

In [None]:
%autosave 60
# Common libraries
import os
import json
import pandas as pd
from getpass import getpass
import datetime
from tqdm import tqdm
# YouTube API libraries
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow, Flow

In [None]:
# secret file - file with information about your YouTube API session
CLIENT_SECRETS_FILE = getpass(prompt='Enter the path to your secret file: ') #This is the path to your JSON file

In [None]:
# This OAuth 2.0 access scope allows for full read/write access to the
# authenticated user's account and requires requests to use an SSL connection.
SCOPES = ['https://www.googleapis.com/auth/youtube.force-ssl']
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'

def get_authenticated_service():
    flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRETS_FILE, SCOPES)
    credentials = flow.run_console()
    return build(API_SERVICE_NAME, API_VERSION, credentials = credentials)

os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'

service = get_authenticated_service()

In [None]:
def get_list(path=None):
    """
    get list from json
    input:  path + filename
    output: list    
    """
    if path is None:
        prompt='Enter the path to your video ids file: '
        path = getpass(prompt=prompt)
    else: 
        pass
    if os.path.exists(path):
        with open(path, 'r') as file:
            video_ids = json.loads(file.read())
        return video_ids, path
                  
    else:
        return list(), path
    
    
def save_list(video_ids, path):
    """
    save new list to file 
    
    """    
    with open(path, 'w') as file:
        file.write(json.dumps(video_ids))

In [None]:
def get_comments(n_videos=50):
    
    """
    allows to get comments on video ids which are not in list
    """
    
    video_processed, path_video_processed = get_list()
    video_processed = set(video_processed)
    df_all = pd.DataFrame() # dataframe for comments and other information
    path_to_data = getpass(prompt='Enter the path to data folder: ')
    
    request = service.videos().list( # API request
        part="snippet,id",
        chart="mostPopular", # get most popular videos 
        regionCode="RU", 
        maxResults=n_videos
    )
    executed_result = request.execute()
    
    # here we get information we needed
    for video_data in tqdm(executed_result['items'], position=0, leave=True):
        video_id = video_data['id']

        # need to check is this video already processed
        if video_id in video_processed:
            continue
            
        else:
            video_channel = video_data['snippet']['channelTitle']
            try:
                response = service.commentThreads().list(
                    part = 'snippet',
                    videoId = video_id,
                    maxResults = 100, # Only take top 100 comments (max)
                    order = 'relevance', #... ranked on relevance
                    textFormat = 'plainText',
                    ).execute()
            except:
                continue
            
            # arrays to save the info 
            arr_text = []
            arr_likes = []
            arr_replies = []

            for comment in response['items']: # for every comment get the info
                arr_text.append(comment['snippet']['topLevelComment']['snippet']['textOriginal'])
                arr_likes.append(comment['snippet']['topLevelComment']['snippet']['likeCount'])
                arr_replies.append(comment['snippet']['totalReplyCount'])
                
            df_video = pd.DataFrame()
            df_video['text'] = arr_text
            df_video['likes'] = arr_likes
            df_video['replies'] = arr_replies
            df_video['video_id'] = video_id
            df_video['channel_name'] = video_channel
            
            df_all = pd.concat((df_all, df_video), axis=0) # concat one video to other
            
            video_processed.add(video_id)
    
    save_list(list(video_processed), path_video_processed)
    ctime = datetime.datetime.now() # for correct naming porpuses
    df_all.to_csv(path_to_data + '/' 
                   + str(ctime.date()) + '_' + str(ctime.hour) + '.csv', index=False)

In [None]:
get_comments()

In [None]:
df = pd.read_csv('D:/Storage/Данные/Комментарии YouTube/Комментарии/2020-07-22_10.csv')

In [None]:
print(df.shape)
display(df.sample(5))
print(df['text'].sample(1).values)
print('Уникальных видео обработано: %d' % df['video_id'].nunique())

## Check how many are already downloaded

In [None]:
import os
from tqdm import tqdm

In [None]:
comments_total = 0
video_total = 0 
path = 'D:/Storage/Данные/Комментарии YouTube/Комментарии/'
for file in tqdm(os.listdir(path)):
    df_temp = pd.read_csv(path + file, usecols=['video_id'])
    video_total += df_temp.nunique()
    comments_total += df_temp.shape[0]
    
print('Всего комментариев получено: %d' % comments_total)
print('Всего видео обработано: %d' % video_total)

# Load subtitles for already saved videos

In [None]:
import sys
import os
import time

sys.path.append('D:\Storage\Data Science\GIT\youtube-transcript-api')
from youtube_transcript_api import YouTubeTranscriptApi

In [None]:
path = 'D:/Storage/Данные/Комментарии YouTube/Комментарии/'
path_subtitles = 'D:/Storage/Данные/Комментарии YouTube/Субтитры к обработанным видео/'
path_to_loaded_videos = 'D:/Storage/Данные/Комментарии YouTube/Вспомогательное/video_processed_subs.txt'

In [None]:
video_processed, path_video_processed = get_list(path_to_loaded_videos)
video_processed = set(video_processed)


for file in tqdm(os.listdir(path), position=0, leave=True):
    
    filename = file.split('.')[0] # Name for saving subtitles json
    current_video_ids = pd.read_csv(path + file, usecols=['video_id'])['video_id'].values
    not_loaded_ids = set(current_video_ids) - video_processed # video ids for loading subtitles
    
    if not_loaded_ids: # If there are any video is not already loaded
        
        video_processed = video_processed.union(not_loaded_ids)
        video_subtitles = YouTubeTranscriptApi.get_transcripts(list(not_loaded_ids), languages=['ru'], 
                                                               continue_after_error=True)
        
        save_list(video_subtitles, path_subtitles + filename + '.json') # Saving all gained subtitles
        time.sleep(1) # For antibot system?

save_list(list(video_processed), path_video_processed) # Save video ids we are processed

##  Check loaded video`s subtitles

In [None]:
total_video_subtitles = 0
for file in tqdm(os.listdir(path_subtitles), position=0, leave=True):
    path = 'D:/Storage/Данные/Комментарии YouTube/Субтитры к обработанным видео/' + file
    q, path = get_list(path)
    total_video_subtitles += len(q[0].keys())
    
print('Total videos with subtitles: %d' % total_video_subtitles)