Here we'll use YouTube API to load comments for NLP porposes

In [1]:
# Common libraries
import os
import json
import pandas as pd
from getpass import getpass
import datetime
from tqdm.notebook import tqdm

# YouTube API libraries
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow, Flow

In [2]:
# secret file - file with information about your YouTube API session
CLIENT_SECRETS_FILE = getpass(prompt='Enter the path to your secret file: ') #This is the path to your JSON file

Enter the path to your secret file: ········


In [3]:
# This OAuth 2.0 access scope allows for full read/write access to the
# authenticated user's account and requires requests to use an SSL connection.
SCOPES = ['https://www.googleapis.com/auth/youtube.force-ssl']
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'

def get_authenticated_service():
    flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRETS_FILE, SCOPES)
    credentials = flow.run_console()
    return build(API_SERVICE_NAME, API_VERSION, credentials = credentials)

os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'

service = get_authenticated_service()

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=552091031714-nuc0n8215tr9h102b7nsd05d0bm9us8c.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fyoutube.force-ssl&state=7Ovlhw49CbnGuKTkIOzfww9SEdX4oN&prompt=consent&access_type=offline
Enter the authorization code: 4/0AFwjOWkMVaH4JvP4Bwc9FTuGNt2UOF5XlEm3CTiyC3DvcGA5LiPqq4


In [4]:
def get_list(prompt='Enter the path to your video ids file: '):
    """
    get list from json
    input:  path + filename
    output: list    
    """
    path = getpass(prompt=prompt)
    if os.path.exists(path):
        with open(path, 'r') as file:
            video_ids = json.loads(file.read())
        return video_ids, path
                  
    else:
        return list(), path
    
    
def save_list(video_ids, path):
    """
    save new list to file 
    
    """    
    with open(path, 'w') as file:
        file.write(json.dumps(video_ids))

In [5]:
def get_comments(n_videos=50):
    
    """
    allows to get comments on video ids which are not in list
    """
    
    video_processed, path_video_processed = get_list()
    video_processed = set(video_processed)
    df_all = pd.DataFrame() # dataframe for comments and other information
    path_to_data = getpass(prompt='Enter the path to data folder: ')
    
    request = service.videos().list( # API request
        part="snippet,id",
        chart="mostPopular", # get most popular videos 
        regionCode="RU", 
        maxResults=n_videos
    )
    executed_result = request.execute()
    
    # here we get information we needed
    for video_data in tqdm(executed_result['items']):
        video_id = video_data['id']

        # need to check is this video already processed
        if video_id in video_processed:
            continue
            
        else:
            video_channel = video_data['snippet']['channelTitle']
            try:
                response = service.commentThreads().list(
                    part = 'snippet',
                    videoId = video_id,
                    maxResults = 100, # Only take top 100 comments (max)
                    order = 'relevance', #... ranked on relevance
                    textFormat = 'plainText',
                    ).execute()
            except:
                continue
            
            # arrays to save the info 
            arr_text = []
            arr_likes = []
            arr_replies = []

            for comment in response['items']: # for every comment get the info
                arr_text.append(comment['snippet']['topLevelComment']['snippet']['textOriginal'])
                arr_likes.append(comment['snippet']['topLevelComment']['snippet']['likeCount'])
                arr_replies.append(comment['snippet']['totalReplyCount'])
                
            df_video = pd.DataFrame()
            df_video['text'] = arr_text
            df_video['likes'] = arr_likes
            df_video['replies'] = arr_replies
            df_video['video_id'] = video_id
            df_video['channel_name'] = video_channel
            
            df_all = pd.concat((df_all, df_video), axis=0) # concat one video to other
            
            video_processed.add(video_id)
    
    save_list(list(video_processed), path_video_processed)
    ctime = datetime.datetime.now() # for correct naming porpuses
    df_all.to_csv(path_to_data + '/' 
                   + str(ctime.date()) + '_' + str(ctime.hour) + '.csv', index=False)

In [6]:
get_comments()

Enter the path to your video ids file: ········
Enter the path to data folder: ········


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [7]:
df = pd.read_csv('D:/Storage/Данные/Комментарии YouTube/Комментарии/2020-05-25_20.csv')

In [14]:
print(df.shape)
display(df.sample(5))
print(df['text'].sample(1).values)
print('Уникальных видео обработано: %d' % df['video_id'].nunique())

(3790, 5)


Unnamed: 0,text,likes,replies,video_id,channel_name
347,Тебе жалко брата тим тима?\nЛайк-Да✔️\nИгнор-Н...,0,0,wQeb53fyMZc,Tим Тим
1383,"Спасибо огромное, от души, спасибо огромнейшее...",0,0,ez57UvmVd3c,Супер Сериал
91,Желаю твою семью тебя здоровья и счастья 🤲,1,0,EoL2iuefmA8,Чердак
3037,Демура Заставляет думать тех кто готов его слу...,0,0,HBNc4A-nC50,Нейромир-ТВ
3303,"Красивые😍,но длинные 😨",9,0,I_gDFjPM_gY,Татьяна Бугрий


['Саня Жирный, гнилой человек, сразу было видно']
Уникальных видео обработано: 38


In [15]:
li, pa = get_list()

Enter the path to your video ids file: ········


In [16]:
print('Видео обработано: %d' % len(li))
print('Комментариев получено: %d' % (len(li) * 100))

Видео обработано: 567
Комментариев получено: 56700
