In [1]:
import pandas as pd
import googleapiclient.discovery
import json
import os
from isodate import parse_duration
from langdetect import detect

In [2]:
data_excel = 'video-list-ranked new coding system May Final (Completed Set) (22_5_23).xlsx'
df = pd.read_excel(f'data/{data_excel}', sheet_name='Clean Video Sheet')
df.head()

Unnamed: 0,id,nrComments,nrThreads,nrReplies,maxThreadLength,disabled,nrAuthors,nrAuthors3Posts,maxPostsByAuthor,url,...,popularityScore2,debateScore,debateScore2,Age,Language,Duration,Pro or agai,Type,Delivery,Other notes
0,eRLJscAlk1M,118,10,19,500,False,115,0,2,https://www.youtube.com/watch?v=eRLJscAlk1M,...,17.312566,0.607542,6.612041,7 yrs,English,6 mins,Pro,Song,Drama,Weird add at the end
1,yiw6_JakZFc,150,26,50,458,False,111,7,17,https://www.youtube.com/watch?v=yiw6_JakZFc,...,16.403931,0.652462,6.614726,1 yr,English,15 mins,Both,Ed Vid,Illustration,Kurzgesagt channel
2,9lD29jqH078,116,10,16,3,False,97,2,5,https://www.youtube.com/watch?v=9lD29jqH078,...,15.699717,0.322811,5.42495,4 yrs,English,19 mins,Both,Debate,Debate,"Joe vs Candace, celebrity figure"
3,LxgMdjyw8uw,168,29,68,382,False,107,10,21,https://www.youtube.com/watch?v=LxgMdjyw8uw,...,16.154511,0.631898,6.532334,10 months,English,16 mins,Pro,Ed Vid,Illustration,Kurzgesagt channel - We WILL FIX IT
4,y564PsKvNZs,97,1,1,1,False,86,1,3,https://www.youtube.com/watch?v=y564PsKvNZs,...,16.012567,0.258505,5.225747,2 yrs,English,7 mins,Neither,Interview,Q&A,"Peterson's comments on climate change, emphisi..."


In [3]:
yt_ids = df['id'].tolist()
# youtube-api only allows querying 50 videos at one time
yt_batches = [yt_ids[x:x+50] for x in range(0, len(yt_ids), 50)]

In [14]:
ApiKey = open("youtube-API-key.txt").read().strip()

In [15]:
api_service_name = "youtube"
api_version = "v3"

youtube = googleapiclient.discovery.build(api_service_name, api_version, developerKey=ApiKey)

In [16]:
def getVideos(query, amount):
    request = youtube.search().list(
        part="snippet",
        maxResults=amount,
        q=query,
        relevanceLanguage='en',
    )
    try:
        response = request.execute()
    except googleapiclient.http.HttpError as ee:
        print('error')
        raise ee
    return response

In [17]:
def getVideoStatistics(videoIds):
    request = youtube.videos().list(
        part='statistics',
        id=videoIds
      )
    try:
        response = request.execute()
    except googleapiclient.http.HttpError as ee:
        print('error')
        raise ee
    return response

In [24]:
stats = []

for batch in yt_batches:
    response = getVideoStatistics(','.join(batch))
    stats.extend(response['items'])

In [25]:
stats_df = pd.DataFrame(stats)

# use the 'id' column as the key to join the two dataframes
df = df.merge(stats_df, on='id')
df = pd.concat([df.drop(['statistics'], axis=1), df['statistics'].apply(pd.Series)], axis=1)
df = df.fillna(0)
df = df.astype({'viewCount':int, 'likeCount':int, 'favoriteCount':int, 'commentCount':int})

df.head(1)

Unnamed: 0,id,nrComments,nrThreads,nrReplies,maxThreadLength,disabled,nrAuthors,nrAuthors3Posts,maxPostsByAuthor,url,Relevance,Content,kind,etag,viewCount,likeCount,favoriteCount,commentCount
0,N7Qot_aax9M,32,5,13,8,False,19,3,6,https://www.youtube.com/watch?v=N7Qot_aax9M,Y,Politician,youtube#video,sO8A-6UG2leEDWZF_sqfsKWRSAE,2374,35,0,82


In [144]:
df.to_excel('data/video-list-first-draft_categorized_metadata.xlsx', index=False)

In [8]:
# get video comments
def getVideoComments(vid, nextPageToken=None, maxResults=100):
    request = youtube.commentThreads().list(
        part="snippet",
        maxResults=maxResults,
        videoId=vid,
        pageToken=nextPageToken
    )
    try:
        response = request.execute()
    except googleapiclient.http.HttpError as ee:
        if b'"reason": "commentsDisabled"' in ee.args[1]:
            return None
        else:
            raise
    print(response['pageInfo'], len(response['items']), response.get("nextPageToken",">>> No nextPageToken!")[:20])
    
    return response

In [5]:
# get comment replies
def getCommentReplies(parentId, nextPageToken=None):
    request = youtube.comments().list(
        part="snippet",
        maxResults=100,
        parentId=parentId,
        pageToken=nextPageToken
    )
    response = request.execute()
    return response

In [4]:
# using this tool: https://github.com/egbertbouman/youtube-comment-downloader
def getCommentsAndReplies(yt_id, output_dir):
    shellstring = f'youtube-comment-downloader --youtubeid={yt_id} --output {output_dir}/{yt_id}.json'
    os.system(shellstring)

In [7]:
# get top 10 ranked videos
ranked_df = pd.read_excel('data/video-list-ranked.xlsx')
ranked_df = ranked_df.sort_values(by='debateScore2', ascending=False)
ranked_df = ranked_df[:10]
ranked_df

Unnamed: 0,id,nrComments,nrThreads,nrReplies,maxThreadLength,disabled,nrAuthors,nrAuthors3Posts,maxPostsByAuthor,url,...,kind,etag,viewCount,likeCount,favoriteCount,commentCount,popularityScore,popularityScore2,debateScore,debateScore2
18,uqwvf6R1_QY,141,19,41,493,False,99,11,8,https://www.youtube.com/watch?v=uqwvf6R1_QY,...,youtube#video,PMpexQnV6Tz8psLc-h7OJv-A-a8,2491977,146394,0,26181,0.097843,14.795546,0.619581,6.624065
1,yiw6_JakZFc,150,26,50,458,False,111,7,17,https://www.youtube.com/watch?v=yiw6_JakZFc,...,youtube#video,4alTF5gz6Ex7xLcAb_66LSvgmJI,12544059,700793,0,63876,0.354178,16.403931,0.652462,6.614726
0,eRLJscAlk1M,118,10,19,500,False,115,0,2,https://www.youtube.com/watch?v=eRLJscAlk1M,...,youtube#video,0hEWhvgdhdyvjBBDXZB4jmwsBrI,30260556,2588214,0,169326,1.0,17.312566,0.607542,6.612041
5,EhAemz1v7dQ,165,31,65,388,False,118,9,8,https://www.youtube.com/watch?v=EhAemz1v7dQ,...,youtube#video,q7weJ53HpdehmDafKpNhu_Mrgys,7997197,311483,0,26269,0.17992,15.935968,0.658505,6.555357
6,wbR-5mHI6bo,128,17,28,445,False,103,9,3,https://www.youtube.com/watch?v=wbR-5mHI6bo,...,youtube#video,gGFOXlFfyildZSR0N256ViKcQlI,6718248,377664,0,42908,0.207111,15.781058,0.585561,6.542472
2,LxgMdjyw8uw,168,29,68,382,False,107,10,21,https://www.youtube.com/watch?v=LxgMdjyw8uw,...,youtube#video,6IJaw1CgYc68g8QeKca7L0eJdUQ,9739864,588197,0,42800,0.267297,16.154511,0.631898,6.532334
14,o-TMOeCDeus,113,3,13,408,False,97,3,6,https://www.youtube.com/watch?v=o-TMOeCDeus,...,youtube#video,rIhmUb6bT02JWDTqDiv5_jh5QGE,2766257,141064,0,6903,0.062227,14.885114,0.50347,6.43294
47,E6bVBH9y5O8,207,41,107,179,False,116,11,23,https://www.youtube.com/watch?v=E6bVBH9y5O8,...,youtube#video,6iVLD99J93FV5rOEI3b8_P73JBI,492742,22218,0,6995,0.022058,13.165337,0.616836,6.298949
119,fByLTBpWNQU,266,58,166,31,False,113,23,58,https://www.youtube.com/watch?v=fByLTBpWNQU,...,youtube#video,Q-EsCmiVTMY86DIW7pMoGkl8NjE,26407,939,0,646,0.001682,10.239674,0.638758,6.150603
38,5Gk9gIpGvSE,130,11,30,234,False,90,11,8,https://www.youtube.com/watch?v=5Gk9gIpGvSE,...,youtube#video,AQmwUtGFNLMW0KPG3SJdN5DDCoE,795742,15843,0,5629,0.021886,13.613656,0.440827,6.144186


In [8]:
yt_ids = ranked_df['id'].tolist()

## Using YouTube API 

In [94]:
# get all top level video comments for each video
for yt_id in yt_ids:
    print(yt_id, '\n')
    response = getVideoComments(yt_id)
    result = response
    
    # get all comments with pagination
    while 'nextPageToken' in response:
        videoId = response['items'][0]['snippet']['videoId']
        nextPageToken = response['nextPageToken']
        res = getVideoComments(videoId, nextPageToken)
        result['items'].extend(res['items'])
        response = res
        
    json.dump(result, open(f"data/comments-data/{yt_id}.json", "w"))

In [10]:
# get all comment replies
for yt_id in yt_ids:
    print(yt_id, '\n')
    with open(f'data/comments-data/{yt_id}.json', 'r') as file:
        data = json.load(file)
    
    # loop through each top level comment in the video
    for item in data['items']:
        result = {}
        parentId = item['id']
        response = getCommentReplies(parentId)
        result = response
        
        # get all comment replies with pagination
        while 'nextPageToken' in response:
            nextPageToken = response['nextPageToken']
            res = getCommentReplies(parentId, nextPageToken)
            result['items'].extend(res['items'])
            response = res
        
        if not os.path.exists(f'data/comments-data/{yt_id}'):
            os.makedirs(f'data/comments-data/{yt_id}')
            
        json.dump(result, open(f'data/comments-data/{yt_id}/{yt_id}.{parentId}.json', "w"))

## Using Comment Downloaded Tool

In [None]:
for yt_id in yt_ids:
    print(yt_id)
    if not os.path.exists(f'data/comments-data/{yt_id}'):
        os.makedirs(f'data/comments-data/{yt_id}')
        getCommentsAndReplies(yt_id, f'data/comments-data/{yt_id}')

# Get Viable Videos
- english 
- longer than 60 seconds

In [13]:
viable_videos = []
query = 'climate change'
amount = 2
videos = getVideos(query, amount)

for search_result in videos.get('items', []):
    video_id = search_result['id']['videoId']
    video_response = youtube.videos().list(
        id=video_id,
        part='contentDetails,statistics'
    ).execute()
    duration = video_response['items'][0]['contentDetails']['duration']
    duration_seconds = parse_duration(duration).total_seconds()
    if duration_seconds > 60:
        print(video_id, ':  ', duration_seconds)
        viable_videos.append(video_response['items'][0])

In [7]:
viable_videos

[{'kind': 'youtube#video',
  'etag': 'pp0QILfcxm5Muol13mIi_6-WEts',
  'id': 'dcBXmj1nMTQ',
  'contentDetails': {'duration': 'PT7M7S',
   'dimension': '2d',
   'definition': 'hd',
   'caption': 'true',
   'licensedContent': True,
   'contentRating': {},
   'projection': 'rectangular'},
  'statistics': {'viewCount': '1167502',
   'likeCount': '25793',
   'favoriteCount': '0',
   'commentCount': '2231'}},
 {'kind': 'youtube#video',
  'etag': 'a9XDPff_CF7KaW38Wd2qc8wSrJM',
  'id': 'EuwMB1Dal-4',
  'contentDetails': {'duration': 'PT6M4S',
   'dimension': '2d',
   'definition': 'hd',
   'caption': 'false',
   'licensedContent': True,
   'contentRating': {},
   'projection': 'rectangular'},
  'statistics': {'viewCount': '220172',
   'likeCount': '2827',
   'favoriteCount': '0',
   'commentCount': '119'}}]

### Test

In [35]:
id_test = videos['items'][0]['id']['videoId']
comms = getVideoComments(id_test)

{'totalResults': 10, 'resultsPerPage': 100} 10 >>> No nextPageToken


In [39]:
detect(comms['items'][1]['snippet']['topLevelComment']['snippet']['textDisplay'])

'en'