# YT Comments analysis

In [37]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from datetime import datetime, timedelta, time
import pandas as pd
import json
import re
from api import API_KEY

channel_id = "UCWeg2Pkate69NFdBeuRFTAw" #Squeezie channel
etoiles = 'UCABf02qOye7XYapcK1M45LQ'
youtube = build('youtube', 'v3', developerKey=API_KEY)
exemple_video = "qCKyRhkhqoQ"
otp_recap = 'F7A8OCdmZ90'

### Class request
Class to handle youtube request since youtube api doesn't provide a request object

In [38]:
class Request:
    """ Class Request handling youtube request as an object """
    def __init__(self, requestType,part=None, id=None, chart=None, regionCode=None, maxResults=None, pageToken=None, videoId=None):
        self.requestType = requestType
        self.part = part
        self.id = id
        self.chart = chart
        self.regionCode = regionCode
        self.maxResults = maxResults
        self.pageToken = pageToken
        self.videoId = videoId
        
    def execute(self):
        param = vars(self) # Fetch class attributes
        param = {x:y for x,y in list(param.items())[1:] if y} # Delete requestType ([1:]) and None attributes
        
        request = self.requestType.list(**param)
        return request.execute()

### Decorator
Decorator to retry when youtube request fails (mostly due to timeout erros)

In [39]:
def retry_on_exception(max_attempts=5):
    def decorator(func):
        def wrapper(*args, **kwargs):
            attempts = 0
            while attempts < max_attempts:
                try:
                    result = func(*args, **kwargs)
                except Exception as e:
                    attempts += 1
                    if attempts == max_attempts:
                        return pd.DataFrame()
                        raise  # Relancer l'exception si le nombre maximal de tentatives est atteint
                    else:
                        print(f"{attempts}: Une exception s'est produite : {e}")
                else:
                    return result  # Retourner le r√©sultat si aucune exception n'est lev√©e
                # time.sleep(0.5)
        return wrapper
    return decorator

### Datetime convertions
Functions to convert iso formated date found in youtube api responses in datetime objects.

In [40]:
def iso_toDatetime(iso_date:str):
    """Converts an ISO 8601 formatted date to a datetime object."""
    return datetime.strptime(iso_date[:-1], '%Y-%m-%dT%H:%M:%S')

def datetime_toISO(dt_obj:datetime):
    """Converts a datetime object to an ISO 8601 formatted date."""
    return dt_obj.isoformat()[:-7]  # remove microseconds

def iso_toDelta(iso_duration:str):
    """Converts an ISO 8601 formatted duration to a timedelta object."""
    match = re.match(r'PT(\d+D)*(\d+H)*(\d+M)*(\d+S)', iso_duration)
    days, hours, minutes, seconds = [int(x[:-1]) if x else 0 for x in match.groups()]
    return timedelta(days=days,hours=hours, minutes=minutes, seconds=seconds)

def delta_toISO(delta_obj:timedelta):
    """Converts a timedelta object to an ISO 8601 formatted duration."""
    hours = delta_obj.seconds // 3600
    minutes = (delta_obj.seconds % 3600) // 60
    seconds = delta_obj.seconds % 60
    
    daysStr = f"{delta_obj.days}D" if delta_obj.days != 0 else ""
    hoursStr = f"{hours}H" if hours != 0 else ""
    minutesStr = f"{minutes}M" if minutes != 0 else ""
    secondsStr = f"{seconds}S" if seconds != 0 else ""
    return f"PT{daysStr}{hoursStr}{minutesStr}{secondsStr}"

# print(iso_toDelta('PT4D3H20M9S'))
# print(delta_toISO(iso_toDelta('PT20M9S')))

### Fectching functions
Functions to fetch channels, comments ant top vids infos.

In [41]:
def format_channel_dataDF(channel_data: dict) -> pd.DataFrame:
    """ Structure raw channel data """
    data = {
        "channel_name": [channel_data.get('snippet', {}).get('title')],
        "channel_id": [channel_data.get('id')],
        "country": [channel_data.get('snippet', {}).get('country',"")],
        **{k:[int(v)] for k,v in channel_data.get('statistics', {}).items() if k != "hiddenSubscriberCount"},
        "topics": [[wikilink.split('/')[-1] for wikilink in channel_data.get('topicDetails', {}).get('topicCategories', [])]],
    }
    return pd.DataFrame.from_dict(data)

In [42]:
def get_channel_data(youtube, channel_id:str) -> dict[str|dict]:
    """ Request (by id) for most important channel stats """
    request = Request(
        requestType=youtube.channels(),
        part="snippet,contentDetails,statistics,topicDetails",
        id=channel_id
    )
    response = request.execute()
    rawData = response.get('items', [])[0]
    return format_channel_dataDF(rawData)
    

get_channel_data(youtube, etoiles)

Unnamed: 0,channel_name,channel_id,country,viewCount,subscriberCount,videoCount,topics
0,Etoiles,UCABf02qOye7XYapcK1M45LQ,FR,48615393,271000,552,"[Action_game, Role-playing_video_game, Action-..."


In [43]:
def format_video_dataDF(video_data: dict) -> pd.DataFrame:
    """ Structure raw video data """
    data = {
            "title": [video_data.get('snippet', {}).get('title')],
            "id": [video_data.get('id')],
            "publishedAt": [video_data.get('snippet', {}).get('publishedAt')],
            "duration" : [video_data.get('contentDetails').get('duration')],
            "ViewCount" : [int(video_data.get('statistics', {}).get('viewCount', 0))],
            "likeCount" : [int(video_data.get('statistics', {}).get('likeCount', 0))],
            "commentCount" : [int(video_data.get('statistics', {}).get('commentCount', 0))],  
            "tags" : [video_data.get('snippet', {}).get('tags')]
    }
    
    return pd.DataFrame.from_dict(data)

In [44]:
def get_video_data(youtube, video_Id:str) -> dict[str|dict]:
    """ Request (by id) for most important video stats """
    request = Request(
        requestType=youtube.videos(),
        part="snippet,contentDetails,statistics,topicDetails",
        id=video_Id,
    )
    response = request.execute()
    
    rawData = response.get('items', [])[0]
    return format_video_dataDF(rawData)

get_video_data(youtube, exemple_video)

Unnamed: 0,title,id,publishedAt,duration,ViewCount,likeCount,commentCount,tags
0,"LE JEU DE LA F√àVE (Avec Joyca, Zafeel et Hctuan)",qCKyRhkhqoQ,2024-02-10T12:02:00Z,PT35M30S,2709309,185090,2020,"[Mastu, Mastus, Humour, Matsu, loat, mastu loa..."


In [49]:
def get_Most_Popular_Video(youtube, region:str) -> pd.DataFrame:
    """ Request for most populars videos stats """
    request = Request(
        requestType=youtube.videos(),
        part="snippet,contentDetails,statistics,topicDetails",
        chart="mostPopular",
        regionCode=region,
        maxResults=100,
        pageToken=''
    )
    response = request.execute()
    
    pages = [response]
    while response.get('nextPageToken'):
        request.pageToken = response.get('nextPageToken')
        response = request.execute()
        pages.append(response)
    
    top_videos = pd.concat([format_video_dataDF(videos) for page in pages for videos in page.get('items')]).reset_index(drop=True)
    top_videos['topID'] = top_videos.index + 1
    top_videos['region'] = region
    return top_videos

get_Most_Popular_Video(youtube, 'FR')
# df.sort_values(by=['fetchedDate'], ascending=False, inplace=True, kind='quicksort', ignore_index=True)
# df

Unnamed: 0,title,id,publishedAt,duration,ViewCount,likeCount,commentCount,tags,topID,region
0,100 d√©fis en une Journ√©e Challenge #3 (Ft. Mic...,FAd8SD3W3zg,2024-03-02T14:15:02Z,PT1H21M32S,2747078,272487,4029,"[INOXTAG, INOX, CROUTON, inox bucketlist, 100 ...",1,FR
1,On a v√©rifi√© tous les angles morts Baki !,w16i0PsLzu0,2024-03-03T21:48:23Z,PT13M56S,543304,46410,4312,,2,FR
2,JOUEUSE DU GRENIER - Les jeux pour fille sur WII,naleZElAgW8,2024-03-03T10:50:06Z,PT25M44S,743478,103610,7290,,3,FR
3,TOP CHEF ou FLOP CHEF ? ( on tourne sur le vra...,9YeawUJIDNY,2024-03-03T16:22:13Z,PT58M4S,233890,19090,374,[natoo],4,FR
4,"LE CASSE DU SI√àCLE (ft Joyca, Seb & Sofyan)",aD_lbT6Bils,2024-03-02T09:59:01Z,PT52M54S,1191442,81749,787,"[djilsi, france, on va ou, camera cachee, cach...",5,FR
...,...,...,...,...,...,...,...,...,...,...
195,J'AI ACHET√â UN SUPERMACH√â AVEC LA YOUTUBE MONE...,h-cgWGVdHhA,2024-02-26T17:00:34Z,PT11M30S,186314,15854,440,,196,FR
196,Des tracteurs forcent un barrage de police et ...,YXeq2BtRDuw,2024-02-26T16:16:49Z,PT1M58S,241339,4075,875,"[agriculteur, agriculture, barrage, barrage de...",197,FR
197,LE VERDICT EST TOMB√â POUR HORNER !! PREVIEW #1...,6pCu6a_VJ7o,2024-02-28T16:44:17Z,PT9M42S,94832,6482,262,"[Formule 1 2022, f1 race highlight, F1 canal +...",198,FR
198,Qu'Est-il Arriv√© √† Fortnite... ?,A1F4gGUrtSQ,2024-02-26T14:49:34Z,PT15M,144344,4690,157,"[deathy, deathy fortnite, fortnite, unchained,...",199,FR


In [None]:
def format_comment_data(comment:dict) -> dict[str|dict]:
    """ Structure raw comment data """
    data = {
        "id": comment.get('id'),
        "comment": comment.get('snippet', {}).get('textOriginal'),
        # "viewerRating": comment.get('snippet', {}).get('viewerRating'),
        "likeCount": comment.get('snippet', {}).get('likeCount'),
        "publishedAt": comment.get('snippet', {}).get('publishedAt'),
        "updatedAt": comment.get('snippet', {}).get('updatedAt')
        }
    
    return data

def format_threadedComment_data(comment:dict) -> dict[str|dict]:
    """ Structure raw threaded comment data """
    data = {
        **format_comment_data(comment.get('snippet', {}).get('topLevelComment')),
        "totalReplyCount": comment.get('snippet', {}).get('totalReplyCount'),
        # "replies": [format_comment_data(com) for com in comment.get('replies', {}).get('comments', [])]
        }
    
    return data

In [None]:
def format_comment_dataDF(comment:dict) -> pd.DataFrame:
    """ Structure raw comment data """
    data = {
        "id": [comment.get('id')],
        "comment": [comment.get('snippet', {}).get('textOriginal')],
        # "viewerRating": comment.get('snippet', {}).get('viewerRating'),
        "likeCount": [int(comment.get('snippet', {}).get('likeCount'))],
        "publishedAt": [comment.get('snippet', {}).get('publishedAt')],
        "updatedAt": [comment.get('snippet', {}).get('updatedAt')]
        }
    
    return data

def format_threadedComment_dataDF(comment:dict) -> pd.DataFrame:
    """ Structure raw threaded comment data """
    data = {
        **format_comment_dataDF(comment.get('snippet', {}).get('topLevelComment')),
        "totalReplyCount": [int(comment.get('snippet', {}).get('totalReplyCount'))],
        # "replies": [format_comment_data(com) for com in comment.get('replies', {}).get('comments', [])]
        }
    
    return data

In [None]:
def format_commentPageDF(page:list[dict]) -> pd.DataFrame:
    data = {}
    for comment in page:
        topLevelComment = comment.get('snippet', {}).get('topLevelComment')
        data.setdefault("id", []).append(topLevelComment.get('id')) 
        data.setdefault("comment", []).append(topLevelComment.get('snippet', {}).get('textOriginal'))
        data.setdefault("likeCount", []).append(int(topLevelComment.get('snippet', {}).get('likeCount')))
        data.setdefault("publishedAt", []).append(topLevelComment.get('snippet', {}).get('publishedAt'))
        data.setdefault("updatedAt", []).append(topLevelComment.get('snippet', {}).get('updatedAt'))
        data.setdefault("totalReplyCount", []).append(int(comment.get('snippet', {}).get('totalReplyCount')))
    return pd.DataFrame(data)

In [None]:
def get_comment(youtube,comment_id:str) -> dict[str|dict]:
    """ Request (by id) for most important comment stats """
    request = Request(
        requestType=youtube.comments(),
        part="snippet,id",
        id=comment_id,
    )
    response = request.execute()
    rawData = response.get('items')[0]
    return pd.DataFrame(format_comment_dataDF(rawData))

get_comment(youtube, 'UgwUQR2JJFJSkihWLhx4AaABAg')

In [34]:
@retry_on_exception(max_attempts=3)
def get_video_commentThreads(youtube, videoID:str, maxComments:int) -> dict[str|list]:
    """ Request (by id) for all comments of a videos """
    request = Request(
        requestType=youtube.commentThreads(),
        part="snippet,id,replies",
        videoId=videoID,
        maxResults=100
    )
    response = request.execute()
    
    maxComments -= response.get('pageInfo', {}).get('totalResults')
    comments = format_commentPageDF(response.get('items',{}))
    while response.get('nextPageToken'):
        request.pageToken = response.get('nextPageToken')
        response = request.execute()
        comments = pd.concat([comments, format_commentPageDF(response.get('items',{}))], ignore_index=True)
        if (maxComments:= maxComments - response.get('pageInfo', {}).get('totalResults')) <= 0:
            break
        
    comments['videoID'] = videoID
    comments['fetchedDate'] = datetime.today()
    return comments

get_video_commentThreads(youtube, 'XqZsoesa55w', 1000)
# get_video_commentThreads(youtube, 'FkXhKu80CWU', 1000)

1: Une exception s'est produite : <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet%2Cid%2Creplies&maxResults=100&videoId=XqZsoesa55w&key=AIzaSyAKQSGcIdkqx8QWrdurjnVdmt7fN8vo2s8&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
2: Une exception s'est produite : <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet%2Cid%2Creplies&maxResults=100&videoId=XqZsoesa55w&key=AIzaSyAKQSGcIdkqx8QWrdurjnVdmt7fN8vo2s8&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads

# Fetching Top Videos
The goal is to fetch the top 200 videos everyday and to get their comments a week after publishing.

In [None]:
def json_toDF(topvidsFile):
    """ transfrom topVideos.json to dataframe then to .csv"""
    with open(topvidsFile, 'r') as f:
        data = json.load(f)
    data.pop('lastUpdate')
    videosList = {}
    for region,dateEntries in data.items():
        # print(region, type(dateEntries))
        for date, videos in dateEntries.items():
            for id, video in enumerate(videos):
                videosList.setdefault('title', []).append(video.get('title'))
                videosList.setdefault('id', []).append(video.get('id'))
                videosList.setdefault('publishedAt', []).append(video.get('publishedAt'))
                videosList.setdefault('duration', []).append(video.get('duration'))
                videosList.setdefault('ViewCount', []).append(video.get('ViewCount'))
                videosList.setdefault('likeCount', []).append(video.get('likeCount'))
                videosList.setdefault('commentCount', []).append(video.get('commentCount'))
                videosList.setdefault('tags', []).append(video.get('tags'))
                videosList.setdefault('fetchedDate', []).append(date) 
                videosList.setdefault('topID', []).append(id+1)
                videosList.setdefault('region', []).append(region)
    
    df = pd.DataFrame(videosList)
    df.to_csv('db/top.csv', index=False)
    # fetchedDate topID region
# json_toDF('db/topVideos.json')

In [67]:
# com = pd.read_csv('db/comments.csv')
df = pd.read_csv('db/dailyTop200.csv')
df[df['fetchedComments'] == True].drop_duplicates(['id'])

Unnamed: 0,title,id,publishedAt,duration,ViewCount,likeCount,commentCount,tags,fetchedDate,topID,region,fetchedComments
0,Les vacances des Cro√ªtons qui d√©marrent pas co...,c1FLpmpo50c,2024-02-20T19:00:28Z,PT29M28S,1209392,122489.0,3763.0,"['Vacances des cro√ªtons', 'cro√ªton', 'team cro...",2024-02-21 18:55:50.000000,1,FR,True
1,ZEN #9 avec Hugo D√©crypte - Saison 3,ib09Xa-XCPA,2024-02-20T18:00:19Z,PT2H2M11S,275222,13560.0,419.0,"['humour', 'emission', 'lateshow', 'live', 'st...",2024-02-21 18:55:50.000000,2,FR,True
2,Elle d√©figure son amie par vengeance ?! - SIP ...,wT4OslqusSo,2024-02-19T16:02:00Z,PT1H42M34S,704778,37634.0,1022.0,"['Twitch', 'Maghla', 'Humour', 'sip and gossip...",2024-02-21 18:55:50.000000,3,FR,True
3,Il a tout sacrifi√© pour 11 Millions,rpkAVRM_XmA,2024-02-20T16:01:09Z,PT23M38S,194549,15756.0,399.0,"['toni musulin', 'casse', 'braquage', 'bandit'...",2024-02-21 18:55:50.000000,4,FR,True
4,L'histoire COMPLETE de Ninjago en 30 minutes,m15NKjkZyXQ,2024-02-20T17:09:30Z,PT28M5S,97306,6941.0,627.0,,2024-02-21 18:55:50.000000,5,FR,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2845,I Caught Minecraft's Most Scary Myths,kFOIiUVf9Hs,2024-02-25T16:30:11Z,PT40M21S,607266,19602.0,4180.0,"['minecraft', ""I Caught Minecraft's Most Scary...",2024-02-26 22:28:37.000000,46,US,True
2858,New Royal Champion Equipment in Clash of Clans!,GCDbh0eTFfg,2024-02-25T13:00:01Z,PT8M12S,486394,24651.0,883.0,"['clash of clans', 'coc', 'clash of clans game...",2024-02-26 22:28:37.000000,59,US,True
2859,Liverpool DESTROY Chelsea in Carabao Cup Final!,IdJFE7_af5I,2024-02-25T18:02:24Z,PT8M18S,380275,13365.0,1851.0,,2024-02-26 22:28:37.000000,60,US,True
3035,After 12 Years Creating Fishing Videos... This...,GoAv7_KYQxQ,2024-02-26T02:40:11Z,PT28M58S,161348,8067.0,1634.0,"['stocks', 'stock market', 'investing', 'begin...",2024-02-27 23:20:00.000000,36,US,True


In [63]:
idlist = com['videoID'].drop_duplicates().values
print(len(idlist))
df.loc[df['id'].isin(idlist),'fetchedComments'] = True
df.to_csv('db/dailyTop200.csv', index=False)
# df
# df[df['topID'] == 1].sort_values(by=['ViewCount', 'likeCount'], ascending=False)
# vals = df[df['id'] == 'tWYsfOSY9vY'][['title','ViewCount', 'likeCount']].drop_duplicates().values
# for title,view,like in vals:
#     print(f'One every {round(view/like)} person droped a like on `{title}`')
# df[df['id'] == '_9u4sYHcR7A'][['topID', 'region', 'fetchedDate']]

724


In [None]:
def push_top_vids(topvidsFile:str, regions:list[str], minElapsedTime:int)-> None:
    """ Fetch top 200 vids per region (per <minElapsedTime>) and push in json <topvidsFile> """
    today = datetime.today()
    try:
        df = pd.read_csv(topvidsFile)
    except pd.errors.EmptyDataError:
        df = pd.DataFrame()
    
    if 'fetchedDate' in df:
        df['fetchedDate'] = pd.to_datetime(df['fetchedDate'], format='ISO8601')
        lastUpdate = df.sort_values(by=['fetchedDate'], ascending=False, ignore_index=True).loc[0]['fetchedDate']
        delta = today - lastUpdate
        if delta.total_seconds() // 3600 < minElapsedTime:
            raise Exception(f'The fetch request has be done too soon. Next request available in {24-(delta.total_seconds() // 3600)}h. Last update done : {lastUpdate}')

    # Fetching
    for reg in regions:
        top200 = get_Most_Popular_Video(youtube, reg)
        top200['fetchedDate'] = today
        top200['fetchedComments'] = False
        df = pd.concat([df, top200])
        
    df.to_csv('db/dailyTop200.csv', index=False) 

push_top_vids('db/dailyTop200.csv', ['FR', 'US'], 24)

In [46]:
def fetch_topVids_comments(topvidsFile:str, minElapsedCommentsTime:int, maxComments:int = 1000) -> None:
    today = datetime.today()
    try:
        df = pd.read_csv(topvidsFile)
    except pd.errors.EmptyDataError:
        df = pd.DataFrame()
    df['publishedAt'] = pd.to_datetime(df['publishedAt'], format='ISO8601')
    
    print(df.dtypes)
    print(df.shape)
    
    uniqueID = df.drop_duplicates(['id'])
    uniqueID = uniqueID[uniqueID['fetchedComments'] == False]
    id_list = list(uniqueID['id'])
    
    comments = pd.DataFrame()
    for id in id_list:
        if (pd.to_datetime(today) - uniqueID.loc[uniqueID['id'] == id, 'publishedAt'].values[0]).days >= minElapsedCommentsTime:
            print(id)
            comments = pd.concat([comments, get_video_commentThreads(youtube, id, maxComments)])
            df.loc[df['id'] == id, 'fetchedComments'] = True
    comments.to_csv('db/comments.csv')    
    return comments
        
    
fetch_topVids_comments('db/dailyTop200.csv', 7, 1000)

title                           object
id                              object
publishedAt        datetime64[ns, UTC]
duration                        object
ViewCount                        int64
likeCount                      float64
commentCount                   float64
tags                            object
fetchedDate                     object
topID                            int64
region                          object
fetchedComments                   bool
dtype: object
(4400, 12)
c1FLpmpo50c
ib09Xa-XCPA
wT4OslqusSo
rpkAVRM_XmA
m15NKjkZyXQ
3eRh1eRKq7k
4LZ4BuAkNyU
OSmhuAYECxc
gMBbzXj6-ok
IFrHZPJ7IRo
2QFCzuFFAik
Yo_qTDXVzMM
BxcOC8Vk6Hc
4PUVXUyxP7I
WoiPJVxGYCw
3fW9OP582k4
K-whyfKSnFo
i-GV_3i_3N8
dIlbshbTRlQ
Fn0ZDmalU6o
8z--jHc3EGc
Uy2B6zRMreI
bNKXxwOQYB8
mB3ECBncuVk
rvg2UiiwXlI
fDbu0D6-n2E
t6bBDAlpQZs
rdD7je57_-4
jyqI5N_Oxhk
hOmzCiV4C3o
ye3blP4MxfE
s6VDeqIhq7c
7uqZhrPdo_M
qp4NHf9fwKA
LXq9g32BM3g
sJQvepUWLa4
5tDm02caZpo
Sdbi40jURfs
zlFoee9DfSE
3V7M1q-WHSU
9y0Ui2E0ERo
m0ARyXUATnE
DnP

Unnamed: 0,id,comment,likeCount,publishedAt,updatedAt,totalReplyCount,videoID,fetchedDate
0,UgxHYBQAE58AMk_Njg14AaABAg,le grand retour tant attendu ü•π,4783,2024-02-21T11:42:48Z,2024-02-21T11:42:48Z,52,c1FLpmpo50c,2024-03-04 13:15:00.801367
1,UgyP3F15VnXEp_W84Z94AaABAg,11:51 √ßa sent un grand avenir pour la team,1,2024-03-04T12:13:02Z,2024-03-04T12:13:02Z,0,c1FLpmpo50c,2024-03-04 13:15:00.801367
2,Ugygqdx4KLS--EiHxch4AaABAg,we really see that there is a very different w...,0,2024-03-04T04:21:48Z,2024-03-04T04:21:48Z,0,c1FLpmpo50c,2024-03-04 13:15:00.801367
3,Ugy_Mi173iclktHPhyF4AaABAg,Tu a gravie l evreste si non tu pourras nous f...,0,2024-03-04T01:02:01Z,2024-03-04T01:02:01Z,0,c1FLpmpo50c,2024-03-04 13:15:00.801367
4,Ugyhr8KJHMlvqTFTwB14AaABAg,Fais des lives !üòÇ,0,2024-03-03T18:15:27Z,2024-03-03T18:15:27Z,0,c1FLpmpo50c,2024-03-04 13:15:00.801367
...,...,...,...,...,...,...,...,...
889,UgzQ9WFo3VrRm9ZMetl4AaABAg,i love hi you are the most beautyful girl in ‚ù§...,1,2024-02-16T15:42:22Z,2024-02-16T15:42:22Z,1,JHQS6B8kKMg,2024-03-04 13:45:01.471817
890,UgzDSKlHGShViXhQzHp4AaABAg,This is lovely üòóüíñüíó,7,2024-02-16T15:14:31Z,2024-02-16T15:14:31Z,1,JHQS6B8kKMg,2024-03-04 13:45:01.471817
891,UgwnhXhiWxjB4tGjIqp4AaABAg,This is why your my fav youtuberüíÖ,44,2024-02-16T15:10:45Z,2024-02-16T15:10:45Z,8,JHQS6B8kKMg,2024-03-04 13:45:01.471817
892,UgzOx-a6E5vo2JzfTeF4AaABAg,I love this video ‚ù§‚ô°Ô∏é‚ô•Ô∏é‚ô°Ô∏é‚ô•Ô∏é ‚ô°Ô∏é‚ô•Ô∏é‚ô°Ô∏é‚ô•Ô∏é ‚òÖ‚òÜ‚òÖ‚òÜ ñ§ê,5,2024-02-16T14:58:35Z,2024-02-16T14:58:35Z,1,JHQS6B8kKMg,2024-03-04 13:45:01.471817


In [None]:
def create_comment_queue(topvidsFile:str, commentsQueue:str) -> None:
    """ Create a json file <commentsQueue> that stores video IDs which comments have not been fetched """
    with open(topvidsFile, 'r') as f:
        data:dict = json.load(f)
    with open(commentsQueue, 'r') as f:
        queue:dict = json.load(f)   
        
    data.pop('lastUpdate', None)
    date_to_fetch = [dates for dates in list(list(data.values())[0].keys()) if dates not in queue.keys()]
    
    if date_to_fetch: # is empty
        for date in date_to_fetch:  
            comment_list = []
            for region in data.keys():
                # print(f"{region}: {date}. Size {len(data.get(region,{}).get(date,[]))}")
                for video in data.get(region,{}).get(date,[]):
                    comment_list += [
                        {'region': region, 
                        'dateEntry': date, 
                        'id': video.get('id'),
                        'publishedAt': video.get('publishedAt')
                        }
                    ]
            queue[date] = comment_list
        
        with open(commentsQueue, 'w') as fichier:
            json.dump(queue, fichier)    
    else:
        print("already in !")

In [None]:

def fetch_topVids_comments(commentsQueue:str, commentList:str, minElapsedCommentsTime:int) -> None:
    """ Fetches vids comments from <commentsQueue> after minElapsedComments and output them in <commentList>"""
    today = datetime.today()
    with open(commentsQueue, 'r') as f:
        queue:dict[list] = json.load(f)
    with open(commentList, 'r') as f:
        comments:dict[list] = json.load(f)
    
    data={}
    remove_indices:dict[list] = {}
    for date,vids in queue.items():
        for i,video in enumerate(vids):
            if (today - iso_toDatetime(video.get('publishedAt'))).days >= minElapsedCommentsTime: 
                start = time.time()
                data[video.get('id')] = get_video_commentThreads(youtube, video.get('id'), 1000) # crashes a lot
                print(f"{i} in {time.time()-start}s",end='\n\n')
                if date not in remove_indices.keys():
                     remove_indices[date] = []
                remove_indices[date].append(i)
            else:print(f"{i} not yet",end='\n\n')
        
    queue = {date:[video for i, video in enumerate(videos) if i not in remove_indices[date]] for date,videos in queue.items()}  
     
    with open(commentsQueue, 'w') as fichier:
            json.dump(queue, fichier)  
               
    with open(commentList, 'w') as fichier:
            json.dump({**comments, **data}, fichier) # merging data

In [None]:
REGION = ['FR', 'US']
topvids = 'db/topVideos.json'
commentsQueue="db/commentQueue.json"
commentList = "db/commentList.json"
minElapsedTime = 24 # Hours
minElapsedCommentsTime = 17 # days

push_top_vids(topvids, REGION, minElapsedTime)
    
# create_comment_queue(topvids, commentsQueue)
            
# fetch_topVids_comments(commentsQueue, commentList, minElapsedCommentsTime)    