# YT Comments analysis

In [2]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from datetime import datetime, timedelta, time
import pandas as pd
import json
import re
from api import API_KEY

channel_id = "UCWeg2Pkate69NFdBeuRFTAw" #Squeezie channel
etoiles = 'UCABf02qOye7XYapcK1M45LQ'
youtube = build('youtube', 'v3', developerKey=API_KEY)
exemple_video = "qCKyRhkhqoQ"
otp_recap = 'F7A8OCdmZ90'

### Class request
Class to handle youtube request since youtube api doesn't provide a request object

In [3]:
class Request:
    """ Class Request handling youtube request as an object """
    def __init__(self, requestType,part=None, id=None, chart=None, regionCode=None, maxResults=None, pageToken=None, videoId=None):
        self.requestType = requestType
        self.part = part
        self.id = id
        self.chart = chart
        self.regionCode = regionCode
        self.maxResults = maxResults
        self.pageToken = pageToken
        self.videoId = videoId
        
    def execute(self):
        param = vars(self) # Fetch class attributes
        param = {x:y for x,y in list(param.items())[1:] if y} # Delete requestType ([1:]) and None attributes
        
        request = self.requestType.list(**param)
        return request.execute()

### Decorator
Decorator to retry when youtube request fails (mostly due to timeout erros)

In [4]:
def retry_on_exception(max_attempts=5):
    def decorator(func):
        def wrapper(*args, **kwargs):
            attempts = 0
            while attempts < max_attempts:
                try:
                    result = func(*args, **kwargs)
                except Exception as e:
                    attempts += 1
                    if attempts == max_attempts:
                        return []
                        raise  # Relancer l'exception si le nombre maximal de tentatives est atteint
                    else:
                        print(f"{attempts}: Une exception s'est produite : {e}")
                else:
                    return result  # Retourner le résultat si aucune exception n'est levée
                time.sleep(0.5)
        return wrapper
    return decorator

### Datetime convertions
Functions to convert iso formated date found in youtube api responses in datetime objects.

In [5]:
def iso_toDatetime(iso_date:str):
    """Converts an ISO 8601 formatted date to a datetime object."""
    return datetime.strptime(iso_date[:-1], '%Y-%m-%dT%H:%M:%S')

def datetime_toISO(dt_obj:datetime):
    """Converts a datetime object to an ISO 8601 formatted date."""
    return dt_obj.isoformat()[:-7]  # remove microseconds

def iso_toDelta(iso_duration:str):
    """Converts an ISO 8601 formatted duration to a timedelta object."""
    match = re.match(r'PT(\d+D)*(\d+H)*(\d+M)*(\d+S)', iso_duration)
    days, hours, minutes, seconds = [int(x[:-1]) if x else 0 for x in match.groups()]
    return timedelta(days=days,hours=hours, minutes=minutes, seconds=seconds)

def delta_toISO(delta_obj:timedelta):
    """Converts a timedelta object to an ISO 8601 formatted duration."""
    hours = delta_obj.seconds // 3600
    minutes = (delta_obj.seconds % 3600) // 60
    seconds = delta_obj.seconds % 60
    
    daysStr = f"{delta_obj.days}D" if delta_obj.days != 0 else ""
    hoursStr = f"{hours}H" if hours != 0 else ""
    minutesStr = f"{minutes}M" if minutes != 0 else ""
    secondsStr = f"{seconds}S" if seconds != 0 else ""
    return f"PT{daysStr}{hoursStr}{minutesStr}{secondsStr}"

# print(iso_toDelta('PT4D3H20M9S'))
# print(delta_toISO(iso_toDelta('PT20M9S')))

### Fectching functions
Functions to fetch channels, comments ant top vids infos.

In [6]:
def format_channel_data(channel_data: dict) -> dict[str|dict]:
    """ Structure raw channel data """
    data = {
        "channel_name": channel_data.get('snippet', {}).get('title'),
        "channel_id": channel_data.get('id'),
        "country": channel_data.get('snippet', {}).get('country',""),
        "stats": channel_data.get('statistics'),
        "topics": [wikilink.split('/')[-1] for wikilink in channel_data.get('topicDetails', {}).get('topicCategories', [])],
    }
    del data['stats']['hiddenSubscriberCount']
    return data

In [7]:
def format_channel_dataDF(channel_data: dict) -> pd.DataFrame:
    """ Structure raw channel data """
    data = {
        "channel_name": [channel_data.get('snippet', {}).get('title')],
        "channel_id": [channel_data.get('id')],
        "country": [channel_data.get('snippet', {}).get('country',"")],
        **{k:[v] for k,v in channel_data.get('statistics', {}).items()},
        "topics": [[wikilink.split('/')[-1] for wikilink in channel_data.get('topicDetails', {}).get('topicCategories', [])]],
    }
    # del data['stats']['hiddenSubscriberCount']
    return pd.DataFrame.from_dict(data)

In [8]:
def get_channel_data(youtube, channel_id:str) -> dict[str|dict]:
    """ Request (by id) for most important channel stats """
    request = Request(
        requestType=youtube.channels(),
        part="snippet,contentDetails,statistics,topicDetails",
        id=channel_id
    )
    response = request.execute()
    rawData = response.get('items', [])[0]
    return format_channel_dataDF(rawData)
    

get_channel_data(youtube, etoiles)

Unnamed: 0,channel_name,channel_id,country,viewCount,subscriberCount,hiddenSubscriberCount,videoCount,topics
0,Etoiles,UCABf02qOye7XYapcK1M45LQ,FR,48279691,271000,False,552,"[Action_game, Role-playing_video_game, Action-..."


In [9]:
def format_video_data(video_data: dict) -> dict[str|dict]:
    """ Structure raw video data """
    data = {
            "title": video_data.get('snippet', {}).get('title'),
            "id": video_data.get('id'),
            "publishedAt": video_data.get('snippet', {}).get('publishedAt'),
            "duration" : video_data.get('contentDetails').get('duration'),
            "ViewCount" : video_data.get('statistics', {}).get('viewCount'),
            "likeCount" : video_data.get('statistics', {}).get('likeCount'),
            "commentCount" : video_data.get('statistics', {}).get('commentCount'),  
            "tags" : video_data.get('snippet', {}).get('tags')
    }
    
    return data

# time = get_video_info(youtube, 'JRBGBjaR9Wg').get("publishedAt")
# print(datetime.strptime(time, '%Y-%m-%dT%H:%M:%SZ'))

In [36]:
def format_video_dataDF(video_data: dict) -> dict[str|dict]:
    """ Structure raw video data """
    data = {
            "title": [video_data.get('snippet', {}).get('title')],
            "id": [video_data.get('id')],
            "publishedAt": [video_data.get('snippet', {}).get('publishedAt')],
            "duration" : [video_data.get('contentDetails').get('duration')],
            "ViewCount" : [int(video_data.get('statistics', {}).get('viewCount', 0))],
            "likeCount" : [int(video_data.get('statistics', {}).get('likeCount', 0))],
            "commentCount" : [int(video_data.get('statistics', {}).get('commentCount', 0))],  
            "tags" : [video_data.get('snippet', {}).get('tags')]
    }
    
    return pd.DataFrame.from_dict(data)

In [11]:
def get_video_data(youtube, video_Id:str) -> dict[str|dict]:
    """ Request (by id) for most important video stats """
    request = Request(
        requestType=youtube.videos(),
        part="snippet,contentDetails,statistics,topicDetails",
        id=video_Id,
    )
    response = request.execute()
    
    rawData = response.get('items', [])[0]
    return format_video_dataDF(rawData)

get_video_data(youtube, exemple_video)

Unnamed: 0,title,id,publishedAt,duration,ViewCount,likeCount,commentCount,tags
0,"LE JEU DE LA FÈVE (Avec Joyca, Zafeel et Hctuan)",qCKyRhkhqoQ,2024-02-10T12:02:00Z,PT35M30S,2666290,183338,1998,"[Mastu, Mastus, Humour, Matsu, loat, mastu loa..."


In [39]:
def get_Most_Popular_Video(youtube, region:str) -> list[dict]:
    """ Request for most populars videos stats """
    request = Request(
        requestType=youtube.videos(),
        part="snippet,contentDetails,statistics,topicDetails",
        chart="mostPopular",
        regionCode=region,
        maxResults=100,
        pageToken=''
    )
    response = request.execute()
    
    pages = [response]
    while response.get('nextPageToken'):
        request.pageToken = response.get('nextPageToken')
        response = request.execute()
        pages.append(response)
    
    # df = pd.DataFrame()
    # for page in pages:
    #      for videos in page.get('items'):
    #         df = pd.concat([df, format_video_dataDF(videos)]).reset_index(drop=True)
    # top = [format_video_data(videos) for page in pages for videos in page.get('items')]
    # print(sorted(top, key=lambda d: d['ViewCount']))
    top_videos = pd.concat([format_video_dataDF(videos) for page in pages for videos in page.get('items')]).reset_index(drop=True)
    return top_videos
# pd.DataFrame.sort_values

df:pd.DataFrame = get_Most_Popular_Video(youtube, 'FR')
df.sort_values(by=['ViewCount', 'likeCount'], ascending=False, inplace=True, kind='quicksort', ignore_index=True)  # 1 001 964 > 1 004 381 > 1 020 789 > 
# print(f"{type(df.loc[0]['ViewCount'])} > {df.loc[1]['ViewCount']} : {df.loc[0]['ViewCount'] > df.loc[1]['ViewCount']}")
# print(df.loc[0], df.loc[1])
df

Unnamed: 0,title,id,publishedAt,duration,ViewCount,likeCount,commentCount,tags
0,POMNI WAKE UP TIME TO GO ON AN ADVENTURE,FkXhKu80CWU,2024-02-23T20:00:12Z,PT3M8S,26013141,1893482,99443,"[meta runner, glitch productions, funny, video..."
1,Brawl Talk! The NEW Ranked Mode!,u0febvWcye8,2024-02-24T16:00:10Z,PT11M43S,16969944,720818,67045,"[brawl stars, mobile game, mobile strategy gam..."
2,Selena Gomez - Love On (Official Music Video),mNHNktxbjdk,2024-02-22T23:00:12Z,PT3M11S,7548610,411447,26197,"[Selena, Gomez, Love, Interscope, Records, Pop..."
3,World's Fastest Camera Drone Vs F1 Car (ft. Ma...,9pEqyr_uT-k,2024-02-27T14:00:07Z,PT12M5S,4902653,177881,9212,"[red bull, redbull, action sports, extreme spo..."
4,"Qui perdra son Job !? (Ft. Mastu, Byilhan)",FLw4vYrKO4M,2024-02-24T13:30:12Z,PT55M41S,4552292,364805,6656,"[INOXTAG, INOX, CROUTON, mastu, byilhan, qui p..."
...,...,...,...,...,...,...,...,...
195,Pourquoi j'ai battu le jeu Zelda MAUDIT (trois...,854UzvoE6pU,2024-02-27T14:00:05Z,PT14M21S,37402,3765,228,"[Ascuns, Video, Game, Gaming, Test, Review, Pa..."
196,DÉCOUVRIR WARHAMMER 40000 : Mes conseils pour ...,y5_7T37bPcU,2024-02-27T18:33:13Z,PT18M31S,37248,3929,210,"[FR, AlphaCast, Gameplay, Warhammer 40000, 40k..."
197,ON S'OCCUPE DES VACHES DE LA FERME !,Dt2tJiS9oBM,2024-02-27T16:00:18Z,PT13M41S,35067,3804,85,"[agricole, vlog, Agriculture, farming, vache, ..."
198,DUNE Partie 2 - CRITIQUE (sans spoilers),Bjr4F9BEI3U,2024-02-27T16:30:03Z,PT10M4S,32767,1574,137,"[dune, dune denis villeneuve, critique dune de..."


In [10]:
def format_comment_data(comment:dict) -> dict[str|dict]:
    """ Structure raw comment data """
    data = {
        "id": comment.get('id'),
        "comment": comment.get('snippet', {}).get('textOriginal'),
        # "viewerRating": comment.get('snippet', {}).get('viewerRating'),
        "likeCount": comment.get('snippet', {}).get('likeCount'),
        "publishedAt": comment.get('snippet', {}).get('publishedAt'),
        "updatedAt": comment.get('snippet', {}).get('updatedAt')
        }
    
    return data

def format_threadedComment_data(comment:dict) -> dict[str|dict]:
    """ Structure raw threaded comment data """
    data = {
        "id": comment.get('id'),
        "topLevelComment": format_comment_data(comment.get('snippet', {}).get('topLevelComment')),
        "totalReplyCount": comment.get('snippet', {}).get('totalReplyCount'),
        # "replies": [format_comment_data(com) for com in comment.get('replies', {}).get('comments', [])]
        }
    
    return data

In [11]:
def get_comment(youtube,comment_id:str) -> dict[str|dict]:
    """ Request (by id) for most important comment stats """
    request = Request(
        requestType=youtube.comments(),
        part="snippet,id",
        id=comment_id,
    )
    response = request.execute()
    # print(response)
    rawData = response.get('items')[0]
    return format_comment_data(rawData)

get_comment(youtube, 'UgwUQR2JJFJSkihWLhx4AaABAg')

{'id': 'UgwUQR2JJFJSkihWLhx4AaABAg',
 'comment': 'Avez vous déjà rêvé de la vie de pirate dans votre enfance ? Dans ce cas Skull & Bones pourrait vous intéresser ! Plus d\'infos ici :  https://ubi.li/1aYqD\n\nSinon, quelques précisions : \n- L\'image utilisée à 7:53 ne représente en réalité pas l\'île de Sainte-Marie proche de Madagascar, mais Sainte- Marie en Martinique ;\n- À 8:33, on dit que la vanille fait l\'objet de contrebande. Cependant, elle n\'était cultivée qu\'en Amérique avant le milieu du 19e siècle, où sa culture est arrivée sur l\'île Bourbon. Il est donc peu probable qu\'il y ait eu un trafic de vanille dans l\'océan Indien au 18e siècle ;\n- Vers 12:50, on évoque la composition des équipages, en mentionnant notamment les "indigènes des Mascareignes" : il est toutefois important de préciser que ces îles n\'étaient pas peuplées avant l\'arrivée des colons européens à la fin du 16e et au début du 17e siècle ;\n- Vers 15:36, on parle d\'ouragans pour désigner les tempêtes

In [12]:
@retry_on_exception(max_attempts=3)
def get_video_commentThreads(youtube, video_Id:str, maxComments:int) -> dict[str|list]:
    """ Request (by id) for all comments of a videos """
    request = Request(
        requestType=youtube.commentThreads(),
        part="snippet,id,replies",
        videoId=video_Id,
        maxResults=100
        # pageToken = ''
    )
    response = request.execute()
    
    maxComments -= response.get('pageInfo', {}).get('totalResults')
    comments = [format_threadedComment_data(comments) for comments in response.get('items',{})]
    while response.get('nextPageToken'):
        request.pageToken = response.get('nextPageToken')
        # time.sleep(0.3)
        response = request.execute()
        comments += [format_threadedComment_data(comments) for comments in response.get('items',{})]
        if (maxComments:= maxComments - response.get('pageInfo', {}).get('totalResults')) <= 0:
            break
        
    print(f"Fetched {len(comments)} comments !")
    return comments

get_video_commentThreads(youtube, exemple_video, 1000)

Fetched 1000 comments !


[{'id': 'UgxCPLRy0iH5mcVyQpB4AaABAg',
  'topLevelComment': {'id': 'UgxCPLRy0iH5mcVyQpB4AaABAg',
   'comment': 'Je suis en fou rire sur le calendrier',
   'likeCount': 7330,
   'publishedAt': '2024-02-10T12:10:07Z',
   'updatedAt': '2024-02-10T12:10:07Z'},
  'totalReplyCount': 131},
 {'id': 'UgwM95yxTARph_1ckn14AaABAg',
  'topLevelComment': {'id': 'UgwM95yxTARph_1ckn14AaABAg',
   'comment': 'Salut mastu , pourquoi tu ne fais plus d’ouverture de colis parce que je suis fan de conpcept voilà j’espère que tu vas le répondre',
   'likeCount': 0,
   'publishedAt': '2024-02-29T12:52:22Z',
   'updatedAt': '2024-02-29T12:52:22Z'},
  'totalReplyCount': 0},
 {'id': 'UgxFPHsTRdpP5SGqn4t4AaABAg',
  'topLevelComment': {'id': 'UgxFPHsTRdpP5SGqn4t4AaABAg',
   'comment': 'Salut mastu ce que je vais te dire c’est pas du tout par rapport à la vidéo c’était juste pour te dire que je viens de réécouter ta music déprime et que j’ai remarqué que t’avait dit << 3 ans dans le milieu et je finie en thérapie >> 

# Fetching Top Videos
The goal is to fetch the top 200 videos everyday and to get their comments a week after publishing.

In [13]:
def push_top_vids(topvidsFile:str, regions:list[str], minElapsedTime:int)-> None:
    """ Fetch top 200 vids per region (per <minElapsedTime>) and push in json <topvidsFile> """
    today = datetime.today()
    with open(topvidsFile, 'r') as f:
        data = json.load(f)
        
    if lastUpdate:= data.get('lastUpdate'):
        delta = today - iso_toDatetime(lastUpdate)
        if delta.total_seconds() // 3600 < minElapsedTime:
            raise Exception(f'The fetch request has be done too soon. Next request available in {24-(delta.total_seconds() // 3600)}h ')
        
    data['lastUpdate'] = datetime_toISO(today)
    # Fetching
    for reg in regions:
        if reg not in data.keys():
            data[reg] = {}
        data[reg][datetime_toISO(today)] = get_Most_Popular_Video(youtube, reg)
        
    with open(topvidsFile, 'w') as fichier:
        json.dump(data, fichier)

In [14]:
def create_comment_queue(topvidsFile:str, commentsQueue:str) -> None:
    """ Create a json file <commentsQueue> that stores video IDs which comments have not been fetched """
    with open(topvidsFile, 'r') as f:
        data:dict = json.load(f)
    with open(commentsQueue, 'r') as f:
        queue:dict = json.load(f)   
        
    data.pop('lastUpdate', None)
    date_to_fetch = [dates for dates in list(list(data.values())[0].keys()) if dates not in queue.keys()]
    
    if date_to_fetch: # is empty
        for date in date_to_fetch:  
            comment_list = []
            for region in data.keys():
                # print(f"{region}: {date}. Size {len(data.get(region,{}).get(date,[]))}")
                for video in data.get(region,{}).get(date,[]):
                    comment_list += [
                        {'region': region, 
                        'dateEntry': date, 
                        'id': video.get('id'),
                        'publishedAt': video.get('publishedAt')
                        }
                    ]
            queue[date] = comment_list
        
        with open(commentsQueue, 'w') as fichier:
            json.dump(queue, fichier)    
    else:
        print("already in !")

In [15]:

def fetch_topVids_comments(commentsQueue:str, commentList:str, minElapsedCommentsTime:int) -> None:
    """ Fetches vids comments from <commentsQueue> after minElapsedComments and output them in <commentList>"""
    today = datetime.today()
    with open(commentsQueue, 'r') as f:
        queue:dict[list] = json.load(f)
    with open(commentList, 'r') as f:
        comments:dict[list] = json.load(f)
    
    data={}
    remove_indices:dict[list] = {}
    for date,vids in queue.items():
        for i,video in enumerate(vids):
            if (today - iso_toDatetime(video.get('publishedAt'))).days >= minElapsedCommentsTime: 
                start = time.time()
                data[video.get('id')] = get_video_commentThreads(youtube, video.get('id'), 1000) # crashes a lot
                print(f"{i} in {time.time()-start}s",end='\n\n')
                if date not in remove_indices.keys():
                     remove_indices[date] = []
                remove_indices[date].append(i)
            else:print(f"{i} not yet",end='\n\n')
        
    queue = {date:[video for i, video in enumerate(videos) if i not in remove_indices[date]] for date,videos in queue.items()}  
     
    with open(commentsQueue, 'w') as fichier:
            json.dump(queue, fichier)  
               
    with open(commentList, 'w') as fichier:
            json.dump({**comments, **data}, fichier) # merging data

In [16]:
REGION = ['FR', 'US']
topvids = 'db/topVideos.json'
commentsQueue="db/commentQueue.json"
commentList = "db/commentList.json"
minElapsedTime = 24 # Hours
minElapsedCommentsTime = 17 # days

push_top_vids(topvids, REGION, minElapsedTime)
    
# create_comment_queue(topvids, commentsQueue)
            
# fetch_topVids_comments(commentsQueue, commentList, minElapsedCommentsTime)    