# YT Comments analysis

In [10]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from api import API_KEY

channel_id = "UCWeg2Pkate69NFdBeuRFTAw" #Squeezie channel
etoiles = 'UCABf02qOye7XYapcK1M45LQ'
youtube = build('youtube', 'v3', developerKey=API_KEY)
exemple_video = "qCKyRhkhqoQ"
otp_recap = 'F7A8OCdmZ90'

In [11]:
class Request:
    """ Class Request handling youtube request better """
    def __init__(self, requestType,part=None, id=None, chart=None, regionCode=None, maxResults=None, pageToken=None, videoId=None):
        self.requestType = requestType
        self.part = part
        self.id = id
        self.chart = chart
        self.regionCode = regionCode
        self.maxResults = maxResults
        self.pageToken = pageToken
        self.videoId = videoId
        
    def execute(self):
        param = vars(self) # Fetch class attributes
        param = {x:y for x,y in list(param.items())[1:] if y} # Delete requestType ([1:]) and None attributes
        
        request = self.requestType.list(**param)
        return request.execute()

In [12]:
import time
def retry_on_exception(max_attempts=5):
    def decorator(func):
        def wrapper(*args, **kwargs):
            attempts = 0
            while attempts < max_attempts:
                try:
                    result = func(*args, **kwargs)
                except Exception as e:
                    attempts += 1
                    if attempts == max_attempts:
                        return []
                        raise  # Relancer l'exception si le nombre maximal de tentatives est atteint
                    else:
                        print(f"{attempts}: Une exception s'est produite : {e}")
                else:
                    return result  # Retourner le résultat si aucune exception n'est levée
                time.sleep(0.5)
        return wrapper
    return decorator


In [13]:
from datetime import datetime, timedelta
import re      
        
def iso_toDatetime(iso_date:str):
    """Converts an ISO 8601 formatted date to a datetime object."""
    return datetime.strptime(iso_date[:-1], '%Y-%m-%dT%H:%M:%S')

def datetime_toISO(dt_obj:datetime):
    """Converts a datetime object to an ISO 8601 formatted date."""
    return dt_obj.isoformat()[:-7]  # remove microseconds

def iso_toDelta(iso_duration:str):
    """Converts an ISO 8601 formatted duration to a timedelta object."""
    match = re.match(r'PT(\d+D)*(\d+H)*(\d+M)*(\d+S)', iso_duration)
    days, hours, minutes, seconds = [int(x[:-1]) if x else 0 for x in match.groups()]
    return timedelta(days=days,hours=hours, minutes=minutes, seconds=seconds)

def delta_toISO(delta_obj:timedelta):
    """Converts a timedelta object to an ISO 8601 formatted duration."""
    hours = delta_obj.seconds // 3600
    minutes = (delta_obj.seconds % 3600) // 60
    seconds = delta_obj.seconds % 60
    
    daysStr = f"{delta_obj.days}D" if delta_obj.days != 0 else ""
    hoursStr = f"{hours}H" if hours != 0 else ""
    minutesStr = f"{minutes}M" if minutes != 0 else ""
    secondsStr = f"{seconds}S" if seconds != 0 else ""
    return f"PT{daysStr}{hoursStr}{minutesStr}{secondsStr}"

# print(iso_toDelta('PT4D3H20M9S'))
# print(delta_toISO(iso_toDelta('PT20M9S')))

In [14]:
def format_channel_data(channel_data):
    """ Structure raw channel data """
    data = {
        "channel_name": channel_data.get('snippet', {}).get('title'),
        "channel_id": channel_data.get('id'),
        "country": channel_data.get('snippet', {}).get('country',""),
        "stats": channel_data.get('statistics'),
        "topics": [wikilink.split('/')[-1] for wikilink in channel_data.get('topicDetails', {}).get('topicCategories', [])],
    }
    del data['stats']['hiddenSubscriberCount']
    return data

In [15]:
def get_channel_data(youtube, channel_id):
    """ Request (by id) for most important channel stats """
    request = Request(
        requestType=youtube.channels(),
        part="snippet,contentDetails,statistics,topicDetails",
        id=channel_id
    )
    response = request.execute()
    rawData = response.get('items', [])[0]
    return format_channel_data(rawData)
    

get_channel_data(youtube, channel_id)

{'channel_name': 'SQUEEZIE',
 'channel_id': 'UCWeg2Pkate69NFdBeuRFTAw',
 'country': 'FR',
 'stats': {'viewCount': '10448305177',
  'subscriberCount': '18700000',
  'videoCount': '1624'},
 'topics': ['Entertainment', 'Television_program']}

In [16]:
from datetime import datetime
def format_video_data(video_data):
    """ Structure raw video data """
    data = {
            "title": video_data.get('snippet', {}).get('title'),
            "id": video_data.get('id'),
            "publishedAt": video_data.get('snippet', {}).get('publishedAt'),
            "duration" : video_data.get('contentDetails').get('duration'),
            "ViewCount" : video_data.get('statistics', {}).get('viewCount'),
            "likeCount" : video_data.get('statistics', {}).get('likeCount'),
            "commentCount" : video_data.get('statistics', {}).get('commentCount'),  
            "tags" : video_data.get('snippet', {}).get('tags')
    }
    
    return data

# time = get_video_info(youtube, 'JRBGBjaR9Wg').get("publishedAt")
# print(datetime.strptime(time, '%Y-%m-%dT%H:%M:%SZ'))

In [17]:
def get_video_data(youtube, video_Id):
    """ Request (by id) for most important video stats """
    request = Request(
        requestType=youtube.videos(),
        part="snippet,contentDetails,statistics,topicDetails",
        id=video_Id,
    )
    response = request.execute()
    
    rawData = response.get('items', [])[0]
    return format_video_data(rawData)

get_video_data(youtube, otp_recap)

{'title': "G2 trop forts pour l'Europe ? (OTP AfterLEC)",
 'id': 'F7A8OCdmZ90',
 'publishedAt': '2024-02-20T16:00:09Z',
 'duration': 'PT1H6M3S',
 'ViewCount': '20015',
 'likeCount': '390',
 'commentCount': '47',
 'tags': ['otp',
  'afterlec',
  'aftershow',
  'podcast',
  'league of legends',
  'lec',
  'g2',
  'winter']}

In [18]:
def get_Most_Popular_Video(youtube, region:str):
    """ Request for most populars videos stats """
    request = Request(
        requestType=youtube.videos(),
        part="snippet,contentDetails,statistics,topicDetails",
        chart="mostPopular",
        regionCode=region,
        maxResults=100,
        pageToken=''
    )
    response = request.execute()
    
    pages = [response]
    while response.get('nextPageToken'):
        request.pageToken = response.get('nextPageToken')
        response = request.execute()
        pages.append(response)
    
    top_videos = [format_video_data(videos) for page in pages for videos in page.get('items')]
    return top_videos

get_Most_Popular_Video(youtube, 'FR')

[{'title': "TIK TOK M'A FAIT ACHETER ÇA ! #6 (Jamais vu ça avant)",
  'id': '4j92PHFnp3g',
  'publishedAt': '2024-02-25T11:56:47Z',
  'duration': 'PT23M46S',
  'ViewCount': '2227168',
  'likeCount': '161964',
  'commentCount': '2236',
  'tags': ['Joyca',
   'video',
   'reaction',
   'Français',
   'tik tok',
   'objets tik tok']},
 {'title': 'CACHE-CACHE Dans Une ÉNORME VILLA En BELGIQUE ! (Feat. Les Croûtons)',
  'id': 'tRAOVffz9LA',
  'publishedAt': '2024-02-25T13:00:47Z',
  'duration': 'PT34M27S',
  'ViewCount': '2006134',
  'likeCount': '155688',
  'commentCount': '2972',
  'tags': ['Michou',
   'Inox',
   'Inoxtag',
   'Lebouseuh',
   'Valouzz',
   'Pidi',
   'Croûton',
   'Croutons',
   'Vacances',
   'Dobby',
   'Cache',
   'Cache-cache',
   'Vidéo']},
 {'title': 'Qui perdra son Job !?  (Ft. Mastu, Byilhan)',
  'id': 'FLw4vYrKO4M',
  'publishedAt': '2024-02-24T13:30:12Z',
  'duration': 'PT55M41S',
  'ViewCount': '3510844',
  'likeCount': '313267',
  'commentCount': '5753',
  't

In [19]:
def format_comment_data(comment):
    """ Structure raw comment data """
    data = {
        "id": comment.get('id'),
        "comment": comment.get('snippet', {}).get('textOriginal'),
        # "viewerRating": comment.get('snippet', {}).get('viewerRating'),
        "likeCount": comment.get('snippet', {}).get('likeCount'),
        "publishedAt": comment.get('snippet', {}).get('publishedAt'),
        "updatedAt": comment.get('snippet', {}).get('updatedAt')
        }
    
    return data

def format_threadedComment_data(comment):
    """ Structure raw threaded comment data """
    data = {
        "id": comment.get('id'),
        "topLevelComment": format_comment_data(comment.get('snippet', {}).get('topLevelComment')),
        "totalReplyCount": comment.get('snippet', {}).get('totalReplyCount'),
        # "replies": [format_comment_data(com) for com in comment.get('replies', {}).get('comments', [])]
        }
    
    return data

In [20]:
def get_comment(youtube,comment_id):
    """ Request (by id) for most important comment stats """
    request = Request(
        requestType=youtube.comments(),
        part="snippet,id",
        id=comment_id,
    )
    response = request.execute()
    # print(response)
    rawData = response.get('items')[0]
    return format_comment_data(rawData)

get_comment(youtube, 'UgwUQR2JJFJSkihWLhx4AaABAg')

{'id': 'UgwUQR2JJFJSkihWLhx4AaABAg',
 'comment': "Avez vous déjà rêvé de la vie de pirate dans votre enfance ? Dans ce cas Skull & Bones pourrait vous intéresser ! Plus d'infos ici :  https://ubi.li/1aYqD",
 'likeCount': 25,
 'publishedAt': '2024-02-08T21:54:35Z',
 'updatedAt': '2024-02-08T21:54:35Z'}

In [21]:
import time
@retry_on_exception(max_attempts=3)
def get_video_commentThreads(youtube,video_Id,maxComments):
    """ Request (by id) for all comments of a videos """
    request = Request(
        requestType=youtube.commentThreads(),
        part="snippet,id,replies",
        videoId=video_Id,
        maxResults=100
        # pageToken = ''
    )
    response = request.execute()
    
    maxComments -= response.get('pageInfo', {}).get('totalResults')
    comments = [format_threadedComment_data(comments) for comments in response.get('items',{})]
    while response.get('nextPageToken'):
        request.pageToken = response.get('nextPageToken')
        # time.sleep(0.3)
        response = request.execute()
        comments += [format_threadedComment_data(comments) for comments in response.get('items',{})]
        if (maxComments:= maxComments - response.get('pageInfo', {}).get('totalResults')) <= 0:
            break
        
    print(f"Fetched {len(comments)} comments !")
    return comments

get_video_commentThreads(youtube, otp_recap, 1000)

Fetched 23 comments !


[{'id': 'Ugw6FJfQqKe0KrTgQF54AaABAg',
  'topLevelComment': {'id': 'Ugw6FJfQqKe0KrTgQF54AaABAg',
   'comment': "Tous ces anglissimes (sans parler des mots compliqués à traduire afin d'être compris) me donnent envie de m'arracher les oreilles, faites un effort sérieusement.",
   'likeCount': 0,
   'publishedAt': '2024-02-21T12:59:33Z',
   'updatedAt': '2024-02-21T12:59:33Z'},
  'totalReplyCount': 0},
 {'id': 'Ugyfodm1-4R2yZV-jkB4AaABAg',
  'topLevelComment': {'id': 'Ugyfodm1-4R2yZV-jkB4AaABAg',
   'comment': "De toutes façons ce format à l'américaine puant avec des playoffs là faut arrêter.\nLe meilleur format qu'il y a eu en europe c'était le championnat aller retour avec 2 matchs à faire (possibilité de draw du coup) et virer ces playoffs qui sont à la fois incompréhensibles et chiants.",
   'likeCount': 0,
   'publishedAt': '2024-02-21T12:10:02Z',
   'updatedAt': '2024-02-21T12:10:02Z'},
  'totalReplyCount': 0},
 {'id': 'Ugy4SWsZlESMcZYFoUR4AaABAg',
  'topLevelComment': {'id': 'Ugy4SW

# Fetching Top Videos
The goal is to fetch the top 200 videos everyday and to get their comments a week after publishing.

In [None]:
from datetime import datetime
import json
REGION = ['FR', 'US']
topvids = 'db/topVideos.json'
minElapsedTime = 24 # Hours

def push_top_vids(filepath):
    today = datetime.today()
    with open(filepath, 'r') as f:
        data = json.load(f)
        
    if lastUpdate:= data.get('lastUpdate'):
        delta = today - iso_toDatetime(lastUpdate)
        if delta.total_seconds() // 3600 < minElapsedTime:
            raise Exception(f'The fetch request has be done too soon. Next request available in {24-(delta.total_seconds() // 3600)}h ')
        
    data['lastUpdate'] = datetime_toISO(today)
    # Fetching
    for reg in REGION:
        if reg not in data.keys():
            data[reg] = {}
        data[reg][datetime_toISO(today)] = get_Most_Popular_Video(youtube, reg)
        
    with open(filepath, 'w') as fichier:
        json.dump(data, fichier)

push_top_vids(topvids)

In [None]:
import json
topvids = 'db/topVideos.json'
commentsQueue="db/commentQueue.json"

def create_comment_queue(filepath:str) -> None:
    with open(filepath, 'r') as f:
        data:dict = json.load(f)
    with open(commentsQueue, 'r') as f:
        queue:dict = json.load(f)   
    data.pop('lastUpdate', None)
    date_to_fetch = [dates for dates in list(list(data.values())[0].keys()) if dates not in queue.keys()]
    
    if date_to_fetch: # is empty
        for date in date_to_fetch:  
            comment_list = []
            for region in data.keys():
                # print(f"{region}: {date}. Size {len(data.get(region,{}).get(date,[]))}")
                for video in data.get(region,{}).get(date,[]):
                    comment_list += [
                        {'region': region, 
                        'dateEntry': date, 
                        'id': video.get('id'),
                        'publishedAt': video.get('publishedAt')
                        }
                    ]
            queue[date] = comment_list
        
        with open(commentsQueue, 'w') as fichier:
            json.dump(queue, fichier)    
    else:
        print("already in !")
    
create_comment_queue(topvids)

In [28]:

import time
import json
commentsQueue="db/commentQueue.json"
commentList = "db/commentList.json"
minElapsedComments = 17 # days

def fetch_topVids_comments(filepath):
    today = datetime.today()
    with open(filepath, 'r') as f:
        queue:dict[list] = json.load(f)
    with open(commentList, 'r') as f:
        comments:dict[list] = json.load(f)
    
    data={}
    remove_indices:dict[list] = {}
    for date,vids in queue.items():
        for i,video in enumerate(vids):
            if (today - iso_toDatetime(video.get('publishedAt'))).days >= minElapsedComments: 
                start = time.time()
                data[video.get('id')] = get_video_commentThreads(youtube, video.get('id'), 1000) # crashes a lot
                print(f"{i} in {time.time()-start}s",end='\n\n')
                if date not in remove_indices.keys():
                     remove_indices[date] = []
                remove_indices[date].append(i)
            else:print(f"{i} not yet",end='\n\n')
        
    queue = {date:[video for i, video in enumerate(videos) if i not in remove_indices[date]] for date,videos in queue.items()}  
     
    with open(filepath, 'w') as fichier:
            json.dump(queue, fichier)  
               
    with open(commentList, 'w') as fichier:
            json.dump({**comments, **data}, fichier) # merging data
            
            
fetch_topVids_comments(commentsQueue)    

0 not yet

1 not yet

2 not yet

3 not yet

4 not yet

5 not yet

6 not yet

7 not yet

8 not yet

9 not yet

10 not yet

11 not yet

12 not yet

13 not yet

14 not yet

15 not yet

16 not yet

17 not yet

18 not yet

19 not yet

20 not yet

21 not yet

22 not yet

23 not yet

24 not yet

25 not yet

26 not yet

27 not yet

28 not yet

29 not yet

30 not yet

31 not yet

32 not yet

33 not yet

34 not yet

35 not yet

36 not yet

37 not yet

38 not yet

39 not yet

40 not yet

41 not yet

42 not yet

43 not yet

44 not yet

45 not yet

46 not yet

47 not yet

48 not yet

49 not yet

50 not yet

51 not yet

52 not yet

53 not yet

54 not yet

55 not yet

56 not yet

57 not yet

58 not yet

59 not yet

60 not yet

61 not yet

62 not yet

63 not yet

64 not yet

65 not yet

66 not yet

67 not yet

68 not yet

69 not yet

70 not yet

71 not yet

72 not yet

73 not yet

74 not yet

75 not yet

76 not yet

77 not yet

78 not yet

79 not yet

80 not yet

81 not yet

82 not yet

83 not yet

84