# YT Comments analysis

In [None]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from datetime import datetime, timedelta, time
import pandas as pd
import json
import re
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError
from api import API_KEY


channel_id = "UCWeg2Pkate69NFdBeuRFTAw" #Squeezie channel
etoiles = 'UCABf02qOye7XYapcK1M45LQ'
youtube = build('youtube', 'v3', developerKey=API_KEY)
exemple_video = "qCKyRhkhqoQ"
otp_recap = 'F7A8OCdmZ90'

### Class request
Class to handle youtube request since youtube api doesn't provide a request object

In [None]:
class Request:
    """ Class Request handling youtube request as an object """
    def __init__(self, requestType,part=None, id=None, chart=None, regionCode=None, maxResults=None, pageToken=None, videoId=None):
        self.requestType = requestType
        self.part = part
        self.id = id
        self.chart = chart
        self.regionCode = regionCode
        self.maxResults = maxResults
        self.pageToken = pageToken
        self.videoId = videoId
        
    def execute(self):
        param = vars(self) # Fetch class attributes
        param = {x:y for x,y in list(param.items())[1:] if y} # Delete requestType ([1:]) and None attributes
        
        request = self.requestType.list(**param)
        return request.execute()

### Decorator
Decorator to retry when youtube request fails (mostly due to timeout erros)

In [None]:
def retry_on_exception(max_attempts=5):
    def decorator(func):
        def wrapper(*args, **kwargs):
            attempts = 0
            while attempts < max_attempts:
                try:
                    result = func(*args, **kwargs)
                except Exception as e:
                    attempts += 1
                    if attempts == max_attempts:
                        return pd.DataFrame()
                        raise  # Relancer l'exception si le nombre maximal de tentatives est atteint
                    else:
                        print(f"{attempts}: Une exception s'est produite : {e}")
                else:
                    return result  # Retourner le résultat si aucune exception n'est levée
                # time.sleep(0.5)
        return wrapper
    return decorator

### Datetime convertions
Functions to convert iso formated date found in youtube api responses in datetime objects.

In [None]:
def iso_toDatetime(iso_date:str):
    """Converts an ISO 8601 formatted date to a datetime object."""
    return datetime.strptime(iso_date[:-1], '%Y-%m-%dT%H:%M:%S')

def datetime_toISO(dt_obj:datetime):
    """Converts a datetime object to an ISO 8601 formatted date."""
    return dt_obj.isoformat()[:-7]  # remove microseconds

def iso_toDelta(iso_duration:str):
    """Converts an ISO 8601 formatted duration to a timedelta object."""
    match = re.match(r'PT(\d+D)*(\d+H)*(\d+M)*(\d+S)', iso_duration)
    days, hours, minutes, seconds = [int(x[:-1]) if x else 0 for x in match.groups()]
    return timedelta(days=days,hours=hours, minutes=minutes, seconds=seconds)

def delta_toISO(delta_obj:timedelta):
    """Converts a timedelta object to an ISO 8601 formatted duration."""
    hours = delta_obj.seconds // 3600
    minutes = (delta_obj.seconds % 3600) // 60
    seconds = delta_obj.seconds % 60
    
    daysStr = f"{delta_obj.days}D" if delta_obj.days != 0 else ""
    hoursStr = f"{hours}H" if hours != 0 else ""
    minutesStr = f"{minutes}M" if minutes != 0 else ""
    secondsStr = f"{seconds}S" if seconds != 0 else ""
    return f"PT{daysStr}{hoursStr}{minutesStr}{secondsStr}"

# print(iso_toDelta('PT4D3H20M9S'))
# print(delta_toISO(iso_toDelta('PT20M9S')))

### Fectching functions
Functions to fetch channels, comments ant top vids infos.

In [None]:
def format_channel_data(channel_data: dict) -> pd.DataFrame:
    """ Structure raw channel data """
    data = {
        "channel_name": [channel_data.get('snippet', {}).get('title')],
        "channel_id": [channel_data.get('id')],
        "country": [channel_data.get('snippet', {}).get('country',"")],
        **{k:[int(v)] for k,v in channel_data.get('statistics', {}).items() if k != "hiddenSubscriberCount"},
        "topics": [[wikilink.split('/')[-1] for wikilink in channel_data.get('topicDetails', {}).get('topicCategories', [])]],
    }
    return pd.DataFrame.from_dict(data)

In [None]:
def get_channel_data(youtube, channel_id:str) -> dict[str|dict]:
    """ Request (by id) for most important channel stats """
    request = Request(
        requestType=youtube.channels(),
        part="snippet,contentDetails,statistics,topicDetails",
        id=channel_id
    )
    response = request.execute()
    rawData = response.get('items', [])[0]
    return format_channel_data(rawData)
    

get_channel_data(youtube, etoiles)

In [None]:
def format_video_data(video_data: dict) -> pd.DataFrame:
    """ Structure raw video data """
    data = {
            "title": [video_data.get('snippet', {}).get('title')],
            "id": [video_data.get('id')],
            "publishedAt": [video_data.get('snippet', {}).get('publishedAt')],
            "duration" : [video_data.get('contentDetails').get('duration')],
            "ViewCount" : [int(video_data.get('statistics', {}).get('viewCount', 0))],
            "likeCount" : [int(video_data.get('statistics', {}).get('likeCount', 0))],
            "commentCount" : [int(video_data.get('statistics', {}).get('commentCount', 0))],  
            "tags" : [video_data.get('snippet', {}).get('tags')]
    }
    
    return pd.DataFrame.from_dict(data)

In [None]:
def get_video_data(youtube, video_Id:str) -> dict[str|dict]:
    """ Request (by id) for most important video stats """
    request = Request(
        requestType=youtube.videos(),
        part="snippet,contentDetails,statistics,topicDetails",
        id=video_Id,
    )
    response = request.execute()
    
    rawData = response.get('items', [])[0]
    return format_video_data(rawData)

get_video_data(youtube, exemple_video)

In [None]:
def get_Most_Popular_Video(youtube, region:str) -> pd.DataFrame:
    """ Request for most populars videos stats """
    request = Request(
        requestType=youtube.videos(),
        part="snippet,contentDetails,statistics,topicDetails",
        chart="mostPopular",
        regionCode=region,
        maxResults=100,
        pageToken=''
    )
    response = request.execute()
    
    pages = [response]
    while response.get('nextPageToken'):
        request.pageToken = response.get('nextPageToken')
        response = request.execute()
        pages.append(response)
    
    top_videos = pd.concat([format_video_data(videos) for page in pages for videos in page.get('items')]).reset_index(drop=True)
    top_videos['topID'] = top_videos.index + 1
    top_videos['region'] = region
    return top_videos

get_Most_Popular_Video(youtube, 'FR')
# df.sort_values(by=['fetchedDate'], ascending=False, inplace=True, kind='quicksort', ignore_index=True)
# df

In [None]:
def format_comment_data(comment:dict) -> pd.DataFrame:
    """ Structure raw comment data """
    data = {
        "id": [comment.get('id')],
        "comment": [comment.get('snippet', {}).get('textOriginal')],
        # "viewerRating": comment.get('snippet', {}).get('viewerRating'),
        "likeCount": [int(comment.get('snippet', {}).get('likeCount'))],
        "publishedAt": [comment.get('snippet', {}).get('publishedAt')],
        "updatedAt": [comment.get('snippet', {}).get('updatedAt')]
        }
    
    return data

def format_threadedComment_data(comment:dict) -> pd.DataFrame:
    """ Structure raw threaded comment data """
    data = {
        **format_comment_data(comment.get('snippet', {}).get('topLevelComment')),
        "totalReplyCount": [int(comment.get('snippet', {}).get('totalReplyCount'))],
        # "replies": [format_comment_data(com) for com in comment.get('replies', {}).get('comments', [])]
        }
    
    return data

In [None]:
def format_commentPage(page:list[dict]) -> pd.DataFrame:
    data = {}
    for comment in page:
        topLevelComment = comment.get('snippet', {}).get('topLevelComment')
        data.setdefault("id", []).append(topLevelComment.get('id')) 
        data.setdefault("comment", []).append(topLevelComment.get('snippet', {}).get('textOriginal'))
        data.setdefault("likeCount", []).append(int(topLevelComment.get('snippet', {}).get('likeCount')))
        data.setdefault("publishedAt", []).append(topLevelComment.get('snippet', {}).get('publishedAt'))
        data.setdefault("updatedAt", []).append(topLevelComment.get('snippet', {}).get('updatedAt'))
        data.setdefault("totalReplyCount", []).append(int(comment.get('snippet', {}).get('totalReplyCount')))
    return pd.DataFrame(data)

In [None]:
def get_comment(youtube,comment_id:str) -> dict[str|dict]:
    """ Request (by id) for most important comment stats """
    request = Request(
        requestType=youtube.comments(),
        part="snippet,id",
        id=comment_id,
    )
    response = request.execute()
    rawData = response.get('items')[0]
    return pd.DataFrame(format_comment_data(rawData))

get_comment(youtube, 'UgwUQR2JJFJSkihWLhx4AaABAg')

In [None]:
@retry_on_exception(max_attempts=3)
def get_video_commentThreads(youtube, videoID:str, maxComments:int) -> dict[str|list]:
    """ Request (by id) for all comments of a videos """
    request = Request(
        requestType=youtube.commentThreads(),
        part="snippet,id,replies",
        videoId=videoID,
        maxResults=100
    )
    response = request.execute()
    
    maxComments -= response.get('pageInfo', {}).get('totalResults')
    comments = format_commentPage(response.get('items',{}))
    while response.get('nextPageToken'):
        request.pageToken = response.get('nextPageToken')
        response = request.execute()
        comments = pd.concat([comments, format_commentPage(response.get('items',{}))], ignore_index=True)
        if (maxComments:= maxComments - response.get('pageInfo', {}).get('totalResults')) <= 0:
            break
        
    comments['videoID'] = videoID
    comments['fetchedDate'] = datetime.today()
    return comments

get_video_commentThreads(youtube, 'FkXhKu80CWU', 1000)
# get_video_commentThreads(youtube, 'FkXhKu80CWU', 1000)

# Fetching Top Videos
The goal is to fetch the top 200 videos everyday and to get their comments a week after publishing.

In [None]:
import pymongo
import pandas as pd
import json
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError
def push_top_vids(topvidsFile:str, regions:list[str], minElapsedTime:int)-> None:
    """ Fetch top 200 vids per region (per <minElapsedTime>) and push in json <topvidsFile> """
    today = datetime.today()
    try:
        df = pd.read_csv(topvidsFile)
    except pd.errors.EmptyDataError:
        df = pd.DataFrame()
    
    if 'fetchedDate' in df:
        df['fetchedDate'] = pd.to_datetime(df['fetchedDate'], format='ISO8601')
        lastUpdate = df.sort_values(by=['fetchedDate'], ascending=False, ignore_index=True).loc[0]['fetchedDate']
        delta = today - lastUpdate
        if delta.total_seconds() // 3600 < minElapsedTime:
            raise Exception(f'The fetch request has be done too soon. Next request available in {24-(delta.total_seconds() // 3600)}h. Last update done : {lastUpdate}')

    # Fetching
    for reg in regions:
        top200 = get_Most_Popular_Video(youtube, reg)
        top200['fetchedDate'] = today
        top200['fetchedComments'] = False
        df = pd.concat([df, top200])
        
    df.to_csv('db/dailyTop200.csv', index=False) 
    
def push_top_vids_with_db(regions: list[str], minElapsedTime: int) -> None:
    """ Fetch top 200 vids per region (per <minElapsedTime>) and push into MongoDB collection """
    cluster = pymongo.MongoClient("mongodb://localhost:27017/")
    db = cluster["ProjectBigData"]
    today = datetime.today()
    collection = db['topVideo']

    # Récupérer la date de la dernière mise à jour dans la base de données MongoDB
    last_record = collection.find_one(sort=[('fetchedDate', pymongo.DESCENDING)])
    if last_record:
        lastUpdate = datetime.strptime(last_record['fetchedDate'], '%Y-%m-%d %H:%M:%S.%f')
        print(lastUpdate)
        delta = today - lastUpdate
        if delta.total_seconds() // 3600 < minElapsedTime:
            raise Exception(f'The fetch request has been done too soon. Next request available in {24 - (delta.total_seconds() // 3600)}h. Last update done: {lastUpdate}')

    # Fetching
    new_data = []  # Stocker les données récupérées pour une insertion ultérieure dans la base de données
    for reg in regions:
        top200 = get_Most_Popular_Video(youtube, reg)
        top200['fetchedDate'] = today
        top200['fetchedComments'] = False
        new_data.extend(top200.to_dict(orient='records'))  # Étendre la liste des nouvelles données avec les données actuelles
        
    # Insérer toutes les nouvelles données dans la base de données MongoDB
    if new_data:
        collection.insert_many(new_data)

In [None]:
REGION = ['FR', 'US']
topvids = 'db/dailyTop200.csv'
commentsQueue="db/comments.csv"
minElapsedTime = 24 # Hours
minElapsedCommentsTime = 17 # days

push_top_vids_with_db( REGION, minElapsedTime)

In [None]:
from datetime import timezone


def fetch_topVids_comments(topvidsFile:str, commentsFile:str,minElapsedCommentsTime:int, maxComments:int = 1000) -> None:
    today = datetime.today()
    try:
        df = pd.read_csv(topvidsFile)
        comments = pd.read_csv(commentsFile)
    except pd.errors.EmptyDataError:
        df = pd.DataFrame()
        comments = pd.DataFrame()
    df['publishedAt'] = pd.to_datetime(df['publishedAt'], format='ISO8601')
    
    print(df.dtypes)
    print(df.shape)
    
    uniqueID = df.drop_duplicates(['id'])
    uniqueID = uniqueID[uniqueID['fetchedComments'] == False]
    id_list = list(uniqueID['id'])
    
    for id in id_list:
        if (pd.to_datetime(today) - uniqueID.loc[uniqueID['id'] == id, 'publishedAt'].values[0]).days >= minElapsedCommentsTime:
            print(id)
            comments = pd.concat([comments, get_video_commentThreads(youtube, id, maxComments)])
            df.loc[df['id'] == id, 'fetchedComments'] = True
            
    df.to_csv(topvidsFile, index=False)
    comments.to_csv(commentsFile, index=False)    
    


In [None]:

import pytz


def fetch_topVids_comments_with_bd(minElapsedCommentsTime:int, maxComments:int = 1000) -> None:
    
    cluster = pymongo.MongoClient("mongodb://localhost:27017/")
    db = cluster["ProjectBigData"]

    # Collection "topVideo"
    collection = db['topVideo']
        # Requête MongoDB pour récupérer les IDs des vidéos avec fetchedComments à false
    pipeline = [
    {"$match": {"fetchedComments": False}},  # Filtrer les vidéos avec fetchedComments égal à false
    {"$group": {"_id": "$id", "date": {"$first": "$publishedAt"}}},  # Regrouper par l'ID de la vidéo et récupérer la première date
    {"$project": {"_id": 0, "id": "$_id", "date": 1}}  # Projection pour renvoyer seulement l'ID et la date
    ]

    # Exécution de la requête et récupération des résultats
    results = collection.aggregate(pipeline)
    print(results)
    # Obtenir la date et l'heure actuelles avec le fuseau horaire UTC
    # Obtenir le fuseau horaire UTC
    utc_timezone = pytz.utc
    comments = pd.DataFrame()
    for video in results:
        # Convertir la date de la vidéo en un objet datetime
        video_date = pd.to_datetime(video['date'], format='%Y-%m-%dT%H:%M:%SZ')
        # Ajouter le fuseau horaire UTC à la date de la vidéo
        video_date = video_date.tz_localize(utc_timezone)
        # Calculer la différence en jours
        elapsed_days = (datetime.now(utc_timezone) - video_date).days
        
        if elapsed_days >= minElapsedCommentsTime:
            print(video['id'])
            comments = pd.concat([comments, get_video_commentThreads(youtube, video['id'], maxComments)])
            collection.update_many(
                {"id": video['id']},  # Critère de sélection des documents à mettre à jour
                {"$set": {"fetchedComments": True}}  # Mise à jour du champ fetchedComments à true
            )
    collection2=db['comments']
    collection2.create_index([("id", pymongo.ASCENDING), ("videoId", pymongo.ASCENDING)], unique=True)
    # Chargement des données depuis le fichier CSV "comments.csv"
    data_dict = comments.to_dict(orient='records')
    for document in data_dict:
        try:
            collection2.insert_one(document)
        except pymongo.errors.DuplicateKeyError as e:
        
            continue    


fetch_topVids_comments_with_bd(minElapsedCommentsTime=7)

## Satistics
Some stats about the comments and top vids.

In [None]:
com = pd.read_csv('db/comments.csv')
df = pd.read_csv('db/dailyTop200.csv')

In [None]:
# df[df['topID'] == 1].sort_values(by=['ViewCount', 'likeCount'], ascending=False)

vals = df[df['id'] == 'tWYsfOSY9vY'][['title','ViewCount', 'likeCount']].drop_duplicates().values
for title,view,like in vals:
    print(f'One every {round(view/like)} person droped a like on `{title}`')
df
# df[df['id'] == '_9u4sYHcR7A'][['topID', 'region', 'fetchedDate']]

In [None]:
def top_evolution_plot(dt, id):
    data = dt[dt['id'] == id]
    data['fetchedDate'] = pd.to_datetime(data['fetchedDate'])
    data['fetchedDate'] = data['fetchedDate'].dt.strftime('%d/%m')
    
    sns.lineplot(data=data[['fetchedDate', 'topID', 'region']], x='fetchedDate', y='topID', hue='region')
    plt.xlabel('Date')
    plt.ylabel('Top ID')
    plt.title(f"Top ID `{data['title'].values[0]}` par région au fil du temps")
    plt.gca().invert_yaxis()
    plt.show()

top_evolution_plot(df, 'dIlbshbTRlQ')

In [None]:
dt = df[df['region'] == 'FR']
top1ID = dt[dt['topID'] == 1]['id'].values
# print(top1ID)
dt = df[df['id'].isin(top1ID)]
dt['fetchedDate'] = pd.to_datetime(dt['fetchedDate'])
dt['fetchedDate'] = dt['fetchedDate'].dt.strftime('%d/%m')
dt.sort_values(by=['fetchedDate'], ascending=False, inplace=True)

sns.lineplot(data=dt[['fetchedDate', 'topID', 'title']], x='fetchedDate', y='topID', hue='title', legend=False)
plt.xlabel('Date')
plt.ylabel('Top ID')
plt.title(f"Top ID 1 en France au fil du temps")
plt.gca().invert_yaxis()
plt.show()
# dt

In [None]:
df['fetchedDate'].sort_values(ascending=False).unique()