In [1]:
from dotenv import load_dotenv
load_dotenv()

import os
key = os.getenv('GOOGLE_API_KEY')

In [2]:
from dataclasses import dataclass
from datetime import datetime
from typing import Optional

@dataclass
class Comment:
    comment_id: str
    author: str
    text: str
    like_count: int
    published_at: datetime
    parent_id: Optional[str]

In [3]:
import requests
from typing import List

def get_top_level_comments(video_id) -> List[Comment]:
    url = 'https://www.googleapis.com/youtube/v3/commentThreads'
    order = 'relevance'
    params = {
        'key': key,
        'part': 'snippet',
        'videoId': video_id,
        'maxResults': 100,
        'order': order,
    }
    
    comments = []
    response = requests.get(url, params=params)
    data = response.json()
    for item in data['items']:
        top_comment = item['snippet']['topLevelComment']['snippet']
        snippet_id = item['id']
        author = top_comment['authorDisplayName']
        text = top_comment['textDisplay']
        like_count = top_comment['likeCount']
        published_at = top_comment['publishedAt']

        comments.append(Comment(snippet_id, author, text, like_count, published_at, None))
    return comments

In [4]:
# video_id = 'NDsO1LT_0lw' # mr beast
video_id = 'YbJOTdZBX1g' # yt rewind 2018

top_level_comments = get_top_level_comments(video_id)
top_level_comments[:5]

[Comment(comment_id='UgyBjLLdTju8FbrdW314AaABAg', author='@Rxaches4lunch', text='2018: we tried but failed horribly<br>2019: we didn’t even try<br>2020: we didn’t even make one', like_count=129639, published_at='2020-11-12T22:37:59Z', parent_id=None),
 Comment(comment_id='Ugyl9eWxxbM04WBrUop4AaABAg', author='@2075anant', text='The irony is that YouTube created the most disliked video on YouTube. Can&#39;t get enough of that.', like_count=54986, published_at='2021-06-03T09:58:21Z', parent_id=None),
 Comment(comment_id='UgzPTxvCQortnOaf1wd4AaABAg', author='@kud4262', text='Every year youtube just keeps getting disconnected from its content creators.', like_count=210609, published_at='2018-12-07T22:16:16Z', parent_id=None),
 Comment(comment_id='UgwfAGAyka6E9AAYfQN4AaABAg', author='@NormalChannel95', text='I miss when this was our biggest worry.', like_count=1212, published_at='2025-03-02T18:55:07Z', parent_id=None),
 Comment(comment_id='UgzuRQReymfJpHjItjt4AaABAg', author='@pendragonandje

In [5]:
def get_replies(comment_id: str) -> List[Comment]:
    url = 'https://www.googleapis.com/youtube/v3/comments'
    params = {
        'key': key,
        'part': 'snippet',
        'parentId': comment_id,
        'maxResults': 100
    }

    replies: List[Comment] = []

    while 1:
        response = requests.get(url, params=params)
        data = response.json()

        for item in data.get('items', []):
            snippet = item['snippet']
            replies.append(Comment(
                comment_id=item['id'],
                author=snippet.get('authorDisplayName', ''),
                text=snippet.get('textDisplay', ''),
                like_count=snippet.get('likeCount', 0),
                published_at=datetime.strptime(snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ"),
                parent_id=snippet.get('parentId')
            ))

        if 'nextPageToken' in data:
            params['pageToken'] = data['nextPageToken']
        else:
            break

    replies.sort(key=lambda c: c.like_count, reverse=True)
    return replies

In [6]:
replies = get_replies('UgzgrqctaRkXF0Ioydx4AaABAg')
replies[:5]

[Comment(comment_id='UgzgrqctaRkXF0Ioydx4AaABAg.AEHxAWu37_NAEHxHWb93UL', author='@SirBhogerPlus', text='Already plan to!', like_count=606, published_at=datetime.datetime(2025, 2, 8, 17, 0, 28), parent_id='UgzgrqctaRkXF0Ioydx4AaABAg'),
 Comment(comment_id='UgzgrqctaRkXF0Ioydx4AaABAg.AEHxAWu37_NAEHxU9Ka3uH', author='@ayhamyt6143', text='LOL', like_count=352, published_at=datetime.datetime(2025, 2, 8, 17, 2, 12), parent_id='UgzgrqctaRkXF0Ioydx4AaABAg'),
 Comment(comment_id='UgzgrqctaRkXF0Ioydx4AaABAg.AEHxAWu37_NAEHxUIndd21', author='@Son_Harold', text='Yeah same<br>I love beast games ❤', like_count=276, published_at=datetime.datetime(2025, 2, 8, 17, 2, 13), parent_id='UgzgrqctaRkXF0Ioydx4AaABAg'),
 Comment(comment_id='UgzgrqctaRkXF0Ioydx4AaABAg.AEHxAWu37_NAEHxUi64FkV', author='@Revanftbl', text='Ok', like_count=143, published_at=datetime.datetime(2025, 2, 8, 17, 2, 16), parent_id='UgzgrqctaRkXF0Ioydx4AaABAg'),
 Comment(comment_id='UgzgrqctaRkXF0Ioydx4AaABAg.AEHxAWu37_NAEHxVEY6krb', author

In [7]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
text = "test"
score = sia.polarity_scores(text)

print(score)

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ltera/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
import html
import re

def text_cleanup(text):
    text = html.unescape(text) # remove html tags
    text = re.sub(r'@\w+', '', text) # remove @mentions
    text = re.sub(r'[^\x00-\x7F]+', '', text) # remove emojis
    return text.strip()
    

def analyze_comment_sentiment(comment):
    preprocessed_text = text_cleanup(comment.text)
    # print(f'BEFORE:{comment.text} AFTER:{preprocessed_text}')
    return sia.polarity_scores(preprocessed_text)

print(top_level_comments[2])
analyze_comment_sentiment(top_level_comments[2])

Comment(comment_id='UgzPTxvCQortnOaf1wd4AaABAg', author='@kud4262', text='Every year youtube just keeps getting disconnected from its content creators.', like_count=210609, published_at='2018-12-07T22:16:16Z', parent_id=None)


{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [9]:
for comment in top_level_comments:
    score = analyze_comment_sentiment(comment)
    if score['pos'] > score['neg']:
        print(f'POSITIVE: {comment}\n')
    elif score['neg'] > score['pos']:
        print(f'NEGATIVE: {comment}\n')
    else:
        print(f'NEUTRAL: {comment}\n')


NEGATIVE: Comment(comment_id='UgyBjLLdTju8FbrdW314AaABAg', author='@Rxaches4lunch', text='2018: we tried but failed horribly<br>2019: we didn’t even try<br>2020: we didn’t even make one', like_count=129639, published_at='2020-11-12T22:37:59Z', parent_id=None)

NEGATIVE: Comment(comment_id='Ugyl9eWxxbM04WBrUop4AaABAg', author='@2075anant', text='The irony is that YouTube created the most disliked video on YouTube. Can&#39;t get enough of that.', like_count=54986, published_at='2021-06-03T09:58:21Z', parent_id=None)

NEUTRAL: Comment(comment_id='UgzPTxvCQortnOaf1wd4AaABAg', author='@kud4262', text='Every year youtube just keeps getting disconnected from its content creators.', like_count=210609, published_at='2018-12-07T22:16:16Z', parent_id=None)

NEGATIVE: Comment(comment_id='UgwfAGAyka6E9AAYfQN4AaABAg', author='@NormalChannel95', text='I miss when this was our biggest worry.', like_count=1212, published_at='2025-03-02T18:55:07Z', parent_id=None)

NEUTRAL: Comment(comment_id='UgzuRQRey