In [1]:
# Install required packages
# !pip install python-dotenv
# !pip install numpy
# !pip install pandas
# !pip install matplotlib
# !pip install google-api-python-client
# !pip install mysql-connector-python
# !pip install sqlalchemy
# !pip install wordcloud
# !pip install vaderSentiment

In [33]:
# Import required packages
from dotenv import load_dotenv
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from googleapiclient.discovery import build
import mysql.connector
from sqlalchemy import create_engine
from datetime import datetime
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud

In [35]:
# Load environment variables from .env file
load_dotenv()

# Get YouTube API key from .env 
youtube_api_key = os.getenv("youtube_api_key")

# Get local MySQL server username from .env
mysql_user = os.getenv("mysql_user")
# Get local MySQL server password from .env
mysql_password = os.getenv("mysql_password")

# Get AWS MySQL server username from .env
aws_mysql_user = os.getenv("aws_mysql_user")
# Get AWS MySQL server password from .env
aws_mysql_password = os.getenv("aws_mysql_password")

# Extract

In [4]:
# Build the YouTube service object
youtube = build("youtube", "v3", developerKey=youtube_api_key)

## Channel data

In [8]:
# Select channels
channel_names = ["AlexTheAnalyst", "LukeBarousse", "Thuvu5"]

# Initialize an empty list to store dictionaries for each channel
channels_ls = []

# Initialize an empty list to store uploads playlist IDs of all channels
uploads_playlist_ids = []

# Loop through each channel
for channel_name in channel_names:
    # Get channel data using the YouTube Channels API
    # Note: Uses 1 out of 10.000 units from the daily usage limit 
    channel_data = youtube.channels().list(part="statistics,snippet,contentDetails", forHandle=channel_name).execute()  

    # Extract channel data in dictionary format
    channel_dict = {
        "channel_id": channel_data["items"][0]["id"],
        "channel_name": channel_data["items"][0]["snippet"]["title"],
        "views": int(channel_data["items"][0]["statistics"]["viewCount"]),
        "videos": int(channel_data["items"][0]["statistics"]["videoCount"]),
        "subscribers": int(channel_data["items"][0]["statistics"]["subscriberCount"])
    }
    
    try:
        # Try to get channel thumbnail in maximum resolution
        channel_dict["thumbnail_url"] = channel_data["items"][0]["snippet"]["thumbnails"]["maxres"]["url"]
    except KeyError:
        try:
            # If maxres is not available, get high resolution
            channel_dict["thumbnail_url"] = channel_data["items"][0]["snippet"]["thumbnails"]["high"]["url"]
        except KeyError:
            # If high resolution is not available, get default resolution
            channel_dict["thumbnail_url"] = channel_data["items"][0]["snippet"]["thumbnails"]["default"]["url"]
    
    # Append channel data in dictionary format to the list
    channels_ls.append(channel_dict)
    
    # Append uploads playlist ID to the list 
    uploads_playlist_ids.append(channel_data["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"])

# Convert list of dictionaries to pandas DataFrame
channel_df = pd.DataFrame(channels_ls) 
channel_df

Unnamed: 0,channel_id,channel_name,views,videos,subscribers,thumbnail_url
0,UC7cs8q-gJRlGwj4A8OmCmXg,Alex The Analyst,36999411,312,822000,https://yt3.ggpht.com/ytc/AIdro_l9wLnClpLKJeVm...
1,UCLLw7jmFsvfIVaUFsLs8mlQ,Luke Barousse,22581819,158,454000,https://yt3.ggpht.com/ytc/AIdro_my6YXWfudW8qM_...
2,UCJQJAI7IjbLcpsjWdSzYz0Q,Thu Vu data analytics,8276248,88,244000,https://yt3.ggpht.com/s3HLl-uzqEaqww2tkWKgjLFf...


## Video data

In [9]:
# Initialize an empty list to store dictionaries for each video
videos_ls = []

# Loop through each channel's uploads playlist
for uploads_playlist_id in uploads_playlist_ids:
    # Initialize next_page_token to None
    next_page_token = None

    # Loop through each video in the playlist
    while True:
        # Get playlist data using the YouTube PlaylistItems API 
        # Note: Each loop uses 1 out of 10.000 units from the daily usage limit (1 unit for 50 videos)
        playlist_data = youtube.playlistItems().list(
            part="snippet", 
            playlistId=uploads_playlist_id, 
            maxResults=50,
            pageToken=next_page_token
        ).execute()

        # Initialize an empty list to store video IDs
        video_ids = []

        # Extract video IDs from the playlist data
        video_ids += [video_data["snippet"]["resourceId"]["videoId"] for video_data in playlist_data["items"]]

        # Get video data using the YouTube Videos API 
        # Note: Uses 1 out of 10.000 units from the daily usage limit (1 unit for 50 videos)
        video_data = youtube.videos().list(part="statistics,snippet,contentDetails", id=video_ids).execute()    

        # Loop through each video 
        for video in video_data["items"]:
            # Extract video data in dictionary format
            video_dict = {
                "video_id": video["id"],
                "channel_id": video["snippet"]["channelId"],
                "video_title": video["snippet"]["title"],
                "video_description": video["snippet"]["description"],
                "published_at": datetime.strptime(video["snippet"]["publishedAt"], "%Y-%m-%dT%H:%M:%SZ"),
                "video_duration": video["contentDetails"]["duration"],
                "views": int(video["statistics"]["viewCount"]),
                "likes": int(video["statistics"]["likeCount"]),
                "comments": int(video["statistics"]["commentCount"])
            }

            try:
                # Try to get thumbnail in maximum resolution
                video_dict["thumbnail_url"] = video["snippet"]["thumbnails"]["maxres"]["url"]
            except KeyError:
                try:
                    # If maxres is not available, get high resolution
                    video_dict["thumbnail_url"] = video["snippet"]["thumbnails"]["high"]["url"]
                except KeyError:
                    # If high resolution is not available, get default resolution
                    video_dict["thumbnail_url"] = video["snippet"]["thumbnails"]["default"]["url"]

            # Append video data in dictionary format to the list
            videos_ls.append(video_dict)

        # Get the next page token
        next_page_token = playlist_data.get("nextPageToken")

        # Exit the loop if there are no more pages
        if next_page_token is None:
            break
        
# Convert list of dictionaries to pandas DataFrame
videos_df = pd.DataFrame(videos_ls)    
videos_df

Unnamed: 0,video_id,channel_id,video_title,video_description,published_at,video_duration,views,likes,comments,thumbnail_url
0,ZYps6TmBkWk,UC7cs8q-gJRlGwj4A8OmCmXg,Azure Account Setup + $200 Free Credits | Azur...,In this lesson we will be setting up and walki...,2024-07-16 12:00:38,PT6M37S,4056,201,25,https://i.ytimg.com/vi/ZYps6TmBkWk/maxresdefau...
1,zv1nfZTYpio,UC7cs8q-gJRlGwj4A8OmCmXg,Building a Fully Interactive Web App using Shi...,In this video we are building a Full Shiny App...,2024-07-09 12:01:07,PT59M52S,6106,235,20,https://i.ytimg.com/vi/zv1nfZTYpio/maxresdefau...
2,rn0dSsYXhIE,UC7cs8q-gJRlGwj4A8OmCmXg,Shiny for Python Components and Layouts | Cust...,In this video we look at how we can customize ...,2024-07-02 12:01:00,PT24M8S,3508,114,7,https://i.ytimg.com/vi/rn0dSsYXhIE/maxresdefau...
3,siHou7lObbo,UC7cs8q-gJRlGwj4A8OmCmXg,Shiny for Python Setup and Install | Deploying...,In this video we are going to install Shiny an...,2024-06-25 12:00:53,PT14M24S,6138,222,31,https://i.ytimg.com/vi/siHou7lObbo/maxresdefau...
4,wJiJXD7MVoY,UC7cs8q-gJRlGwj4A8OmCmXg,Is the Economy on the Brink of a Recession? | ...,The Economy isn't looking good and there's a l...,2024-06-18 12:00:06,PT21M19S,11367,693,124,https://i.ytimg.com/vi/wJiJXD7MVoY/maxresdefau...
...,...,...,...,...,...,...,...,...,...,...
554,5LWoJAh-kww,UCJQJAI7IjbLcpsjWdSzYz0Q,Data Analyst Skill Stack // How I Became A Dat...,👩🏻‍💻 My laptop and iPad for doing DS/ study 👉 ...,2021-06-16 10:16:47,PT10M35S,33230,1793,70,https://i.ytimg.com/vi/5LWoJAh-kww/maxresdefau...
555,_RzoHVWKwq4,UCJQJAI7IjbLcpsjWdSzYz0Q,Effective visual note-taking on iPad + Downloa...,Hi there! In this video I’m showing you how to...,2021-06-09 06:58:21,PT8M19S,12599,420,15,https://i.ytimg.com/vi/_RzoHVWKwq4/maxresdefau...
556,hWKLO7GtpiU,UCJQJAI7IjbLcpsjWdSzYz0Q,Data scientist is NOT the only SEXY job // All...,Hi there! In this video I explain different da...,2021-06-01 22:51:42,PT14M4S,9041,295,20,https://i.ytimg.com/vi/hWKLO7GtpiU/maxresdefau...
557,dBZqggW22rs,UCJQJAI7IjbLcpsjWdSzYz0Q,"How I take notes on iPad Pro | Notion, Notes, ...",My note-taking system tour in Notion and tips ...,2021-05-16 22:47:12,PT9M7S,12001,281,21,https://i.ytimg.com/vi/dBZqggW22rs/maxresdefau...


## Comments data

In [10]:
# Initialize an empty list to store comments
comments_ls = []

# Loop through each video
for video_id in videos_df["video_id"].values:
    # Initialize next_page_token to None
    next_page_token = None

    # Loop through data batches of 100 comments 
    while True:
        try:
            # Get data from 100 comments using the YouTube CommentThreads API 
            # Note: Each loop uses 1 out of 10.000 units from the daily usage limit (1 unit for 100 comments)
            comments_data = youtube.commentThreads().list(
                part="snippet", 
                videoId=video_id, 
                maxResults=100,
                pageToken=next_page_token
            ).execute()
        # Handle error if e.g. video comments are disabled
        except Exception as e:
            print(f"Failed to get comments for video {video_id}.")

        # Loop through each comment
        for comment in comments_data["items"]:
            # Extract comment data in dictionary format
            comment_dict = {
                "comment_id": comment["snippet"]["topLevelComment"]["id"],
                "video_id": comment["snippet"]["topLevelComment"]["snippet"]["videoId"],
                "channel_id": comment["snippet"]["topLevelComment"]["snippet"]["channelId"],
                "comment_text": comment["snippet"]["topLevelComment"]["snippet"]["textOriginal"],
                "published_at": datetime.strptime(comment["snippet"]["topLevelComment"]["snippet"]["publishedAt"], "%Y-%m-%dT%H:%M:%SZ")
            }
            # Append comment data dictionary to the list
            comments_ls.append(comment_dict)

        # Get the next page token
        next_page_token = comments_data.get("nextPageToken")

        # Exit the loop if there are no more pages
        if next_page_token is None: 
            break
        
# Convert list of dictionaries to pandas DataFrame
comments_df = pd.DataFrame(comments_ls)    
comments_df 

Failed to get comments for video W_s4c1vLGXM.
Failed to get comments for video qfyynHBFOsM.
Failed to get comments for video G4syHs3M82E.
Failed to get comments for video LJtFgnHGAos.
Failed to get comments for video r9imv1z82jQ.


Unnamed: 0,comment_id,video_id,channel_id,comment_text,published_at
0,UgzSfHTFO8ZWcg_maUF4AaABAg,ZYps6TmBkWk,UC7cs8q-gJRlGwj4A8OmCmXg,i'm ready,2024-07-22 23:04:43
1,Ugym_vQM6LOs4x1mV5N4AaABAg,ZYps6TmBkWk,UC7cs8q-gJRlGwj4A8OmCmXg,Please help me,2024-07-21 16:04:03
2,UgwuWMq45Mw5DmWuCkJ4AaABAg,ZYps6TmBkWk,UC7cs8q-gJRlGwj4A8OmCmXg,Hey Alex the social links in your channel des...,2024-07-19 23:44:53
3,Ugz9XMUa83zfOqZQ17h4AaABAg,ZYps6TmBkWk,UC7cs8q-gJRlGwj4A8OmCmXg,The future of Binance: an exclusive interview ...,2024-07-19 06:50:19
4,Ugx5zQmHZ_vs6PPI7Ht4AaABAg,ZYps6TmBkWk,UC7cs8q-gJRlGwj4A8OmCmXg,"Thanks Alex! Great series, much needed! Could ...",2024-07-18 23:40:08
...,...,...,...,...,...
58222,UgykL4IfQ7CLbONTW5t4AaABAg,QDdqsFCIxIk,UCJQJAI7IjbLcpsjWdSzYz0Q,"thank you, please i need you help!!",2022-03-07 11:10:44
58223,UgzyWHk3Kx6N3sbElMJ4AaABAg,QDdqsFCIxIk,UCJQJAI7IjbLcpsjWdSzYz0Q,Very diff but great inputs from similar topic ...,2022-02-20 00:42:23
58224,Ugw1X4GuVl6jdAB8sFJ4AaABAg,QDdqsFCIxIk,UCJQJAI7IjbLcpsjWdSzYz0Q,Replying email and control backspace was new f...,2021-08-02 15:15:34
58225,UgzDqSm_tOqxIbgclvF4AaABAg,QDdqsFCIxIk,UCJQJAI7IjbLcpsjWdSzYz0Q,Valuable insight😇,2021-07-14 10:17:53


# Transform

## Convert video duration

In [11]:
# Function to convert the YouTube video duration from ISO 8601 format (str) to seconds (int)
def convert_iso8601_duration(duration):
    # Regular expression to match hours, minutes, and seconds
    time_extractor = re.compile(r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?')
    # Extract hours, minutes, and seconds
    extracted = time_extractor.match(duration)
    if extracted:
        hours = int(extracted.group(1)) if extracted.group(1) else 0
        minutes = int(extracted.group(2)) if extracted.group(2) else 0
        seconds = int(extracted.group(3)) if extracted.group(3) else 0
        # Return total seconds
        total_seconds = hours * 3600 + minutes * 60 + seconds
        return total_seconds
    else:
        return 0

In [12]:
# Convert video duration in pandas DataFrame
videos_df["video_duration"] = videos_df["video_duration"].apply(convert_iso8601_duration)

# Load

## Into local MySQL database

In [None]:
# Connect to local MySQL database
connection = mysql.connector.connect(
    host = "localhost",
    port = 3306,
    user = mysql_user,
    password = mysql_password,
    database = "youtube_analytics"
)

# Create a cursor object to execute SQL queries
cursor = connection.cursor()

# Drop existing MySQL tables 
tables_to_drop = ["comments", "videos", "channels"]
for table in tables_to_drop:
    cursor.execute(f"DROP TABLE IF EXISTS {table};")
        
try:
    # Create an SQLAlchemy engine for interacting with the MySQL database
    engine = create_engine(f"mysql+mysqlconnector://{mysql_user}:{mysql_password}@localhost:3306/youtube_analytics") 
    
    # Load the YouTube channels DataFrame into the MySQL channels table
    try:
        channel_df.to_sql("channels", con=engine, if_exists="replace", index=False)
        print("Channels data successfully loaded into local MySQL database.")
    except Exception as e:
        print("Error loading channels data:", e)
    
    # Load the YouTube videos DataFrame into the MySQL videos table
    try:
        videos_df.to_sql("videos", con=engine, if_exists="replace", index=False)
        print("Videos data successfully loaded into local MySQL database.")
    except Exception as e:
        print("Error loading videos data:", e)
    
    # Load the YouTube comments DataFrame into the MySQL comments table
    try:
        comments_df.to_sql("comments", con=engine, if_exists="replace", index=False)
        print("Comments data successfully loaded into local MySQL database.")
    except Exception as e:
        print("Error loading comments data:", e)
    
except Exception as e:
    # Print error if exception occurs when connecting to the database 
    print("Error connecting to local MySQL database:", e)

finally:
    # Close the cursor and connection to free up resources
    cursor.close()
    connection.close()

## Into AWS MySQL database
Note: Make sure to establish an SSH tunnel via PuTTY to connect to the AWS RDS MySQL server instance through the EC2 instance.

In [None]:
# Connect to AWS MySQL database
connection = mysql.connector.connect(
    host = "localhost",
    port = 3308,
    user = aws_mysql_user,
    password = aws_mysql_password,
    database = "youtube_analytics"
)

# Create a cursor object to execute SQL queries
cursor = connection.cursor()

# Drop existing MySQL tables 
tables_to_drop = ["comments", "videos", "channels"]
for table in tables_to_drop:
    cursor.execute(f"DROP TABLE IF EXISTS {table};")
        
try:
    # Create an SQLAlchemy engine for interacting with the MySQL database
    engine = create_engine(f"mysql+mysqlconnector://{aws_mysql_user}:{aws_mysql_password}@localhost:3308/youtube_analytics") 
    
    # Load the YouTube channels DataFrame into the MySQL channels table
    try:
        channel_df.to_sql("channels", con=engine, if_exists="replace", index=False)
        print("Channels data successfully loaded into AWS MySQL database.")
    except Exception as e:
        print("Error loading channels data:", e)
    
    # Load the YouTube videos DataFrame into the MySQL videos table
    try:
        videos_df.to_sql("videos", con=engine, if_exists="replace", index=False)
        print("Videos data successfully loaded into AWS MySQL database.")
    except Exception as e:
        print("Error loading videos data:", e)
    
    # Load the YouTube comments DataFrame into the MySQL comments table
    try:
        comments_df.to_sql("comments", con=engine, if_exists="replace", index=False)
        print("Comments data successfully loaded into AWS MySQL database.")
    except Exception as e:
        print("Error loading comments data:", e)
    
except Exception as e:
    # Print error if exception occurs when connecting to the database 
    print("Error connecting to AWS MySQL database:", e)

finally:
    # Close the cursor and connection to free up resources
    cursor.close()
    connection.close()

# Sentiment analysis

To identify the most effective sentiment analysis method for YouTube comments, we will compare three models: VADER, DistilBERT, and RoBERTa. Each will be applied to a dataset of 50 randomly selected comments. VADER employs a rule-based approach, while DistilBERT and RoBERTa are machine learning-based approaches.

## Fetch example comments
Fetch 50 random comments from the AWS RDS MySQL comments table.

Note: Make sure to establish an SSH tunnel via PuTTY to connect to the AWS RDS MySQL server instance through the EC2 instance.

In [37]:
try:
    # Create an SQLAlchemy engine for interacting with the MySQL database
    engine = create_engine(f"mysql+mysqlconnector://{aws_mysql_user}:{aws_mysql_password}@localhost:3308/youtube_analytics") 
    
    # MySQL query to fetch 50 random comments
    query = "SELECT * FROM comments ORDER BY RAND() LIMIT 50"
    
    # Execute query to load the comments from the MySQL comments table into a pandas DataFrame
    try:
        random_comments_df = pd.read_sql(query, engine)
        print("50 random comments successfully fetched from the AWS MySQL database.")
    except Exception as e:
        print("Error fetching random comments from the AWS MySQL database:", e)
    
except Exception as e:
    # Print error if exception occurs when connecting to the database 
    print("Error connecting to AWS MySQL database:", e)

finally:
    # Close the database connection
    engine.dispose()

50 random comments successfully fetched from the AWS MySQL database.


In [39]:
random_comments_df

Unnamed: 0,comment_id,video_id,channel_id,comment_text,published_at
0,UgxT6HsOQ2C0pifnQcx4AaABAg,M2ySRYpo9S0,UC7cs8q-gJRlGwj4A8OmCmXg,One nice thing is that I read you can potentia...,2021-04-06 22:54:08
1,Ugx_C5utab-eTH5c4Lh4AaABAg,wgRwITQHszU,UC7cs8q-gJRlGwj4A8OmCmXg,i want to say thank you sir! Alex The Analyst\...,2024-06-27 06:03:18
2,Ugxg_fCY_Y0Ah-_JcZh4AaABAg,KRXSJb9ql1Y,UC7cs8q-gJRlGwj4A8OmCmXg,Hmmm... Wonder how I managed to get my commen...,2024-05-14 02:08:59
3,UgyToBpiOh4x-u-nLbx4AaABAg,NQSe-SuykJU,UCLLw7jmFsvfIVaUFsLs8mlQ,"dude I don't know if you know this, but RP cam...",2023-02-02 00:15:12
4,Ugy7az9R_be4_xLWeZF4AaABAg,9RRQtNnq3s0,UC7cs8q-gJRlGwj4A8OmCmXg,Hi Alex. Thanks a lot for this video! One qu...,2021-04-13 11:27:40
5,UgwskNedGDZS5UWiR5R4AaABAg,tpGawyNMRLM,UCJQJAI7IjbLcpsjWdSzYz0Q,I work at fedex and i just saw your play butto...,2023-01-30 05:59:36
6,Ugz1VaDlGpYn7PdsNcp4AaABAg,wnapnTAMj68,UC7cs8q-gJRlGwj4A8OmCmXg,It’s not worth it. But this is a google platfo...,2022-03-17 19:53:56
7,UgyWgwipQBo9nsZ4yTp4AaABAg,dMHWOhgzUhU,UC7cs8q-gJRlGwj4A8OmCmXg,Thanks for your detailed tips 👍,2021-06-10 13:52:29
8,Ugz4R5zvS3NFzOLLANF4AaABAg,aJ9Q10v8Nrc,UC7cs8q-gJRlGwj4A8OmCmXg,This was great!! Has anyone signed up for the ...,2022-05-11 21:39:51
9,Ugx-QEAKvdva7l_HGzh4AaABAg,fUpChfNN5Uo,UC7cs8q-gJRlGwj4A8OmCmXg,Additionally you need to have excellent presen...,2021-10-11 06:39:55


## VADER
VADER (Valence Aware Dictionary for sEntiment Reasoning) is a sentiment analysis tool designed specifically for social media texts. It employs a rule-based approach, leveraging a pre-defined lexicon of words and phrases along with grammatical and syntactical rules to determine the sentiment of a given text. The tool is particularly effective in handling the informal and colloquial language often found in social media posts, including the use of emoticons, acronyms, and slang.

In [21]:
# Initialize the VADER sentiment intensity analyzer
vader_sia = SentimentIntensityAnalyzer()

# Define a function to get sentiment scores
def get_sentiment(text):
    return vader_sia.polarity_scores(text)["compound"]

# Apply sentiment analysis to each comment and store sentiment scores in new column 
comments_df["sentiment_score"] = comments_df["comment_text"].apply(get_sentiment)

# Define a function to categorize the sentiment scores into positve, negative, or neutral
def categorize_sentiment(score):
    if score > 0.05:
        return "Positive"
    elif score < -0.05:
        return "Negative"
    else:
        return "Neutral"

# Apply sentiment categorization to each comment and store sentiment category in new column 
comments_df["sentiment"] = comments_df["sentiment_score"].apply(categorize_sentiment)

In [22]:
# Display the first few rows to verify the results
pd.set_option('display.max_colwidth', None)
comments_df[["comment_text", "sentiment_score", "sentiment"]].head(20)

Unnamed: 0,comment_text,sentiment_score,sentiment
0,i'm ready,0.3612,Positive
1,Please help me,0.6124,Positive
2,Hey Alex the social links in your channel description can't be accessible you need to fix that and you Gmail isn't there too.,0.0,Neutral
3,The future of Binance: an exclusive interview with the CEO,0.128,Positive
4,"Thanks Alex! Great series, much needed! Could you also do an end- to-end project with different data services like Synapse, ADF, Databricks and Power BI?",0.8774,Positive
5,An insider's perspective: exclusive interview with Binance's CEO on future developments,0.128,Positive
6,I was just thinking of where to learn about azure and here we go .one more bootcamp from the best da teacher.in whole youtube right now🎉,0.7845,Positive
7,You're the best!!!! Mr Freberg❤❤,0.941,Positive
8,"Hello @Alex_The_Analyst and everyone,\nPlease i really need your help to sign up on Microsoft Azure, I reside in Nigeria and my debit card is not accepted. Here is the error message ""We’re unable to validate the credit card information you provided"" Please help.🙏\n\nThank you.",0.9072,Positive
9,Where do you begin to start configuring Azure?,0.0,Neutral


## DistilBERT
DistilBERT (Distilled BERT) is a machine learning-based model that is a smaller, faster, and more efficient version of BERT (Bidirectional Encoder Representations from Transformers). With 66 million parameters, the [distilbert-base-uncased-finetuned-sst-2-english](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english) model by Hugging Face maintains around 97% of BERT's performance while being significantly faster and lighter. Fine-tuned on the Stanford Sentiment Treebank (SST-2) dataset, it achieves 91.3% accuracy in sentiment classification, close to BERT's 92.7%. The model performs binary sentiment classification, identifying text as either positive or negative. This model is ideal for sentiment analysis tasks, such as social media monitoring, customer feedback analysis, and brand sentiment tracking, especially on devices with limited computational resources.

## RoBERTa 
RoBERTa (Robustly Optimized BERT Pretraining Approach) is an enhanced version of BERT, featuring 125 million parameters and improved performance due to larger datasets and optimized hyperparameters. The [twitter-roberta-base-sentiment-latest](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest) model by [cardiffnlp](https://huggingface.co/cardiffnlp) is trained on approximately 124 million tweets and fine-tuned using the TweetEval benchmark. Unlike DistilBERT, RoBERTa performs ternary classification, categorizing text as positive, negative, or neutral. Its ability to understand context and detect subtle sentiment cues makes it ideal for analyzing diverse and context-rich social media comments.

# Word clouds

Word clouds to visualize the main topics of each YouTube channel.

In [None]:
# Loop through each channel
for channel_id in channel_df["channel_id"].values:
    # Print channel name
    print(channel_df[channel_df["channel_id"]==channel_id]["channel_name"].values[0])

    # Combine all video titles into a single string
    text = " ".join(videos_df[videos_df["channel_id"]==channel_id]["video_title"])
    
    # Create a WordCloud object
    wordcloud = WordCloud(width=800, height=400, background_color="white", random_state=7)

    # Create a word cloud of the video titles
    wordcloud.generate(text)

    # Display the word cloud 
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")  # Turn off the axis
    plt.show()