In [None]:
# Install required packages
# !pip install python-dotenv
# !pip install numpy
# !pip install pandas
# !pip install google-api-python-client
# !pip install mysql-connector-python
# !pip install sqlalchemy

In [3]:
# Import required packages
from dotenv import load_dotenv
import os
import numpy as np
import pandas as pd
from googleapiclient.discovery import build
import mysql.connector
from sqlalchemy import create_engine

In [4]:
# Load environment variables from .env file
load_dotenv()

# Get YouTube API key from .env 
youtube_api_key = os.getenv("youtube_api_key")
# Get MySQL username from .env
mysql_user = os.getenv("mysql_user")
# Get MySQL password from .env
mysql_password = os.getenv("mysql_password")

In [5]:
# Build the YouTube service object
youtube = build("youtube", "v3", developerKey=youtube_api_key)

In [6]:
# Extract data from a single YouTube channel

# Define channel name
channel_name = "AlexTheAnalyst"

# Get channel data using the YouTube Channels API
# Note: Uses 1 out of 10.000 units from the daily usage limit 
channel_data = youtube.channels().list(part="statistics,snippet,contentDetails", forHandle=channel_name).execute()  

# Extract channel data and store as pandas DataFrame
channel_df = pd.DataFrame([{
    "channel_id": channel_data["items"][0]["id"],
    "channel_name": channel_data["items"][0]["snippet"]["title"],
    "views": int(channel_data["items"][0]["statistics"]["viewCount"]),
    "videos": int(channel_data["items"][0]["statistics"]["videoCount"]),
    "subscribers": int(channel_data["items"][0]["statistics"]["subscriberCount"])
}])
channel_df

Unnamed: 0,channel_id,channel_name,views,videos,subscribers
0,UC7cs8q-gJRlGwj4A8OmCmXg,Alex The Analyst,31963894,294,735000


In [7]:
# Extract video data

# Initialize an empty list to store dictionaries for each video
videos_ls = []

# Extract uploads playlist ID containing all videos of the channel 
uploads_playlist_id = channel_data["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]

# Initialize next_page_token to None
next_page_token = None

# Loop through each video in the uploads playlist
while True:
    # Get playlist data using the YouTube PlaylistItems API 
    # Note: Each loop uses 1 out of 10.000 units from the daily usage limit (1 unit for 50 videos)
    playlist_data = youtube.playlistItems().list(
        part="snippet", 
        playlistId=uploads_playlist_id, 
        maxResults=50,
        pageToken=next_page_token
    ).execute()
    
    # Initialize an empty list to store video IDs
    video_ids = []

    # Extract video IDs from the playlist data
    video_ids += [video_data["snippet"]["resourceId"]["videoId"] for video_data in playlist_data["items"]]
    
    # Get video data using the YouTube Videos API 
    # Note: Uses 1 out of 10.000 units from the daily usage limit (1 unit for 50 videos)
    video_data = youtube.videos().list(part="statistics,snippet,contentDetails", id=video_ids).execute()    

    # Loop through each video 
    for video in video_data["items"]:
        # Extract video data in dictionary format
        video_dict = {
            "video_id": video["id"],
            "channel_id": video["snippet"]["channelId"],
            "video_title": video["snippet"]["title"],
            "video_description": video["snippet"]["description"],
            "published_at": video["snippet"]["publishedAt"],
            "video_duration": video["contentDetails"]["duration"],
            "views": video["statistics"]["viewCount"],
            "likes": video["statistics"]["likeCount"],
            "comments": video["statistics"]["commentCount"],
        }
        
        try:
            # Try to get thumbnail in maximum resolution
            video_dict["thumbnail_url"] = video["snippet"]["thumbnails"]["maxres"]["url"]
        except KeyError:
            # If maxres is not available, get default resolution
            video_dict["thumbnail_url"] = video["snippet"]["thumbnails"]["default"]["url"]
        
        # Append video data in dictionary format to the list
        videos_ls.append(video_dict)
 
    # Get the next page token
    next_page_token = playlist_data.get("nextPageToken")
    
    # Exit the loop if there are no more pages
    if next_page_token is None:
        break
        
# Convert list of dictionaries to pandas DataFrame
videos_df = pd.DataFrame(videos_ls)    
videos_df

Unnamed: 0,video_id,channel_id,video_title,video_description,published_at,video_duration,views,likes,comments,thumbnail_url
0,7b8ViCqD9JM,UC7cs8q-gJRlGwj4A8OmCmXg,How to give up on the job search fast,💻Analyst Builder - https://www.analystbuilder....,2024-03-27T12:19:35Z,PT52S,2717,182,7,https://i.ytimg.com/vi/7b8ViCqD9JM/maxresdefau...
1,7vnxpcqmqNQ,UC7cs8q-gJRlGwj4A8OmCmXg,Stored Procedures in MySQL | Advanced MySQL Se...,Full MySQL Course: https://www.analystbuilder....,2024-03-26T12:00:12Z,PT12M37S,2414,100,10,https://i.ytimg.com/vi/7vnxpcqmqNQ/maxresdefau...
2,uEk07jXdKOo,UC7cs8q-gJRlGwj4A8OmCmXg,Temp Tables in MySQL | Advanced MySQL Series,Full MySQL Course: https://www.analystbuilder....,2024-03-19T12:00:56Z,PT7M46S,4580,158,8,https://i.ytimg.com/vi/uEk07jXdKOo/maxresdefau...
3,UC7uvOqcUTs,UC7cs8q-gJRlGwj4A8OmCmXg,CTEs in MySQL | Advanced MySQL Series,Full MySQL Course: https://www.analystbuilder....,2024-03-12T12:00:23Z,PT10M31S,7429,228,15,https://i.ytimg.com/vi/UC7uvOqcUTs/maxresdefau...
4,1KEbiqRWOkA,UC7cs8q-gJRlGwj4A8OmCmXg,7 Mistakes to Avoid During Your Data Analyst J...,When I was a Hiring Managers I saw a lot of pe...,2024-03-05T13:00:01Z,PT11M54S,15324,573,49,https://i.ytimg.com/vi/1KEbiqRWOkA/maxresdefau...
...,...,...,...,...,...,...,...,...,...,...
289,4rfr6A3lO-Y,UC7cs8q-gJRlGwj4A8OmCmXg,Data Analyst Resume | Reviewing My Resume! | F...,Data Analyst Resume | Reviewing My Resume! | F...,2020-01-30T14:07:55Z,PT7M33S,69879,1641,64,https://i.ytimg.com/vi/4rfr6A3lO-Y/default.jpg
290,OTq2NRy_AGs,UC7cs8q-gJRlGwj4A8OmCmXg,Working at a Big Company Vs Small Company | To...,Working at a Big Company Vs Small Company | To...,2020-01-25T16:38:39Z,PT5M50S,14941,404,22,https://i.ytimg.com/vi/OTq2NRy_AGs/default.jpg
291,ya28cb3zFGE,UC7cs8q-gJRlGwj4A8OmCmXg,Data Analyst Salary | 100k with No Experience,Data Analyst Salary | 100k with No Experience ...,2020-01-23T03:16:09Z,PT5M3S,63565,2172,227,https://i.ytimg.com/vi/ya28cb3zFGE/default.jpg
292,Hsi2BG0SOiQ,UC7cs8q-gJRlGwj4A8OmCmXg,Truth About Big Companies | Told by a Fortune ...,Truth About Big Companies // There are a ton o...,2020-01-21T03:52:15Z,PT5M45S,8643,316,18,https://i.ytimg.com/vi/Hsi2BG0SOiQ/default.jpg


In [23]:
# Extract comments data

# Initialize an empty list to store comments
comments_ls = []

# Loop through each video
for video_id in videos_df["video_id"].values:
    # Initialize next_page_token to None
    next_page_token = None

    # Loop through data batches of 100 comments 
    while True:
        # Get data from 100 comments using the YouTube CommentThreads API 
        # Note: Each loop uses 1 out of 10.000 units from the daily usage limit (1 unit for 100 comments)
        comments_data = youtube.commentThreads().list(
            part="snippet", 
            videoId=video_id, 
            maxResults=100,
            pageToken=next_page_token
        ).execute()

        # Loop through each comment
        for comment in comments_data["items"]:
            # Extract comment data in dictionary format
            comment_dict = {
                "comment_id": comment["snippet"]["topLevelComment"]["id"],
                "video_id": comment["snippet"]["topLevelComment"]["snippet"]["videoId"],
                "channel_id": comment["snippet"]["topLevelComment"]["snippet"]["channelId"],
                "comment_text": comment["snippet"]["topLevelComment"]["snippet"]["textOriginal"]
            }
            # Append comment data dictionary to the list
            comments_ls.append(comment_dict)

        # Get the next page token
        next_page_token = playlist_data.get("nextPageToken")

        # Exit the loop if there are no more pages
        if next_page_token is None: 
            break
        
# Convert list of dictionaries to pandas DataFrame
comments_df = pd.DataFrame(comments_ls)    
comments_df 

Unnamed: 0,comment_id,video_id,channel_id,comment_text
0,Ugxzv5XgGPn-kzjBT9h4AaABAg,7b8ViCqD9JM,UC7cs8q-gJRlGwj4A8OmCmXg,Recruiters are great - but they really don't a...
1,UgzgH61hm2kJ6LJUaDd4AaABAg,7b8ViCqD9JM,UC7cs8q-gJRlGwj4A8OmCmXg,What’s a recruiter
2,Ugw2t6Ako-Qo5q7Ti0R4AaABAg,7b8ViCqD9JM,UC7cs8q-gJRlGwj4A8OmCmXg,Thanks for this information. I am about starti...
3,Ugy1U2pk7U6go4UjIpV4AaABAg,7b8ViCqD9JM,UC7cs8q-gJRlGwj4A8OmCmXg,Exactly what I do
4,Ugx3e3rDnw4cRF1M5dp4AaABAg,7b8ViCqD9JM,UC7cs8q-gJRlGwj4A8OmCmXg,"Heyy Alex, please do video on how to get freel..."
...,...,...,...,...
16033,UgztX5Zp0jjsOBtRqdp4AaABAg,6lQzbk6_OTw,UC7cs8q-gJRlGwj4A8OmCmXg,"Hey Alex, what do you think about COGNOS?"
16034,Ugx5i3bb5-5V8zNge_x4AaABAg,6lQzbk6_OTw,UC7cs8q-gJRlGwj4A8OmCmXg,"Hi Alex,\nfound your channel on Reddit and am ..."
16035,Ugz9os89TlxWUGtj0zR4AaABAg,6lQzbk6_OTw,UC7cs8q-gJRlGwj4A8OmCmXg,"Great video, Alex! I definitely agree that Exc..."
16036,UgxgUXmGkengAMwgAGt4AaABAg,6lQzbk6_OTw,UC7cs8q-gJRlGwj4A8OmCmXg,"Hey Alex, great video, just went through all o..."


In [24]:
# Load data from Pandas DataFrames into MySQL tables

# Connect to MySQL database
connection = mysql.connector.connect(
    host = "localhost",
    user = mysql_user,
    password = mysql_password,
    database = "youtube_analytics"
)

# Create a cursor object for interacting with the database
cursor = connection.cursor()

try:
    # Create a SQLAlchemy engine for interacting with the MySQL database
    engine = create_engine(f"mysql+mysqlconnector://{mysql_user}:{mysql_password}@localhost/youtube_analytics")
    
    # Load the YouTube channels DataFrame into the MySQL channels table
    try:
        channel_df.to_sql("channels", con=engine, if_exists="replace", index=False)
        print("Channels data successfully loaded into MySQL database.")
    except Exception as e:
        print("Error loading channels data:", e)
    
    # Load the YouTube videos DataFrame into the MySQL videos table
    try:
        videos_df.to_sql("videos", con=engine, if_exists="replace", index=False)
        print("Videos data successfully loaded into MySQL database.")
    except Exception as e:
        print("Error loading videos data:", e)
    
    # Load the YouTube comments DataFrame into the MySQL comments table
    try:
        comments_df.to_sql("comments", con=engine, if_exists="replace", index=False)
        print("Comments data successfully loaded into MySQL database.")
    except Exception as e:
        print("Error loading comments data:", e)
    
except Exception as e:
    # Print error if exception occurs when connecting to the database 
    print("Error connecting to MySQL database:", e)

finally:
    # Close the cursor and connection to free up resources
    cursor.close()
    connection.close()

Channels data successfully loaded into MySQL database.
Videos data successfully loaded into MySQL database.
Comments data successfully loaded into MySQL database.
