# Exploring the YouTube Dominance of Leading Music Artists: An EDA Projects using YouTube API in Python

## DATA GENERATION

 *This notebook contains data generation part , further analysis is performed in a separate notebook ("Youtube_Data_Analysis.ipynb")*

## Importing Libraries

In [None]:
pip install --upgrade google-api-python-client

In [None]:
pip install --upgrade google-auth-oauthlib google-auth-httplib2

In [None]:
pip install -U rdflib

In [4]:
#importing libraries
import pandas as pd
import numpy as np
from dateutil import parser
import isodate

# Data visualization libraries
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
sns.set(style="darkgrid", color_codes=True)

# Google API
from googleapiclient.discovery import build


# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from wordcloud import WordCloud

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
pd.set_option('display.max_columns', 500)

## Data Generation with youtube API

In [6]:
api_key = "AIzaSyBrRMW-jdjMjsH2sgSDWHRnrrA33wTzdyc"


channel_ids = ['UCqECaJ8Gagnn7YCbPEzWH6g', #TaylorSwift
               'UCIwFjwMjI0y7PDBVEO9-bkQ', #justinbieber
               'UCPNxhDvTcytIdvwXWAm43cA', #selenagomez
               'UC0C-w0YjGpqDXGB8IHb662A', #EdSheeran
               'UCZFWPqqPkFlNwIxcpsLOwew', #HarryStyles
               'UCiGm_E4ZwYSHV3bcW1pnSeQ', #BillieEilish
               'UC9CoOnJkIBMdeijd9qYoT_g', #ArianaGrande
               'UCYvmuw-JtVrTZQ-7Y4kd63Q', #KatyPerry
               'UCoUM-UJ7rirJYP8CQ0EIaHA', #brunomars
               'UC-J-KZfRV8c13fOCkhXdLiQ', #dualipa
               ]


youtube = build('youtube', 'v3', developerKey=api_key)

In [7]:
def get_channel_stats(youtube, channel_ids):
    """
    Get channel statistics: title, subscriber count, view count, video count, upload playlist
    Params:

    youtube: the build object from googleapiclient.discovery
    channels_ids: list of channel IDs

    Returns:
    Dataframe containing the channel statistics for all channels in the provided list: title, subscriber count, view count, video count, upload playlist

    """
    all_data = []
    request = youtube.channels().list(
                part='snippet,contentDetails,statistics',
                id=','.join(channel_ids))
    response = request.execute()

    for i in range(len(response['items'])):
        data = dict(channelName = response['items'][i]['snippet']['title'],
                    subscribers = response['items'][i]['statistics']['subscriberCount'],
                    views = response['items'][i]['statistics']['viewCount'],
                    totalVideos = response['items'][i]['statistics']['videoCount'],
                    playlistId = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
        all_data.append(data)

    return pd.DataFrame(all_data)

def get_video_ids(youtube, playlist_id):
    """
    Get list of video IDs of all videos in the given playlist
    Params:

    youtube: the build object from googleapiclient.discovery
    playlist_id: playlist ID of the channel

    Returns:
    List of video IDs of all videos in the playlist

    """

    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()

    video_ids = []

    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])

    next_page_token = response.get('nextPageToken')
    more_pages = True

    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()

            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])

            next_page_token = response.get('nextPageToken')

    return video_ids

def get_video_details(youtube, video_ids):
    """
    Get video statistics of all videos with given IDs
    Params:

    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs

    Returns:
    Dataframe with statistics of videos, i.e.:
        'channelTitle', 'title', 'description', 'tags', 'publishedAt'
        'viewCount', 'likeCount', 'favoriteCount', 'commentCount'
        'duration', 'definition', 'caption'
    """

    all_video_info = []

    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute()

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)

    return pd.DataFrame(all_video_info)

def get_comments_in_videos(youtube, video_ids):
    """
    Get top level comments as text from all videos with given IDs (only the first 10 comments due to quote limit of Youtube API)
    Params:

    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs

    Returns:
    Dataframe with video IDs and associated top level comment in text.

    """
    all_comments = []

    for video_id in video_ids:
        try:
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            response = request.execute()

            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)

        except:
            # When error occurs - most likely because comments are disabled on a video
            print('Could not get comments for video ' + video_id)

    return pd.DataFrame(all_comments)

## Pulling the CHANNEL statistics

In [8]:
channel_data = get_channel_stats(youtube, channel_ids)

In [9]:
channel_data

Unnamed: 0,channelName,subscribers,views,totalVideos,playlistId
0,Justin Bieber,72300000,31349772873,249,UUIwFjwMjI0y7PDBVEO9-bkQ
1,Harry Styles,14700000,6090073794,17,UUZFWPqqPkFlNwIxcpsLOwew
2,Billie Eilish,48900000,14330078464,79,UUiGm_E4ZwYSHV3bcW1pnSeQ
3,Dua Lipa,22700000,12017489077,187,UU-J-KZfRV8c13fOCkhXdLiQ
4,Taylor Swift,55600000,32355797042,225,UUqECaJ8Gagnn7YCbPEzWH6g
5,Katy Perry,44600000,26004363145,135,UUYvmuw-JtVrTZQ-7Y4kd63Q
6,Bruno Mars,37400000,20199607485,99,UUoUM-UJ7rirJYP8CQ0EIaHA
7,Ed Sheeran,53900000,31159950083,472,UU0C-w0YjGpqDXGB8IHb662A
8,Selena Gomez,34300000,13579860782,192,UUPNxhDvTcytIdvwXWAm43cA
9,Ariana Grande,53100000,24673381554,154,UU9CoOnJkIBMdeijd9qYoT_g


In [10]:
channel_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   channelName  10 non-null     object
 1   subscribers  10 non-null     object
 2   views        10 non-null     object
 3   totalVideos  10 non-null     object
 4   playlistId   10 non-null     object
dtypes: object(5)
memory usage: 528.0+ bytes


In [11]:
# Convert count columns to numeric columns
numeric_cols = ['subscribers', 'views', 'totalVideos']
channel_data[numeric_cols] = channel_data[numeric_cols].apply(pd.to_numeric, errors='coerce')

In [12]:
channel_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   channelName  10 non-null     object
 1   subscribers  10 non-null     int64 
 2   views        10 non-null     int64 
 3   totalVideos  10 non-null     int64 
 4   playlistId   10 non-null     object
dtypes: int64(3), object(2)
memory usage: 528.0+ bytes


In [13]:
import plotly.express as px

# Assuming you have the channel_data DataFrame
fig = px.bar(channel_data, x='channelName', y='subscribers', color='channelName',
             labels={'channelName': 'Channel Name', 'subscribers': 'Subscribers'},
             title='Subscribers by Channel',
             height=500, width=700, color_discrete_sequence=px.colors.qualitative.Set3)

# Format y-axis labels to display in thousands
fig.update_layout(yaxis=dict(tickformat=',.0f', title='Subscribers (in thousands)'))

# Rotate x-axis labels for better readability
fig.update_xaxes(tickangle=45, tickmode='array')

# Show the plot
fig.show()


In [14]:
import plotly.express as px

# Assuming you have the channel_data DataFrame
fig = px.bar(channel_data.sort_values('views', ascending=False),
             x='channelName', y='views',
             labels={'channelName': 'Channel Name', 'views': 'Views'},
             title='Views by Channel',
             height=500, width=700,
             color='channelName', color_discrete_sequence=px.colors.qualitative.Bold)

# Format y-axis labels to display in thousands
fig.update_layout(yaxis=dict(tickformat=',.0f', title='Views (in thousands)'))

# Rotate x-axis labels for better readability
fig.update_xaxes(tickangle=45, tickmode='array')

# Show the plot
fig.show()


## Pulling VIDEO statistics for all channels

In [15]:
# Create a dataframe with video statistics and comments from all channels

video_df = pd.DataFrame()
comments_df = pd.DataFrame()

for c in channel_data['channelName'].unique():
    print("Getting video information from channel: " + c)
    playlist_id = channel_data.loc[channel_data['channelName']== c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)

    # get video data
    video_data = get_video_details(youtube, video_ids)
    # get comment data
    comments_data = get_comments_in_videos(youtube, video_ids)

    # append video data together and comment data toghether
    video_df = video_df.append(video_data, ignore_index=True)
    comments_df = comments_df.append(comments_data, ignore_index=True)

Getting video information from channel: Justin Bieber




Could not get comments for video gOr6L844U9I




Could not get comments for video Yio7S8uMNeE




Could not get comments for video y1iJV2snVoA




Could not get comments for video jON5yjNDM_U




Could not get comments for video 7U9CKtcIwwk




Could not get comments for video BgcF5Pu8lRg




Could not get comments for video trG0aD4Buv4
Could not get comments for video ko8gepM8MBU




Could not get comments for video KJWZSEkCrAM




Could not get comments for video Zv4ftT043S0




Could not get comments for video 5yIGhsydtUQ



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Getting video information from channel: Harry Styles




Could not get comments for video QFyK9yYX24E




Could not get comments for video gEgAW-M-qOk



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Getting video information from channel: Billie Eilish




Could not get comments for video 0wpBUcYzFG0




Could not get comments for video 3YolugGV-Og



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Getting video information from channel: Dua Lipa




Could not get comments for video kpAzEXMqP9c




Could not get comments for video V9ovFDzDHTI



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Getting video information from channel: Taylor Swift




Could not get comments for video oWVYzCPs3nE




Could not get comments for video e7HLU5-0VU4



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Getting video information from channel: Katy Perry




Could not get comments for video al72T0ju-e4




Could not get comments for video LvY3C-pGq4o




Could not get comments for video GDQOCchuGOk




Could not get comments for video knhzSEv5Xpk




Could not get comments for video PVJ37H3IEds
Could not get comments for video Swjh99uKfSo




Could not get comments for video dc7D0Ay78OE




Could not get comments for video RzJsVEnf6lU




Could not get comments for video YHROHJlU_Ng



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Getting video information from channel: Bruno Mars




Could not get comments for video 29oLcLcl5R4



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Getting video information from channel: Ed Sheeran




Could not get comments for video Soy4RpKlcM8




Could not get comments for video A8cXaCtUrT8




Could not get comments for video Xv4oBVBHoQA



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Getting video information from channel: Selena Gomez




Could not get comments for video _UNVnF8FqcI




Could not get comments for video 8sG9ieizdEQ




Could not get comments for video NvlE1NyGyEs




Could not get comments for video BWJPQp8pOF0




Could not get comments for video OZdZLeFjvXY
Could not get comments for video aRqdRaZdN6o




Could not get comments for video lmWO7hlyvns




Could not get comments for video 9CuE9XZJRX8



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Getting video information from channel: Ariana Grande




Could not get comments for video Wxjo9OJskYM
Could not get comments for video StW5s8UA2YU



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [16]:
video_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,z5AIMF1BjOE,Justin Bieber,Drew House TOKYO #shorts,Follow Justin: \nhttp://facebook.com/justinbie...,"[justin, bieber, justinbieber, purpose, myworl...",2022-11-21T16:29:07Z,1487703,37450,,3248,PT42S,hd,false
1,_lpSCOZ1PCo,Justin Bieber,Justin Bieber X Free Fire - Beautiful Love (Fr...,The official “Beautiful Love (Free Fire)” musi...,"[justin, bieber, justinbieber, purpose, myworl...",2022-09-18T13:00:18Z,7360042,276241,,11196,PT3M33S,hd,false
2,ZMH9AMwAk34,JustinBieberVEVO,"Justin Bieber, Don Toliver - The Making of 'Ho...",Justin Bieber ft. Don Toliver - The Making of...,"[Justin, Bieber, Don, Toliver, Honest, (The, M...",2022-06-15T16:00:17Z,4242600,101582,,4940,PT3M32S,hd,true
3,-78y2WW4YDA,Justin Bieber,#shorts,Follow Justin: \nhttp://facebook.com/justinbie...,"[justin, bieber, justinbieber, purpose, myworl...",2022-05-31T17:54:49Z,1397112,82416,,1706,PT11S,hd,false
4,UWrBS-4pSfM,Justin Bieber,"May 13, 2022",,,2022-05-13T16:28:59Z,1317965,53664,,902,PT4S,hd,false
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3065,SaF-hAQA99Y,Ariana Grande,wannabe,i love you frany hah!,"[wannabe, spice, girls, osnapitzari, LOL]",2007-03-03T22:32:48Z,19349430,2267338,,123239,PT1M17S,sd,false
3066,LBKoSOPUMFE,Ariana Grande,its not unusual,this is me and my bff dancing lolll,"[its, not, unusual, tom, jones, lol, osnapitza...",2007-02-03T02:48:54Z,1179785,83256,,5579,PT2M5S,sd,false
3067,qXj9Lw7Geg8,Ariana Grande,dancing grandparents,these are my grandparents... =],"[dancing, grandparents, osnapitzari]",2007-01-23T03:09:43Z,606277,21293,,1713,PT2M48S,sd,false
3068,toIjc7ywxc8,Ariana Grande,Shoes,This is a shoes music video that i made with m...,"[shoes, ariana, osnapitzari]",2007-01-23T02:48:19Z,2424212,141131,,12344,PT2M50S,sd,false


In [17]:
comments_df

Unnamed: 0,video_id,comments
0,z5AIMF1BjOE,[I don't see your new videos for long time\nWh...
1,_lpSCOZ1PCo,"[Who is for army 😂😂😂 ?, Its been a year alread..."
2,ZMH9AMwAk34,"[💪💪💪, ✌Peace, lol, I spilled my peppermint tea..."
3,-78y2WW4YDA,"[Justin, lol, 저스틴비버 에게(저스틴비버야 이글이해석이어려우면 여기글캡쳐..."
4,UWrBS-4pSfM,"[jb alive?, I love JUSTIN BIVER😘😘😘😘😘😰😰😰😰😰😰😰😰😰😍..."
...,...,...
3023,SaF-hAQA99Y,[Why did youtube recommend me this after 16 ye...
3024,LBKoSOPUMFE,[knowing that someone is so famise is actuly a...
3025,qXj9Lw7Geg8,"[Ya casi 17 años..., Haha ❤❤❤, RIP grandpa gra..."
3026,toIjc7ywxc8,"[Dec. 2023 gang\n👇, SHOES!, 16 years agooo, He..."


In [18]:
# Write video data to CSV file for future references
video_df.to_csv('video_data_top10_channels_singers.csv')
comments_df.to_csv('comments_data_top10_channels_singers.csv')