In [1]:
from googleapiclient.discovery import build
import pandas as pd
from IPython.display import JSON

In [2]:
api_key = 'AIzaSyBF_nxPBBYcNZk14oCb5fYypr9AC9LdBzw'

In [3]:
# Channel ids of the youtube channels we're interested in

channel_ids = ['UCJQJAI7IjbLcpsjWdSzYz0Q', # Thu Vu data analytics
               'UC2UXDak6o7rBm23k3Vv5dww', # Tina Huang
               'UC7cs8q-gJRlGwj4A8OmCmXg', # Alex the Analyst
               'UC6DnLOwz8R0iZPmkZ3vMM1g', # datdata
               'UC3Bl85nLYtHHxrE1Uzkd5oA', # Grow Up
               'UCwXJEwaFT5i3MKKMGdneYUA', # The Career Force
               'UCFp1vaKzpfvoGai0vE5VJ0w', # Guy in a Cube
               'UCW8Ews7tdKKkBT6GdtQaXvQ'] # StrataScratch

In [4]:
api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = build(
    api_service_name, api_version, developerKey = api_key)

In [5]:
# Function to get info from channels

def get_channel_stats(youtube, channel_ids):

    all_data = []
    
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=','.join(channel_ids)
    )
    response = request.execute()

    # loop through items
    for item in response['items']:
        data = {'ChannelName':item['snippet']['title'],
                'subscribers': item['statistics']['subscriberCount'],
                'views':item['statistics']['viewCount'],
                'totalVideos': item['statistics']['videoCount'],
                'playlistId':item['contentDetails']['relatedPlaylists']['uploads']

        }

        all_data.append(data)
    return(pd.DataFrame(all_data)) 

In [6]:
# Function to get video ids

def get_video_ids(youtube, playlist_id):

# Get list of video IDs of all videos in the given playlist
# Params:
# youtube: the build object from googleapiclient.discovery
# playlist_id: playlist ID of the channel
# Returns:
# List of video IDs of all videos in the playlist
    
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
    
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
            
            next_page_token = response.get('nextPageToken')
    
    return video_ids

In [7]:
# Function to get all video details

def get_video_details(youtube, video_ids):

    all_video_info = []

    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
        part="snippet,contentDetails,statistics",
        id=','.join(video_ids[i:i+50])
        )
        response = request.execute()
    
        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle','title','description','tags','publishedAt'],
                            'statistics': ['viewCount','likeCount','favouriteCount','commentCount'],
                            'contentDetails': ['duration','definition','caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None
        
            all_video_info.append(video_info)

    return pd.DataFrame(all_video_info)    

In [8]:
# Get top level comments as text from all videos with given IDs 
#(only the first 10 comments due to quote limit of Youtube API)

def get_comments_videos(youtube, video_ids):

    all_comments = []

    for video_id in video_ids:
        try:
            request = youtube.commentThreads().list(
            part="snippet, replies",
            videoId = video_id
            )
            response = request.execute()

            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments':comments_in_video}

            all_comments.append(comments_in_video_info)
            
        except:
            # When error occurs - most likely because comments are disabled on a video
            print('Could not get comments for video ' + video_id)
            
    return pd.DataFrame(all_comments)

In [9]:
channel_data = get_channel_stats(youtube, channel_ids)

In [10]:
channel_data.dtypes

ChannelName    object
subscribers    object
views          object
totalVideos    object
playlistId     object
dtype: object

In [11]:
numeric_cols = ['subscribers', 'views', 'totalVideos']
channel_data[numeric_cols] = channel_data[numeric_cols].apply(pd.to_numeric,errors='coerce')

In [None]:
# create data frames with video stats and comments for all channles

video_df = pd.DataFrame()
comments_df = pd.DataFrame()

for c in channel_data['ChannelName'].unique():
    print('Getting video info from channel: ' + c)
    playlist_id = channel_data.loc[channel_data['ChannelName']== c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)

    # get video data
    video_data = get_video_details(youtube, video_ids)

    # get comment data
    comments_data = get_comments_videos(youtube, video_ids)

    # append video data together and comment data together
    video_df = video_df.append(video_data, ignore_index=True)
    comments_df = comments_df.append(comments_data, ignore_index=True)

Getting video info from channel: Thu Vu data analytics


In [None]:
video_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,JDOAcKTy9Mk,StrataScratch,Top 5 Data Science Project Ideas in 2022 to St...,"As a data scientist, you need to practice to k...","[data science project ideas 2022, data science...",2022-08-18T15:00:10Z,1167,72,,6,PT4M41S,hd,false
1,o8cFJmI50Dw,StrataScratch,STRING_AGG Function to Solve Data Science Inte...,This video shows you how to use the STRING_AGG...,"[string_agg function in sql, string_agg, sql i...",2022-08-11T15:00:25Z,1181,45,,2,PT22M19S,hd,false
2,ggZzHnzYH1s,StrataScratch,How To Use Python Window Functions | Solving A...,"In this video, we'll take a close look at pyth...","[python window functions, window functions in ...",2022-08-03T03:51:18Z,2345,103,,7,PT16M24S,hd,false
3,8zeLdtkY2CQ,StrataScratch,Data Science Mock Interview | Salesforce SQL C...,"In this video, we'll take a close look at a da...","[find the retention rates, data science mock i...",2022-06-17T09:49:01Z,5502,159,,22,PT19M44S,hd,false
4,tnoOz6MzTPg,StrataScratch,Python cumsum() | Solving Python Optimization ...,"In this video, we'll take a close look at one ...","[python optimization questions, python optimiz...",2022-05-11T15:01:39Z,2385,78,,10,PT19M13S,hd,false
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005,4rfr6A3lO-Y,Alex The Analyst,Data Analyst Resume | Reviewing My Resume! | F...,Data Analyst Resume | Reviewing My Resume! | F...,"[Data Analyst, How to become a data analyst, D...",2020-01-30T14:07:55Z,51103,1313,,63,PT7M33S,hd,false
2006,OTq2NRy_AGs,Alex The Analyst,Working at a Big Company Vs Small Company | To...,Working at a Big Company Vs Small Company | To...,"[Data Analyst, How to become a Data Analyst, B...",2020-01-25T16:38:39Z,10487,308,,18,PT5M50S,hd,false
2007,ya28cb3zFGE,Alex The Analyst,Data Analyst Salary | 100k with No Experience,Data Analyst Salary | 100k with No Experience ...,"[Data Analyst Salary, Data analyst with no exp...",2020-01-23T03:16:09Z,49519,1786,,205,PT5M3S,hd,false
2008,Hsi2BG0SOiQ,Alex The Analyst,Truth About Big Companies | Told by a Fortune ...,Truth About Big Companies // There are a ton o...,"[Working at a big company, Big company data an...",2020-01-21T03:52:15Z,6109,236,,17,PT5M45S,hd,false


In [None]:
comments_df

Unnamed: 0,video_id,comments
0,JDOAcKTy9Mk,[Could you recommend which forum would be good...
1,o8cFJmI50Dw,[with\r\nno_series as (select generate_series ...
2,ggZzHnzYH1s,[Very sorry that the code is small. We will fi...
3,8zeLdtkY2CQ,"[Hi, thank you for amazing video. During the i..."
4,tnoOz6MzTPg,"[Very elegant, thank you much!, Very clever. N..."
...,...,...
2001,4rfr6A3lO-Y,[Thanks for watching! What is the weirdest thi...
2002,OTq2NRy_AGs,[Would you rather work at a BIG Company or a S...
2003,ya28cb3zFGE,[3:23\n76k hourly? I'm in!\nBut nice vid keep ...
2004,Hsi2BG0SOiQ,"[Bro, you are spitting straight facts in this ..."


In [None]:
channel_data

Unnamed: 0,ChannelName,subscribers,views,totalVideos,playlistId
0,StrataScratch,34800,1074656,91,UUW8Ews7tdKKkBT6GdtQaXvQ
1,The Career Force,51200,3426279,134,UUwXJEwaFT5i3MKKMGdneYUA
2,datdata,105000,4257947,293,UU6DnLOwz8R0iZPmkZ3vMM1g
3,Guy in a Cube,282000,21836476,804,UUFp1vaKzpfvoGai0vE5VJ0w
4,Grow Up,13200,1860076,374,UU3Bl85nLYtHHxrE1Uzkd5oA
5,Thu Vu data analytics,48700,1169922,43,UUJQJAI7IjbLcpsjWdSzYz0Q
6,Tina Huang,386000,15138127,106,UU2UXDak6o7rBm23k3Vv5dww
7,Alex The Analyst,267000,10646121,159,UU7cs8q-gJRlGwj4A8OmCmXg


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=128ee979-16de-4ca3-802b-b10a97ff6128' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>