In [1]:
import sys
sys.executable

'/Users/nimo/opt/anaconda3/bin/python'

In [2]:
import json
import requests
from tqdm import tqdm

class YTstats:
    def __init__(self, api_key, channel_id):
        self.api_key = api_key
        self.channel_id = channel_id
        self.channel_statistics = None
        self.video_data = None

    def extract_all(self):
        self.get_channel_statistics()
        self.get_channel_video_data()

    def get_channel_statistics(self):
        """Extract the channel statistics"""
        print('get channel statistics...')
        url = f'https://www.googleapis.com/youtube/v3/channels?part=statistics&id={self.channel_id}&key={self.api_key}'
        pbar = tqdm(total=1)
        
        json_url = requests.get(url)
        data = json.loads(json_url.text)
        try:
            data = data['items'][0]['statistics']
        except KeyError:
            print('Could not get channel statistics')
            data = {}

        self.channel_statistics = data
        pbar.update()
        pbar.close()
        return data
        
    def get_channel_video_data(self):
        "Extract all video information of the channel"
        print('get video data...')
        channel_videos, channel_playlists = self._get_channel_content(limit=50)

        parts=["snippet", "statistics","contentDetails", "topicDetails"]
        for video_id in tqdm(channel_videos):
            for part in parts:
                data = self._get_single_video_data(video_id, part)
                channel_videos[video_id].update(data)

        self.video_data = channel_videos
        return channel_videos

    def _get_single_video_data(self, video_id, part):
        """
        Extract further information for a single video
        parts can be: 'snippet', 'statistics', 'contentDetails', 'topicDetails'
        """

        url = f"https://www.googleapis.com/youtube/v3/videos?part={part}&id={video_id}&key={self.api_key}"
        json_url = requests.get(url)
        data = json.loads(json_url.text)
        try:
            data = data['items'][0][part]
        except KeyError as e:
            print(f'Error! Could not get {part} part of data: \n{data}')
            data = dict()
        return data

    def _get_channel_content(self, limit=None, check_all_pages=True):
        """
        Extract all videos and playlists, can check all available search pages
        channel_videos = videoId: title, publishedAt
        channel_playlists = playlistId: title, publishedAt
        return channel_videos, channel_playlists
        """
        url = f"https://www.googleapis.com/youtube/v3/search?key={self.api_key}&channelId={self.channel_id}&part=snippet,id&order=date"
        if limit is not None and isinstance(limit, int):
            url += "&maxResults=" + str(limit)

        vid, pl, npt = self._get_channel_content_per_page(url)
        idx = 0
        while(check_all_pages and npt is not None and idx < 10):
            nexturl = url + "&pageToken=" + npt
            next_vid, next_pl, npt = self._get_channel_content_per_page(nexturl)
            vid.update(next_vid)
            pl.update(next_pl)
            idx += 1

        return vid, pl

    def _get_channel_content_per_page(self, url):
        """
        Extract all videos and playlists per page
        return channel_videos, channel_playlists, nextPageToken
        """
        json_url = requests.get(url)
        data = json.loads(json_url.text)
        channel_videos = dict()
        channel_playlists = dict()
        if 'items' not in data:
            print('Error! Could not get correct channel data!\n', data)
            return channel_videos, channel_videos, None

        nextPageToken = data.get("nextPageToken", None)

        item_data = data['items']
        for item in item_data:
            try:
                kind = item['id']['kind']
                published_at = item['snippet']['publishedAt']
                title = item['snippet']['title']
                if kind == 'youtube#video':
                    video_id = item['id']['videoId']
                    channel_videos[video_id] = {'publishedAt': published_at, 'title': title}
                elif kind == 'youtube#playlist':
                    playlist_id = item['id']['playlistId']
                    channel_playlists[playlist_id] = {'publishedAt': published_at, 'title': title}
            except KeyError as e:
                print('Error! Could not extract data from item:\n', item)

        return channel_videos, channel_playlists, nextPageToken

    def dump(self):
        """Dumps channel statistics and video data in a single json file"""
        if self.channel_statistics is None or self.video_data is None:
            print('data is missing!\nCall get_channel_statistics() and get_channel_video_data() first!')
            return

        fused_data = {self.channel_id: {"channel_statistics": self.channel_statistics,
                              "video_data": self.video_data}}

        channel_title = self.video_data.popitem()[1].get('channelTitle', self.channel_id)
        channel_title = channel_title.replace(" ", "_").lower()
        filename = channel_title + '.json'
        with open(filename, 'w') as f:
            json.dump(fused_data, f, indent=4)
        
        print('file dumped to', filename)

In [3]:
# from youtube_statistics import YTstats

API_KEY = "AIzaSyD78_4bOwu-HzA4rpNevICNuV_5beG0bCE"
channel_id = "UCY3TJECrA90t9YTrxhdjcVw"

yt = YTstats(API_KEY,channel_id)
yt.get_channel_statistics()
yt.get_channel_video_data()
yt.dump()

  0%|          | 0/1 [00:00<?, ?it/s]

get channel statistics...


100%|██████████| 1/1 [00:00<00:00,  3.15it/s]


get video data...


100%|██████████| 58/58 [00:49<00:00,  1.18it/s]

file dumped to meet_arnold.json





In [3]:
# import pandas as pd

file = "meet_arnold.json"
data = None
with open(file, 'r') as f:
    data = json.load(f)
    
channel_id, stats = data.popitem()
print(channel_id)
channel_stats = stats["channel_statistics"]
video_stats = stats["video_data"]

UCY3TJECrA90t9YTrxhdjcVw


In [4]:
# cahnnel statistics 
print('views', channel_stats["viewCount"])
print('subscriber', channel_stats["subscriberCount"])
print('videos', channel_stats["videoCount"])

views 489353228
subscriber 3070000
videos 58


In [5]:
# video statistics 
sorted_vids = sorted(video_stats.items(), key= lambda item: int(item[1]["viewCount"]), reverse=True)
stats = []
for vid in sorted_vids:
    video_id = vid[0]
    title = vid[1]["title"]
    views = vid[1]["viewCount"]
    likes = vid[1]["likeCount"]
    dislikes = vid[1]["dislikeCount"]
    comments = vid[1]["commentCount"]
    stats.append([title, views, likes, dislikes, comments])

In [7]:
import pandas as pd

df = pd.DataFrame(stats, columns=["title","viewCount","likeCount","dislikeCount","commentCount"])
df = df.head(51)
df 

Unnamed: 0,title,viewCount,likeCount,dislikeCount,commentCount
0,What if You Are Buried Alive?,23967421,318904,23352,9627
1,What Happens If You Drink Coke Instead of Water?,19296266,214329,10692,13760
2,What If the Whale Swallowed You Alive?,17503071,204842,14238,11811
3,What If You Spend 1 Nanosecond In The Volcano?,17125945,194349,16267,13919
4,What Would Happen If Humans Disappeared???,16287845,217874,18623,20402
5,What if You Swallowed the Most Venomous Snake ...,15041354,153734,11791,3863
6,What If You Run At The Speed of Light ?,13755925,213877,11206,10394
7,What if You Eat 100 000 Calories?,13646615,147457,13813,12738
8,What Happens To You In The Bottom Of Mariana T...,13634796,168219,8679,9062
9,Arnold Set off to the Center of the Earth,13145412,171772,7103,10631
