## YouTube Statistics

$by: Jeremiah\space Chinyelugo$

##### Importing the libraries

In [2]:
import requests
import pandas as pd
import numpy as np
import json
from bs4 import BeautifulSoup
from tqdm import tqdm
import warnings 
warnings.filterwarnings('ignore')
import re

##### Defining the api key and channel id

In [2]:
# api_key = 'INPUT YUR API KEY HERE'
# channel_id = 'INPUT THE CHANNEL ID'

# #find chanel id using (https://commentpicker.com/youtube-channel-id.php)

##### Class for getting channel stats

In [3]:
class YouTube_Stats:
    
    ''' Initializing the api_key & Id of the channel we want to work with '''
    def __init__(self, api_key, channel_id):
        self.api_key = api_key
        self.channel_id = channel_id
        self.video_id = None
        self.channel_statistics = None
        
    
    ''' Method to get the channel statistics '''
    def get_channel_statistics(self):
        url = f"https://www.googleapis.com/youtube/v3/channels?part=statistics&id={self.channel_id}&key={self.api_key}"
        r = requests.get(url)
        data = r.json()
        try:
            data = data['items'][0]['statistics']
        except:
            data = None
        
        self.channel_statistics = data
        return data
    
    ''' Retrieves specified properties for each video_id '''
    def get_channel_video_data(self):
        channel_videos = self._get_channel_videos(limit=50)
        
        parts = ['snippet','statistics','contentDetails']
        for video_id in tqdm(channel_videos):
            for part in parts:
                data = self._get_single_video_data(video_id, part)
                channel_videos[video_id].update(data)
        
        self.video_data = channel_videos
        return channel_videos
        
        
        
    ''' Retrieves a video's data. video_id of the video and 
    part(specifies what property or properties you want to retrieve e.g "statistics") 
    are required '''    
    def _get_single_video_data(self, video_id, part):
        url = f"https://www.googleapis.com/youtube/v3/videos?part={part}&id={video_id}&key={self.api_key}"
        r = requests.get(url)
        data = r.json()
        try:
            data = data['items'][0][part]
        except:
            print('error')
            data = dict()
        return data
    
    
    ''' Limits and retrieves the video Id retrieved from the next method'''
    def _get_channel_videos(self, limit=None):
        url = f"https://www.googleapis.com/youtube/v3/search?key={self.api_key}&channelId={self.channel_id}&part=snippet,id&order=date"
        if limit is not None and isinstance(limit, int):
            url = url + "&maxResults=" + str(limit)
            
        vid, npt = self._get_channel_videos_per_page(url)
        idx = 0
        while (npt is not None and idx < 10):
            nexturl = url + '&pageToken=' + npt
            next_vid, npt = self._get_channel_videos_per_page(nexturl)
            vid.update(next_vid)
            idx = idx + 1
        
        return vid
        
    ''' Retrieves video Id's for each page and the NextPageToken for other 
    video Id"s '''    
    def _get_channel_videos_per_page(self, url):
        r = requests.get(url)
        data = r.json()
        channel_videos = dict()
        if 'items' not in data:
            return channel_videos, None
        
        item_data = data['items']
        nextPageToken = data.get('nextPageToken', None)
        for item in item_data:
            try:
                kind = item['id']['kind']
                if kind == 'youtube#video':
                    video_id = item['id']['videoId']
                    channel_videos[video_id] = dict()
            except KeyError:
                print('Error')
                
        return channel_videos, nextPageToken
    
    
    def dump(self):
        if self.channel_statistics is None or self.video_data is None:
            print('No Data Available')
            return
        
        fused_data = {self.channel_id: {'channel_statistics': self.channel_statistics, 'video_data': self.video_data}}
        channel_title = self.video_data.popitem()[1].get('channelTitle', self.channel_id)
        channel_title = channel_title.replace(' ','_').lower()
        filename = channel_title + '.json'
        
        with open(filename, 'w') as f:
            json.dump(fused_data, f, indent = 4)
        
        print('File dumped')

In [4]:
yt = YouTube_Stats(api_key, channel_id)
yt.get_channel_statistics()
yt.get_channel_video_data()
yt.dump()

100%|████████████████████████████████████████████████████████████████████████████████| 172/172 [01:30<00:00,  1.91it/s]

File dumped





### Retrieving the json file
json file has already been saved in the current directory, replace the file name below with the json filename

In [3]:
# name of file
filename = 'kurzgesagt_–_in_a_nutshell'

with open(filename+'.json', 'r') as f:
    data = json.load(f)
channel_id = [x for x in data.keys()][0]

#### Channel_stats

In [4]:
channel_stats = data[channel_id]['channel_statistics']

In [5]:
df_channel_stats = pd.DataFrame([channel_stats.values()], columns=channel_stats.keys())
df_channel_stats['channelName'] = filename
df_channel_stats

Unnamed: 0,viewCount,subscriberCount,hiddenSubscriberCount,videoCount,channelName
0,2225065056,20100000,False,172,kurzgesagt_–_in_a_nutshell


#### Video Data

In [6]:
video_data = data[channel_id]['video_data']

In [7]:
df_video_data = pd.DataFrame([])

video_ids = [x for x in video_data.keys()]
properties = ['publishedAt', 'title', 'description', 'viewCount', 'likeCount', 'commentCount', 'duration']

for prop in properties:
    list_ = []
    for video_id in video_ids:
        try:
            data = video_data[video_id][prop]
        except KeyError:
            data = 0
        list_.append(data)
    df_video_data[prop] = list_
df_video_data['channelName'] = filename

#### Cleaning the data

In [8]:
df_video_data['publishedAt'] = pd.to_datetime(df_video_data['publishedAt']).dt.date

In [23]:
# Stripping PT away to make manipulation easier
df_video_data['duration'] = df_video_data['duration'].str.strip('PT')



# extracting minutes and seconds
minutes = [x[0] for x in df_video_data['duration'].str.split('M')]
seconds = [x[1] for x in df_video_data['duration'].str.split('M')]

df_video_data['seconds'] = seconds
df_video_data['minutes'] = minutes

# converting minutes to seconds
df_video_data['seconds'] = df_video_data['seconds'].str.strip('S')
df_video_data['seconds'] = np.where(df_video_data['seconds'] == '', 0, df_video_data['seconds'])

# adding converted minutes to seconds
df_video_data['duration_seconds'] = df_video_data['seconds'].astype(np.int) + (df_video_data['minutes'].astype(np.int) * 60)

# cleaning video description
def clean_description(x):
    matches = re.findall(r"\s\s[A-Za-z0-9.&'‘’“”,\s\?\–\-:…%!()]+\s\s", x)
    return sorted(matches, key=len, reverse=True)[0].strip('\n')

df_video_data['cleaned_description'] = df_video_data['description'].apply(clean_description)

# extra cleaning

def extra_clean(x):
    matches = re.findall(r"[A-Za-z0-9.&'‘’“”,\s\?\–\-:…%!()]+\s\s", x)
    return matches[0].strip('\n')

# noticed some instances that needed further cleaning
lists = [102,154,103,124,127,141,143,133,140,150,126,128,136,142,139,131,151,93,138,134,148,152,147,122,137,110,86,87,89,90,91,96,101,135,106,111,117,125,121,118,120,116,112,113,114,144,153,78,79,82,83]

df_video_data.loc[lists, 'cleaned_description'] = df_video_data.loc[lists,'description'].apply(extra_clean)

In [31]:
df_video_data.drop(['seconds', 'minutes'], axis = 1, inplace = True)

In [32]:
df_video_data

Unnamed: 0,publishedAt,title,description,viewCount,likeCount,commentCount,duration,channelName,duration_seconds,cleaned_description
0,2023-02-14,The Most Complex Language in the World,Go ‘beyond the nutshell’ at https://brilliant....,2872959,174991,9396,11M55S,kurzgesagt_–_in_a_nutshell,715,"You are cells. Your muscles, organs, skin and ..."
1,2022-12-15,Black Hole Star – The Star That Shouldn't Exist,Try Rocket Money for free & unlock more featur...,6392760,311920,14719,12M23S,kurzgesagt_–_in_a_nutshell,743,Black hole stars may have been the largest sta...
2,2022-12-11,How To Terraform Mars - WITH LASERS,Go ‘beyond the nutshell’ at https://brilliant....,4825687,224507,13410,11M28S,kurzgesagt_–_in_a_nutshell,688,Mars is a disappointing hellhole lacking pract...
3,2022-12-06,The Horror of the Slaver Ant,Offset your carbon footprint on Wren: ​https:/...,5406221,263591,9677,10M44S,kurzgesagt_–_in_a_nutshell,644,Everything changed when the slaver nation atta...
4,2022-11-23,The Most Extreme Explosion in the Universe,Go ‘beyond the nutshell’ at https://brilliant....,5396261,222701,7804,11M19S,kurzgesagt_–_in_a_nutshell,679,Supernovae are the most powerful explosions in...
...,...,...,...,...,...,...,...,...,...,...
166,2013-12-19,The History and Future of Everything -- Time,How much time do you have left?\n\nTime makes ...,7396276,185074,13704,7M10S,kurzgesagt_–_in_a_nutshell,430,"Short videos, explaining things. For example E..."
167,2013-11-28,How The Stock Exchange Works (For Dummies),Why are there stocks at all?\n\nEveryday in th...,8190633,129034,8621,3M34S,kurzgesagt_–_in_a_nutshell,214,Everyday in the news we hear about the stock e...
168,2013-10-11,The Gulf Stream Explained,Learn about the role of the sea in global warm...,5811034,63711,1968,5M4S,kurzgesagt_–_in_a_nutshell,304,The global conveyer belt is part of the large-...
169,2013-09-03,Fracking explained: opportunity or danger,Fracking explained in five minutes.\n\nFrackin...,7158396,100165,8086,5M3S,kurzgesagt_–_in_a_nutshell,303,Fracking is a controversial topic. On the one ...


#### Saving dataframe in csv format

In [33]:
df_video_data.to_csv(filename+'.csv')

## ~ The End ~