- hide the API key


# Loading the libraries

In [1]:
from googleapiclient.discovery import build
import pandas as pd
import numpy as np
from datetime import date, timedelta
import re
import os

In [4]:
os.environ.get("youtube_api_key")

# Getting the data from the API

## Using our API key

In [None]:
api_key = "AIzaSyAlAjDhaQ2iZ8u6Z-Zzmv3FezFPiOIG2rs"

youtube = build('youtube', 'v3', developerKey = api_key)

## Building the wrapper functions

### Getting the ID of the playlist containing all videos a channel has uploaded

In [None]:
def get_uploads_playlist_id(username):
    
    request = youtube.channels().list(
        part = "contentDetails",
        forUsername = username)
    
    response = request.execute()
    
    return response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
    

### Getting the IDs of the uploaded videos

YouTube won't let users retrieve data for more than 50 videos within a single API call. Luckily though the response from the API provides the 'nextPageToken' key which we will allow us to access the page after the one we just got.

In [None]:
def get_all_video_ids(uploaded_videos_playlist_id): # The playlist ID we downloaded with the previous function
    
    video_ids = []
    
    page_token = None # For the first iteration of the loop the page_token is set to 'None' as we're getting the first 50 ids
    
    while True: 
        
        request = youtube.playlistItems().list(
        part = "id, contentDetails",
        playlistId = uploaded_videos_playlist_id,
        maxResults = 50,
        pageToken = page_token) # Setting the 'pageToken' argument equal to the page_token we just saved from the previous iteration
        
        response = request.execute()
        
        for item in response["items"]:
            video_ids.append(item["contentDetails"]["videoId"]) # Appending the ids 
            
        if 'nextPageToken' in response.keys():       # Is there a 'nextPageToken' key in the json object? 
            page_token = response['nextPageToken']   # If so, let's save the page token so that we can move to the next iteration.
        else:                                        # Else, let's break out of the loop and return our output
            return video_ids                               
    

### Getting the data for each uploaded video

In [None]:
def store_video_data_in_df(username):
    
    vid_ids = get_all_video_ids(get_uploads_playlist_id(username))
    
    # Let's split the vid_ids list into chunks. The number of chunks is equal to the total n of vids uploaded 
    # by a channel divided by 50 (the max amount of data retrievable in one call), rounded to the greatest near integer 
    # and then cast to an integer
    
    n_chunks = int(np.ceil(len(vid_ids)/50))
    
    splits = np.array_split(vid_ids, n_chunks) 
    
    # Let's create an empty dataframe which it'll store all our data
    
    df = pd.DataFrame(columns = ["id", "title", "published_at", "view_count", "like_count", "dislike_count", 
                             "comment_count", "duration", "tags", "last_updated"])
    
    for split in splits:  
        request = youtube.videos().list(
        part = "snippet, statistics, contentDetails",
        id = ','.join(split)) # The lists of video IDS need to be collapsed into a single string with values separated by a comma
        
        response = request.execute()
             
        for video in response["items"]:
            df = df.append({"id": video["id"],
                   "title": video["snippet"]["title"], 
                   "published_at": video["snippet"]["publishedAt"],
                   "tags": ";".join(video["snippet"]["tags"]) if "tags" in video["snippet"].keys() else np.nan,
                   "view_count": video["statistics"]["viewCount"],
                   "like_count": video["statistics"]["likeCount"],
                   "dislike_count": video["statistics"]["dislikeCount"],
                   "comment_count": video["statistics"]["commentCount"] if "commentCount" in video["statistics"].keys() else np.nan,
                   "duration": video["contentDetails"]["duration"],
                   "last_updated": date.today()
                   },
                   ignore_index = True)
            
    return df


## Downloading the data

In [None]:
df = store_video_data_in_df("joshstarmer")

No duplicated ids: great!

In [None]:
df["id"].duplicated().sum()

In [None]:
df.head()

# Data cleaning

## Null values

In [None]:
df.isnull().sum()

## Casting variables

All our variables are of the object type. We need to cast the non-string ones to the correty datatype

In [None]:
df.info()

All the columns with the word "count" in their name need to be cast as numeric

In [None]:
numeric_cols = [col for col in df.columns if "count" in col]

df[numeric_cols] = df[numeric_cols].apply(lambda x: pd.to_numeric(arg = x, downcast = "integer"))

Let's split the "published_at" column on the letter "T" and keep the first resulting column (the date)

In [None]:
df["published_at"] = df["published_at"].str.split("T", expand = True).iloc[:, 0]

Let's cast these two variables as datetime

In [None]:
datetime_cols = ["published_at", "last_updated"]

df[datetime_cols] = df[datetime_cols].apply(pd.to_datetime)

Let's fix the "duration" variable

In [None]:
df["duration"] = df["duration"].str.replace("PT", "").str.replace("H"," hours ").str.replace("M", " minutes ").str.replace("S", " seconds")

In [None]:
df["duration"] = pd.to_timedelta(df["duration"])

In [None]:
df.head()

# Test with another YouTuber

df = pd.DataFrame(columns = ["id", "title", "published_at", "view_count", "like_count", "dislike_count", 
                             "comment_count", "duration", "tags", "last_updated"])

df

store_video_data_in_df("RedLetterMedia")

df