In [46]:
import pandas as pd
from googleapiclient.discovery import build
from IPython.display import JSON
import json
import regex as re

### Grab API Key

In [1]:
from api import api_key

AIzaSyDSYrK4YHtPt6EegrZLcGWvnelniaTfpPU


In [12]:
channel_ids = ["UCX6b17PVsYBQ0ip5gyeme-Q"]

In [25]:
def get_channel_stats(channel_data):
    channels = []
    for item in channel_data["items"]:
        dict = {}
        dict["channelName"] = item["snippet"]["title"]
        dict["publishedDate"] = item["snippet"]["publishedAt"]
        dict["subscribers"] = item["statistics"]["subscriberCount"]
        dict["views"] = item["statistics"]["viewCount"]
        dict["totalVideos"] = item["statistics"]["videoCount"]
        dict["playlistId"] = item["contentDetails"]["relatedPlaylists"]['uploads']

        channels.append(dict)
    return channels


In [23]:
api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = build(
    api_service_name, api_version, developerKey=api_key)

request = youtube.channels().list(
    part="snippet,contentDetails,statistics",
    id = ",".join(channel_ids)
)
response = request.execute()


In [None]:

print(json.dumps(response, indent=2))

In [26]:
channel_stats = get_channel_stats(response)

In [29]:
channel_stats[0]

{'channelName': 'CrashCourse',
 'publishedDate': '2006-05-20T02:43:42Z',
 'subscribers': '14800000',
 'views': '1828706086',
 'totalVideos': '1474',
 'playlistId': 'UUX6b17PVsYBQ0ip5gyeme-Q'}

In [36]:
def get_video_ids(youtube, playlist_id):
    video_ids=[]

    request = youtube.playlistItems().list(
            part="snippet,contentDetails",
            playlistId=playlist_id,
            maxResults = 50)

    response = request.execute()

    for item in response["items"]:
            video_ids.append(item["contentDetails"]["videoId"])

    next_page_token = response.get('nextPageToken')
    while next_page_token is not None:
        request = youtube.playlistItems().list(
            part="snippet,contentDetails",
            playlistId=playlist_id,
            maxResults = 50,
            pageToken = next_page_token
        )

        response = request.execute()

        for item in response["items"]:
            video_ids.append(item["contentDetails"]["videoId"])

        next_page_token = response.get("nextPageToken")

    return video_ids

In [37]:
video_ids = get_video_ids(youtube,channel_stats[0]["playlistId"])
len(video_ids)

1474

### Get video stats

In [44]:
def get_video_stats(youtube, video_ids):

    request = youtube.videos().list(
        part="snippet,contentDetails,statistics",
        id=video_ids[0:5]
    )
    response = request.execute()

    all_video_info = []

    for video in response["items"]:
        stats_to_keep = {"snippet": ["channelTitle", "title", "description", "tags", "publishedAt"],
                        "statistics": ["viewCount", "likeCount", "favouriteCount", "commentCount"],
                        "contentDetails": ["duration", "definition", "caption"]}
        video_info = {}
        video_info["video_id"] = video["id"]

        for k in stats_to_keep.keys():
            for v in stats_to_keep[k]:
                try:
                    video_info[v] = video[k][v]
                except:
                    video_info[v] = None
        
        all_video_info.append(video_info)

    return pd.DataFrame(all_video_info)

In [45]:
cc_df = get_video_stats(youtube, video_ids)
cc_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,pqosCR6J2qo,CrashCourse,What Do These Creepy Plant Mouths Do? (Plant T...,"Plants—they’re just like us! Well, not exactly...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-15T16:00:38Z,31135,1502,,52,PT12M44S,hd,True
1,xOLcZMw0hd4,CrashCourse,The Scientific Method: Crash Course Biology #2,Science offers a way of discovering and unders...,"[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-13T16:00:44Z,29683,1490,,87,PT15M9S,hd,True
2,tZE_fQFK8EY,CrashCourse,Introduction to Biology: Crash Course Biology #1,Biology is the study of life—a four-letter wor...,"[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-06T16:00:39Z,69811,3753,,266,PT13M27S,hd,True
3,y9BLCfcUcFg,CrashCourse,Plant Cells & Hormones: Crash Course Botany #3,"At first glance, plant and animal cells have a...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-01T16:03:13Z,55940,3433,,135,PT12M58S,hd,True
4,9AEzixu_xZk,CrashCourse,What Are Plants Made Of? Crash Course Botany #2,"When you eat a salad for lunch, you’re digging...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-05-25T16:00:10Z,77280,5493,,247,PT16M30S,hd,True


## Data Pre-processing 

In [48]:
cc_df_copy = cc_df.copy()

### Get course information

In [62]:
def grab_course(video_id, title):
    title_info = []

    # Loop through the titles and make a dicitonary of each one
    for index, x in enumerate(title):
        dict = {}

        # Split the title by the words "Crash Course"
        title_parts = x.split("Crash Course")

        # Grab the episode title and get rid of the ":"
        episode = title_parts[0].replace(":","")
        dict["episode"] = episode

        # Split the course and the episode number along the "#"
        course_info = title_parts[1].split("#")
        dict["course"] = course_info[0]
        dict["episode_num"] = course_info[1]

        # Grab the length of the raw title of the episode 
        dict["episode_title_len"] = len(episode)

        # Include video_id
        dict["video_id"] = video_id[index]

        # Add the dictionary to the list
        title_info.append(dict)

    # Return information as a dataframe
    return pd.DataFrame(title_info)

In [63]:
# Apply grab_course_info function on the title column of the dataframe
title_df = grab_course(cc_df_copy["video_id"], cc_df_copy["title"])
title_df

Unnamed: 0,episode,course,episode_num,episode_title_len,video_id
0,What Do These Creepy Plant Mouths Do? (Plant T...,Botany,4,54,pqosCR6J2qo
1,The Scientific Method,Biology,2,22,xOLcZMw0hd4
2,Introduction to Biology,Biology,1,24,tZE_fQFK8EY
3,Plant Cells & Hormones,Botany,3,23,y9BLCfcUcFg
4,What Are Plants Made Of?,Botany,2,25,9AEzixu_xZk


In [64]:
# Concatenate the course information (title_df) with the cc_df_copy
cc_bycourse_df = pd.merge(cc_df_copy, title_df, on="video_id")

# Drop the title column
cc_bycourse_df.drop(columns=["title"], inplace=True)
cc_bycourse_df

Unnamed: 0,video_id,channelTitle,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,episode,course,episode_num,episode_title_len
0,pqosCR6J2qo,CrashCourse,"Plants—they’re just like us! Well, not exactly...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-15T16:00:38Z,31135,1502,,52,PT12M44S,hd,True,What Do These Creepy Plant Mouths Do? (Plant T...,Botany,4,54
1,xOLcZMw0hd4,CrashCourse,Science offers a way of discovering and unders...,"[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-13T16:00:44Z,29683,1490,,87,PT15M9S,hd,True,The Scientific Method,Biology,2,22
2,tZE_fQFK8EY,CrashCourse,Biology is the study of life—a four-letter wor...,"[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-06T16:00:39Z,69811,3753,,266,PT13M27S,hd,True,Introduction to Biology,Biology,1,24
3,y9BLCfcUcFg,CrashCourse,"At first glance, plant and animal cells have a...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-01T16:03:13Z,55940,3433,,135,PT12M58S,hd,True,Plant Cells & Hormones,Botany,3,23
4,9AEzixu_xZk,CrashCourse,"When you eat a salad for lunch, you’re digging...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-05-25T16:00:10Z,77280,5493,,247,PT16M30S,hd,True,What Are Plants Made Of?,Botany,2,25


### Check if amy columns have null values

In [65]:
cc_bycourse_df.isnull().any()

video_id             False
channelTitle         False
description          False
tags                 False
publishedAt          False
viewCount            False
likeCount            False
favouriteCount        True
commentCount         False
duration             False
definition           False
caption              False
episode              False
course               False
episode_num          False
episode_title_len    False
dtype: bool

### Check Column Types

In [67]:
cc_bycourse_df.dtypes

video_id             object
channelTitle         object
description          object
tags                 object
publishedAt          object
viewCount            object
likeCount            object
favouriteCount       object
commentCount         object
duration             object
definition           object
caption              object
episode              object
course               object
episode_num          object
episode_title_len     int64
dtype: object

In [68]:
# Convert "viewCount", "likeCount", "favouriteCount", "commentCount" to numeric
numeric_cols = ["viewCount", "likeCount", "favouriteCount", "commentCount"]
cc_bycourse_df[numeric_cols] = cc_bycourse_df[numeric_cols].apply(pd.to_numeric, errors="coerce", axis=1)

In [69]:
cc_bycourse_df.dtypes

video_id              object
channelTitle          object
description           object
tags                  object
publishedAt           object
viewCount            float64
likeCount            float64
favouriteCount       float64
commentCount         float64
duration              object
definition            object
caption               object
episode               object
course                object
episode_num           object
episode_title_len      int64
dtype: object

### Convert duration column to seconds

In [70]:
import isodate


ModuleNotFoundError: No module named 'isodate'