In [100]:
import pandas as pd
from googleapiclient.discovery import build
from IPython.display import JSON
import json
import regex as re

### Grab API Key

In [101]:
from api import api_key

In [102]:
channel_ids = ["UCX6b17PVsYBQ0ip5gyeme-Q"]

In [103]:
def get_channel_stats(channel_data):
    channels = []
    for item in channel_data["items"]:
        dict = {}
        dict["channelName"] = item["snippet"]["title"]
        dict["publishedDate"] = item["snippet"]["publishedAt"]
        dict["subscribers"] = item["statistics"]["subscriberCount"]
        dict["views"] = item["statistics"]["viewCount"]
        dict["totalVideos"] = item["statistics"]["videoCount"]
        dict["playlistId"] = item["contentDetails"]["relatedPlaylists"]['uploads']

        channels.append(dict)
    return channels


In [104]:
api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = build(
    api_service_name, api_version, developerKey=api_key)

request = youtube.channels().list(
    part="snippet,contentDetails,statistics",
    id = ",".join(channel_ids)
)
response = request.execute()


In [105]:

print(json.dumps(response, indent=2))

{
  "kind": "youtube#channelListResponse",
  "etag": "fURXDl1TrJAoOOwjh-QZK_UU3E0",
  "pageInfo": {
    "totalResults": 1,
    "resultsPerPage": 5
  },
  "items": [
    {
      "kind": "youtube#channel",
      "etag": "G3ClDofRBMvC_L-vtKK6ENqAJfw",
      "id": "UCX6b17PVsYBQ0ip5gyeme-Q",
      "snippet": {
        "title": "CrashCourse",
        "description": "At Crash Course, we believe that high-quality educational videos should be available to everyone for free! \n\nSubscribe for weekly videos from our current courses! Right now, we're producing Climate & Energy. The Crash Course team has produced more than 45 courses on a wide variety of subjects, including organic chemistry, literature, world history, biology, philosophy, theater, ecology, and many more!  We also recently teamed up with Arizona State University to bring you more courses on the Study Hall channel.\n\nHelp support Crash Course at Patreon.com/CrashCourse.",
        "customUrl": "@crashcourse",
        "publishedAt":

In [106]:
channel_stats = get_channel_stats(response)

In [107]:
channel_stats[0]

{'channelName': 'CrashCourse',
 'publishedDate': '2006-05-20T02:43:42Z',
 'subscribers': '14800000',
 'views': '1828706086',
 'totalVideos': '1475',
 'playlistId': 'UUX6b17PVsYBQ0ip5gyeme-Q'}

In [108]:
def get_video_ids(youtube, playlist_id):
    video_ids=[]

    request = youtube.playlistItems().list(
            part="snippet,contentDetails",
            playlistId=playlist_id,
            maxResults = 50)

    response = request.execute()

    for item in response["items"]:
            video_ids.append(item["contentDetails"]["videoId"])

    next_page_token = response.get('nextPageToken')
    while next_page_token is not None:
        request = youtube.playlistItems().list(
            part="snippet,contentDetails",
            playlistId=playlist_id,
            maxResults = 50,
            pageToken = next_page_token
        )

        response = request.execute()

        for item in response["items"]:
            video_ids.append(item["contentDetails"]["videoId"])

        next_page_token = response.get("nextPageToken")

    return video_ids

In [109]:
video_ids = get_video_ids(youtube,channel_stats[0]["playlistId"])
len(video_ids)

1475

### Get video stats

In [124]:
def get_video_stats(youtube, video_ids):

    all_video_info = []

    for i in range(0, len(video_ids), 50):

        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=",".join(video_ids[i:i+50])
        )
        response = request.execute()

        

        for video in response["items"]:
            stats_to_keep = {"snippet": ["channelTitle", "title", "description", "tags", "publishedAt"],
                            "statistics": ["viewCount", "likeCount", "favouriteCount", "commentCount"],
                            "contentDetails": ["duration", "definition", "caption"]}
            video_info = {}
            video_info["video_id"] = video["id"]

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None
            
            all_video_info.append(video_info)

    return pd.DataFrame(all_video_info)

In [125]:
cc_df = get_video_stats(youtube, video_ids)
cc_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,WzOrF5W4l3Q,CrashCourse,Photosynthesis and Cellular Respiration: Crash...,"Plants and trees may seem pretty passive, but ...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-22T16:00:21Z,5525,467,,23,PT13M,hd,true
1,pqosCR6J2qo,CrashCourse,What Do These Creepy Plant Mouths Do? (Plant T...,"Plants—they’re just like us! Well, not exactly...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-15T16:00:38Z,31715,1527,,52,PT12M44S,hd,true
2,xOLcZMw0hd4,CrashCourse,The Scientific Method: Crash Course Biology #2,Science offers a way of discovering and unders...,"[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-13T16:00:44Z,29941,1502,,87,PT15M9S,hd,true
3,tZE_fQFK8EY,CrashCourse,Introduction to Biology: Crash Course Biology #1,Biology is the study of life—a four-letter wor...,"[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-06T16:00:39Z,70140,3761,,266,PT13M27S,hd,true
4,y9BLCfcUcFg,CrashCourse,Plant Cells & Hormones: Crash Course Botany #3,"At first glance, plant and animal cells have a...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-01T16:03:13Z,56117,3443,,135,PT12M58S,hd,true
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1470,HVT3Y3_gHGg,CrashCourse,Water - Liquid Awesome: Crash Course Biology #2,Hank teaches us why water is one of the most f...,"[water, hydrogen, oxygen, molecule, covalent b...",2012-02-06T22:10:10Z,4839919,51023,,3825,PT11M17S,hd,true
1471,n7ndRwqJYDM,CrashCourse,Indus Valley Civilization: Crash Course World ...,In which John Green teaches you about the Indu...,"[John Green, Crashcourse, Documentary, Valley,...",2012-02-02T20:18:11Z,7916781,85497,,8047,PT9M35S,hd,true
1472,QnQe0xW_JY4,CrashCourse,Carbon... SO SIMPLE: Crash Course Biology #1,Check out our new-and-improved Crash Course Bi...,"[biology, crashcourse, gilbert lewis, carbon, ...",2012-01-30T18:53:06Z,7826885,78202,,7809,PT11M57S,hd,true
1473,Yocja_N5s1I,CrashCourse,The Agricultural Revolution: Crash Course Worl...,In which John Green investigates the dawn of h...,"[John Green, history, agriculture, ancient, pr...",2012-01-26T20:12:01Z,15377422,152918,,12247,PT11M11S,hd,true


## Data Pre-processing 

In [126]:
cc_df_copy = cc_df.copy()

### Get course information

In [134]:
# Function to identify the type of video it is
# We have video types: "preview", contains "Crash Course" and "#", contains: "How to College" and "Crash Course"
def grab_video_type(title):
    if re.search("Preview"):
        return "preview"
    elif re.search("How to College"):
        return "college"
    elif re.search("College Foundations"):
        return "collegeFoundations"
    elif re.search("Study Hall"):
        return "studyHall"
    elif re.search("#"):
        return "regular"
    else:
        return "none"

In [135]:
# Function to parse out information if it is a regular crashcourse video (og) 
def regular_title_info(id, x):
    # initialize dictionary
    dict={}

    # Split the title by the words "Crash Course"
    title_parts = x.split("Crash Course")

    # Grab the episode title and get rid of the ":"
    episode = title_parts[0].replace(":","")
    dict["episode_title"] = episode

    # Split the course and the episode number along the "#"
    try:
        course_info = title_parts[1].split("#")
        dict["course"] = course_info[0]    
        dict["episode_num"] = course_info[1]
    except:
        dict["course"] = title_parts
        dict["episode_num"] = None

    # Grab the length of the raw title of the episode 
    dict["episode_title_len"] = len(episode)

    # Include video_id
    dict["video_id"] = id

    # include course type
    dict["video_type"] = "regular course"

    return dict

In [None]:
# Function to grab video information if type is preview
def preview_title_info(id, x):
    # initialize dictionary
    dict = {}

    dict["episode_title"] = x
    dict["course"] = x.replace("Preview", "").replace(":","")
    dict["episode_num"] = None
    dict["episode_title_len"] = len(x.replace("Preview", "").replace(":",""))
    dict["video_id"] = id
    dict["video_type"] = "preview"

    return dict

In [131]:
def grab_course(video_id, title):
    title_info = []

    # Loop through the titles and make a dicitonary of each one
    for index, x in enumerate(title):

        video_type = grab_video_type(x)

        if video_type == "regular":
            dict = regular_title_info(video_id[index],title)
        elif video_type == "preview":
            dict = 
            

        # Add the dictionary to the list
        title_info.append(dict)

    # Return information as a dataframe
    return pd.DataFrame(title_info)

In [132]:
# Apply grab_course_info function on the title column of the dataframe
title_df = grab_course(cc_df_copy["video_id"], cc_df_copy["title"])
title_df

IndexError: list index out of range

In [79]:
# Concatenate the course information (title_df) with the cc_df_copy
cc_bycourse_df = pd.merge(cc_df_copy, title_df, on="video_id")

# Drop the title column
cc_bycourse_df.drop(columns=["title"], inplace=True)
cc_bycourse_df

Unnamed: 0,video_id,channelTitle,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,episode,course,episode_num,episode_title_len
0,pqosCR6J2qo,CrashCourse,"Plants—they’re just like us! Well, not exactly...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-15T16:00:38Z,31135,1502,,52,PT12M44S,hd,True,What Do These Creepy Plant Mouths Do? (Plant T...,Botany,4,54
1,xOLcZMw0hd4,CrashCourse,Science offers a way of discovering and unders...,"[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-13T16:00:44Z,29683,1490,,87,PT15M9S,hd,True,The Scientific Method,Biology,2,22
2,tZE_fQFK8EY,CrashCourse,Biology is the study of life—a four-letter wor...,"[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-06T16:00:39Z,69811,3753,,266,PT13M27S,hd,True,Introduction to Biology,Biology,1,24
3,y9BLCfcUcFg,CrashCourse,"At first glance, plant and animal cells have a...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-01T16:03:13Z,55940,3433,,135,PT12M58S,hd,True,Plant Cells & Hormones,Botany,3,23
4,9AEzixu_xZk,CrashCourse,"When you eat a salad for lunch, you’re digging...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-05-25T16:00:10Z,77280,5493,,247,PT16M30S,hd,True,What Are Plants Made Of?,Botany,2,25


### Check if amy columns have null values

In [65]:
cc_bycourse_df.isnull().any()

video_id             False
channelTitle         False
description          False
tags                 False
publishedAt          False
viewCount            False
likeCount            False
favouriteCount        True
commentCount         False
duration             False
definition           False
caption              False
episode              False
course               False
episode_num          False
episode_title_len    False
dtype: bool

### Check Column Types

In [67]:
cc_bycourse_df.dtypes

video_id             object
channelTitle         object
description          object
tags                 object
publishedAt          object
viewCount            object
likeCount            object
favouriteCount       object
commentCount         object
duration             object
definition           object
caption              object
episode              object
course               object
episode_num          object
episode_title_len     int64
dtype: object

In [80]:
# Convert "viewCount", "likeCount", "favouriteCount", "commentCount" to numeric
numeric_cols = ["viewCount", "likeCount", "favouriteCount", "commentCount"]
cc_bycourse_df[numeric_cols] = cc_bycourse_df[numeric_cols].apply(pd.to_numeric, errors="coerce", axis=1)

In [69]:
cc_bycourse_df.dtypes

video_id              object
channelTitle          object
description           object
tags                  object
publishedAt           object
viewCount            float64
likeCount            float64
favouriteCount       float64
commentCount         float64
duration              object
definition            object
caption               object
episode               object
course                object
episode_num           object
episode_title_len      int64
dtype: object

### Convert duration column to seconds

In [71]:
import isodate

In [81]:
cc_bycourse_df["duration"] = cc_bycourse_df["duration"].apply(lambda x: isodate.parse_duration(x))
cc_bycourse_df["duration"] = cc_bycourse_df["duration"].astype("timedelta64[s]")
cc_bycourse_df

Unnamed: 0,video_id,channelTitle,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,episode,course,episode_num,episode_title_len
0,pqosCR6J2qo,CrashCourse,"Plants—they’re just like us! Well, not exactly...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-15T16:00:38Z,31135.0,1502.0,,52.0,764.0,hd,True,What Do These Creepy Plant Mouths Do? (Plant T...,Botany,4,54
1,xOLcZMw0hd4,CrashCourse,Science offers a way of discovering and unders...,"[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-13T16:00:44Z,29683.0,1490.0,,87.0,909.0,hd,True,The Scientific Method,Biology,2,22
2,tZE_fQFK8EY,CrashCourse,Biology is the study of life—a four-letter wor...,"[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-06T16:00:39Z,69811.0,3753.0,,266.0,807.0,hd,True,Introduction to Biology,Biology,1,24
3,y9BLCfcUcFg,CrashCourse,"At first glance, plant and animal cells have a...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-01T16:03:13Z,55940.0,3433.0,,135.0,778.0,hd,True,Plant Cells & Hormones,Botany,3,23
4,9AEzixu_xZk,CrashCourse,"When you eat a salad for lunch, you’re digging...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-05-25T16:00:10Z,77280.0,5493.0,,247.0,990.0,hd,True,What Are Plants Made Of?,Botany,2,25


### Grab the number of tags for each video

In [82]:
# If there are no tags, then put None
cc_bycourse_df["tag_count"] = cc_bycourse_df["tags"].apply(lambda x: 0 if x is None else len(x))
cc_bycourse_df

Unnamed: 0,video_id,channelTitle,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,episode,course,episode_num,episode_title_len,number_of_tags
0,pqosCR6J2qo,CrashCourse,"Plants—they’re just like us! Well, not exactly...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-15T16:00:38Z,31135.0,1502.0,,52.0,764.0,hd,True,What Do These Creepy Plant Mouths Do? (Plant T...,Botany,4,54,7
1,xOLcZMw0hd4,CrashCourse,Science offers a way of discovering and unders...,"[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-13T16:00:44Z,29683.0,1490.0,,87.0,909.0,hd,True,The Scientific Method,Biology,2,22,11
2,tZE_fQFK8EY,CrashCourse,Biology is the study of life—a four-letter wor...,"[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-06T16:00:39Z,69811.0,3753.0,,266.0,807.0,hd,True,Introduction to Biology,Biology,1,24,9
3,y9BLCfcUcFg,CrashCourse,"At first glance, plant and animal cells have a...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-01T16:03:13Z,55940.0,3433.0,,135.0,778.0,hd,True,Plant Cells & Hormones,Botany,3,23,22
4,9AEzixu_xZk,CrashCourse,"When you eat a salad for lunch, you’re digging...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-05-25T16:00:10Z,77280.0,5493.0,,247.0,990.0,hd,True,What Are Plants Made Of?,Botany,2,25,21


### Add day of the week published 

In [91]:
from datetime import datetime

In [94]:
cc_bycourse_df["publishedAt"] = cc_bycourse_df["publishedAt"].apply(lambda x: datetime.fromisoformat(x[:-1]))


In [95]:
cc_bycourse_df["publishedDate"] = cc_bycourse_df["publishedAt"].apply(lambda x: x.date())
cc_bycourse_df["publishedTime"] = cc_bycourse_df["publishedAt"].apply(lambda x: x.time())
cc_bycourse_df.drop(columns=["publishedAt"], inplace=True)
cc_bycourse_df

Unnamed: 0,video_id,channelTitle,description,tags,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,episode,course,episode_num,episode_title_len,number_of_tags,publishedDate,publishedTime
0,pqosCR6J2qo,CrashCourse,"Plants—they’re just like us! Well, not exactly...","[vlogbrothers, Crash Course, crashcourse, educ...",31135.0,1502.0,,52.0,764.0,hd,True,What Do These Creepy Plant Mouths Do? (Plant T...,Botany,4,54,7,2023-06-15,16:00:38
1,xOLcZMw0hd4,CrashCourse,Science offers a way of discovering and unders...,"[vlogbrothers, Crash Course, crashcourse, educ...",29683.0,1490.0,,87.0,909.0,hd,True,The Scientific Method,Biology,2,22,11,2023-06-13,16:00:44
2,tZE_fQFK8EY,CrashCourse,Biology is the study of life—a four-letter wor...,"[vlogbrothers, Crash Course, crashcourse, educ...",69811.0,3753.0,,266.0,807.0,hd,True,Introduction to Biology,Biology,1,24,9,2023-06-06,16:00:39
3,y9BLCfcUcFg,CrashCourse,"At first glance, plant and animal cells have a...","[vlogbrothers, Crash Course, crashcourse, educ...",55940.0,3433.0,,135.0,778.0,hd,True,Plant Cells & Hormones,Botany,3,23,22,2023-06-01,16:03:13
4,9AEzixu_xZk,CrashCourse,"When you eat a salad for lunch, you’re digging...","[vlogbrothers, Crash Course, crashcourse, educ...",77280.0,5493.0,,247.0,990.0,hd,True,What Are Plants Made Of?,Botany,2,25,21,2023-05-25,16:00:10


### reorder columns

In [97]:
cc_bycourse_df.columns

Index(['video_id', 'channelTitle', 'description', 'tags', 'viewCount',
       'likeCount', 'favouriteCount', 'commentCount', 'duration', 'definition',
       'caption', 'episode', 'course', 'episode_num', 'episode_title_len',
       'number_of_tags', 'publishedDate', 'publishedTime'],
      dtype='object')

In [None]:
df_final = cc_bycourse_df[['video_id', 'channelTitle', 'course', 'episode_title','episode_num', 'episode_title_len','description', 'tags', 'number_of_tags', 'publishedDate', 'publishedTime', 'viewCount',
       'likeCount', 'favouriteCount', 'commentCount', 'duration', 'definition',
       'caption']]