In [12]:
import pandas as pd
from googleapiclient.discovery import build
from IPython.display import JSON
import json
import regex as re

### Grab API Key

In [13]:
from api import api_key

In [14]:
channel_ids = ["UCX6b17PVsYBQ0ip5gyeme-Q"]

In [15]:
def get_channel_stats(channel_data):
    channels = []
    for item in channel_data["items"]:
        dict = {}
        dict["channelName"] = item["snippet"]["title"]
        dict["publishedDate"] = item["snippet"]["publishedAt"]
        dict["subscribers"] = item["statistics"]["subscriberCount"]
        dict["views"] = item["statistics"]["viewCount"]
        dict["totalVideos"] = item["statistics"]["videoCount"]
        dict["playlistId"] = item["contentDetails"]["relatedPlaylists"]['uploads']

        channels.append(dict)
    return channels


In [16]:
api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = build(
    api_service_name, api_version, developerKey=api_key)

request = youtube.channels().list(
    part="snippet,contentDetails,statistics",
    id = ",".join(channel_ids)
)
response = request.execute()


In [17]:

print(json.dumps(response, indent=2))

{
  "kind": "youtube#channelListResponse",
  "etag": "ehByGjtoIy0PktRTfldf1V6Y6Ls",
  "pageInfo": {
    "totalResults": 1,
    "resultsPerPage": 5
  },
  "items": [
    {
      "kind": "youtube#channel",
      "etag": "fBpaDocjOdf1gaDwns4Qu-jysuU",
      "id": "UCX6b17PVsYBQ0ip5gyeme-Q",
      "snippet": {
        "title": "CrashCourse",
        "description": "At Crash Course, we believe that high-quality educational videos should be available to everyone for free! \n\nSubscribe for weekly videos from our current courses! Right now, we're producing Climate & Energy. The Crash Course team has produced more than 45 courses on a wide variety of subjects, including organic chemistry, literature, world history, biology, philosophy, theater, ecology, and many more!  We also recently teamed up with Arizona State University to bring you more courses on the Study Hall channel.\n\nHelp support Crash Course at Patreon.com/CrashCourse.",
        "customUrl": "@crashcourse",
        "publishedAt":

In [18]:
channel_stats = get_channel_stats(response)

In [19]:
channel_stats[0]

{'channelName': 'CrashCourse',
 'publishedDate': '2006-05-20T02:43:42Z',
 'subscribers': '14800000',
 'views': '1828997009',
 'totalVideos': '1475',
 'playlistId': 'UUX6b17PVsYBQ0ip5gyeme-Q'}

In [20]:
def get_video_ids(youtube, playlist_id):
    video_ids=[]

    request = youtube.playlistItems().list(
            part="snippet,contentDetails",
            playlistId=playlist_id,
            maxResults = 50)

    response = request.execute()

    for item in response["items"]:
            video_ids.append(item["contentDetails"]["videoId"])

    next_page_token = response.get('nextPageToken')
    while next_page_token is not None:
        request = youtube.playlistItems().list(
            part="snippet,contentDetails",
            playlistId=playlist_id,
            maxResults = 50,
            pageToken = next_page_token
        )

        response = request.execute()

        for item in response["items"]:
            video_ids.append(item["contentDetails"]["videoId"])

        next_page_token = response.get("nextPageToken")

    return video_ids

In [21]:
video_ids = get_video_ids(youtube,channel_stats[0]["playlistId"])
len(video_ids)

1475

### Get video stats

In [22]:
def get_video_stats(youtube, video_ids):

    all_video_info = []

    for i in range(0, len(video_ids), 50):

        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=",".join(video_ids[i:i+50])
        )
        response = request.execute()

        

        for video in response["items"]:
            stats_to_keep = {"snippet": ["channelTitle", "title", "description", "tags", "publishedAt"],
                            "statistics": ["viewCount", "likeCount", "favouriteCount", "commentCount"],
                            "contentDetails": ["duration", "definition", "caption"]}
            video_info = {}
            video_info["video_id"] = video["id"]

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None
            
            all_video_info.append(video_info)

    return pd.DataFrame(all_video_info)

In [23]:
cc_df = get_video_stats(youtube, video_ids)
cc_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,WzOrF5W4l3Q,CrashCourse,Photosynthesis and Cellular Respiration: Crash...,"Plants and trees may seem pretty passive, but ...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-22T16:00:21Z,13760,835,,51,PT13M,hd,true
1,pqosCR6J2qo,CrashCourse,What Do These Creepy Plant Mouths Do? (Plant T...,"Plants—they’re just like us! Well, not exactly...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-15T16:00:38Z,32496,1570,,52,PT12M44S,hd,true
2,xOLcZMw0hd4,CrashCourse,The Scientific Method: Crash Course Biology #2,Science offers a way of discovering and unders...,"[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-13T16:00:44Z,30368,1519,,87,PT15M9S,hd,true
3,tZE_fQFK8EY,CrashCourse,Introduction to Biology: Crash Course Biology #1,Biology is the study of life—a four-letter wor...,"[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-06T16:00:39Z,70777,3775,,266,PT13M27S,hd,true
4,y9BLCfcUcFg,CrashCourse,Plant Cells & Hormones: Crash Course Botany #3,"At first glance, plant and animal cells have a...","[vlogbrothers, Crash Course, crashcourse, educ...",2023-06-01T16:03:13Z,56483,3462,,135,PT12M58S,hd,true
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1470,HVT3Y3_gHGg,CrashCourse,Water - Liquid Awesome: Crash Course Biology #2,Hank teaches us why water is one of the most f...,"[water, hydrogen, oxygen, molecule, covalent b...",2012-02-06T22:10:10Z,4840139,51024,,3825,PT11M17S,hd,true
1471,n7ndRwqJYDM,CrashCourse,Indus Valley Civilization: Crash Course World ...,In which John Green teaches you about the Indu...,"[John Green, Crashcourse, Documentary, Valley,...",2012-02-02T20:18:11Z,7917297,85507,,8047,PT9M35S,hd,true
1472,QnQe0xW_JY4,CrashCourse,Carbon... SO SIMPLE: Crash Course Biology #1,Check out our new-and-improved Crash Course Bi...,"[biology, crashcourse, gilbert lewis, carbon, ...",2012-01-30T18:53:06Z,7827237,78209,,7809,PT11M57S,hd,true
1473,Yocja_N5s1I,CrashCourse,The Agricultural Revolution: Crash Course Worl...,In which John Green investigates the dawn of h...,"[John Green, history, agriculture, ancient, pr...",2012-01-26T20:12:01Z,15378590,152936,,12247,PT11M11S,hd,true


## Load raw data to CSV 

In [37]:
cc_df.to_csv("data/crashcourse_raw.csv")

## Data Pre-processing 

In [204]:
cc_df_copy = pd.read_csv("data/crashcourse_raw.csv")

### Clean course information

In [205]:
# Fix CrashCourse
def fix_cc(title):
    if re.search(r"CrashCourse",title):
        return title.replace("CrashCourse","Crash Course")
    return title

In [206]:
cc_df_copy["title"] = cc_df_copy["title"].apply(lambda x: fix_cc(x))

In [207]:
# Fix "Ai" versus "AI"
def fix_AI(title):
    if re.search(r"Ai",title):
        return title.replace("Ai","AI")
    return title

In [208]:
cc_df_copy["title"] = cc_df_copy["title"].apply(lambda x: fix_AI(x))

### Get course information

In [220]:
# Function to parse out information if it is a regular crashcourse video (og) 
def regular_title_info(id, x):
    # initialize dictionary
    dict={}

    # Split the title by the words "Crash Course"
    if "Crash Course" in x:
        title_parts = x.split("Crash Course") 
    else:
        title_parts = x.split(":")

    # Grab the episode title and get rid of the ":"
    episode = title_parts[0].replace(":","")
    dict["episode_title"] = episode.strip()

    # Split the course and the episode number along the "#"
    try:
        course_info = (title_parts[1].replace("CC","")).split("#")
        course = course_info[0].strip()

        # Deal with Lily extra space:
        if re.search("Lily Gladstone", x):
            course = "Film Production with Lily Gladstone"

        # Deal with Business-Entrepreneurship inconsistencies 
        if course == "Business - Entrepreneurship" or course == "Entrepreneurship":
            course = "Business Entrepreneurship"

        if course == "Business Soft Skills" or course == "Business: Soft Skills":
            course = "Business - Soft Skills"

        
        dict["course"] = course   
        dict["episode_num"] = course_info[1].strip()
        
    except:
        dict["course"] = title_parts
        dict["episode_num"] = None

    # Grab the length of the raw title of the episode 
    dict["episode_title_len"] = len(episode)

    # Include video_id
    dict["video_id"] = id

    # Include Arizona teamup
    dict["with_Arizona"] = False

    # include course type
    dict["video_type"] = "regular course"

    return dict

In [210]:
# Function to grab video information if type is preview
def preview_title_info(id, x):
    # initialize dictionary
    dict = {}

    episode = x.replace("Preview", "").replace(":","").strip()

    dict["episode_title"] = episode
    dict["course"] = "Preview"
    dict["episode_num"] = None
    dict["episode_title_len"] = len(episode)
    dict["video_id"] = id
    dict["with_Arizona"] = False
    dict["video_type"] = "preview"

    return dict

In [211]:
# Function to grab video information if type "How to College":
def college_title_info(id,x):
    # initialize dictionary
    dict = {}
    episode_title =  x.replace("How to College", "").replace("Crash Course","").strip()

    dict["episode_title"] = episode_title
    dict["course"] = "How to College"
    dict["episode_num"] = None
    dict["episode_title_len"] = len(episode_title)
    dict["video_id"] = id
    dict["with_Arizona"] = False
    dict["video_type"] = "How to College"

    return dict

In [212]:
# Function to grab video information if type is outtakes
def outtakes_title_info(id, x):
    # initialize dictionary
    dict = {}

    x = x.replace("Out-Takes","Outtakes")
    episode = x.replace("Crash Course", "").replace(":","").strip()
    
    dict["episode_title"] = episode
    dict["course"] = "outtakes"
    dict["episode_num"] = None
    dict["episode_title_len"] = len(episode)
    dict["video_id"] = id
    dict["with_Arizona"] = False
    dict["video_type"] = "outtakes"

    return dict

In [213]:
# Function to grab any exceptions to the titles
def other_title_info(id, x):
    dict = {}

    dict["episode_title"] = x
    dict["course"] = "unknown"
    dict["episode_num"] = None
    dict["episode_title_len"] = len(x)
    dict["video_id"] = id
    dict["with_Arizona"] = False
    dict["video_type"] = "unknown"

    return dict

In [214]:
# Function to grab any exceptions to the titles
def office_title_info(id, x):
    dict = {}

    episode =  x.replace("Crash Course","").replace("Office Hourse","").replace(":","").strip()
    dict["episode_title"] = episode
    dict["course"] = "Office Hours"
    dict["episode_num"] = None
    dict["episode_title_len"] = len(episode)
    dict["video_id"] = id
    dict["with_Arizona"] = False
    dict["video_type"] = "office hours"

    return dict

In [215]:
# Function to grab the Crash Course Literature
def lit_title_info(id, x):
    dict = {}

    # Inconsistent Naming even within the same season (some contain ":" and other's contains "-")
    if re.search(":",x):
        episode = x.split(":")
    elif re.search("-",x):
        episode = x.split("-")
    else: 
        episode = x.split("!")
        episode[0] = "Disease!"
     
    episode_num = episode[1].replace("Crash Course Literature","").replace("Crash Course World History","").replace("Crash Course Big History","")
    dict["episode_title"] = episode[0]
    
    # Figure out which course it is a subsequent season of
    if re.search(r"Crash Course Literarture",x):
        dict["course"] = "Literature"
    elif re.search(r"Crash Course World History",x):
        dict["course"] = "World History"
    else:
        dict["course"] = "Big History"

    dict["episode_num"] = episode_num
    dict["episode_title_len"] = len(episode[0])
    dict["video_id"] = id
    dict["with_Arizona"] = False
    dict["video_type"] = "regular course"

    return dict

In [216]:
def grab_course(video_id, title):
    title_info = []

    # Loop through the titles and make a dicitonary of each one
    for index, x in enumerate(title):

        if "Government & Politics" in x:
            x=x.replace("&","and")

        if re.search("Preview",x): #FINISHED
            dict = preview_title_info(video_id[index], x)

        elif re.search("Outtakes",x) or re.search("Out-Takes",x):
            dict = outtakes_title_info(video_id[index],x)
            
        elif re.search("How to College",x):
            dict = college_title_info(video_id[index], x)

        elif re.search("Office Hours",x):
            dict = office_title_info(video_id[index],x)

        elif re.search("#",x): #FINISHED
            dict = regular_title_info(video_id[index], x)

        elif re.search("Crash Course Literature",x) or re.search("Crash Course World History",x) or re.search("Crash Course Big History",x):
            dict = lit_title_info(video_id[index],x)
            
        else:
            dict = other_title_info(video_id[index], x)
            

        # Add the dictionary to the list
        title_info.append(dict)

    # Return information as a dataframe
    return pd.DataFrame(title_info)

In [221]:
# Apply grab_course_info function on the title column of the dataframe
title_df = grab_course(cc_df_copy["video_id"], cc_df_copy["title"])
title_df

Unnamed: 0,episode_title,course,episode_num,episode_title_len,video_id,with_Arizona,video_type
0,Photosynthesis and Cellular Respiration,Botany,5,40,WzOrF5W4l3Q,False,regular course
1,What Do These Creepy Plant Mouths Do? (Plant Tissues),Botany,4,54,pqosCR6J2qo,False,regular course
2,The Scientific Method,Biology,2,22,xOLcZMw0hd4,False,regular course
3,Introduction to Biology,Biology,1,24,tZE_fQFK8EY,False,regular course
4,Plant Cells & Hormones,Botany,3,23,y9BLCfcUcFg,False,regular course
...,...,...,...,...,...,...,...
1470,Water - Liquid Awesome,Biology,2,23,HVT3Y3_gHGg,False,regular course
1471,Indus Valley Civilization,World History,2,26,n7ndRwqJYDM,False,regular course
1472,Carbon... SO SIMPLE,Biology,1,20,QnQe0xW_JY4,False,regular course
1473,The Agricultural Revolution,World History,1,28,Yocja_N5s1I,False,regular course


### Check for what is in the unknowns

In [222]:
pd.set_option('display.max_colwidth', 1000)

In [223]:
title_df[title_df["course"]=="unknown"]

Unnamed: 0,episode_title,course,episode_num,episode_title_len,video_id,with_Arizona,video_type
6,NEW Crash Course Biology Series!,unknown,,32,PWGBqskV1UQ,False,unknown
232,Covid-19 and Public Health: A Message from Crash Course,unknown,,55,G4rcv3p7AYg,False,unknown
262,Announcing the Crash Course App!,unknown,,32,JfMmzx67Krw,False,unknown
295,What's new with Crash Course,unknown,,28,seLtYzXJ_YE,False,unknown
428,A History of Crash Course,unknown,,25,1Q5cPfbmSD8,False,unknown
654,Changes to our Patreon,unknown,,22,JeOimlgs5tQ,False,unknown
837,A Note on CC Human Geography,unknown,,28,yvFStAP7Uko,False,unknown
960,Explore The Solar System: 360 Degree Interactive Tour!,unknown,,54,0ytyMKa8aps,False,unknown
1068,Crash Course Economics Intro!,unknown,,29,9I_-ADGrKQo,False,unknown
1126,Become a Crash Course Patron!,unknown,,29,VO_1VbQADW8,False,unknown


### Concatenate the course information df with the rest of cc_df_copy

In [224]:
pd.set_option('display.max_colwidth', 100)

In [225]:
# Concatenate the course information (title_df) with the cc_df_copy
cc_bycourse_df = pd.merge(cc_df_copy, title_df, on="video_id")

# Drop the title column
cc_bycourse_df.drop(columns=["title"], inplace=True)
cc_bycourse_df.head(5)

Unnamed: 0.1,Unnamed: 0,video_id,channelTitle,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,episode_title,course,episode_num,episode_title_len,with_Arizona,video_type
0,0,WzOrF5W4l3Q,CrashCourse,"Plants and trees may seem pretty passive, but behind the scenes, their cells are working hard to...","['vlogbrothers', 'Crash Course', 'crashcourse', 'education', 'botany', 'photosynthesis', 'mitoch...",2023-06-22T16:00:21Z,13760.0,835.0,,51.0,PT13M,hd,True,Photosynthesis and Cellular Respiration,Botany,5,40,False,regular course
1,1,pqosCR6J2qo,CrashCourse,"Plants—they’re just like us! Well, not exactly, but they do have skin and hair like us…even if t...","['vlogbrothers', 'Crash Course', 'crashcourse', 'education', 'botany', 'photosynthesis', 'plant ...",2023-06-15T16:00:38Z,32496.0,1570.0,,52.0,PT12M44S,hd,True,What Do These Creepy Plant Mouths Do? (Plant Tissues),Botany,4,54,False,regular course
2,2,xOLcZMw0hd4,CrashCourse,"Science offers a way of discovering and understanding the world around us, driven by questions a...","['vlogbrothers', 'Crash Course', 'crashcourse', 'education', 'Sammy Ramsey', 'Dr. Sammy', 'Dr. B...",2023-06-13T16:00:44Z,30368.0,1519.0,,87.0,PT15M9S,hd,True,The Scientific Method,Biology,2,22,False,regular course
3,3,tZE_fQFK8EY,CrashCourse,Biology is the study of life—a four-letter word that connects you to 4 billion years worth of fa...,"['vlogbrothers', 'Crash Course', 'crashcourse', 'education', 'Sammy Ramsey', 'Dr. Sammy', 'Dr. B...",2023-06-06T16:00:39Z,70777.0,3775.0,,266.0,PT13M27S,hd,True,Introduction to Biology,Biology,1,24,False,regular course
4,4,y9BLCfcUcFg,CrashCourse,"At first glance, plant and animal cells have a lot in common: they’re both highly organized, kee...","['vlogbrothers', 'Crash Course', 'crashcourse', 'education', 'crash course', 'botany', 'plants',...",2023-06-01T16:03:13Z,56483.0,3462.0,,135.0,PT12M58S,hd,True,Plant Cells & Hormones,Botany,3,23,False,regular course


### Check if amy columns have null values

In [226]:
cc_bycourse_df.isnull().any()

Unnamed: 0           False
video_id             False
channelTitle         False
description          False
tags                  True
publishedAt          False
viewCount             True
likeCount             True
favouriteCount        True
commentCount          True
duration             False
definition           False
caption              False
episode_title        False
course               False
episode_num           True
episode_title_len    False
with_Arizona         False
video_type           False
dtype: bool

### Check Column Types

In [227]:
cc_bycourse_df.dtypes

Unnamed: 0             int64
video_id              object
channelTitle          object
description           object
tags                  object
publishedAt           object
viewCount            float64
likeCount            float64
favouriteCount       float64
commentCount         float64
duration              object
definition            object
caption                 bool
episode_title         object
course                object
episode_num           object
episode_title_len      int64
with_Arizona            bool
video_type            object
dtype: object

In [228]:
# Convert "viewCount", "likeCount", "favouriteCount", "commentCount" to numeric
numeric_cols = ["viewCount", "likeCount", "favouriteCount", "commentCount", "episode_num"]
cc_bycourse_df[numeric_cols] = cc_bycourse_df[numeric_cols].apply(pd.to_numeric, errors="coerce", axis=1)

In [229]:
cc_bycourse_df.dtypes

Unnamed: 0             int64
video_id              object
channelTitle          object
description           object
tags                  object
publishedAt           object
viewCount            float64
likeCount            float64
favouriteCount       float64
commentCount         float64
duration              object
definition            object
caption                 bool
episode_title         object
course                object
episode_num          float64
episode_title_len      int64
with_Arizona            bool
video_type            object
dtype: object

### Convert duration column to seconds

In [230]:
import isodate

In [231]:
cc_bycourse_df["duration"] = cc_bycourse_df["duration"].apply(lambda x: isodate.parse_duration(x))
cc_bycourse_df["duration"] = cc_bycourse_df["duration"].astype("timedelta64[s]")
cc_bycourse_df.head(5)

Unnamed: 0.1,Unnamed: 0,video_id,channelTitle,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,episode_title,course,episode_num,episode_title_len,with_Arizona,video_type
0,0,WzOrF5W4l3Q,CrashCourse,"Plants and trees may seem pretty passive, but behind the scenes, their cells are working hard to...","['vlogbrothers', 'Crash Course', 'crashcourse', 'education', 'botany', 'photosynthesis', 'mitoch...",2023-06-22T16:00:21Z,13760.0,835.0,,51.0,780.0,hd,True,Photosynthesis and Cellular Respiration,Botany,5.0,40,False,regular course
1,1,pqosCR6J2qo,CrashCourse,"Plants—they’re just like us! Well, not exactly, but they do have skin and hair like us…even if t...","['vlogbrothers', 'Crash Course', 'crashcourse', 'education', 'botany', 'photosynthesis', 'plant ...",2023-06-15T16:00:38Z,32496.0,1570.0,,52.0,764.0,hd,True,What Do These Creepy Plant Mouths Do? (Plant Tissues),Botany,4.0,54,False,regular course
2,2,xOLcZMw0hd4,CrashCourse,"Science offers a way of discovering and understanding the world around us, driven by questions a...","['vlogbrothers', 'Crash Course', 'crashcourse', 'education', 'Sammy Ramsey', 'Dr. Sammy', 'Dr. B...",2023-06-13T16:00:44Z,30368.0,1519.0,,87.0,909.0,hd,True,The Scientific Method,Biology,2.0,22,False,regular course
3,3,tZE_fQFK8EY,CrashCourse,Biology is the study of life—a four-letter word that connects you to 4 billion years worth of fa...,"['vlogbrothers', 'Crash Course', 'crashcourse', 'education', 'Sammy Ramsey', 'Dr. Sammy', 'Dr. B...",2023-06-06T16:00:39Z,70777.0,3775.0,,266.0,807.0,hd,True,Introduction to Biology,Biology,1.0,24,False,regular course
4,4,y9BLCfcUcFg,CrashCourse,"At first glance, plant and animal cells have a lot in common: they’re both highly organized, kee...","['vlogbrothers', 'Crash Course', 'crashcourse', 'education', 'crash course', 'botany', 'plants',...",2023-06-01T16:03:13Z,56483.0,3462.0,,135.0,778.0,hd,True,Plant Cells & Hormones,Botany,3.0,23,False,regular course


### Grab the number of tags for each video

In [234]:
# If there are no tags, then put None
# Convert the 'tags' column to a list of strings
def grab_tagCount(tags):
    try:
        t = tags.replace("[","").replace("]","")
        l = t.split(",")
        return len(l)
    except:
        return None

cc_bycourse_df["tag_Count"] = cc_bycourse_df["tags"].apply(lambda x: grab_tagCount(x))
cc_bycourse_df.head(2)

Unnamed: 0.1,Unnamed: 0,video_id,channelTitle,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,episode_title,course,episode_num,episode_title_len,with_Arizona,video_type,tag_Count
0,0,WzOrF5W4l3Q,CrashCourse,"Plants and trees may seem pretty passive, but behind the scenes, their cells are working hard to...","['vlogbrothers', 'Crash Course', 'crashcourse', 'education', 'botany', 'photosynthesis', 'mitoch...",2023-06-22T16:00:21Z,13760.0,835.0,,51.0,780.0,hd,True,Photosynthesis and Cellular Respiration,Botany,5.0,40,False,regular course,15.0
1,1,pqosCR6J2qo,CrashCourse,"Plants—they’re just like us! Well, not exactly, but they do have skin and hair like us…even if t...","['vlogbrothers', 'Crash Course', 'crashcourse', 'education', 'botany', 'photosynthesis', 'plant ...",2023-06-15T16:00:38Z,32496.0,1570.0,,52.0,764.0,hd,True,What Do These Creepy Plant Mouths Do? (Plant Tissues),Botany,4.0,54,False,regular course,7.0


### Add day of the week published 

In [235]:
from datetime import datetime

In [236]:
cc_bycourse_df["publishedAt"] = cc_bycourse_df["publishedAt"].apply(lambda x: datetime.fromisoformat(x[:-1]))


In [237]:
cc_bycourse_df["publishedDate"] = cc_bycourse_df["publishedAt"].apply(lambda x: x.date())
cc_bycourse_df["publishedTime"] = cc_bycourse_df["publishedAt"].apply(lambda x: x.time())
cc_bycourse_df.drop(columns=["publishedAt"], inplace=True)
cc_bycourse_df.head(2)

Unnamed: 0.1,Unnamed: 0,video_id,channelTitle,description,tags,viewCount,likeCount,favouriteCount,commentCount,duration,...,caption,episode_title,course,episode_num,episode_title_len,with_Arizona,video_type,tag_Count,publishedDate,publishedTime
0,0,WzOrF5W4l3Q,CrashCourse,"Plants and trees may seem pretty passive, but behind the scenes, their cells are working hard to...","['vlogbrothers', 'Crash Course', 'crashcourse', 'education', 'botany', 'photosynthesis', 'mitoch...",13760.0,835.0,,51.0,780.0,...,True,Photosynthesis and Cellular Respiration,Botany,5.0,40,False,regular course,15.0,2023-06-22,16:00:21
1,1,pqosCR6J2qo,CrashCourse,"Plants—they’re just like us! Well, not exactly, but they do have skin and hair like us…even if t...","['vlogbrothers', 'Crash Course', 'crashcourse', 'education', 'botany', 'photosynthesis', 'plant ...",32496.0,1570.0,,52.0,764.0,...,True,What Do These Creepy Plant Mouths Do? (Plant Tissues),Botany,4.0,54,False,regular course,7.0,2023-06-15,16:00:38


### Include time since published 

In [238]:
cc_bycourse_df["days_published_for"] = cc_bycourse_df["publishedDate"].apply(lambda x: (datetime.today().date() - x).days)
cc_bycourse_df["days_published_for"]

0          3
1         10
2         12
3         19
4         24
        ... 
1470    4157
1471    4161
1472    4164
1473    4168
1474    4223
Name: days_published_for, Length: 1475, dtype: int64

### reorder columns

In [239]:
cc_bycourse_df.columns

Index(['Unnamed: 0', 'video_id', 'channelTitle', 'description', 'tags',
       'viewCount', 'likeCount', 'favouriteCount', 'commentCount', 'duration',
       'definition', 'caption', 'episode_title', 'course', 'episode_num',
       'episode_title_len', 'with_Arizona', 'video_type', 'tag_Count',
       'publishedDate', 'publishedTime', 'days_published_for'],
      dtype='object')

In [241]:
df_final = cc_bycourse_df[['video_id', 'channelTitle', 'course', "video_type", 'episode_title','episode_num', 'episode_title_len', 'tag_Count', 'publishedDate', 'publishedTime', 'days_published_for', 'viewCount',
       'likeCount', 'favouriteCount', 'commentCount', 'duration', 'definition',
       'caption', "with_Arizona"]]
df_final

Unnamed: 0,video_id,channelTitle,course,video_type,episode_title,episode_num,episode_title_len,tag_Count,publishedDate,publishedTime,days_published_for,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,with_Arizona
0,WzOrF5W4l3Q,CrashCourse,Botany,regular course,Photosynthesis and Cellular Respiration,5.0,40,15.0,2023-06-22,16:00:21,3,13760.0,835.0,,51.0,780.0,hd,True,False
1,pqosCR6J2qo,CrashCourse,Botany,regular course,What Do These Creepy Plant Mouths Do? (Plant Tissues),4.0,54,7.0,2023-06-15,16:00:38,10,32496.0,1570.0,,52.0,764.0,hd,True,False
2,xOLcZMw0hd4,CrashCourse,Biology,regular course,The Scientific Method,2.0,22,11.0,2023-06-13,16:00:44,12,30368.0,1519.0,,87.0,909.0,hd,True,False
3,tZE_fQFK8EY,CrashCourse,Biology,regular course,Introduction to Biology,1.0,24,9.0,2023-06-06,16:00:39,19,70777.0,3775.0,,266.0,807.0,hd,True,False
4,y9BLCfcUcFg,CrashCourse,Botany,regular course,Plant Cells & Hormones,3.0,23,22.0,2023-06-01,16:03:13,24,56483.0,3462.0,,135.0,778.0,hd,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1470,HVT3Y3_gHGg,CrashCourse,Biology,regular course,Water - Liquid Awesome,2.0,23,26.0,2012-02-06,22:10:10,4157,4840139.0,51024.0,,3825.0,677.0,hd,True,False
1471,n7ndRwqJYDM,CrashCourse,World History,regular course,Indus Valley Civilization,2.0,26,18.0,2012-02-02,20:18:11,4161,7917297.0,85507.0,,8047.0,575.0,hd,True,False
1472,QnQe0xW_JY4,CrashCourse,Biology,regular course,Carbon... SO SIMPLE,1.0,20,18.0,2012-01-30,18:53:06,4164,7827237.0,78209.0,,7809.0,717.0,hd,True,False
1473,Yocja_N5s1I,CrashCourse,World History,regular course,The Agricultural Revolution,1.0,28,27.0,2012-01-26,20:12:01,4168,15378590.0,152936.0,,12247.0,671.0,hd,True,False


In [244]:
df_final[df_final["course"]=="Business - Soft Skills"]

Unnamed: 0,video_id,channelTitle,course,video_type,episode_title,episode_num,episode_title_len,tag_Count,publishedDate,publishedTime,days_published_for,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,with_Arizona
335,DJq9zVLu5_Q,CrashCourse,Business - Soft Skills,regular course,How to Avoid Burnout,17.0,21,12.0,2019-07-03,21:13:34,1453,165333.0,5324.0,,186.0,640.0,hd,True,False
337,NL4_VoX7DuQ,CrashCourse,Business - Soft Skills,regular course,The Many Forms of Power,16.0,24,13.0,2019-06-26,21:39:05,1460,92058.0,2653.0,,114.0,694.0,hd,True,False
339,59sCMUnvNdA,CrashCourse,Business - Soft Skills,regular course,How to Create a Fair Workplace,15.0,31,6.0,2019-06-19,21:17:39,1467,65476.0,1688.0,,105.0,642.0,hd,True,False
340,bSRvDfD451I,CrashCourse,Business - Soft Skills,regular course,How to Find Your Leadership Style,14.0,34,12.0,2019-06-13,20:30:00,1473,136103.0,3482.0,,104.0,611.0,hd,True,False
342,gOHoSuDEO4M,CrashCourse,Business - Soft Skills,regular course,How to Handle Conflict,13.0,23,15.0,2019-06-05,20:30:00,1481,111186.0,2951.0,,92.0,696.0,hd,True,False
344,fRj7Am63wVc,CrashCourse,Business - Soft Skills,regular course,How to Avoid Teamwork Disasters,12.0,32,13.0,2019-05-29,20:30:01,1488,93074.0,2432.0,,92.0,636.0,hd,True,False
346,vLxjUNINfnY,CrashCourse,Business - Soft Skills,regular course,How to Make Tough Decisions,11.0,28,13.0,2019-05-22,21:14:31,1495,114115.0,3267.0,,122.0,688.0,hd,True,False
348,2Si7ah_h32s,CrashCourse,Business - Soft Skills,regular course,Making Time Management Work for You,10.0,36,15.0,2019-05-15,21:00:01,1502,245625.0,6916.0,,153.0,660.0,hd,True,False
350,LQ5Uj1nryBc,CrashCourse,Business - Soft Skills,regular course,How to Set and Achieve SMART Goals,9.0,35,12.0,2019-05-08,20:49:06,1509,165330.0,3801.0,,91.0,653.0,hd,True,False
353,FTrxX0b4I4o,CrashCourse,Business - Soft Skills,regular course,How to Become a Better Negotiator,8.0,34,13.0,2019-05-01,23:06:57,1516,112084.0,3268.0,,126.0,693.0,hd,True,False


# Load as CSV

In [196]:
df_final.to_csv("data/crashcourse_bycourse_cleaned.csv")