In [1]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd

## KhanAcademy

In [21]:
base_url = "https://khanacademy.org/api/v1/topic/"
slug = "root"

In [22]:
r = requests.get(base_url + slug)
soup = BeautifulSoup(r.text,'lxml')

In [24]:
children = json.loads(soup.p.text)['children']
topics = []
for child in children:
     topics.append(child['node_slug'])

In [25]:
topics

['math',
 'science',
 'economics-finance-domain',
 'humanities',
 'computing',
 'test-prep',
 'educator-test',
 'partner-content',
 'talks-and-interviews',
 'college-careers-more',
 'talent-search',
 'resources',
 'mappers']

In [26]:
topics = topics[:5]

In [45]:
subtopics = {slug: [] for slug in topics}
for slug in topics:
    r = requests.get(base_url + slug)
    soup = BeautifulSoup(r.text,'lxml')
    info = json.loads(soup.p.text)['children']
    for sub_slug in info:
        subtopics[slug].append(sub_slug['node_slug'])

In [46]:
subtopics

{'computing': ['computer-programming', 'computer-science', 'hour-of-code'],
 'economics-finance-domain': ['ap-macroeconomics',
  'macroeconomics',
  'ap-microeconomics',
  'microeconomics',
  'core-finance'],
 'humanities': ['us-history',
  'world-history',
  'ap-world-history',
  'ap-us-government-and-politics',
  'art-history',
  'grammar',
  'music',
  'special-topics-art-history',
  'arts-humanities-partners'],
 'math': ['early-math',
  'cc-kindergarten-math',
  'cc-1st-grade-math',
  'cc-2nd-grade-math',
  'cc-third-grade-math',
  'cc-fourth-grade-math',
  'cc-fifth-grade-math',
  'cc-sixth-grade-math',
  'cc-seventh-grade-math',
  'cc-eighth-grade-math',
  'arithmetic',
  'basic-geo',
  'pre-algebra',
  'algebra-basics',
  'algebra',
  'geometry',
  'algebra2',
  'trigonometry',
  'statistics-probability',
  'probability',
  'ap-statistics',
  'precalculus',
  'differential-calculus',
  'integral-calculus',
  'ap-calculus-ab',
  'ap-calculus-bc',
  'multivariable-calculus',
  'di

#### A lot of extras, so let's cut down - KhanAcademy is more for people still in primary school

In [67]:
subtopics['computing'] = [a for a in subtopics['computing'][:-1]]

In [69]:
subtopics['economics-finance-domain'] = [a for a in subtopics['economics-finance-domain'] if 'ap-' not in a]

In [70]:
subtopics['humanities'] = [a for a in subtopics['humanities'] if 'ap-' not in a and 'partners' not in a and 'special' not in a]

In [71]:
subtopics['math'] = ['statistics-probability', 'probability', 'multivariable-calculus','differential-equations','linear-algebra']

In [72]:
subtopics['science'] = [a for a in subtopics['science'][:-1]]

In [73]:
subtopics

{'computing': ['computer-programming', 'computer-science'],
 'economics-finance-domain': ['macroeconomics',
  'microeconomics',
  'core-finance'],
 'humanities': ['us-history',
  'world-history',
  'art-history',
  'grammar',
  'music'],
 'math': ['statistics-probability',
  'probability',
  'multivariable-calculus',
  'differential-equations',
  'linear-algebra'],
 'science': ['physics',
  'chemistry',
  'organic-chemistry',
  'biology',
  'high-school-biology',
  'cosmology-and-astronomy',
  'electrical-engineering']}

#### Now let's go through all their possible classes

In [74]:
ka_dict = {
    "Course": [],
    "Description": [],
    "Image": [],
    "Instructor": [],
    "Link": [],
    "Price": [],
    "Subject": [],
    "Provider": []
}

In [75]:
course_url = base_url + subtopics['computing'][1]
r = requests.get(course_url)
soup = BeautifulSoup(r.text,'lxml')

In [76]:
for topic in subtopics:
    for subject in subtopics[topic]:
        course_url = base_url + subject
        r = requests.get(course_url)
        soup = BeautifulSoup(r.text,'lxml')
        children = json.loads(soup.p.text)['children']
        for course in children:
            ka_dict["Course"].append(course['title'])
            ka_dict["Description"].append(course['description'])
            ka_dict["Image"].append(course['icon'])
            ka_dict["Instructor"].append("None")
            ka_dict["Link"].append(course['url'])
            ka_dict["Price"].append("Free")
            ka_dict["Subject"].append(subject.replace("-", " "))
            ka_dict["Provider"].append("Khan Academy")

In [77]:
ka_df = pd.DataFrame.from_dict(ka_dict)

In [90]:
ka_df = ka_df.drop_duplicates(subset = ['Course'])

In [91]:
ka_df.to_csv("khan_courses.csv")

In [92]:
len(ka_df.Course.unique())

215

#### Let's extract the subject information now

In [112]:
real_subjects = ka_df.Subject.unique()

In [114]:
subject_info = {
    "Subject": [],
    "Provider": [],
    "Image": [],
    "Courses": []
}

In [115]:
for subject in real_subjects:
    subject_info["Subject"].append(subject)
    subject_info["Provider"].append("Khan Academy")
    subject_info["Courses"].append(';'.join([course for course in ka_df[ka_df.Subject == subject].Course])) # collect all courses
    subject_info["Image"].append(ka_df[ka_df.Subject == subject].mode().Image[0]) # get most occuring

In [117]:
ka_subject_df = pd.DataFrame.from_dict(subject_info)

In [119]:
ka_subject_df = ka_subject_df[["Subject","Provider","Image","Courses"]]

In [120]:
ka_subject_df

Unnamed: 0,Subject,Provider,Image,Courses
0,statistics probability,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,Analyzing categorical data;Displaying and comp...
1,probability,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,Scatterplots;Data distributions;Two-way tables...
2,multivariable calculus,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,Thinking about multivariable functions;Derivat...
3,differential equations,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,First order differential equations;Second orde...
4,linear algebra,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,Vectors and spaces;Matrix transformations;Alte...
5,physics,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,One-dimensional motion;Two-dimensional motion;...
6,chemistry,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,"Atoms, compounds, and ions;Chemical reactions ..."
7,organic chemistry,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,Structure and bonding;Resonance and acid-base ...
8,biology,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,"Intro to biology;Chemistry of life;Water, acid..."
9,high school biology,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,Biology foundations;Cells;Cellular transport;E...


In [185]:
ka_subject_df.to_csv("ka_subjects.csv")

### Wait, let's double check Udemy Data

In [123]:
df_udemy = pd.read_csv("udemy_courses.csv",encoding = "ISO-8859-1")

In [125]:
len(df_udemy)

2557

In [126]:
len(df_udemy.drop_duplicates(subset = ["Course"]))

2557

#### Ok good, let's extract its subject data as well

In [177]:
u_subjects = {}
udemy_subjects = {
    "Subject": [],
    "Provider": [],
    "Image": [],
    "Courses": []
}

In [178]:
u_subjects

{}

In [179]:
for index, row in df_udemy.iterrows():
    if row["PrimarySubcategory"] not in u_subjects:
        u_subjects[row["PrimarySubcategory"]] = row["PrimaryCategory"]

In [180]:
for subject in u_subjects:
    udemy_subjects["Subject"].append(subject)
    udemy_subjects["Provider"].append("Udemy")
    udemy_subjects["Courses"].append(';'.join([course for course in df_udemy[df_udemy.PrimarySubcategory == subject].PrimarySubcategory])) # collect all courses

In [181]:
for key in udemy_subjects:
    print(len(udemy_subjects[key]))

118
118
0
118


In [182]:
# just need to extract images from website
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

for subject in udemy_subjects["Subject"]:
    primary = u_subjects[subject].replace("&","and").replace(" ","-")
    subject = subject.replace("&","and").replace(" ","-")
    url = "https://www.udemy.com/courses/" + primary + "/" + subject + "/"
    # TEMPORARY
    udemy_subjects["Image"].append(url)
    

In [183]:
for key in udemy_subjects:
    print(len(udemy_subjects[key]))

118
118
118
118


In [186]:
df_udemy_subjects = pd.DataFrame.from_dict(udemy_subjects)

In [188]:
df_udemy_subjects = df_udemy_subjects[["Subject","Provider","Image","Courses"]]

In [None]:
df_udemy_subjects = df_udemy_subjects.to