In [1]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd

## KhanAcademy

In [21]:
base_url = "https://khanacademy.org/api/v1/topic/"
slug = "root"

In [22]:
r = requests.get(base_url + slug)
soup = BeautifulSoup(r.text,'lxml')

In [24]:
children = json.loads(soup.p.text)['children']
topics = []
for child in children:
     topics.append(child['node_slug'])

In [25]:
topics

['math',
 'science',
 'economics-finance-domain',
 'humanities',
 'computing',
 'test-prep',
 'educator-test',
 'partner-content',
 'talks-and-interviews',
 'college-careers-more',
 'talent-search',
 'resources',
 'mappers']

In [26]:
topics = topics[:5]

In [45]:
subtopics = {slug: [] for slug in topics}
for slug in topics:
    r = requests.get(base_url + slug)
    soup = BeautifulSoup(r.text,'lxml')
    info = json.loads(soup.p.text)['children']
    for sub_slug in info:
        subtopics[slug].append(sub_slug['node_slug'])

In [46]:
subtopics

{'computing': ['computer-programming', 'computer-science', 'hour-of-code'],
 'economics-finance-domain': ['ap-macroeconomics',
  'macroeconomics',
  'ap-microeconomics',
  'microeconomics',
  'core-finance'],
 'humanities': ['us-history',
  'world-history',
  'ap-world-history',
  'ap-us-government-and-politics',
  'art-history',
  'grammar',
  'music',
  'special-topics-art-history',
  'arts-humanities-partners'],
 'math': ['early-math',
  'cc-kindergarten-math',
  'cc-1st-grade-math',
  'cc-2nd-grade-math',
  'cc-third-grade-math',
  'cc-fourth-grade-math',
  'cc-fifth-grade-math',
  'cc-sixth-grade-math',
  'cc-seventh-grade-math',
  'cc-eighth-grade-math',
  'arithmetic',
  'basic-geo',
  'pre-algebra',
  'algebra-basics',
  'algebra',
  'geometry',
  'algebra2',
  'trigonometry',
  'statistics-probability',
  'probability',
  'ap-statistics',
  'precalculus',
  'differential-calculus',
  'integral-calculus',
  'ap-calculus-ab',
  'ap-calculus-bc',
  'multivariable-calculus',
  'di

#### A lot of extras, so let's cut down - KhanAcademy is more for people still in primary school

In [67]:
subtopics['computing'] = [a for a in subtopics['computing'][:-1]]

In [69]:
subtopics['economics-finance-domain'] = [a for a in subtopics['economics-finance-domain'] if 'ap-' not in a]

In [70]:
subtopics['humanities'] = [a for a in subtopics['humanities'] if 'ap-' not in a and 'partners' not in a and 'special' not in a]

In [71]:
subtopics['math'] = ['statistics-probability', 'probability', 'multivariable-calculus','differential-equations','linear-algebra']

In [72]:
subtopics['science'] = [a for a in subtopics['science'][:-1]]

In [73]:
subtopics

{'computing': ['computer-programming', 'computer-science'],
 'economics-finance-domain': ['macroeconomics',
  'microeconomics',
  'core-finance'],
 'humanities': ['us-history',
  'world-history',
  'art-history',
  'grammar',
  'music'],
 'math': ['statistics-probability',
  'probability',
  'multivariable-calculus',
  'differential-equations',
  'linear-algebra'],
 'science': ['physics',
  'chemistry',
  'organic-chemistry',
  'biology',
  'high-school-biology',
  'cosmology-and-astronomy',
  'electrical-engineering']}

#### Now let's go through all their possible classes

In [74]:
ka_dict = {
    "Course": [],
    "Description": [],
    "Image": [],
    "Instructor": [],
    "Link": [],
    "Price": [],
    "Subject": [],
    "Provider": []
}

In [75]:
course_url = base_url + subtopics['computing'][1]
r = requests.get(course_url)
soup = BeautifulSoup(r.text,'lxml')

In [76]:
for topic in subtopics:
    for subject in subtopics[topic]:
        course_url = base_url + subject
        r = requests.get(course_url)
        soup = BeautifulSoup(r.text,'lxml')
        children = json.loads(soup.p.text)['children']
        for course in children:
            ka_dict["Course"].append(course['title'])
            ka_dict["Description"].append(course['description'])
            ka_dict["Image"].append(course['icon'])
            ka_dict["Instructor"].append("None")
            ka_dict["Link"].append(course['url'])
            ka_dict["Price"].append("Free")
            ka_dict["Subject"].append(subject.replace("-", " "))
            ka_dict["Provider"].append("Khan Academy")

In [77]:
ka_df = pd.DataFrame.from_dict(ka_dict)

In [90]:
ka_df = ka_df.drop_duplicates(subset = ['Course'])

In [91]:
ka_df.to_csv("khan_courses.csv")

In [92]:
len(ka_df.Course.unique())

215

#### Let's extract the subject information now

In [329]:
real_subjects = ka_df.Subject.unique()

In [330]:
subject_info = {
    "Subject": [],
    "Provider": [],
    "Image": [],
    "Courses": []
}

In [331]:
for subject in real_subjects:
    subject_info["Subject"].append(subject)
    subject_info["Provider"].append("Khan Academy")
    subject_info["Courses"].append(','.join([str(course) for course in ka_df[ka_df.Subject == subject].Course.index])) # collect all courses
    subject_info["Image"].append(ka_df[ka_df.Subject == subject].mode().Image[0]) # get most occuring

In [332]:
ka_subject_df = pd.DataFrame.from_dict(subject_info)

In [333]:
ka_subject_df = ka_subject_df[["Subject","Provider","Image","Courses"]]

In [334]:
ka_subject_df

Unnamed: 0,Subject,Provider,Image,Courses
0,Statistics Probability,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,0123456789101112131415
1,Probability,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,1617182122
2,Multivariable Calculus,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,2324252627
3,Differential Equations,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,282930
4,Linear Algebra,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,313233
5,Physics,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,"34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,4..."
6,Chemistry,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,"54,55,56,57,58,59,60,61,62,63,65,66,67,68,69,7..."
7,Organic Chemistry,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,72737576777879808182838485
8,Biology,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,"86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,..."
9,High School Biology,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,116117118119120121


In [335]:
ka_subject_df.to_csv("ka_subjects.csv")

### Wait, let's double check Udemy Data

In [275]:
df_udemy = pd.read_csv("udemy_courses.csv",encoding = "ISO-8859-1")

In [125]:
len(df_udemy)

2557

In [126]:
len(df_udemy.drop_duplicates(subset = ["Course"]))

2557

#### Ok good, let's extract its subject data as well

In [316]:
u_subjects = {}
udemy_subjects = {
    "Subject": [],
    "Provider": [],
    "Image": [],
    "Courses": []
}

In [317]:
u_subjects

{}

In [318]:
for index, row in df_udemy.iterrows():
    if row["PrimarySubcategory"] not in u_subjects:
        u_subjects[row["PrimarySubcategory"]] = row["PrimaryCategory"]

In [319]:
for subject in u_subjects:
    udemy_subjects["Subject"].append(subject)
    udemy_subjects["Provider"].append("Udemy")
    udemy_subjects["Courses"].append(','.join([str(course) for course in df_udemy[df_udemy.PrimarySubcategory == subject].Course.index])) # collect all courses

In [320]:
for key in udemy_subjects:
    print(len(udemy_subjects[key]))

118
118
0
118


In [321]:
# just need to extract images from website
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

for subject in udemy_subjects["Subject"]:
    primary = u_subjects[subject].replace("&","and").replace(" ","-")
    subject = subject.replace("&","and").replace(" ","-")
    url = "https://www.udemy.com/courses/" + primary + "/" + subject + "/"
    # TEMPORARY
    udemy_subjects["Image"].append(url)
    

In [322]:
for key in udemy_subjects:
    print(len(udemy_subjects[key]))

118
118
118
118


In [323]:
df_udemy_subjects = pd.DataFrame.from_dict(udemy_subjects)

In [324]:
df_udemy_subjects = df_udemy_subjects[["Subject","Provider","Image","Courses"]]

In [336]:
df_udemy_subjects.to_csv("udemy_subjects_INCOMPLETE.csv")

## TIME TO MERGE

In [337]:
df_udemy_subjects.head()

Unnamed: 0,Subject,Provider,Image,Courses
0,Web Development,Udemy,https://www.udemy.com/courses/Development/Web-...,"0,1,2,4,9,14,15,18,22,25,26,44,48,50,54,57,58,..."
1,Other,Udemy,https://www.udemy.com/courses/IT-and-Software/...,"3,102,151,186,241,273,310,334,381,383,446,464,..."
2,Programming Languages,Udemy,https://www.udemy.com/courses/Development/Prog...,"5,6,16,19,45,79,94,127,157,172,180,182,194,196..."
3,Personal Transformation,Udemy,https://www.udemy.com/courses/Personal-Develop...,"7,10,73,106,152,187,289,292,295,326,357,360,39..."
4,Personal Finance,Udemy,https://www.udemy.com/courses/Personal-Develop...,"8,148,269,564,586,611,721,874,1232,1277,1285,1..."


In [338]:
ka_subject_df['Subject'] = ka_subject_df['Subject'].apply(lambda s: " ".join(w.capitalize() for w in s.split()))

In [339]:
ka_subject_df.head()

Unnamed: 0,Subject,Provider,Image,Courses
0,Statistics Probability,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,0123456789101112131415
1,Probability,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,1617182122
2,Multivariable Calculus,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,2324252627
3,Differential Equations,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,282930
4,Linear Algebra,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,313233


In [340]:
df_subjects = pd.concat([ka_subject_df,df_udemy_subjects])

In [341]:
df_subjects

Unnamed: 0,Subject,Provider,Image,Courses
0,Statistics Probability,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,0123456789101112131415
1,Probability,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,1617182122
2,Multivariable Calculus,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,2324252627
3,Differential Equations,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,282930
4,Linear Algebra,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,313233
5,Physics,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,"34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,4..."
6,Chemistry,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,"54,55,56,57,58,59,60,61,62,63,65,66,67,68,69,7..."
7,Organic Chemistry,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,72737576777879808182838485
8,Biology,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,"86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,..."
9,High School Biology,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,116117118119120121


In [342]:
df_subjects.to_csv("subjects.csv")

## MORE MERGING

In [243]:
df_udemy["Subject"] = df_udemy["PrimarySubcategory"]

In [250]:
good = df_udemy.columns[1:]
good2 = [df_udemy.columns[-1]]

In [251]:
good2

['Subject']

In [252]:
good = list(good[:6]) + good2

In [254]:
df_udemy = df_udemy[good]

In [255]:
df_udemy.head()

Unnamed: 0,Course,Description,Image,Instructor,Link,Price,Subject
0,JavaScript: Understanding the Weird Parts,An advanced JavaScript course for everyone! Sc...,https://udemy-images.udemy.com/course/125_H/36...,Anthony Alicea,/understand-javascript/,$174.99,Web Development
1,Learn and Understand AngularJS,Master AngularJS and the Javascript concepts b...,https://udemy-images.udemy.com/course/125_H/28...,Anthony Alicea,/learn-angularjs/,$174.99,Web Development
2,Webpack 2: The Complete Developer's Guide,Master Webpack 2 as you deploy web apps suppor...,https://udemy-images.udemy.com/course/125_H/10...,Stephen Grider,/webpack-2-the-complete-developers-guide/,$74.99,Web Development
3,Blockchain and Bitcoin Fundamentals,Learn the key elements of blockchain and Bitco...,https://udemy-images.udemy.com/course/125_H/11...,George Levy,/blockchain-and-bitcoin-fundamentals/,$94.99,Other
4,Build Responsive Real World Websites with HTML...,"The easiest way to learn modern web design, HT...",https://udemy-images.udemy.com/course/125_H/43...,Jonas Schmedtmann,/design-and-develop-a-killer-website-with-html...,$199.99,Web Development


In [256]:
df_udemy["Provider"] = df_udemy["Course"].apply(lambda x: "Udemy") # lazy way to insert column

In [257]:
df_udemy.head()

Unnamed: 0,Course,Description,Image,Instructor,Link,Price,Subject,Provider
0,JavaScript: Understanding the Weird Parts,An advanced JavaScript course for everyone! Sc...,https://udemy-images.udemy.com/course/125_H/36...,Anthony Alicea,/understand-javascript/,$174.99,Web Development,Udemy
1,Learn and Understand AngularJS,Master AngularJS and the Javascript concepts b...,https://udemy-images.udemy.com/course/125_H/28...,Anthony Alicea,/learn-angularjs/,$174.99,Web Development,Udemy
2,Webpack 2: The Complete Developer's Guide,Master Webpack 2 as you deploy web apps suppor...,https://udemy-images.udemy.com/course/125_H/10...,Stephen Grider,/webpack-2-the-complete-developers-guide/,$74.99,Web Development,Udemy
3,Blockchain and Bitcoin Fundamentals,Learn the key elements of blockchain and Bitco...,https://udemy-images.udemy.com/course/125_H/11...,George Levy,/blockchain-and-bitcoin-fundamentals/,$94.99,Other,Udemy
4,Build Responsive Real World Websites with HTML...,"The easiest way to learn modern web design, HT...",https://udemy-images.udemy.com/course/125_H/43...,Jonas Schmedtmann,/design-and-develop-a-killer-website-with-html...,$199.99,Web Development,Udemy


In [258]:
ka_df = ka_df[good]

In [267]:
ka_df['Subject'] = ka_df['Subject'].apply(lambda s: " ".join(w.capitalize() for w in s.split()))

In [261]:
ka_df["Provider"] = ka_df["Course"].apply(lambda x: "Khan Academy")

In [268]:
ka_df.head()

Unnamed: 0,Course,Description,Image,Instructor,Link,Price,Subject,Provider
0,Analyzing categorical data,This unit covers methods for dealing with data...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
1,Displaying and comparing quantitative data,This unit covers some basic methods for graphi...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
2,Summarizing quantitative data,This unit covers common measures of center lik...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
3,Modeling data distributions,This unit takes our understanding of distribut...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
4,Exploring bivariate numerical data,We use scatter plots to explore the relationsh...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy


In [269]:
df_courses = pd.concat([ka_df,df_udemy])

In [270]:
df_courses.to_csv("courses.csv")

In [314]:
df_courses[df_courses.Course.str.contains("golang")].Course[275].encode("utf8").decode("utf8").encode("ISO-8859-1").decode("ISO-8859-1")

'Web Development w/ Google\x92s Go (golang) Programming Language'