In [2]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd

## KhanAcademy

In [21]:
base_url = "https://khanacademy.org/api/v1/topic/"
slug = "root"

In [22]:
r = requests.get(base_url + slug)
soup = BeautifulSoup(r.text,'lxml')

In [24]:
children = json.loads(soup.p.text)['children']
topics = []
for child in children:
     topics.append(child['node_slug'])

In [25]:
topics

['math',
 'science',
 'economics-finance-domain',
 'humanities',
 'computing',
 'test-prep',
 'educator-test',
 'partner-content',
 'talks-and-interviews',
 'college-careers-more',
 'talent-search',
 'resources',
 'mappers']

In [26]:
topics = topics[:5]

In [45]:
subtopics = {slug: [] for slug in topics}
for slug in topics:
    r = requests.get(base_url + slug)
    soup = BeautifulSoup(r.text,'lxml')
    info = json.loads(soup.p.text)['children']
    for sub_slug in info:
        subtopics[slug].append(sub_slug['node_slug'])

In [46]:
subtopics

{'computing': ['computer-programming', 'computer-science', 'hour-of-code'],
 'economics-finance-domain': ['ap-macroeconomics',
  'macroeconomics',
  'ap-microeconomics',
  'microeconomics',
  'core-finance'],
 'humanities': ['us-history',
  'world-history',
  'ap-world-history',
  'ap-us-government-and-politics',
  'art-history',
  'grammar',
  'music',
  'special-topics-art-history',
  'arts-humanities-partners'],
 'math': ['early-math',
  'cc-kindergarten-math',
  'cc-1st-grade-math',
  'cc-2nd-grade-math',
  'cc-third-grade-math',
  'cc-fourth-grade-math',
  'cc-fifth-grade-math',
  'cc-sixth-grade-math',
  'cc-seventh-grade-math',
  'cc-eighth-grade-math',
  'arithmetic',
  'basic-geo',
  'pre-algebra',
  'algebra-basics',
  'algebra',
  'geometry',
  'algebra2',
  'trigonometry',
  'statistics-probability',
  'probability',
  'ap-statistics',
  'precalculus',
  'differential-calculus',
  'integral-calculus',
  'ap-calculus-ab',
  'ap-calculus-bc',
  'multivariable-calculus',
  'di

#### A lot of extras, so let's cut down - KhanAcademy is more for people still in primary school

In [67]:
subtopics['computing'] = [a for a in subtopics['computing'][:-1]]

In [69]:
subtopics['economics-finance-domain'] = [a for a in subtopics['economics-finance-domain'] if 'ap-' not in a]

In [70]:
subtopics['humanities'] = [a for a in subtopics['humanities'] if 'ap-' not in a and 'partners' not in a and 'special' not in a]

In [71]:
subtopics['math'] = ['statistics-probability', 'probability', 'multivariable-calculus','differential-equations','linear-algebra']

In [72]:
subtopics['science'] = [a for a in subtopics['science'][:-1]]

In [73]:
subtopics

{'computing': ['computer-programming', 'computer-science'],
 'economics-finance-domain': ['macroeconomics',
  'microeconomics',
  'core-finance'],
 'humanities': ['us-history',
  'world-history',
  'art-history',
  'grammar',
  'music'],
 'math': ['statistics-probability',
  'probability',
  'multivariable-calculus',
  'differential-equations',
  'linear-algebra'],
 'science': ['physics',
  'chemistry',
  'organic-chemistry',
  'biology',
  'high-school-biology',
  'cosmology-and-astronomy',
  'electrical-engineering']}

#### Now let's go through all their possible classes

In [74]:
ka_dict = {
    "Course": [],
    "Description": [],
    "Image": [],
    "Instructor": [],
    "Link": [],
    "Price": [],
    "Subject": [],
    "Provider": []
}

In [75]:
course_url = base_url + subtopics['computing'][1]
r = requests.get(course_url)
soup = BeautifulSoup(r.text,'lxml')

In [76]:
for topic in subtopics:
    for subject in subtopics[topic]:
        course_url = base_url + subject
        r = requests.get(course_url)
        soup = BeautifulSoup(r.text,'lxml')
        children = json.loads(soup.p.text)['children']
        for course in children:
            ka_dict["Course"].append(course['title'])
            ka_dict["Description"].append(course['description'])
            ka_dict["Image"].append(course['icon'])
            ka_dict["Instructor"].append("None")
            ka_dict["Link"].append(course['url'])
            ka_dict["Price"].append("Free")
            ka_dict["Subject"].append(subject.replace("-", " "))
            ka_dict["Provider"].append("Khan Academy")

In [77]:
ka_df = pd.DataFrame.from_dict(ka_dict)

In [90]:
ka_df = ka_df.drop_duplicates(subset = ['Course'])

In [91]:
ka_df.to_csv("khan_courses.csv")

In [17]:
ka_df = pd.read_csv("khan_courses.csv",encoding = "ISO-8859-1")

In [19]:
ka_df = ka_df[ka_df.columns[1:]]

In [92]:
len(ka_df.Course.unique())

215

#### Let's extract the subject information now

In [72]:
real_subjects = ka_df.Subject.unique()

In [73]:
subject_info = {
    "Subject": [],
    "Provider": [],
    "Image": [],
    "Courses": []
}

In [79]:
df_courses[df_courses.Subject == "Art History"].Course.index

Int64Index([172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184,
            185, 186, 187],
           dtype='int64')

In [80]:
for subject in real_subjects:
    subject_info["Subject"].append(subject)
    subject_info["Provider"].append("Khan Academy")
    subject_info["Courses"].append(','.join([str(course) for course in df_courses[df_courses.Subject == subject].Course.index])) # collect all courses
    subject_info["Image"].append(ka_df[ka_df.Subject == subject].mode().Image[0]) # get most occuring

In [81]:
ka_subject_df = pd.DataFrame.from_dict(subject_info)

In [83]:
ka_subject_df = ka_subject_df[["Subject","Provider","Image","Courses"]]

In [84]:
ka_subject_df.head()

Unnamed: 0,Subject,Provider,Image,Courses
0,Statistics Probability,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,0123456789101112131415
1,Probability,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,1617181920
2,Multivariable Calculus,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,2122232425
3,Differential Equations,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,262728
4,Linear Algebra,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,293031


In [85]:
ka_subject_df.to_csv("ka_subjects.csv")

### Wait, let's double check Udemy Data

In [56]:
df_udemy_r = pd.read_csv("udemy_courses.csv",encoding = "ISO-8859-1")

In [59]:
df_udemy_r[df_udemy_r.columns[1:]]

Unnamed: 0,Course,Description,Image,Instructor,Link,Price,PrimaryCategory,PrimaryCategoryID,PrimarySubcategory,PrimarySubcategoryID,Rating
0,JavaScript: Understanding the Weird Parts,An advanced JavaScript course for everyone! Sc...,https://udemy-images.udemy.com/course/125_H/36...,Anthony Alicea,/understand-javascript/,$174.99,Development,288,Web Development,8,4.77
1,Learn and Understand AngularJS,Master AngularJS and the Javascript concepts b...,https://udemy-images.udemy.com/course/125_H/28...,Anthony Alicea,/learn-angularjs/,$174.99,Development,288,Web Development,8,4.68
2,Webpack 2: The Complete Developer's Guide,Master Webpack 2 as you deploy web apps suppor...,https://udemy-images.udemy.com/course/125_H/10...,Stephen Grider,/webpack-2-the-complete-developers-guide/,$74.99,Development,288,Web Development,8,4.73
3,Blockchain and Bitcoin Fundamentals,Learn the key elements of blockchain and Bitco...,https://udemy-images.udemy.com/course/125_H/11...,George Levy,/blockchain-and-bitcoin-fundamentals/,$94.99,IT & Software,294,Other,140,4.71
4,Build Responsive Real World Websites with HTML...,"The easiest way to learn modern web design, HT...",https://udemy-images.udemy.com/course/125_H/43...,Jonas Schmedtmann,/design-and-develop-a-killer-website-with-html...,$199.99,Development,288,Web Development,8,4.69
5,C# Advanced Topics: Take Your C# Skills to the...,The advanced C# constructs that every C# coder...,https://udemy-images.udemy.com/course/125_H/35...,Mosh Hamedani,/csharp-advanced/,$149.99,Development,288,Programming Languages,12,4.61
6,"C# Intermediate: Classes, Interfaces and OOP","An in-depth, step-by-step guide to classes, in...",https://udemy-images.udemy.com/course/125_H/38...,Mosh Hamedani,/csharp-intermediate-classes-interfaces-and-oop/,$149.99,Development,288,Programming Languages,12,4.64
7,"Reiki Level I, II and Master/Teacher Program","Learn Reiki Levels 1, 2 and Master Level to be...",https://udemy-images.udemy.com/course/125_H/59...,Lisa Powers,/reikicourse/,$194.99,Personal Development,296,Personal Transformation,142,4.73
8,Cryptocurrency ICO Investing Course 2018: Iden...,Learn how to pick the most high-quality and ga...,https://udemy-images.udemy.com/course/125_H/14...,Suppoman ,/cryptocurrency-ico/,$199.99,Personal Development,296,Personal Finance,148,4.52
9,ES6 Javascript: The Complete Developer's Guide,ES6 Javascript Development from scratch. Get ...,https://udemy-images.udemy.com/course/125_H/86...,Stephen Grider,/javascript-es6-tutorial/,$79.99,Development,288,Web Development,8,4.69


In [126]:
len(df_udemy.drop_duplicates(subset = ["Course"]))

2557

#### Ok good, let's extract its subject data as well

In [60]:
u_subjects = {}
udemy_subjects = {
    "Subject": [],
    "Provider": [],
    "Image": [],
    "Courses": []
}

In [317]:
u_subjects

{}

In [13]:
df_subjects = pd.read_csv("subjects.csv",encoding = "ISO-8859-1")

In [14]:
df_subjects.head()

Unnamed: 0.1,Unnamed: 0,subject,provider,image,courses,jobs
0,0,Statistics Probability,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,0123456789101112131415,173.0
1,1,Probability,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,1617182122,101173247.0
2,2,Multivariable Calculus,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,2324252627,
3,3,Differential Equations,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,282930,
4,4,Linear Algebra,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,313233,


In [61]:
for index, row in df_udemy.iterrows():
    if row["Subject"] not in u_subjects:
        u_subjects[row["Subject"]] = row["Subject"]

In [55]:
df_udemy.head()

Unnamed: 0,Course,Description,Image,Instructor,Link,Price,Subject,Provider
0,JavaScript: Understanding the Weird Parts,An advanced JavaScript course for everyone! Sc...,https://udemy-images.udemy.com/course/125_H/36...,Anthony Alicea,/understand-javascript/,$174.99,Web Development,Udemy
1,Learn and Understand AngularJS,Master AngularJS and the Javascript concepts b...,https://udemy-images.udemy.com/course/125_H/28...,Anthony Alicea,/learn-angularjs/,$174.99,Web Development,Udemy
2,Webpack 2: The Complete Developer's Guide,Master Webpack 2 as you deploy web apps suppor...,https://udemy-images.udemy.com/course/125_H/10...,Stephen Grider,/webpack-2-the-complete-developers-guide/,$74.99,Web Development,Udemy
3,Blockchain and Bitcoin Fundamentals,Learn the key elements of blockchain and Bitco...,https://udemy-images.udemy.com/course/125_H/11...,George Levy,/blockchain-and-bitcoin-fundamentals/,$94.99,Other,Udemy
4,Build Responsive Real World Websites with HTML...,"The easiest way to learn modern web design, HT...",https://udemy-images.udemy.com/course/125_H/43...,Jonas Schmedtmann,/design-and-develop-a-killer-website-with-html...,$199.99,Web Development,Udemy


In [62]:
for subject in u_subjects:
    udemy_subjects["Subject"].append(subject)
    udemy_subjects["Provider"].append("Udemy")
    udemy_subjects["Image"].append(df_udemy_r.loc[df_udemy_r[df_udemy_r.PrimarySubcategory == subject].Rating.idxmax(),"Image"])
    udemy_subjects["Courses"].append(','.join([str(course) for course in df_courses[(df_courses.Subject == subject) & (df_courses.Provider=="Udemy")].Course.index])) # collect all courses

In [63]:
for key in udemy_subjects:
    print(len(udemy_subjects[key]))

118
118
118
118


In [321]:
# just need to extract images from website
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

for subject in udemy_subjects["Subject"]:
    primary = u_subjects[subject].replace("&","and").replace(" ","-")
    subject = subject.replace("&","and").replace(" ","-")
    url = "https://www.udemy.com/courses/" + primary + "/" + subject + "/"
    # TEMPORARY
    udemy_subjects["Image"].append(url)
    

In [322]:
for key in udemy_subjects:
    print(len(udemy_subjects[key]))

118
118
118
118


In [65]:
df_udemy_subjects = pd.DataFrame.from_dict(udemy_subjects)

In [67]:
df_udemy_subjects = df_udemy_subjects[["Subject","Provider","Image","Courses"]]

In [68]:
df_udemy_subjects.head()

Unnamed: 0,Subject,Provider,Image,Courses
0,Web Development,Udemy,https://udemy-images.udemy.com/course/125_H/15...,"215,216,217,219,224,229,230,233,237,240,241,25..."
1,Other,Udemy,https://udemy-images.udemy.com/course/125_H/14...,"218,317,366,401,456,488,525,549,596,598,661,67..."
2,Programming Languages,Udemy,https://udemy-images.udemy.com/course/125_H/32...,"220,221,231,234,260,294,309,342,372,387,395,39..."
3,Personal Transformation,Udemy,https://udemy-images.udemy.com/course/125_H/14...,"222,225,288,321,367,402,504,507,510,541,572,57..."
4,Personal Finance,Udemy,https://udemy-images.udemy.com/course/125_H/13...,"223,363,484,779,801,826,936,1089,1447,1492,150..."


In [69]:
df_udemy_subjects.to_csv("udemy_subjects.csv")

## TIME TO MERGE

In [86]:
df_udemy_subjects.head()

Unnamed: 0,Subject,Provider,Image,Courses
0,Web Development,Udemy,https://udemy-images.udemy.com/course/125_H/15...,"215,216,217,219,224,229,230,233,237,240,241,25..."
1,Other,Udemy,https://udemy-images.udemy.com/course/125_H/14...,"218,317,366,401,456,488,525,549,596,598,661,67..."
2,Programming Languages,Udemy,https://udemy-images.udemy.com/course/125_H/32...,"220,221,231,234,260,294,309,342,372,387,395,39..."
3,Personal Transformation,Udemy,https://udemy-images.udemy.com/course/125_H/14...,"222,225,288,321,367,402,504,507,510,541,572,57..."
4,Personal Finance,Udemy,https://udemy-images.udemy.com/course/125_H/13...,"223,363,484,779,801,826,936,1089,1447,1492,150..."


In [338]:
ka_subject_df['Subject'] = ka_subject_df['Subject'].apply(lambda s: " ".join(w.capitalize() for w in s.split()))

In [87]:
ka_subject_df.head()

Unnamed: 0,Subject,Provider,Image,Courses
0,Statistics Probability,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,0123456789101112131415
1,Probability,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,1617181920
2,Multivariable Calculus,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,2122232425
3,Differential Equations,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,262728
4,Linear Algebra,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,293031


In [88]:
df_subjects = pd.concat([ka_subject_df,df_udemy_subjects])

In [90]:
df_subjects = df_subjects.reset_index()
df_subjects.head()

Unnamed: 0,index,Subject,Provider,Image,Courses
0,0,Statistics Probability,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,0123456789101112131415
1,1,Probability,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,1617181920
2,2,Multivariable Calculus,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,2122232425
3,3,Differential Equations,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,262728
4,4,Linear Algebra,Khan Academy,https://cdn.kastatic.org/genfiles/topic-icons/...,293031


In [91]:
df_subjects.to_csv("subjects.csv")

## MORE MERGING

In [22]:
df_udemy["Subject"] = df_udemy["PrimarySubcategory"]

In [23]:
good = df_udemy.columns[1:]
good2 = [df_udemy.columns[-1]]

In [251]:
good2

['Subject']

In [24]:
good = list(good[:6]) + good2

In [25]:
df_udemy = df_udemy[good]

In [26]:
df_udemy.head()

Unnamed: 0,Course,Description,Image,Instructor,Link,Price,Subject
0,JavaScript: Understanding the Weird Parts,An advanced JavaScript course for everyone! Sc...,https://udemy-images.udemy.com/course/125_H/36...,Anthony Alicea,/understand-javascript/,$174.99,Web Development
1,Learn and Understand AngularJS,Master AngularJS and the Javascript concepts b...,https://udemy-images.udemy.com/course/125_H/28...,Anthony Alicea,/learn-angularjs/,$174.99,Web Development
2,Webpack 2: The Complete Developer's Guide,Master Webpack 2 as you deploy web apps suppor...,https://udemy-images.udemy.com/course/125_H/10...,Stephen Grider,/webpack-2-the-complete-developers-guide/,$74.99,Web Development
3,Blockchain and Bitcoin Fundamentals,Learn the key elements of blockchain and Bitco...,https://udemy-images.udemy.com/course/125_H/11...,George Levy,/blockchain-and-bitcoin-fundamentals/,$94.99,Other
4,Build Responsive Real World Websites with HTML...,"The easiest way to learn modern web design, HT...",https://udemy-images.udemy.com/course/125_H/43...,Jonas Schmedtmann,/design-and-develop-a-killer-website-with-html...,$199.99,Web Development


In [29]:
df_udemy.loc[:,"Provider"] = df_udemy["Course"].apply(lambda x: "Udemy") # lazy way to insert column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [30]:
df_udemy.head()

Unnamed: 0,Course,Description,Image,Instructor,Link,Price,Subject,Provider
0,JavaScript: Understanding the Weird Parts,An advanced JavaScript course for everyone! Sc...,https://udemy-images.udemy.com/course/125_H/36...,Anthony Alicea,/understand-javascript/,$174.99,Web Development,Udemy
1,Learn and Understand AngularJS,Master AngularJS and the Javascript concepts b...,https://udemy-images.udemy.com/course/125_H/28...,Anthony Alicea,/learn-angularjs/,$174.99,Web Development,Udemy
2,Webpack 2: The Complete Developer's Guide,Master Webpack 2 as you deploy web apps suppor...,https://udemy-images.udemy.com/course/125_H/10...,Stephen Grider,/webpack-2-the-complete-developers-guide/,$74.99,Web Development,Udemy
3,Blockchain and Bitcoin Fundamentals,Learn the key elements of blockchain and Bitco...,https://udemy-images.udemy.com/course/125_H/11...,George Levy,/blockchain-and-bitcoin-fundamentals/,$94.99,Other,Udemy
4,Build Responsive Real World Websites with HTML...,"The easiest way to learn modern web design, HT...",https://udemy-images.udemy.com/course/125_H/43...,Jonas Schmedtmann,/design-and-develop-a-killer-website-with-html...,$199.99,Web Development,Udemy


In [31]:
ka_df = ka_df[good]

In [35]:
ka_df['Subject'] = ka_df['Subject'].apply(lambda s: " ".join(w.capitalize() for w in s.split()))

In [33]:
ka_df["Provider"] = ka_df["Course"].apply(lambda x: "Khan Academy")

In [36]:
ka_df.head()

Unnamed: 0,Course,Description,Image,Instructor,Link,Price,Subject,Provider
0,Analyzing categorical data,This unit covers methods for dealing with data...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
1,Displaying and comparing quantitative data,This unit covers some basic methods for graphi...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
2,Summarizing quantitative data,This unit covers common measures of center lik...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
3,Modeling data distributions,This unit takes our understanding of distribut...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
4,Exploring bivariate numerical data,We use scatter plots to explore the relationsh...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy


In [37]:
df_courses = pd.concat([ka_df,df_udemy])

In [41]:
df_courses = df_courses.reset_index()
df_courses = df_courses[df_courses.columns[1:]]

In [93]:
df_courses

Unnamed: 0,Course,Description,Image,Instructor,Link,Price,Subject,Provider
0,Analyzing categorical data,This unit covers methods for dealing with data...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
1,Displaying and comparing quantitative data,This unit covers some basic methods for graphi...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
2,Summarizing quantitative data,This unit covers common measures of center lik...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
3,Modeling data distributions,This unit takes our understanding of distribut...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
4,Exploring bivariate numerical data,We use scatter plots to explore the relationsh...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
5,Study design,Statistics is all about forming questions and ...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
6,Probability,Probability tells us how often some event will...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
7,"Counting, permutations, and combinations",This unit covers methods for counting how many...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
8,Random variables,Random variables can be any outcomes from some...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy
9,Sampling distributions,A sampling distribution shows every possible r...,https://cdn.kastatic.org/genfiles/topic-icons/...,,https://www.khanacademy.org/math/statistics-pr...,Free,Statistics Probability,Khan Academy


In [94]:
df_courses.to_csv("courses.csv")

In [314]:
df_courses[df_courses.Course.str.contains("golang")].Course[275].encode("utf8").decode("utf8").encode("ISO-8859-1").decode("ISO-8859-1")

'Web Development w/ Google\x92s Go (golang) Programming Language'

In [3]:
df_subjects = pd.read_csv("subjects.csv")

In [6]:
df_subjects = df_subjects[df_subjects.columns[1:]]

In [12]:
df_subjects[df_subjects.provider == "Udemy"].subject

22              Web Development
23                        Other
24        Programming Languages
25      Personal Transformation
26             Personal Finance
27             Data & Analytics
28        Memory & Study Skills
29                Home Business
30            Operating Systems
31                  Mobile Apps
32                      Finance
33           Network & Security
34             IT Certification
35               Math & Science
36                    Microsoft
37           Project Management
38             Entrepreneurship
39                    Databases
40             Software Testing
41            Development Tools
42                   Meditation
43               Communications
44                   Web Design
45         Software Engineering
46                       Sports
47       Social Media Marketing
48            Digital Marketing
49             Game Development
50              User Experience
51                   Creativity
                 ...           
110     

In [20]:
df_udemy = pd.read_csv("udemy_courses.csv",encoding = "ISO-8859-1")

In [34]:
def extract_image(subject):
    return df_udemy.Image[df_udemy[df_udemy.PrimarySubcategory == subject].Rating.idxmax()]

In [46]:
df_subjects.loc[df_subjects.provider == "Udemy","image"] = df_subjects[df_subjects.provider == "Udemy"].subject.apply(lambda x: extract_image(x))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [53]:
df_subjects.to_csv("subjects.csv")