In [5]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

In [6]:
# 1. kmooc
home_base_url = "https://www.kmooc.kr/"
search_url = "https://www.kmooc.kr/json/course/category/search"
detail_base_url = "https://www.kmooc.kr/view/course/detail/"
total_page = 5
subj_info_list = []
curr_pg_no = 0
page_cnt_per_file = 5

for p in range(1, total_page+1):
    params = {
        "page": f"{p}",
        "sby": "id",
        "sorder": "desc",
        "open_status": "-1",
        "main_categoryid": "14",
        "sub_categoryid": "22",
        "interest_categoryid": "-1",
        "period_categoryid": "-1",
        "institutionid": "-1",
        "mobile": "1"
    }
    search_resp = requests.post(search_url, params=params)
    contents = json.loads(search_resp.content)
    subject_list = contents["list"]

    for subject in subject_list:
        subj_info_dict = {}

        detail_url = detail_base_url+str(subject["id"])
        subj_info_dict["detail_link"] = detail_url
        detail_resp = requests.get(detail_url)
        soup = BeautifulSoup(detail_resp.content)

        cat_tags = soup.select(".list .catagory")
        cont_tags = soup.select(".list .content")
        title_tags = soup.select(".list .title")
        img_tags = soup.select(".card_img")
        
        img = img_tags[0]["style"].split("'")[1]
        subj_info_dict["thumbnail"] = img

        for title in title_tags:
            subject = title.text.strip().split("\n")
        
        if len(subject) > 1:
            subj_info_dict["title"] = subject[1]
        else:
            subj_info_dict["title"] = subject[0]
        
        for i in range(len(cat_tags)):
            category = cat_tags[i].text.strip()
            content = cont_tags[i].text.strip()

            if category == "분야":
                subj_info_dict["field"] = content
            if category == "강좌만족도":
                subj_info_dict["satisfaction"] = content
            if category == "강좌언어":
                subj_info_dict["language"] = content
            if category.startswith("운영기관"):
                subj_info_dict["institution"] = content.split("\n")[0]
            
        subj_info_list.append(subj_info_dict)

    if p%page_cnt_per_file == 0:
        curr_pg_no = p//page_cnt_per_file
        df = pd.DataFrame(subj_info_list)
        df.to_csv(f"kmooc_lecture_{curr_pg_no}.csv", index=False, encoding='UTF8')
        subj_info_list.clear()
    
curr_pg_no += 1
df = pd.DataFrame(subj_info_list)
df.to_csv(f"kmooc_lecture_{curr_pg_no}.csv", index=False, encoding='UTF8')

In [65]:
#2. Udemy
page_no_limit = 10
subj_info_list = []

home_base_url = "https://www.udemy.com"
home_resp = requests.get(home_base_url)
home_soup = BeautifulSoup(home_resp.content)
cat_tags = home_soup.select(".js-side-nav-cat")

for cat_tag in cat_tags:

    page_no = 0

    if len(cat_tag["href"].split("/")) > 4:
        continue
    
    category_id = cat_tag["data-id"]

    while True:
        page_no += 1
        cat_url = f"https://www.udemy.com/api-2.0/discovery-units/all_courses/?p={page_no}&page_size=16&subcategory=&instructional_level=&lang=&price=&duration=&closed_captions=&subs_filter_type=&category_id={category_id}&source_page=category_page&locale=en_US&currency=krw&navigation_locale=en_US&skip_price=true&sos=pc&fl=cat"
        cat_resp = requests.get(cat_url)
        cat_json_cont = json.loads(cat_resp.content)
        
        lecture_list = cat_json_cont["unit"]["items"]

        for lec in lecture_list:
            subj_info_dict = {}

            detail_url = home_base_url+lec["url"]
            subj_info_dict["detail_link"] = detail_url
            subj_info_dict["title"] = lec["title"]
            subj_info_dict["satisfaction"] = round(lec["avg_rating"], 1)
            subj_info_dict["institution"] = lec["visible_instructors"][0]["display_name"]
            subj_info_dict["thumbnail"] = lec["image_240x135"]
            
            field = lec["context_info"]["category"]["title"]
            if lec["context_info"]["label"] is not None and "title" in lec["context_info"]["label"]:
                field += "/" + lec["context_info"]["label"]["title"]
            
            subj_info_dict["field"] = field
                
            subj_info_list.append(subj_info_dict)
        
        if page_no == page_no_limit:
            break

df = pd.DataFrame(subj_info_list)
df.to_csv(f"udemy_lecture.csv", index=False, encoding='UTF8')

In [67]:
#3. Inflearn
page_no_limit = 10
subj_info_list = []

home_base_url = "https://www.inflearn.com/courses"

for i in range(page_no_limit+1):
    search_url = home_base_url + f"/client/api/v1/course/search?isDiscounted=false&isNew=false&pageNumber={i+1}&pageSize=60&types=ONLINE"
    search_resp = requests.get(search_url)
    search_json = json.loads(search_resp.content)

    attrib_token = search_json["data"]["attributionToken"]
    lec_list = search_json["data"]["items"]

    for lec in lec_list:
        subj_info_dict = {}

        detail_url = home_base_url+"/"+lec["course"]["slug"]+"?"+"attributionToken="+attrib_token
        subj_info_dict["detail_link"] = detail_url
        subj_info_dict["title"] = lec["course"]["title"]
        subj_info_dict["satisfaction"] = lec["course"]["star"]
        subj_info_dict["institution"] = lec["instructor"]["name"]
        subj_info_dict["thumbnail"] = lec["course"]["thumbnailUrl"]
        
        cat_parent = lec["course"]["metadata"]["parentCategories"]
        cat_child = lec["course"]["metadata"]["childCategories"]

        field = ""
        for pr in cat_parent:
            field += pr + "/"
        field += " > "
        for ch in cat_child:
            field += ch + "/"
        
        subj_info_dict["field"] = field
            
        subj_info_list.append(subj_info_dict)
    
    if page_no == page_no_limit:
        break

df = pd.DataFrame(subj_info_list)
df.to_csv(f"inflearn_lecture.csv", index=False, encoding='UTF8')