In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import chromedriver_autoinstaller
from urllib.parse import unquote_plus

# Automatically download and install the matching ChromeDriver
chromedriver_autoinstaller.install()

'/home/work/.local/lib/python3.10/site-packages/chromedriver_autoinstaller/126/chromedriver'

In [2]:
#pip install selenium chromedriver_autoinstaller markdownify

In [3]:
# List of subjects
subjects = [
    "Architecture", "Art+%26+Culture", "Biology+%26+Life+Sciences", "Business+%26+Management",
    "Chemistry", "Communication", "Computer+Science", "Data+Analysis+%26+Statistics", "Design",
    "Economics+%26+Finance", "Education+%26+Teacher+Training", "Electronics", "Energy+%26+Earth+Sciences",
    "Engineering", "Environmental+Studies", "Ethics", "Food+%26+Nutrition", "Health+%26+Safety", "History",
    "Humanities", "Language", "Law", "Literature", "Math", "Medicine", "Music", "Philanthropy",
    "Philosophy+%26+Ethics", "Physics", "Science", "Social+Sciences"
]

In [4]:
# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument('--headless')  # Run headless Chrome if needed
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--remote-debugging-port=9222')

# Initialize WebDriver
driver = webdriver.Chrome(service=Service('/home/work/.local/lib/python3.10/site-packages/chromedriver_autoinstaller/126/chromedriver'), options=chrome_options)

wait = WebDriverWait(driver, 10)

In [5]:
def get_last_page_number(driver):
    time.sleep(2)  # Ensure the page has fully loaded
    try:
        # Find the <ul> element with class "pagination"
        pagination_element = driver.find_element(By.CSS_SELECTOR, "ul.pagination")
        pagination_html = pagination_element.get_attribute('outerHTML')
        soup = BeautifulSoup(pagination_html, 'html.parser')
        
        page_items = soup.find_all('li', class_='page-item')
        if len(page_items) > 1:
            last_page_number = int(page_items[-2].text.strip())
        else:
            last_page_number = 1
    except Exception as e:
        print(f"Error extracting pagination: {e}")
        last_page_number = 1
    return last_page_number

In [6]:
def get_course_data(driver, subject, page):
    course_data = []
    url = f"https://www.edx.org/search?tab=course&page={page}&subject={subject}"
    driver.get(url)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'search-results-container')))
    time.sleep(1.5)  # Ensure the page is fully loaded
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all course cards
    course_cards = soup.find_all('a', class_='base-card-link')
    if not course_cards:
        return []

    # Loop through each course link
    for card in course_cards:
        href = card['href']
        course_url = "https://www.edx.org" + href
        driver.get(course_url)
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'course-header'))) 
        course_soup = BeautifulSoup(driver.page_source, 'html.parser')


        # Extract course details
        title = course_soup.find('div', class_='course-header').find('h1').text.strip()
        rating_element = course_soup.find('div', class_='course-header').find('div', class_='h5 ml-1 mr-3 mb-0')
        rating = rating_element.text.strip() if rating_element else None
        sub_info_element = course_soup.find('div', class_='course-header').find('div', class_='p').find('p')
        sub_info = sub_info_element.text.strip() if sub_info_element else None
        image_url_element = course_soup.find('div', class_='course-header').find('img', class_='CloudflareImage')
        image_url = image_url_element['src'] if image_url_element else None

        preview_expand_components = course_soup.find('div', class_='course-main container-mw-sm container-fluid').find_all('div', class_='preview-expand-component')
        about = ' '.join([p.text.strip() for p in preview_expand_components[0].find_all('p')]) if len(preview_expand_components) > 0 else None
        what_you_will_learn = '. '.join([li.text.strip() for li in preview_expand_components[1].find_all('li')]) if len(preview_expand_components) > 1 else None
        syllabus = ' '.join([p.text.strip() for p in preview_expand_components[2].find_all('p')]) if len(preview_expand_components) > 2 else None

       # Extract institution, subject, level, and language information
        info_elements = course_soup.find('div', class_='course-main container-mw-sm container-fluid').find('ul', class_='mb-0 pl-3 ml-1').find_all('li')

        #Sometime a some time p 
        institution_elem = info_elements[0].find('a') or info_elements[0].find('p')

        institution = institution_elem.text.strip() if institution_elem else None
        subject = info_elements[1].find('a').text.strip() if len(info_elements) > 1 else None
        level = info_elements[2].text.strip().replace('Level: ', '') if len(info_elements) > 2 else None
        
        language_element = course_soup.find('div', class_='course-main container-mw-sm container-fluid').find_all('div', class_='col-12 col-md-6')[1].find('ul', class_='pl-3 ml-1 mb-0').find_all('li')[0]
        language = language_element.text.strip().replace('Language: ', '') if language_element else None

        course_data.append({
            'title': title,
            'sub_info': sub_info,
            'rating': rating,
            'subject': subject,
            'level': level,
            'language': language,
            'institution': institution,
            'about': about,
            'what_you_will_learn': what_you_will_learn,
            'syllabus': syllabus,
            'image_url': image_url,
            'course_url': course_url
        })
    return course_data

In [7]:
all_course_data = []

# Loop through each subject
for subject in subjects:
    subject_course = []
    
    print(f"Fetching {subject}")

    url = f"https://www.edx.org/search?tab=course&page=1&subject={subject}"
    driver.get(url)
    
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'search-results-container')))
    last_page_number = get_last_page_number(driver)

    print(f"Last page number for subject {subject}: {last_page_number}")

    # Loop through each page
    for page in range(1, last_page_number + 1):
        course_data = get_course_data(driver, subject, page)
        print(f"Fetching page {page} for subject {subject}, count: {len(course_data)}")
        subject_course.extend(course_data)
        all_course_data.extend(course_data)
    
    print(f'===> Total courses fetched for {subject}: {len(subject_course)}')
    
# Close the browser
driver.quit()

Fetching Architecture
Last page number for subject Architecture: 4
Fetching page 1 for subject Architecture, count: 24
Fetching page 2 for subject Architecture, count: 24
Fetching page 3 for subject Architecture, count: 24
Fetching page 4 for subject Architecture, count: 1
===> Total courses fetched for Architecture: 73
Fetching Art+%26+Culture
Last page number for subject Art+%26+Culture: 11
Fetching page 1 for subject Art+%26+Culture, count: 24
Fetching page 2 for subject Art+%26+Culture, count: 24
Fetching page 3 for subject Art+%26+Culture, count: 24
Fetching page 4 for subject Art+%26+Culture, count: 24
Fetching page 5 for subject Art+%26+Culture, count: 24
Fetching page 6 for subject Art+%26+Culture, count: 24
Fetching page 7 for subject Art+%26+Culture, count: 24
Fetching page 8 for subject Art+%26+Culture, count: 24
Fetching page 9 for subject Art+%26+Culture, count: 24
Fetching page 10 for subject Art+%26+Culture, count: 24
Fetching page 11 for subject Art+%26+Culture, count: 

TimeoutException: Message: 
Stacktrace:
#0 0x5601fc9866aa <unknown>
#1 0x5601fc6690dc <unknown>
#2 0x5601fc6b5931 <unknown>
#3 0x5601fc6b5a21 <unknown>
#4 0x5601fc6fa234 <unknown>
#5 0x5601fc6d889d <unknown>
#6 0x5601fc6f75c3 <unknown>
#7 0x5601fc6d8613 <unknown>
#8 0x5601fc6a84f7 <unknown>
#9 0x5601fc6a8e4e <unknown>
#10 0x5601fc94c87b <unknown>
#11 0x5601fc950921 <unknown>
#12 0x5601fc93836e <unknown>
#13 0x5601fc951482 <unknown>
#14 0x5601fc91cccf <unknown>
#15 0x5601fc9760a8 <unknown>
#16 0x5601fc976280 <unknown>
#17 0x5601fc9857dc <unknown>
#18 0x7fe5ce253ac3 <unknown>


In [8]:
len(all_course_data)

5254

In [None]:
# ## Debug page 7 art and culture
# url = "https://www.edx.org/learn/architecture/harvard-university-the-architectural-imagination?index=product&queryID=04a3bf35c76e5138d31121e13db98a32&position=1&results_level=second-level-results&term=&objectID=course-8c411679-4105-4de9-8a4c-9c5b3f4a33a6&campaign=The+Architectural+Imagination&source=edX&product_category=course&placement_url=https%3A%2F%2Fwww.edx.org%2Fsearch"

# driver.get(url)
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'course-header'))) 
# time.sleep(1.2) 
# course_soup = BeautifulSoup(driver.page_source, 'html.parser')

# # Extract course details
# title = course_soup.find('div', class_='course-header').find('h1').text.strip()
# print(title)


# # Extract institution, subject, level, and language information
# info_elements = course_soup.find('div', class_='course-main container-mw-sm container-fluid').find('ul', class_='mb-0 pl-3 ml-1').find_all('li')

# #Sometime a some time p 
# institution_elem = info_elements[0].find('a') or info_elements[0].find('p')

# institution = institution_elem.text.strip() if institution_elem else None
# subject = info_elements[1].find('a').text.strip() if len(info_elements) > 1 else None
# level = info_elements[2].text.strip().replace('Level: ', '') if len(info_elements) > 2 else None

# language_element = course_soup.find('div', class_='course-main container-mw-sm container-fluid').find_all('div', class_='col-12 col-md-6')[1].find('ul', class_='pl-3 ml-1 mb-0').find_all('li')[0]
# language = language_element.text.strip().replace('Language: ', '') if language_element else None


# print(institution)
# print(subject)
# print(level)
# print(language)

In [9]:
# Save data to a DataFrame and CSV
df = pd.DataFrame(all_course_data)
df.to_csv('edx_courses.csv', index=False)

print("Scraping complete. Data saved to edx_courses.csv")

Scraping complete. Data saved to edx_courses.csv


In [10]:
print(len(df))
df.head()

5254


Unnamed: 0,title,sub_info,rating,subject,level,language,institution,about,what_you_will_learn,syllabus,image_url,course_url
0,HarvardX: The Architectural Imagination,Learn fundamental principles of architecture —...,4.7 stars,Architecture,Introductory,English,HarvardX,Architecture engages a culture’s deepest socia...,"How to read, analyze, and understand different...",Part I: Form and History Part II: The Technolo...,https://prod-discovery.edx-cdn.org/cdn-cgi/ima...,https://www.edx.org/learn/architecture/harvard...
1,MITx: Sustainable Building Design,"Learn and explore key scientific principles, t...",,Architecture,Intermediate,English,MITx,"Meeting growing global energy demand, while mi...",Understand the scientific principles underlyin...,Week 1 - Energy Use in Buildings Week 2 - Unde...,https://prod-discovery.edx-cdn.org/cdn-cgi/ima...,https://www.edx.org/learn/sustainable-developm...
2,TokyoTechX: Japanese Architecture and Structur...,"In this revised course, fundamental and modern...",4.2 stars,Architecture,Intermediate,English,TokyoTechX,"In this revised course, in depth video lecture...",Evolution of seismic design concepts in Japan....,WEEK 1 : History of Japanese Structural Design...,https://prod-discovery.edx-cdn.org/cdn-cgi/ima...,https://www.edx.org/learn/architecture/tokyo-i...
3,"NUS: Data Science for Construction, Architectu...",This course introduces data science skills tar...,4.6 stars,Data Analysis & Statistics,Introductory,English,NUS,The building industry is exploding with data s...,Why data science is important for the built en...,Section 1: Introduction to Course and Python F...,https://prod-discovery.edx-cdn.org/cdn-cgi/ima...,https://www.edx.org/learn/data-science/the-nat...
4,SDGAcademyX: Sustainable Cities,"Learn how government, the private sector, and ...",4.7 stars,Environmental Studies,Intermediate,English,SDGAcademyX,Did you know that experts estimate an addition...,"An overview of governance, land management, ut...",Module 1: The urban opportunity Module 2: What...,https://prod-discovery.edx-cdn.org/cdn-cgi/ima...,https://www.edx.org/learn/sustainability/sdg-a...


In [11]:
df['subject'].unique()

array(['Architecture', 'Data Analysis & Statistics',
       'Environmental Studies', 'Computer Science', 'Engineering',
       'Humanities', 'Social Sciences', 'Business & Management',
       'Energy & Earth Sciences', 'Art & Culture', 'Design', 'History',
       'Language', 'Literature', 'Communication',
       'Education & Teacher Training', 'Music', 'Food & Nutrition',
       'Health & Safety', 'Medicine', 'Biology & Life Sciences', 'Math',
       'Physics', 'Science', 'Chemistry', 'Philosophy & Ethics',
       'Economics & Finance', 'Law', 'Ethics', 'Electronics'],
      dtype=object)

In [13]:
df.iloc[3]['what_you_will_learn']

'Why data science is important for the built environment. Why building industry professionals should learn how to code. A jump start in the Python Programming Language. Overview of the Pandas data analysis library. Guidance in the loading, processing, and merging of data. Visualization of data from buildings. Basic machine learning concepts applied to building data. Examples of parametric analysis for the integrated design process. Examples of how to process time-series data from IoT sensors. Examples of analysis of thermal comfort data from occupants. Numerous starting points for using data science in other building-related tasks'