In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import csv

In [2]:
# List of topics to scrape
#topics = ["Business", "Computer Science", "Information Technology", "Data Science", "Health", 
        #  "Physical Science and Engineering", "Social Sciences", "Arts and Humanities", 
        #  "Personal Development", "Language Learning", "Math and Logic"]

# List of topics to scrape
# topics = ["Data Science", "Health", 
#           "Physical Science and Engineering", "Social Sciences", "Arts and Humanities", 
#           "Personal Development", "Language Learning", "Math and Logic"]

topics = ["Language Learning", "Math and Logic"]

In [3]:
def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver

In [4]:
def get_total_pages(driver):
    try:
        pagination = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "ul.cds-pagination-nav"))
        )
        last_page_button = pagination.find_elements(By.CSS_SELECTOR, "button.cds-paginationItem-default")[-1]
        return int(last_page_button.text)
    except NoSuchElementException:
        print("Pagination not found. Assuming single page.")
        return 1

In [5]:
def get_course_urls(driver, page_url):
    driver.get(page_url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "li.cds-9"))
    )
    
    # Scroll to load all courses
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    course_elements = soup.find_all('li', class_='cds-9')
    
    course_urls = []
    for course_element in course_elements:
        url = course_element.find('a', class_='cds-119')['href']
        course_urls.append('https://www.coursera.org' + url)
    
    return course_urls

In [6]:
def get_course_info(driver, url):
    driver.get(url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "h1[data-e2e='hero-title']"))
    )
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    title = soup.select_one("h1[data-e2e='hero-title']").text.strip()
    
    try:
        description = soup.select_one("div.css-12wvpfc p").text.strip()
    except AttributeError:
        description = "Description not found"
    
    try:
        rating = soup.select_one("div.css-139h6xi div").text.strip()
    except AttributeError:
        rating = "N/A"
    
    try:
        duration = soup.select_one("div.css-fw9ih3 div").text.strip()
    except AttributeError:
        duration = "Duration not found"
    
    what_you_will_learn = [item.text.strip() for item in soup.select("div[data-track-component='what_you_will_learn_section'] li")]
    
    skills_you_will_gain = [skill.text.strip() for skill in soup.select("div.css-1m3kxpf ul.css-yk0mzy li span")]
    
    return {
        'title': title,
        'description': description,
        'rating': rating,
        'duration': duration,
        'what_you_will_learn': what_you_will_learn,
        'skills_you_will_gain': skills_you_will_gain,
        'url': url
    }

In [7]:
driver = setup_driver()
all_courses = []

In [8]:
for topic in topics:
    
    print(f"Scraping courses for topic: {topic}")
    base_url = f"https://www.coursera.org/courses?topic={topic}&sortBy=BEST_MATCH"
    driver.get(base_url)
    total_pages = get_total_pages(driver)
    #total_pages = 1
    print(f"Total pages for {topic}: {total_pages}")
        
    for page in range(1, total_pages):
        page_url = f"{base_url}&page={page}"
        print(f"Scraping page {page} of {total_pages} for {topic}")
        course_urls = get_course_urls(driver, page_url)
            
        for url in course_urls:
            #print(f"Scraping course: {url}")
            course_info = get_course_info(driver, url)
            course_info['topic'] = topic
            all_courses.append(course_info)
            time.sleep(1)  # Be respectful with request frequency
            
        time.sleep(2)  # Additional delay between pages
        
    time.sleep(3)  # Additional delay between topics
    
driver.quit()

Scraping courses for topic: Language Learning
Total pages for Language Learning: 19
Scraping page 1 of 19 for Language Learning
Scraping page 2 of 19 for Language Learning
Scraping page 3 of 19 for Language Learning
Scraping page 4 of 19 for Language Learning
Scraping page 5 of 19 for Language Learning
Scraping page 6 of 19 for Language Learning
Scraping page 7 of 19 for Language Learning
Scraping page 8 of 19 for Language Learning
Scraping page 9 of 19 for Language Learning
Scraping page 10 of 19 for Language Learning
Scraping page 11 of 19 for Language Learning
Scraping page 12 of 19 for Language Learning
Scraping page 13 of 19 for Language Learning
Scraping page 14 of 19 for Language Learning
Scraping page 15 of 19 for Language Learning
Scraping page 16 of 19 for Language Learning
Scraping page 17 of 19 for Language Learning
Scraping page 18 of 19 for Language Learning
Scraping courses for topic: Math and Logic
Total pages for Math and Logic: 9
Scraping page 1 of 9 for Math and Logi

TimeoutException: Message: 
Stacktrace:
0   chromedriver                        0x0000000106946d18 chromedriver + 4996376
1   chromedriver                        0x000000010693e5da chromedriver + 4961754
2   chromedriver                        0x00000001064e1d10 chromedriver + 388368
3   chromedriver                        0x000000010652e30f chromedriver + 701199
4   chromedriver                        0x000000010652e3f1 chromedriver + 701425
5   chromedriver                        0x0000000106573464 chromedriver + 984164
6   chromedriver                        0x00000001065529dd chromedriver + 850397
7   chromedriver                        0x0000000106570a00 chromedriver + 973312
8   chromedriver                        0x0000000106552753 chromedriver + 849747
9   chromedriver                        0x0000000106521635 chromedriver + 648757
10  chromedriver                        0x0000000106521e5e chromedriver + 650846
11  chromedriver                        0x000000010690d000 chromedriver + 4759552
12  chromedriver                        0x0000000106911f18 chromedriver + 4779800
13  chromedriver                        0x00000001069125e5 chromedriver + 4781541
14  chromedriver                        0x00000001068efaa9 chromedriver + 4639401
15  chromedriver                        0x00000001069128d9 chromedriver + 4782297
16  chromedriver                        0x00000001068e1044 chromedriver + 4579396
17  chromedriver                        0x000000010692ea08 chromedriver + 4897288
18  chromedriver                        0x000000010692ec03 chromedriver + 4897795
19  chromedriver                        0x000000010693e1de chromedriver + 4960734
20  libsystem_pthread.dylib             0x00007ff80da4c1d3 _pthread_start + 125
21  libsystem_pthread.dylib             0x00007ff80da47bd3 thread_start + 15


In [9]:
len(all_courses)

311

In [10]:
# Save data to CSV
with open('language_learning&Math_coursera.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=['topic', 'title', 'description', 'rating', 'reviews', 'duration', 'what_you_will_learn', 'skills_you_will_gain', 'url'])
    writer.writeheader()
    for course in all_courses:
        course['what_you_will_learn'] = ' '.join(course['what_you_will_learn'])
        course['skills_you_will_gain'] = ' '.join(course['skills_you_will_gain'])
        writer.writerow(course)
    
print(f"Scraped {len(all_courses)} courses")

Scraped 311 courses


In [11]:
import pandas as pd

df = pd.read_csv('language_learning&Math_coursera.csv')
df

Unnamed: 0,topic,title,description,rating,reviews,duration,what_you_will_learn,skills_you_will_gain,url
0,Language Learning,Arizona State University TESOL Professional Ce...,Launch Your Career as an English Teacher. Mast...,4.9,,Duration not found,Learners will master modern communicative tec...,English Language English language teaching EFL...,https://www.coursera.org/professional-certific...
1,Language Learning,"TESOL Certificate, Part 1: Teach English Now! ...","TESOL Certificate, Part 1: Teach English Now!....",4.9,,Duration not found,Understand basic modes of language learning De...,Education Online Learning,https://www.coursera.org/specializations/tesol
2,Language Learning,Improve Your English Communication Skills Spec...,Improve Your English Communication Skills. Wri...,4.8,,Duration not found,,Professional Email Writing Writing Presentation,https://www.coursera.org/specializations/impro...
3,Language Learning,"TESOL Certificate, Part 2: Teach English Now! ...",TESOL Certificate Part II: Teach English Now!....,4.9,,Duration not found,,Teaching English Language Lesson Plan Speech,https://www.coursera.org/specializations/tesol...
4,Language Learning,English for Career Development,Description not found,4.8,,Duration not found,,Communication English Language Career Developm...,https://www.coursera.org/learn/careerdevelopment
...,...,...,...,...,...,...,...,...,...
306,Math and Logic,Traitement d'images : analyse fréquentielle et...,Description not found,,,Recommended experienceCloseRecommended experie...,Transformée de Fourier en 2D Principes de la d...,Image Processing compression jpeg transformée ...,https://www.coursera.org/learn/moocimage-fouri...
307,Math and Logic,Differential Equations Part III Systems of Equ...,Description not found,,,Recommended experienceCloseRecommended experie...,,,https://www.coursera.org/learn/differential-eq...
308,Math and Logic,"Doğrusal Cebir II: Kare Matrisler, Hesaplama Y...",Description not found,4.5,,Duration not found,,,https://www.coursera.org/learn/linearalgebra2
309,Math and Logic,Çok değişkenli Fonksiyon II: Uygulamalar / Mul...,Description not found,4.4,,Duration not found,,,https://www.coursera.org/learn/calculus-difera...
