# Testing

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse

url = "https://www.open.edu/openlearn/science-maths-technology"
headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

course_links = set()

for a in soup.find_all("a", href=True):
    href = a["href"]

    # Normalize full URL
    if href.startswith("/openlearn/"):
        full_url = "https://www.open.edu" + href
    elif href.startswith("https://www.open.edu/openlearn/"):
        full_url = href
    else:
        continue

    # Filter out non-course links
    if any(x in full_url for x in ["about", "search", "help", "contact", "tag"]):
        continue

    # Must contain 'content-section'
    if "content-section" in full_url:
        course_links.add(full_url.split("?")[0])  # remove query params

print("Courses found:", len(course_links))
list(course_links)[:5]

Courses found: 4


['https://www.open.edu/openlearn/science-maths-technology/mathematics-statistics/numbers-units-and-arithmetic/content-section-0',
 'https://www.open.edu/openlearn/science-maths-technology/practising-systems-thinking-practice-stip/content-section-overview',
 'https://www.open.edu/openlearn/science-maths-technology/information-security/content-section-0',
 'https://www.open.edu/openlearn/science-maths-technology/engineering-technology/groups-and-teamwork/content-section-0']

In [None]:
def scrape_openlearn_course(url):
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")

    data = {
        "course_url": url,
        "platform": "OpenLearn",
        "language": "English",
        "course_title": None,
        "duration": None,
        "difficulty_level": None,
        "rating_out_of_5": None
    }

    # Course title
    title = soup.find("h1")
    data["course_title"] = title.text.strip() if title else None

    # Duration & Level
    info_blocks = soup.find_all("div", class_="course-info")

    for block in info_blocks:
        icon = block.find("span", class_="icon")
        value = block.find("h4")

        if not icon or not value:
            continue

        icon_classes = " ".join(icon.get("class", [])).lower()
        text = value.text.strip()

        if "icon-clock" in icon_classes:
            data["duration"] = text

        elif "icon-course_icon_level" in icon_classes:
            data["difficulty_level"] = text

    # ⭐ Rating
    rating_block = soup.find("span", class_="average-value")
    if rating_block:
        rating_value = rating_block.find("strong")
        if rating_value:
            data["rating_out_of_5"] = float(rating_value.text.strip())

    return data

In [None]:
sample = scrape_openlearn_course(list(course_links)[0])
sample

{'course_url': 'https://www.open.edu/openlearn/science-maths-technology/mathematics-statistics/numbers-units-and-arithmetic/content-section-0',
 'platform': 'OpenLearn',
 'language': 'English',
 'course_title': 'Numbers, units and arithmetic',
 'duration': '5 hours study',
 'difficulty_level': 'Level 1: Introductory',
 'rating_out_of_5': 4.5}

# 1.OpenLearn Scraping

In [None]:
!pip install requests beautifulsoup4 pandas tqdm



In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time

In [None]:
# Define headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

In [None]:
# Collect course-links

BASE_URL = "https://www.open.edu"
CATEGORY_URL = "https://www.open.edu/openlearn/science-maths-technology"

def get_course_links(category_url):
    r = requests.get(category_url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")

    links = set()

    for a in soup.find_all("a", href=True):
        href = a["href"]

        # OpenLearn course pattern
        if "/openlearn/" in href and "content-section" in href:
            if href.startswith("/"):
                href = BASE_URL + href
            links.add(href.split("?")[0])  # remove tracking params

    return list(links)

In [None]:
course_links = get_course_links(CATEGORY_URL)
len(course_links), course_links[:5]

(4,
 ['https://www.open.edu/openlearn/science-maths-technology/understanding-autism/content-section-overview',
  'https://www.open.edu/openlearn/science-maths-technology/information-security/content-section-0',
  'https://www.open.edu/openlearn/science-maths-technology/practising-systems-thinking-practice-stip/content-section-overview',
  'https://www.open.edu/openlearn/science-maths-technology/moons-our-solar-system/content-section-overview'])

In [None]:
# Course Detail Scraper

def scrape_openlearn_course(url):
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")

    data = {
        "course_title": None,
        "course_url": url,
        "platform": "OpenLearn",
        "language": "English",
        "duration": None,
        "difficulty_level": None,
        "rating_out_of_5": None
    }

    # Title
    title = soup.find("h1")
    if title:
        data["course_title"] = title.text.strip()

    # Duration & Level
    info_blocks = soup.find_all("div", class_="course-info")

    for block in info_blocks:
        icon = block.find("span", class_="icon")
        value = block.find("h4")

        if not icon or not value:
            continue

        icon_classes = " ".join(icon.get("class", [])).lower()
        text = value.text.strip()

        if "icon-clock" in icon_classes:
            data["duration"] = text

        elif "icon-course_icon_level" in icon_classes:
            data["difficulty_level"] = text

    # Rating
    rating_block = soup.find("span", class_="average-value")
    if rating_block:
        strong = rating_block.find("strong")
        if strong:
            data["rating_out_of_5"] = float(strong.text.strip())

    return data


In [None]:
# Scale Scraping
courses_data = []

for url in tqdm(course_links):
    try:
        course = scrape_openlearn_course(url)
        courses_data.append(course)
        time.sleep(1)  # polite scraping
    except Exception as e:
        print("Error:", url, e)


100%|██████████| 4/4 [00:10<00:00,  2.63s/it]


In [None]:
# Create Dataframe
df= pd.DataFrame(courses_data)
df.head()

Unnamed: 0,course_title,course_url,platform,language,duration,difficulty_level,rating_out_of_5
0,Understanding autism,https://www.open.edu/openlearn/science-maths-t...,OpenLearn,English,24 hours study,Level 1: Introductory,4.4
1,Information security,https://www.open.edu/openlearn/science-maths-t...,OpenLearn,English,10 hours study,Level 3: Advanced,4.7
2,Practising systems thinking in practice (STiP),https://www.open.edu/openlearn/science-maths-t...,OpenLearn,English,10 hours study,Level 3: Advanced,0.0
3,Moons of our Solar System,https://www.open.edu/openlearn/science-maths-t...,OpenLearn,English,24 hours study,Level 1: Introductory,4.7


In [None]:
# Category scraper

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

BASE_URL = "https://www.open.edu"
HOME_URL = "https://www.open.edu/openlearn/"

def get_openlearn_categories():
    r = requests.get(HOME_URL, headers=HEADERS)
    soup = BeautifulSoup(r.text, "html.parser")

    categories = {}

    container = soup.find("div", class_="subjects-bottom")
    links = container.find_all("a", class_="subject-item-ext")

    for link in links:
        name = link.text.strip()
        url = BASE_URL + link["href"]
        categories[name] = url

    return categories

In [None]:
categories = get_openlearn_categories()
categories

{'Money & Business': 'https://www.open.edu/openlearn/money-management',
 'Education & Development': 'https://www.open.edu/openlearn/education',
 'Health, Sports & Psychology': 'https://www.open.edu/openlearn/body-mind',
 'History & The Arts': 'https://www.open.edu/openlearn/history-the-arts',
 'Languages': 'https://www.open.edu/openlearn/languages',
 'Nature & Environment': 'https://www.open.edu/openlearn/nature-environment',
 'Science, Maths & Technology': 'https://www.open.edu/openlearn/science-maths-technology',
 'Society, Politics & Law': 'https://www.open.edu/openlearn/society',
 'Digital & Computing': 'https://www.open.edu/openlearn/digital'}

In [None]:
# Extract Course URLs from each category

def get_course_links(category_url):
    r = requests.get(category_url, headers=HEADERS)
    soup = BeautifulSoup(r.text, "html.parser")

    course_links = set()

    for a in soup.find_all("a", href=True):
        href = a["href"]

        if "/openlearn/" in href and "content-section" in href:
            if href.startswith("/"):
                href = BASE_URL + href

            course_links.add(href.split("?")[0])

    return list(course_links)

In [None]:
# Collect all Course URLs
all_course_links = set()

for name, url in categories.items():
    links = get_course_links(url)
    print(f"{name}: {len(links)} courses")
    all_course_links.update(links)

len(all_course_links)

Money & Business: 4 courses
Education & Development: 4 courses
Health, Sports & Psychology: 4 courses
History & The Arts: 4 courses
Languages: 4 courses
Nature & Environment: 3 courses
Science, Maths & Technology: 4 courses
Society, Politics & Law: 4 courses
Digital & Computing: 4 courses


35

In [None]:
# Course Detail Scraper

def scrape_openlearn_course(url):
    r = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(r.text, "html.parser")

    data = {
        "course_title": None,
        "course_url": url,
        "platform": "OpenLearn",
        "language": "English",
        "duration": None,
        "difficulty_level": None,
        "rating_out_of_5": None
    }

    # Title
    h1 = soup.find("h1")
    if h1:
        data["course_title"] = h1.text.strip()

    # Duration & Level
    for info in soup.find_all("div", class_="course-info"):
        icon = info.find("span", class_="icon")
        value = info.find("h4")

        if not icon or not value:
            continue

        classes = " ".join(icon.get("class", [])).lower()
        text = value.text.strip()

        if "icon-clock" in classes:
            data["duration"] = text
        elif "icon-course_icon_level" in classes:
            data["difficulty_level"] = text

    # Rating
    rating = soup.find("span", class_="average-value")
    if rating:
        strong = rating.find("strong")
        if strong:
            data["rating_out_of_5"] = float(strong.text.strip())

    return data


In [None]:
# Scale Scraping

courses_data = []

for url in tqdm(all_course_links):
    try:
        courses_data.append(scrape_openlearn_course(url))
        time.sleep(1)
    except Exception as e:
        print("Failed:", url)


100%|██████████| 35/35 [01:27<00:00,  2.51s/it]


In [None]:
# Build dataset
df = pd.DataFrame(courses_data)
df.drop_duplicates(subset="course_url", inplace=True)
df.reset_index(drop=True, inplace=True)

df.shape

(35, 7)

In [None]:
df.head()

Unnamed: 0,course_title,course_url,platform,language,duration,difficulty_level,rating_out_of_5
0,Everyday English 1,https://www.open.edu/openlearn/languages/every...,OpenLearn,English,48 hours study,Level 1: Introductory,4.4
1,Understanding autism,https://www.open.edu/openlearn/science-maths-t...,OpenLearn,English,24 hours study,Level 1: Introductory,4.4
2,Getting started on ancient Greek,https://www.open.edu/openlearn/history-the-art...,OpenLearn,English,16 hours study,Level 1: Introductory,4.6
3,Diagramming for development 2: exploring inter...,https://www.open.edu/openlearn/digital-computi...,OpenLearn,English,4 hours study,Level 3: Advanced,4.8
4,Working in teams,https://www.open.edu/openlearn/money-business/...,OpenLearn,English,24 hours study,Level 1: Introductory,4.8


In [None]:
import pandas as pd
df.to_csv('openlearn_data.csv')

# 2.Saylor Scraping

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time

In [None]:
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://learn.saylor.org"

def get_categories():
    url = f"{BASE_URL}/course/index.php"
    response = requests.get(url, timeout=20)
    soup = BeautifulSoup(response.text, "html.parser")

    categories = []

    for a in soup.find_all("a", class_="category-link"):
        name = a.get_text(strip=True)
        link = a.get("href")

        if link and "categoryid=" in link:
            categories.append({
                "category_name": name,
                "category_url": link
            })

    return categories

In [None]:
categories = get_categories()
len(categories), categories[:5]

(81,
 [{'category_name': 'All categories',
   'category_url': 'https://learn.saylor.org/course/index.php?categoryid=all'},
  {'category_name': 'Arts and Humanities',
   'category_url': 'https://learn.saylor.org/course/index.php?categoryid=82'},
  {'category_name': 'Art History',
   'category_url': 'https://learn.saylor.org/course/index.php?categoryid=21'},
  {'category_name': 'Communication',
   'category_url': 'https://learn.saylor.org/course/index.php?categoryid=8'},
  {'category_name': 'English',
   'category_url': 'https://learn.saylor.org/course/index.php?categoryid=11'}])

In [None]:
# Get Courses from Category
from urllib.parse import urljoin

def get_courses_from_category(category_url):
    response = requests.get(category_url, timeout=20)
    soup = BeautifulSoup(response.text, "html.parser")

    courses = []

    for a in soup.find_all("a", href=True):
        href = a["href"]

        if "course/view.php?id=" in href:
            full_url = urljoin("https://learn.saylor.org", href)
            title = a.get_text(strip=True)

            courses.append({
                "course_title": title,
                "course_url": full_url
            })

    # remove duplicates
    courses = list({c["course_url"]: c for c in courses}.values())
    return courses


In [None]:
test_url = "https://learn.saylor.org/course/index.php?categoryid=82"
courses = get_courses_from_category(test_url)

print("Courses found:", len(courses))
courses[:5]

Courses found: 0


[]

In [None]:
import requests

url = "https://learn.saylor.org/course/index.php?categoryid=82"

headers = {
    "User-Agent": "Mozilla/5.0"
}

response = requests.get(url, headers=headers, timeout=20)

print("Status Code:", response.status_code)
print("Content length:", len(response.text))
print(response.text[:3000])

Status Code: 200
Content length: 144632
<!DOCTYPE html>

<html  dir="ltr" lang="en" xml:lang="en">
<head>

    <title>All courses | Saylor Academy</title>
    <meta name="description" content="">

    <link rel="shortcut icon" href="//learn.saylor.org/pluginfile.php/1/theme_remui/faviconurl/1765242345/favicon-32x32.png" />
    <meta name="apple-itunes-app" content="app-id=1481507148, app-argument=https://learn.saylor.org/course/index.php?categoryid=82"/><link rel="manifest" href="https://learn.saylor.org/admin/tool/mobile/mobile.webmanifest.php" /><!-- Start Matomo Code -->
    <noscript>
        <p>
            <img src="//catalyst-analytics.ca/piwik.php?idsite=41" style="border:0;" alt="" />
        </p>
    </noscript>

<script>
    var _paq = _paq || [];
    _paq.push(['setDocumentTitle', '']);

    
    _paq.push(['trackPageView']);
    _paq.push(['enableLinkTracking']);
    _paq.push(['enableHeartBeatTimer', 30]);

    function embedTrackingCode() {
        var u='//catalyst-anal

In [None]:
print(response.text)

<!DOCTYPE html>

<html  dir="ltr" lang="en" xml:lang="en">
<head>

    <title>All courses | Saylor Academy</title>
    <meta name="description" content="">

    <link rel="shortcut icon" href="//learn.saylor.org/pluginfile.php/1/theme_remui/faviconurl/1765242345/favicon-32x32.png" />
    <meta name="apple-itunes-app" content="app-id=1481507148, app-argument=https://learn.saylor.org/course/index.php?categoryid=82"/><link rel="manifest" href="https://learn.saylor.org/admin/tool/mobile/mobile.webmanifest.php" /><!-- Start Matomo Code -->
    <noscript>
        <p>
            <img src="//catalyst-analytics.ca/piwik.php?idsite=41" style="border:0;" alt="" />
        </p>
    </noscript>

<script>
    var _paq = _paq || [];
    _paq.push(['setDocumentTitle', '']);

    
    _paq.push(['trackPageView']);
    _paq.push(['enableLinkTracking']);
    _paq.push(['enableHeartBeatTimer', 30]);

    function embedTrackingCode() {
        var u='//catalyst-analytics.ca/';
        var p='//catalyst-an

In [None]:
import requests
from bs4 import BeautifulSoup

BASE = "https://learn.saylor.org"

headers = {"User-Agent": "Mozilla/5.0"}

def get_all_category_links():
    url = f"{BASE}/course/index.php"
    r = requests.get(url, headers=headers, timeout=20)
    soup = BeautifulSoup(r.text, "html.parser")

    categories = []

    for a in soup.find_all("a", class_="category-link"):
        name = a.get_text(strip=True)
        link = a.get("href")

        if link and "categoryid=" in link:
            categories.append({
                "category_name": name,
                "category_url": link
            })

    # remove duplicates
    categories = list({c["category_url"]: c for c in categories}.values())
    return categories

In [None]:
categories = get_all_category_links()
len(categories), categories[:5]

(27,
 [{'category_name': 'All categories',
   'category_url': 'https://learn.saylor.org/course/index.php?categoryid=all'},
  {'category_name': 'Arts and Humanities',
   'category_url': 'https://learn.saylor.org/course/index.php?categoryid=82'},
  {'category_name': 'Art History',
   'category_url': 'https://learn.saylor.org/course/index.php?categoryid=21'},
  {'category_name': 'Communication',
   'category_url': 'https://learn.saylor.org/course/index.php?categoryid=8'},
  {'category_name': 'English',
   'category_url': 'https://learn.saylor.org/course/index.php?categoryid=11'}])

In [None]:
from urllib.parse import urljoin

def get_courses_from_category(category_url):
    r = requests.get(category_url, headers=headers, timeout=20)
    soup = BeautifulSoup(r.text, "html.parser")

    courses = []

    for a in soup.find_all("a", href=True):
        href = a["href"]

        if "course/view.php?id=" in href:
            courses.append({
                "course_title": a.get_text(strip=True),
                "course_url": urljoin(BASE, href)
            })

    # remove duplicates
    courses = list({c["course_url"]: c for c in courses}.values())
    return courses

In [None]:
test_subcat = "https://learn.saylor.org/course/index.php?categoryid=21"

courses = get_courses_from_category(test_subcat)
print("Courses found:", len(courses))
courses[:3]

Courses found: 1


[{'course_title': 'ARTH101: Art Appreciation',
  'course_url': 'https://learn.saylor.org/course/view.php?id=1248'}]

In [None]:
def scrape_course(course_url, category):
    r = requests.get(course_url, headers=headers, timeout=20)
    soup = BeautifulSoup(r.text, "html.parser")

    title = soup.find("h1").get_text(strip=True)

    duration = None
    level = None

    for li in soup.find_all("li"):
        txt = li.get_text(" ", strip=True)
        if "Time:" in txt:
            duration = txt.replace("Time:", "").strip()
        if "Level:" in txt:
            level = txt.replace("Level:", "").strip()

    return {
        "platform": "Saylor Academy",
        "category": category,
        "course_title": title,
        "course_url": course_url,
        "duration": duration,
        "difficulty_level": level,
        "language": "English"
    }

In [None]:
import time
import pandas as pd

records = []

categories = get_all_category_links()

for cat in categories:
    courses = get_courses_from_category(cat["category_url"])

    if len(courses) == 0:
        continue   # skip non-leaf categories

    for c in courses:
        try:
            data = scrape_course(c["course_url"], cat["category_name"])
            records.append(data)
            time.sleep(1)
        except Exception as e:
            print("Error:", c["course_url"])

In [None]:
df = pd.DataFrame(records)
df.shape

(161, 7)

In [None]:
df.head()

Unnamed: 0,platform,category,course_title,course_url,duration,difficulty_level,language
0,Saylor Academy,Art History,ARTH101: Art Appreciation,https://learn.saylor.org/course/view.php?id=1248,31 hours,,English
1,Saylor Academy,Communication,COMM001: Principles of Human Communication,https://learn.saylor.org/course/view.php?id=5,56 hours,,English
2,Saylor Academy,Communication,COMM411: Public Relations,https://learn.saylor.org/course/view.php?id=793,15 hours,,English
3,Saylor Academy,English,ENGL000: Pre-College English,https://learn.saylor.org/course/view.php?id=762,29 hours,,English
4,Saylor Academy,English,ENGL001: English Composition I,https://learn.saylor.org/course/view.php?id=43,32 hours,,English


In [None]:
df.to_csv('saylor_academy_data.csv')

# 3.OpenLearn Scraping - more categories

In [None]:
OPENLEARN_CATEGORIES = {
    "Money & Business": "https://www.open.edu/openlearn/money-management",
    "Education & Development": "https://www.open.edu/openlearn/education",
    "Health, Sports & Psychology": "https://www.open.edu/openlearn/body-mind",
    "History & The Arts": "https://www.open.edu/openlearn/history-the-arts",
    "Languages": "https://www.open.edu/openlearn/languages",
    "Nature & Environment": "https://www.open.edu/openlearn/nature-environment",
    # "Science, Maths & Technology": "https://www.open.edu/openlearn/science-maths-technology",
    "Society, Politics & Law": "https://www.open.edu/openlearn/society",
    "Digital & Computing": "https://www.open.edu/openlearn/digital"
}


In [None]:
# imports and headers
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

headers = {
    "User-Agent": "Mozilla/5.0"
}

In [None]:
# Extract courses from one category
def scrape_openlearn_category(category_name, category_url):
    print(f"Scraping: {category_name}")

    response = requests.get(category_url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    courses = []

    course_links = soup.find_all("a", class_="dotdot not-bibblio-content")

    print("Courses found:", len(course_links))

    for link in course_links:
        title = link.get_text(strip=True)
        url = link.get("href")

        if not url.startswith("http"):
            url = "https://www.open.edu" + url

        courses.append({
            "course_title": title,
            "course_url": url,
            "category": category_name,
            "platform": "OpenLearn",
            "language": "English"
        })

    return courses

In [None]:
# Loop through all categories

all_courses = []

for cat, url in OPENLEARN_CATEGORIES.items():
    data = scrape_openlearn_category(cat, url)
    all_courses.extend(data)
    time.sleep(2)   # polite delay

Scraping: Money & Business
Courses found: 9
Scraping: Education & Development
Courses found: 9
Scraping: Health, Sports & Psychology
Courses found: 9
Scraping: History & The Arts
Courses found: 8
Scraping: Languages
Courses found: 7
Scraping: Nature & Environment
Courses found: 8
Scraping: Society, Politics & Law
Courses found: 9
Scraping: Digital & Computing
Courses found: 8


In [None]:
# Convert to Dataframe
df_openlearn = pd.DataFrame(all_courses)

print(df_openlearn.shape)
df_openlearn.head()

(67, 5)


Unnamed: 0,course_title,course_url,category,platform,language
0,MSE’s Academy of Money,https://www.open.edu/openlearn/money-business/...,Money & Business,OpenLearn,English
1,Introduction to bookkeeping and accounting,https://www.open.edu/openlearn/money-business/...,Money & Business,OpenLearn,English
2,"Midlife MOT: wealth, work and wellbeing",https://www.open.edu/openlearn/midlife-mot-wea...,Money & Business,OpenLearn,English
3,Working in teams,https://www.open.edu/openlearn/money-business/...,Money & Business,OpenLearn,English
4,Managing my money for young adults: in partner...,https://www.open.edu/openlearn/money-business/...,Money & Business,OpenLearn,English


In [None]:
def scrape_openlearn_course_details(course_url):
    try:
        response = requests.get(course_url, headers=headers, timeout=15)
        soup = BeautifulSoup(response.text, "html.parser")

        # ---- Duration ----
        duration = None
        clock_icon = soup.find("span", class_="icon-clock")
        if clock_icon:
            h4 = clock_icon.find_next("h4")
            if h4:
                duration = h4.get_text(strip=True)

        # ---- Difficulty Level ----
        difficulty_level = None
        level_div = soup.find("div", class_="course-info")
        if level_div:
            h4 = level_div.find("h4")
            if h4 and "Level" in h4.text:
                difficulty_level = h4.get_text(strip=True)

        # ---- Rating ----
        rating = None
        rating_span = soup.find("span", class_="average-value")
        if rating_span:
            strong = rating_span.find("strong")
            if strong:
                rating = strong.get_text(strip=True)

        return duration, difficulty_level, rating

    except Exception as e:
        return None, None, None

In [None]:
enriched_rows = []

for idx, row in df_openlearn.iterrows():
    print(f"Enriching {idx+1}/{len(df_openlearn)}")

    duration, level, rating = scrape_openlearn_course_details(row["course_url"])

    enriched_rows.append({
        **row,
        "duration": duration,
        "difficulty_level": level,
        "rating": rating
    })

    time.sleep(2)   # VERY IMPORTANT

Enriching 1/67
Enriching 2/67
Enriching 3/67
Enriching 4/67
Enriching 5/67
Enriching 6/67
Enriching 7/67
Enriching 8/67
Enriching 9/67
Enriching 10/67
Enriching 11/67
Enriching 12/67
Enriching 13/67
Enriching 14/67
Enriching 15/67
Enriching 16/67
Enriching 17/67
Enriching 18/67
Enriching 19/67
Enriching 20/67
Enriching 21/67
Enriching 22/67
Enriching 23/67
Enriching 24/67
Enriching 25/67
Enriching 26/67
Enriching 27/67
Enriching 28/67
Enriching 29/67
Enriching 30/67
Enriching 31/67
Enriching 32/67
Enriching 33/67
Enriching 34/67
Enriching 35/67
Enriching 36/67
Enriching 37/67
Enriching 38/67
Enriching 39/67
Enriching 40/67
Enriching 41/67
Enriching 42/67
Enriching 43/67
Enriching 44/67
Enriching 45/67
Enriching 46/67
Enriching 47/67
Enriching 48/67
Enriching 49/67
Enriching 50/67
Enriching 51/67
Enriching 52/67
Enriching 53/67
Enriching 54/67
Enriching 55/67
Enriching 56/67
Enriching 57/67
Enriching 58/67
Enriching 59/67
Enriching 60/67
Enriching 61/67
Enriching 62/67
Enriching 63/67
E

In [None]:
df_openlearn_enriched = pd.DataFrame(enriched_rows)

df_openlearn_enriched.head()

Unnamed: 0,course_title,course_url,category,platform,language,duration,difficulty_level,rating
0,MSE’s Academy of Money,https://www.open.edu/openlearn/money-business/...,Money & Business,OpenLearn,English,12 hours study,,4.2
1,Introduction to bookkeeping and accounting,https://www.open.edu/openlearn/money-business/...,Money & Business,OpenLearn,English,8 hours study,,4.2
2,"Midlife MOT: wealth, work and wellbeing",https://www.open.edu/openlearn/midlife-mot-wea...,Money & Business,OpenLearn,English,4 hours study,,4.4
3,Working in teams,https://www.open.edu/openlearn/money-business/...,Money & Business,OpenLearn,English,24 hours study,,4.8
4,Managing my money for young adults: in partner...,https://www.open.edu/openlearn/money-business/...,Money & Business,OpenLearn,English,,,


In [None]:
df_openlearn_enriched.to_csv('openlearn_more_cate_data.csv')

In [None]:
df_openlearn_enriched.isna().mean() * 100

Unnamed: 0,0
course_title,0.0
course_url,0.0
category,0.0
platform,0.0
language,0.0
duration,53.731343
difficulty_level,100.0
rating,53.731343


# 4.FutureLearn Scraping

In [None]:
# Imports and Headers
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

BASE_URL = "https://www.futurelearn.com"
CATALOG_URL = "https://www.futurelearn.com/courses"

In [None]:
# Fetch course cards
def scrape_futurelearn_listing(url):
    response = requests.get(url, headers=HEADERS, timeout=20)
    soup = BeautifulSoup(response.text, "html.parser")

    courses = []

    for a in soup.find_all("a", href=True):
        href = a["href"]

        if href.startswith("/courses/") and a.find("h3"):
            title = a.find("h3").get_text(strip=True)
            full_url = urljoin(BASE_URL, href)

            courses.append({
                "course_title": title,
                "course_url": full_url,
                "platform": "FutureLearn",
                "language": "English"
            })

    return courses


In [None]:
def scrape_futurelearn_details(course_url):
    try:
        r = requests.get(course_url, headers=HEADERS, timeout=20)
        s = BeautifulSoup(r.text, "html.parser")

        # ---- Rating ----
        rating = None
        for div in s.find_all("div"):
            text = div.get_text(strip=True)
            if "reviews" in text.lower() and any(ch.isdigit() for ch in text):
                rating = text.split()[0]
                break

        # ---- Duration ----
        duration = None
        for p in s.find_all("p"):
            if "week" in p.text.lower():
                duration = p.get_text(strip=True)
                break

        return rating, duration

    except:
        return None, None

In [None]:
all_courses = scrape_futurelearn_listing(CATALOG_URL)

print("Courses found:", len(all_courses))

Courses found: 0


In [None]:
import requests

url = "https://www.futurelearn.com/courses/perioperative-medicine"

headers = {
    "User-Agent": "Mozilla/5.0"
}

response = requests.get(url, headers=headers)

print("Status:", response.status_code)
print("Length:", len(response.text))

Status: 403
Length: 7077


# 5.OpenUniversity Australia Scraping

In [None]:
import requests
from bs4 import BeautifulSoup

url = "https://www.open.edu.au/courses"
headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
print("Status:", response.status_code)
print("Length:", len(response.text))

Status: 200
Length: 406573


In [None]:
soup = BeautifulSoup(response.text, "html.parser")

course_links = [
    "https://www.open.edu.au" + a["href"]
    for a in soup.find_all("a", href=True)
    if a["href"].startswith("/courses/")
]

course_links = list(set(course_links))
print("Courses found:", len(course_links))
course_links[:5]

Courses found: 18


['https://www.open.edu.au/courses/degrees?page=5',
 'https://www.open.edu.au/courses/degrees/law-justice',
 'https://www.open.edu.au/courses/degrees/media-communications',
 'https://www.open.edu.au/courses/degrees?page=4',
 'https://www.open.edu.au/courses/degrees/psychology-mental-health']

In [None]:
test_url = course_links[0]
r = requests.get(test_url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")

print("Status:", r.status_code)
print("Title:", soup.find("h1").text.strip())

Status: 200
Title: Search thousands of courses and degrees by leading Australian universities


In [None]:
def scrape_course(url):
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")

    data = {
        "course_url": url,
        "platform": "Open Universities Australia",
        "language": "English"
    }

    # Title
    title = soup.find("h1")
    data["course_title"] = title.text.strip() if title else None

    # Category
    category = soup.find("a", {"data-testid": "breadcrumb-link"})
    data["category"] = category.text.strip() if category else None

    # Duration
    duration = soup.find("span", string=lambda x: x and "year" in x.lower())
    data["duration"] = duration.text.strip() if duration else None

    # Difficulty (derived)
    level = soup.find(string=lambda x: x and "undergraduate" in x.lower())
    if level:
        data["difficulty_level"] = "Beginner"
    else:
        data["difficulty_level"] = "Intermediate"

    # Rating (often absent)
    rating = soup.find("span", {"data-testid": "rating-value"})
    data["rating"] = rating.text.strip() if rating else None

    # Price
    price = soup.find(string=lambda x: x and "$" in x)
    data["price"] = price.strip() if price else "Paid"

    return data

In [None]:
import time
import pandas as pd

records = []

for i, link in enumerate(course_links[:100]):  # start with 100
    try:
        records.append(scrape_course(link))
        time.sleep(1.2)  # respectful delay
    except Exception as e:
        print("Failed:", link, e)

df_oua = pd.DataFrame(records)
df_oua.head()

Unnamed: 0,course_url,platform,language,course_title,category,duration,difficulty_level,rating,price
0,https://www.open.edu.au/courses/degrees?page=5,Open Universities Australia,English,Search thousands of courses and degrees by lea...,,,Beginner,,"{""props"":{""pageProps"":{""canonicalUrl"":""https:/..."
1,https://www.open.edu.au/courses/degrees/law-ju...,Open Universities Australia,English,Search thousands of courses and degrees by lea...,,"Add Bachelor of Laws (4 Years), University of ...",Beginner,,"{""props"":{""pageProps"":{""canonicalUrl"":""https:/..."
2,https://www.open.edu.au/courses/degrees/media-...,Open Universities Australia,English,Search thousands of courses and degrees by lea...,,,Beginner,,"{""props"":{""pageProps"":{""canonicalUrl"":""https:/..."
3,https://www.open.edu.au/courses/degrees?page=4,Open Universities Australia,English,Search thousands of courses and degrees by lea...,,,Beginner,,"{""props"":{""pageProps"":{""canonicalUrl"":""https:/..."
4,https://www.open.edu.au/courses/degrees/psycho...,Open Universities Australia,English,Search thousands of courses and degrees by lea...,,,Beginner,,"{""props"":{""pageProps"":{""canonicalUrl"":""https:/..."


In [None]:
df_oua.drop_duplicates(subset="course_url", inplace=True)
df_oua.isnull().sum()

Unnamed: 0,0
course_url,0
platform,0
language,0
course_title,0
category,18
duration,16
difficulty_level,0
rating,18
price,0


In [None]:
df_oua.sample(2)

Unnamed: 0,course_url,platform,language,course_title,category,duration,difficulty_level,rating,price
10,https://www.open.edu.au/courses/degrees?page=3,Open Universities Australia,English,Search thousands of courses and degrees by lea...,,,Beginner,,"{""props"":{""pageProps"":{""canonicalUrl"":""https:/..."
14,https://www.open.edu.au/courses/degrees?page=2,Open Universities Australia,English,Search thousands of courses and degrees by lea...,,,Beginner,,"{""props"":{""pageProps"":{""canonicalUrl"":""https:/..."


In [None]:
df_oua.to_csv('openuniv_aus_data.csv')

In [None]:
df_oua.shape

(18, 9)

In [None]:
disciplines = [
    "business",
    "health",
    "education",
    "it",
    "engineering",
    "law",
    "arts",
    "science",
    "psychology",
    "design",
    "communication"
]

In [None]:
import requests
from bs4 import BeautifulSoup

headers = {"User-Agent": "Mozilla/5.0"}

def get_links_from_discipline(discipline):
    url = f"https://www.open.edu.au/courses/{discipline}"
    r = requests.get(url, headers=headers)

    if r.status_code != 200:
        return []

    soup = BeautifulSoup(r.text, "html.parser")

    links = [
        "https://www.open.edu.au" + a["href"]
        for a in soup.find_all("a", href=True)
        if a["href"].startswith("/courses/")
    ]

    return list(set(links))

In [None]:
all_links = set()

for d in disciplines:
    links = get_links_from_discipline(d)
    print(f"{d}: {len(links)} courses")
    all_links.update(links)

print("Total unique course links:", len(all_links))

business: 28 courses
health: 18 courses
education: 12 courses
it: 14 courses
engineering: 5 courses
law: 16 courses
arts: 18 courses
science: 12 courses
psychology: 7 courses
design: 18 courses
communication: 18 courses
Total unique course links: 127


In [None]:
import time
import pandas as pd

records = []

for link in all_links:
    try:
        records.append(scrape_course(link))
        time.sleep(1.2)
    except Exception:
        pass

df_oua = pd.DataFrame(records)
df_oua.drop_duplicates(subset="course_url", inplace=True)

df_oua.shape

(127, 9)

In [None]:
df_oua.isnull().sum()

Unnamed: 0,0
course_url,0
platform,0
language,0
course_title,0
category,127
duration,120
difficulty_level,0
rating,127
price,0


In [None]:
df_oua.sample(3)

Unnamed: 0,course_url,platform,language,course_title,category,duration,difficulty_level,rating,price
91,https://www.open.edu.au/courses/degrees/data-s...,Open Universities Australia,English,Search thousands of courses and degrees by lea...,,,Beginner,,"{""props"":{""pageProps"":{""canonicalUrl"":""https:/..."
94,https://www.open.edu.au/courses/health?page=4,Open Universities Australia,English,Search thousands of courses and degrees by lea...,,,Beginner,,"{""props"":{""pageProps"":{""canonicalUrl"":""https:/..."
90,https://www.open.edu.au/courses/degrees/law-ju...,Open Universities Australia,English,Search thousands of courses and degrees by lea...,,,Beginner,,"{""props"":{""pageProps"":{""canonicalUrl"":""https:/..."


In [None]:
df_oua= df_oua[['course_url', 'platform', 'language', 'course_title','category', 'duration','difficulty_level','rating']]
df_oua.columns

Index(['course_url', 'platform', 'language', 'course_title', 'category',
       'duration', 'difficulty_level', 'rating'],
      dtype='object')

In [None]:
df_oua.to_csv('openuniv_aus_data_new.csv')

# 6.EduOpen India

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:
base_url = "https://learn.eduopen.org/eduopenv2/catalog.php?lang=en"
headers = {"User-Agent": "Mozilla/5.0"}

In [None]:
response = requests.get(base_url, headers=headers)
print("Status Code:", response.status_code)
print("Content length:", len(response.text))
soup = BeautifulSoup(response.text, "html.parser")

Status Code: 200
Content length: 53547


In [None]:
category_links = []
for a in soup.find_all("a", href=True):
    if "catalog.php?catid=" in a["href"]:  # or "course_details" for courses directly
        category_links.append("https://learn.eduopen.org/eduopenv2/" + a["href"].split("/")[-1])
category_links = list(set(category_links))
print("Total categories found:", len(category_links))

Total categories found: 0


#  7.ClassCentral

In [None]:
import requests

url = "https://www.udemy.com/api-2.0/courses/"
params = {
    "page": 1,
    "page_size": 20,
    "language": "en",
    "ordering": "highest-rated"
}

headers = {
    "User-Agent": "Mozilla/5.0",
    "Accept": "application/json",
    "Referer": "https://www.udemy.com/"
}

response = requests.get(url, params=params, headers=headers)

print("Status:", response.status_code)
print("Length:", len(response.text))
# print(response.json().keys())

Status: 403
Length: 7282


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.geeksforgeeks.org/courses/"
headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
print("Status:", response.status_code)
print("HTML length:", len(response.text))

Status: 200
HTML length: 226991


In [None]:
soup = BeautifulSoup(response.text, "html.parser")

In [None]:
titles = soup.find_all("h4", class_="course_heading")
print("Titles found:", len(titles))
print(titles[:3])

Titles found: 20
[<h4 class="ui left aligned header courseListingPage_myAuto__i6GdI sofia-pro course_heading">System Design: Low-Level to High-Level - Self Paced</h4>, <h4 class="ui left aligned header courseListingPage_myAuto__i6GdI sofia-pro course_heading">Tech Interview 101 - DSA and System Design</h4>, <h4 class="ui left aligned header courseListingPage_myAuto__i6GdI sofia-pro course_heading">Java Backend Development with AI - Live</h4>]


In [None]:
courses = []

for h4 in titles:
    title = h4.get_text(strip=True)

    parent_link = h4.find_parent("a")
    course_url = parent_link["href"] if parent_link else None

    if course_url and course_url.startswith("/"):
        course_url = "https://www.geeksforgeeks.org" + course_url

    courses.append({
        "platform": "GeeksforGeeks",
        "course_title": title,
        "course_url": course_url,
        "rating_out_of_5": None,
        "duration": None,
        "difficulty_level": None,
        "language": "English"
    })

In [None]:
len(courses)

20

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import re

url = "https://www.geeksforgeeks.org/courses/"
headers = {
    "User-Agent": "Mozilla/5.0"
}

response = requests.get(url, headers=headers)
print("Status:", response.status_code)
print("HTML length:", len(response.text))

Status: 200
HTML length: 226991


In [None]:
soup = BeautifulSoup(response.text, "html.parser")

script_tag = soup.find("script", id="__NEXT_DATA__")
print("Script found:", script_tag is not None)


Script found: True


In [None]:
data = json.loads(script_tag.string)

In [None]:
initial_state = data["props"]["pageProps"]["initialState"]
print(initial_state.keys())

dict_keys(['user', 'myCoursesApi', 'usersApi', 'jobsApi', 'eventsApi', 'listingPageApi', 'explorePageApi', 'coursesDashboardAPI', 'contestApi', 'registrationCntApi', 'recurringEventApi', 'problemApi', 'batchApi', 'trackApi', 'ideApi', 'generalApi', 'allFeedbacksApi', 'dashboardBaseApi', 'addEditCourseFeaturesApi', 'filtersApi', 'paymentTransactionHistoryApi', 'classAllApi', 'sidebarApi', 'addEditJobsApi', 'addEditProblemApi', 'hackathonApi', 'hackathonNewApi', 'problemData', 'allUserSprint', 'generalData', 'batchTrackData', 'rewardAPI', 'contestData', 'couponData', 'problemOfTheDayAPI', 'addEditCourseData', 'jobPortalData', 'jobPortalApi', 'collegeAdminData', 'geekOlympics', 'summerTrainingEvent', 'dsaToDevelopmentApi', 'dsaToDevelopmentData', 'organizationDashboardApi', 'organizationDashboardData', 'userInfo', 'myCoursesData', 'aggregatedReportData', 'leaderboardApi', 'premiumApi', 'termsOfServiceApi'])


In [None]:
def find_keys(obj, keyword, found=set()):
    if isinstance(obj, dict):
        for k, v in obj.items():
            if keyword.lower() in k.lower():
                found.add(k)
            find_keys(v, keyword, found)
    elif isinstance(obj, list):
        for i in obj:
            find_keys(i, keyword, found)
    return found

print(find_keys(initial_state, "course"))

{'course_list', 'course_highlights', 'course_for_kids', 'addEditCourseFeaturesApi', 'course_fee_type', 'addEditCourseData', 'course_short_name', 'electiveCourses', 'getPromotionalEventsApi({"cdnCountryCode":"US","pageSource":"all_courses"})', 'course_email_content', 'course_tier', 'myCoursesData', 'courseIntro', 'courseContent', 'my-courses', 'courseDefaultLang', 'course_type', 'course_default_lang', 'courseBanners', 'courseSlug', 'course_redirection_slug', 'courseFaqs', 'view_all_course_page', 'courseBenefits', 'courseFeeType', 'course_name', 'est_course_duration_hrs', 'course_app_coupon', 'course-management', 'linked_offline_course', 'course_duration', 'view_all_course_api', 'course_slug', 'course_image_text', 'course_publish_date', 'course_id', 'coursesDashboardAPI', 'courseJourneyPhases', 'upgradableCourses', 'myCoursesApi', 'course_expiry_days', 'courses'}


In [None]:
initial_state["course"]["courses"]

KeyError: 'course'

In [None]:
initial_state

{'user': {'userVal': '',
  'userError': False,
  'responsemsg': '',
  'responsetype': 'error',
  'recaptchaError': False,
  'recaptchaValue': '',
  'loading': False,
  'recpatchSiteKey': '6LexF0sUAAAAADiQjz9BMiSrqplrItl-tWYDSfWa',
  'userData': None,
  'rememberMe': True,
  'theme': 'light',
  'gfgLogo': 'https://media.geeksforgeeks.org/gfg-gg-logo.svg'},
 'myCoursesApi': {'queries': {},
  'mutations': {},
  'provided': {},
  'subscriptions': {},
  'config': {'online': True,
   'focused': True,
   'middlewareRegistered': True,
   'refetchOnFocus': False,
   'refetchOnReconnect': False,
   'refetchOnMountOrArgChange': False,
   'keepUnusedDataFor': 60,
   'reducerPath': 'myCoursesApi'}},
 'usersApi': {'queries': {},
  'mutations': {},
  'provided': {},
  'subscriptions': {},
  'config': {'online': True,
   'focused': True,
   'middlewareRegistered': True,
   'refetchOnFocus': False,
   'refetchOnReconnect': False,
   'refetchOnMountOrArgChange': False,
   'keepUnusedDataFor': 60,
   're

In [None]:
categories = [
    {
        "category": "DSA / Placements",
        "url": "https://www.geeksforgeeks.org/courses/category/dsa-placements"
    },
    {
        "category": "Data Science / ML",
        "url": "https://www.geeksforgeeks.org/courses/category/data-science-machine-learning"
    },
    {
        "category": "Web Development",
        "url": "https://www.geeksforgeeks.org/courses/category/web-development"
    },
    {
        "category": "Programming Languages",
        "url": "https://www.geeksforgeeks.org/courses/category/programming-languages"
    },
    {
        "category": "Computer Science Core",
        "url": "https://www.geeksforgeeks.org/courses/category/computer-science"
    }
]

print("Total categories:", len(categories))

Total categories: 5


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

headers = {
    "User-Agent": "Mozilla/5.0"
}

records = []

for cat in categories:
    response = requests.get(cat["url"], headers=headers, timeout=30)
    soup = BeautifulSoup(response.text, "html.parser")

    course_titles = soup.find_all("h4", class_="course_heading")

    print(f"{cat['category']} → {len(course_titles)} courses")

    for c in course_titles:
        records.append({
            "course_title": c.get_text(strip=True),
            "category": cat["category"],
            "platform": "GeeksforGeeks",
            "language": "English",
            "course_url": cat["url"],   # category-level URL
            "duration": None,
            "difficulty_level": None,
            "rating_out_of_5": None,
            "data_source": "HTML scraping"
        })

df_gfg = pd.DataFrame(records)
print(df_gfg.head())
print("Total records:", len(df_gfg))

DSA / Placements → 0 courses
Data Science / ML → 0 courses
Web Development → 0 courses
Programming Languages → 0 courses
Computer Science Core → 0 courses
Empty DataFrame
Columns: []
Index: []
Total records: 0


# 8.Coursera

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

headers = {
    "User-Agent": "Mozilla/5.0"
}

sitemap_url = "https://www.coursera.org/sitemap~www~courses.xml"

response = requests.get(sitemap_url, headers=headers, timeout=30)

print("Status:", response.status_code)
print("Content length:", len(response.content))

Status: 200
Content length: 1549154


In [None]:
soup = BeautifulSoup(response.content, "xml")

course_urls = [loc.text for loc in soup.find_all("loc")]

print("Total URLs found:", len(course_urls))
course_urls[:5]

Total URLs found: 15483


['https://www.coursera.org/learn/-bigquery-zhtw',
 'https://www.coursera.org/learn/-financial-accounting-101',
 'https://www.coursera.org/learn/-financial-accounting-102',
 'https://www.coursera.org/learn/-financial-accounting-103',
 'https://www.coursera.org/learn/-gemini-code-assist-zhtw']

In [None]:
def is_english_course(url):
    non_english_tokens = [
        "-fr", "-es", "-pt", "-ru", "-zh", "-de", "-it", "-ko", "-ja"
    ]
    return not any(token in url for token in non_english_tokens)

english_urls = [u for u in course_urls if is_english_course(u)]

print("English-like courses:", len(english_urls))
english_urls[:5]

English-like courses: 11892


['https://www.coursera.org/learn/-financial-accounting-101',
 'https://www.coursera.org/learn/-financial-accounting-102',
 'https://www.coursera.org/learn/-financial-accounting-103',
 'https://www.coursera.org/learn/-network-security',
 'https://www.coursera.org/learn/-security-principles']

In [None]:
def extract_title_from_url(url):
    slug = url.rstrip("/").split("/")[-1]
    title = slug.replace("-", " ").title()
    return title

data = []

for url in english_urls[:1000]:  # limit for your dataset
    data.append({
        "course_url": url,
        "course_title": extract_title_from_url(url),
        "platform": "Coursera",
        "language": "English",
        "duration": None,
        "difficulty_level": None,
        "rating_out_of_5": None,
        "data_completion_method": "Sitemap + URL heuristic"
    })

df_coursera = pd.DataFrame(data)
df_coursera.head()

Unnamed: 0,course_url,course_title,platform,language,duration,difficulty_level,rating_out_of_5,data_completion_method
0,https://www.coursera.org/learn/-financial-acco...,Financial Accounting 101,Coursera,English,,,,Sitemap + URL heuristic
1,https://www.coursera.org/learn/-financial-acco...,Financial Accounting 102,Coursera,English,,,,Sitemap + URL heuristic
2,https://www.coursera.org/learn/-financial-acco...,Financial Accounting 103,Coursera,English,,,,Sitemap + URL heuristic
3,https://www.coursera.org/learn/-network-security,Network Security,Coursera,English,,,,Sitemap + URL heuristic
4,https://www.coursera.org/learn/-security-princ...,Security Principles,Coursera,English,,,,Sitemap + URL heuristic


In [None]:
df_coursera.shape

(1000, 8)

In [None]:
df_coursera.to_csv('coursera_data.csv')