Course Scrapper Notebook

In [3]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.32.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.32.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post0-py2.py3-

# ---
# title: UIUC Course Catalog Scraper
# author: Jas Mehta, Moksha Shah
# date: 2025-05-11
# colab-type: code
# description: This notebook scrapes course data for all terms and departments from the University of Illinois Urbana-Champaign (UIUC) course schedule website (https://courses.illinois.edu).

## 📘 UIUC Course Catalog Scraper

This notebook automates the extraction of course listings from the [UIUC Course Explorer](https://courses.illinois.edu/schedule). It performs the following tasks:

- Retrieves all academic terms (e.g., Fall 2025, Spring 2025)
- Extracts all department codes (e.g., CS, MATH, ECE) for each term
- Visits every course listing page (e.g., CS 225) and scrapes:
    - Course title
    - Section info
    - Schedule
    - Instructor
    - CRN
    - etc
- Aggregates all results into a single pandas DataFrame
- Saves the output as a CSV file

### 🔧 Requirements

- `requests`
- `beautifulsoup4`
- `pandas`
- `selenium` with a supported driver (e.g., ChromeDriver)
- `tqdm` (optional, for progress bars)

> ⚠️ Note: Running this on Google Colab requires setting up Selenium with a headless browser environment, which may be complex. For best results, run this notebook locally with a full desktop browser.

### ✅ Output

- `uiuc_all_courses.csv`: Contains all course listings for all selected terms and departments.




In [None]:
import requests
import xml.etree.ElementTree as ET
from urllib.parse import urljoin

def extract_uiuc_course_urls(year: int, term: str, *, timeout: int = 15) -> list[str]:
    """
    Return every course-level Course Explorer URL for the given UIUC term.

    >>> extract_uiuc_course_urls(2025, "fall")[0]
    'https://courses.illinois.edu/schedule/2025/fall/AAS/100'

    Parameters
    ----------
    year     : 4-digit academic year (e.g. 2025)
    term     : 'spring', 'summer', or 'fall'  (case-insensitive)
    timeout  : per-request timeout in seconds (default 15)

    Returns
    -------
    list[str] – sorted list of https://courses.illinois.edu/schedule/… URLs
    """
    term = term.lower()
    if term not in {"spring", "summer", "fall"}:
        raise ValueError("term must be 'spring', 'summer', or 'fall'")

    # Root feed that lists every subject offered that term
    root_url = (
        f"https://courses.illinois.edu/cisapp/explorer/schedule/{year}/{term}.xml"
    )

    sess = requests.Session()
    sess.headers["User-Agent"] = "UIUC-course-scraper/1.0 (+github.com/you)"

    def _get_xml(url: str) -> ET.Element:
        """Download *url* and return its root Element."""
        r = sess.get(url, timeout=timeout)
        r.raise_for_status()
        return ET.fromstring(r.content)

    course_urls: list[str] = []

    # 1) Subject listing -------------------------------------------------------
    for subj in _get_xml(root_url).iter("subject"):
        # Each <subject … href="…/CS.xml"> element already contains the API link
        subj_xml = subj.attrib["href"]
        # 2) Course listing inside that subject --------------------------------
        for course in _get_xml(subj_xml).iter("course"):
            course_xml = course.attrib["href"]
            # Convert API link → human URL: drop '/cisapp/explorer' and '.xml'
            human_url = course_xml.replace("/cisapp/explorer", "").rstrip(".xml")
            course_urls.append(human_url)

    return sorted(course_urls)


all_urls = extract_uiuc_course_urls(2025, "fall")

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

def scrape_uiuc_schedule_selenium(url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # More verbose error logging
    print(f"Starting to scrape URL: {url}")

    try:
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)

        # Wait longer for page to load
        wait = WebDriverWait(driver, 20)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))

        # Debugging page load
        print(f"Page title: {driver.title}")
        print(f"Page URL: {driver.current_url}")

        # Save page source for debugging
        with open("page_source.html", "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        print("Saved page source to page_source.html")

        # Get basic course information
        try:
            course_title = driver.find_element(By.CSS_SELECTOR, "h1").text.strip()
            print(f"Found course title: {course_title}")
        except NoSuchElementException:
            course_title = "Unknown Course"
            print("Could not find course title")

        page_text = driver.find_element(By.TAG_NAME, "body").text

        # Extract description
        description = ""
        desc_match = re.search(r"Description:([^\n]+)", page_text)
        if desc_match:
            description = desc_match.group(1).strip()
            print(f"Found description: {description[:50]}...")
        else:
            print("Could not find description")

        # Extract credit hours
        credit_hours = ""
        credit_match = re.search(r"Credit:\s*(\d+(?:\.\d+)?)\s*hours", page_text)
        if credit_match:
            credit_hours = credit_match.group(1)
            print(f"Found credit hours: {credit_hours}")
        else:
            print("Could not find credit hours")

        # Try to expand all details
        try:
            print("Looking for detail expansion buttons...")
            buttons = driver.find_elements(By.TAG_NAME, "button")
            detail_buttons = [b for b in buttons if "detail" in b.text.lower() or "expand" in b.text.lower()]

            if detail_buttons:
                print(f"Found {len(detail_buttons)} detail buttons, clicking the first one...")
                detail_buttons[0].click()
                time.sleep(5)  # Wait longer for details to expand
            else:
                print("No detail buttons found")

                # Try alternate approaches
                detail_links = driver.find_elements(By.XPATH, "//a[contains(text(), 'detail') or contains(text(), 'expand')]")
                if detail_links:
                    print(f"Found {len(detail_links)} detail links, clicking the first one...")
                    detail_links[0].click()
                    time.sleep(5)
        except Exception as e:
            print(f"Error expanding details: {e}")

        # Find the main course table with multiple approaches
        main_table = None

        # Try different ways to locate the table
        table_approaches = [
            # Try to find table with CRN in header
            lambda: next((table for table in driver.find_elements(By.TAG_NAME, "table")
                         if table.find_elements(By.TAG_NAME, "tr") and
                         "CRN" in table.find_element(By.TAG_NAME, "tr").text), None),

            # Try to find table with specific class
            lambda: driver.find_element(By.CSS_SELECTOR, "table.table") if driver.find_elements(By.CSS_SELECTOR, "table.table") else None,

            # Try to find any table
            lambda: driver.find_element(By.TAG_NAME, "table") if driver.find_elements(By.TAG_NAME, "table") else None,

            # Try to find table in a specific region
            lambda: driver.find_element(By.XPATH, "//div[contains(@class, 'schedule')]/table")
                   if driver.find_elements(By.XPATH, "//div[contains(@class, 'schedule')]/table") else None
        ]

        for approach in table_approaches:
            try:
                main_table = approach()
                if main_table:
                    print(f"Found table using approach {table_approaches.index(approach) + 1}")
                    break
            except Exception as e:
                print(f"Approach failed: {e}")

        if not main_table:
            print("Could not find main course table")
            # Create a dummy data row with basic course info
            return pd.DataFrame([{
                "Description": description,
                "Credit Hours": credit_hours,
                "Section Title": course_title,
                "Section Credit Hours": credit_hours,
                "CRN": "Unknown",
                "Section": "Unknown",
                "Status Code": "",
                "Part of Term": "",
                "Section Status": "",
                "Enrollment Status": "",
                "Note": "Table data not found - this is placeholder data"
            }])

        # Count tables for debugging
        tables = driver.find_elements(By.TAG_NAME, "table")
        print(f"Found {len(tables)} tables on the page")

        # Print the first row of each table for debugging
        for i, table in enumerate(tables):
            try:
                rows = table.find_elements(By.TAG_NAME, "tr")
                if rows:
                    print(f"Table {i+1} first row: {rows[0].text[:100]}...")
                    # If this is our main table, print more info
                    if table == main_table:
                        print(f"  This is our main table with {len(rows)} rows")
                        if len(rows) > 1:
                            print(f"  Second row sample: {rows[1].text[:100]}...")
            except Exception as e:
                print(f"Error examining table {i+1}: {e}")

        data = []
        headers = []

        try:
            header_row = main_table.find_element(By.TAG_NAME, "tr")
            headers = [th.text.strip() for th in header_row.find_elements(By.TAG_NAME, "th")]
            print(f"Found headers: {headers}")

            # If headers are empty, try td elements (some tables use td for headers)
            if not headers:
                headers = [td.text.strip() for td in header_row.find_elements(By.TAG_NAME, "td")]
                print(f"Found headers from td elements: {headers}")

            # If still empty, use default headers
            if not headers:
                print("Using default headers")
                headers = ["CRN", "Status", "Section", "Type", "Time", "Day", "Location", "Instructor", "Detail"]
        except Exception as e:
            print(f"Error getting headers: {e}")
            headers = ["CRN", "Status", "Section", "Type", "Time", "Day", "Location", "Instructor", "Detail"]

        try:
            rows = main_table.find_elements(By.TAG_NAME, "tr")[1:]  # Skip header row
            print(f"Processing {len(rows)} data rows")

            for i, row in enumerate(rows):
                try:
                    cells = row.find_elements(By.TAG_NAME, "td")
                    if len(cells) < 2:  # Skip rows with too few cells
                        print(f"Skipping row {i+1} - not enough cells ({len(cells)})")
                        continue

                    # Initialize the row data with all requested columns
                    row_data = {
                        "Description": description,
                        "Credit Hours": credit_hours,
                        "Section Title": course_title,
                        "Section Credit Hours": credit_hours,  # Default to course credit hours
                        "Section Info": "",
                        "Degree Attributes": "",
                        "Schedule Information": "",
                        "CRN": "",
                        "Section": "",
                        "Status Code": "",
                        "Part of Term": "",
                        "Section Status": "",
                        "Enrollment Status": "",
                        "Type": "",
                        "Type Code": "",
                        "Start Time": "",
                        "End Time": "",
                        "Days of Week": "",
                        "Room": "",
                        "Building": "",
                        "Instructors": ""
                    }

                    # Add the data from the table cells
                    for j, header in enumerate(headers):
                        if j < len(cells):
                            row_data[header] = cells[j].text.strip()

                    # Print first cell for debugging
                    if len(cells) > 0:
                        print(f"Row {i+1}, first cell: {cells[0].text.strip()}")

                    # Extract CRN from the table
                    if "CRN" in row_data and row_data["CRN"]:
                        print(f"Found CRN: {row_data['CRN']}")
                    elif "CRN" in headers and len(cells) > headers.index("CRN"):
                        row_data["CRN"] = cells[headers.index("CRN")].text.strip()
                        print(f"Extracted CRN from column: {row_data['CRN']}")

                    # Extract Section from the table
                    if "Section" in row_data and row_data["Section"]:
                        pass
                    elif "Section" in headers and len(cells) > headers.index("Section"):
                        row_data["Section"] = cells[headers.index("Section")].text.strip()

                    # Extract Status and Status Code
                    if "Status" in row_data and row_data["Status"]:
                        row_data["Section Status"] = row_data["Status"]
                        # Try to infer status code
                        if row_data["Status"].lower() == "open":
                            row_data["Status Code"] = "A"
                        elif row_data["Status"].lower() == "closed":
                            row_data["Status Code"] = "C"

                    # Extract Type and Type Code
                    if "Type" in row_data and row_data["Type"]:
                        row_data["Type"] = row_data["Type"]
                        row_data["Type Code"] = row_data["Type"]

                    # Extract Time and process into Start Time and End Time
                    if "Time" in row_data and row_data["Time"]:
                        time_match = re.search(r"(\d+:\d+[AP]M)\s*-\s*(\d+:\d+[AP]M)", row_data["Time"])
                        if time_match:
                            row_data["Start Time"] = time_match.group(1)
                            row_data["End Time"] = time_match.group(2)

                    # Extract Days of Week
                    if "Day" in row_data and row_data["Day"]:
                        row_data["Days of Week"] = row_data["Day"]

                    # Extract Room and Building from Location
                    if "Location" in row_data and row_data["Location"]:
                        location_parts = row_data["Location"].split(" ", 1)
                        if len(location_parts) >= 2:
                            row_data["Room"] = location_parts[0]
                            row_data["Building"] = location_parts[1]

                    # Extract Instructors
                    if "Instructor" in row_data and row_data["Instructor"]:
                        row_data["Instructors"] = row_data["Instructor"]

                    # Extract detailed information from the "Detail" column if it exists
                    if "Detail" in row_data and row_data["Detail"]:
                        detail_text = row_data["Detail"]

                        # Extract Part of Term
                        pot_match = re.search(r"Part of Term\s*(\d+)", detail_text)
                        if pot_match:
                            row_data["Part of Term"] = pot_match.group(1).strip()

                        # Extract Degree Notes/Attributes
                        degree_match = re.search(r"Degree Notes\s*(.*?)(?:Date Range|$)", detail_text)
                        if degree_match:
                            row_data["Degree Attributes"] = degree_match.group(1).strip()

                        # Extract Date Range as Schedule Information
                        date_match = re.search(r"Date Range\s*Meets\s*(\d+/\d+/\d+-\d+/\d+/\d+)", detail_text)
                        if date_match:
                            row_data["Schedule Information"] = f"Meets {date_match.group(1)}"

                        # Extract Availability/Enrollment Status
                        avail_match = re.search(r"Availability\s*(\w+)", detail_text)
                        if avail_match:
                            row_data["Enrollment Status"] = avail_match.group(1).strip()

                    # Use the entire Detail as Section Info if nothing better is available
                    if "Detail" in row_data and not row_data["Section Info"]:
                        row_data["Section Info"] = row_data["Detail"]

                    data.append(row_data)
                    print(f"Added data row {i+1}")
                except Exception as e:
                    print(f"Error processing row {i+1}: {e}")
        except Exception as e:
            print(f"Error processing table rows: {e}")

        print(f"Processed {len(data)} data rows successfully")
    except Exception as e:
        print(f"Major error: {e}")
        return pd.DataFrame([{
            "Description": "Error occurred",
            "Credit Hours": "",
            "Section Title": "",
            "CRN": "",
            "Error": str(e)
        }])
    finally:
        try:
            driver.quit()
            print("Driver closed")
        except:
            print("Error closing driver")

    if not data:
        print("No data was collected")
        # Return a dataframe with error information
        return pd.DataFrame([{
            "Description": description,
            "Credit Hours": credit_hours,
            "Section Title": course_title,
            "CRN": "No data found",
            "Error": "Table data could not be extracted"
        }])

    return pd.DataFrame(data)

def test_with_direct_html():
    """Try to extract info from direct HTML when selenium fails"""
    print("Attempting direct HTML extraction as fallback...")
    try:
        url = "https://courses.illinois.edu/schedule/2025/fall/AAS/100"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Try to extract basic course info
        title = soup.find('h1')
        course_title = title.text.strip() if title else "Unknown Course"

        # Extract description
        desc_text = soup.find(text=re.compile("Description:"))
        description = ""
        if desc_text:
            parent = desc_text.parent
            description = parent.text.replace("Description:", "").strip()

        # Extract credit hours
        credit_text = soup.find(text=re.compile("Credit:"))
        credit_hours = ""
        if credit_text:
            credit_match = re.search(r"Credit:\s*(\d+(?:\.\d+)?)\s*hours", credit_text)
            if credit_match:
                credit_hours = credit_match.group(1)

        # Try to find the course table
        tables = soup.find_all('table')

        data = []
        for table in tables:
            headers = [th.text.strip() for th in table.find_all('th')]
            if not headers or 'CRN' not in ' '.join(headers):
                continue

            rows = table.find_all('tr')[1:]  # Skip header row
            for row in rows:
                cells = row.find_all('td')
                if len(cells) < len(headers):
                    continue

                row_data = {
                    "Description": description,
                    "Credit Hours": credit_hours,
                    "Section Title": course_title,
                    "CRN": cells[headers.index('CRN')].text.strip() if 'CRN' in headers else "",
                    "Type": cells[headers.index('Type')].text.strip() if 'Type' in headers else "",
                    "Section": cells[headers.index('Section')].text.strip() if 'Section' in headers else "",
                    "Degree Attributes": "Extracted from HTML fallback",
                    "Method": "HTML_FALLBACK"
                }
                data.append(row_data)

        if data:
            print(f"HTML fallback found {len(data)} rows")
            return pd.DataFrame(data)
        else:
            print("HTML fallback found no data")
            return pd.DataFrame([{
                "Description": description,
                "Credit Hours": credit_hours,
                "Section Title": course_title,
                "CRN": "No data found",
                "Method": "HTML_FALLBACK_EMPTY"
            }])
    except Exception as e:
        print(f"HTML fallback error: {e}")
        return pd.DataFrame([{
            "CRN": "Error in HTML fallback",
            "Error": str(e),
            "Method": "HTML_FALLBACK_ERROR"
        }])

def main():
    print("Starting UIUC course scraping...")
    dfs = []
    # Try selenium first
    #df = scrape_uiuc_schedule_selenium()
    for url in all_urls:
        print(f"\n--- Scraping {url} ---")
        df = scrape_uiuc_schedule_selenium(url)

        # Fallback to HTML if selenium gave no useful rows
        if df.empty or (all(df['CRN'].astype(str).str.contains('Unknown|Error|No data', case=False))):
            print("Selenium failed to get data, trying HTML fallback...")
            df = test_with_direct_html()

        dfs.append(df)

    # Concatenate all results
    combined = pd.concat(dfs, ignore_index=True)

    # Filter to only the requested columns that exist
    requested_columns = [
        "Description", "Credit Hours", "Section Info", "Degree Attributes",
        "Schedule Information", "CRN", "Section", "Status Code", "Part of Term",
        "Section Title", "Section Credit Hours", "Section Status", "Enrollment Status",
        "Type", "Type Code", "Start Time", "End Time", "Days of Week", "Room",
        "Building", "Instructors"
    ]
    available_columns = [col for col in requested_columns if col in combined.columns]
    combined = combined[available_columns] if available_columns else combined

    # Save to CSV
    output_file = "uiuc_all_courses.csv"
    combined.to_csv(output_file, index=False)
    print(f"✅ All data saved to {output_file} with {len(combined)} rows and {len(combined.columns)} columns.")
    print(combined.head())


    # # If no rows with actual CRN data, try HTML fallback
    # if df.empty or (all(df['CRN'].astype(str).str.contains('Unknown|Error|No data', case=False)) if 'CRN' in df.columns else True):
    #     print("Selenium scraping failed to get data, trying HTML fallback...")
    #     df_html = test_with_direct_html()

    #     # If HTML fallback worked, use that data
    #     if not df_html.empty and not all(df_html['CRN'].astype(str).str.contains('Unknown|Error|No data', case=False) if 'CRN' in df_html.columns else True):
    #         print("Using HTML fallback data")
    #         df = df_html
    #     else:
    #         print("Both methods failed")
    #         # Combine error info
    #         if not df.empty and not df_html.empty:
    #             df = pd.concat([df, df_html], ignore_index=True)

    # # Keep only the requested columns that exist in the dataframe
    # requested_columns = [
    #     "Description", "Credit Hours", "Section Info", "Degree Attributes",
    #     "Schedule Information", "CRN", "Section", "Status Code", "Part of Term",
    #     "Section Title", "Section Credit Hours", "Section Status", "Enrollment Status",
    #     "Type", "Type Code", "Start Time", "End Time", "Days of Week", "Room",
    #     "Building", "Instructors"
    # ]

    # # Filter to only columns that exist in the dataframe
    # available_columns = [col for col in requested_columns if col in df.columns]

    # if available_columns:
    #     df_filtered = df[available_columns]
    # else:
    #     df_filtered = df



    # # Save the output
    # output_file = "uiuc_course_data.csv"
    # df_filtered.to_csv(output_file, index=False)
    # print(f"✅ Data saved to {output_file} with {len(df_filtered)} rows and {len(df_filtered.columns)} columns.")
    # print(df_filtered.head())

if __name__ == "__main__":
    main()

Starting UIUC course scraping...

--- Scraping http://cis.local/cisapi/schedule/2025/fall/AAS/215 ---
Starting to scrape URL: http://cis.local/cisapi/schedule/2025/fall/AAS/215


Below code processes and cleans a structured CSV file by removing unnecessary rows and extracting key metadata. The result is a cleaner and more usable version of the original dataset.

In [None]:
import csv
import re

def process_csv(input_file, output_file):
    """
    Process a CSV file by removing alternate rows and extracting 'Part of Term' info
    """
    rows = []
    processed_rows = []

    # Read the input CSV file
    with open(input_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        headers = reader.fieldnames.copy()  # Copy the headers

        # Add 'Part of Term' header if it doesn't exist
        if 'Part of Term' not in headers:
            headers.append('Part of Term')

        # Read all rows into memory
        for row in reader:
            rows.append(row)

    # Process rows - keep every other row, extract Part of Term from alternate rows
    for i in range(0, len(rows), 2):
        if i < len(rows):
            current_row = dict(rows[i])  # Copy the current row

            # Make sure there's a Part of Term field
            if 'Part of Term' not in current_row:
                current_row['Part of Term'] = ''

            # If we have a next row, look for Part of Term info
            if i + 1 < len(rows):
                next_row = rows[i + 1]

                # Check all fields in the next row for 'Part of Term'
                for field, value in next_row.items():
                    if value and isinstance(value, str) and 'Part of Term' in value:
                        # Extract the Part of Term value (e.g., "1" from "Part of Term 1")
                        match = re.search(r'Part of Term\s+(\S+)', value)
                        if match:
                            current_row['Part of Term'] = match.group(1)
                            break

            processed_rows.append(current_row)

    # Write to output file
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=headers)
        writer.writeheader()
        writer.writerows(processed_rows)

    print(f"Successfully processed {len(processed_rows)} rows. Output saved to {output_file}")

# Example usage
if __name__ == "__main__":
    input_file = "/content/uiuc_all_courses.csv"  # Change this to your input file
    output_file = "courses_processed.csv"  # Change this to your desired output file

    process_csv(input_file, output_file)