In [None]:
import requests
from bs4 import BeautifulSoup
import json
import re
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

TEST_URL = "https://catalog.coastal.edu/preview_program.php?catoid=33&poid=5939"
BASE_URL = "https://catalog.coastal.edu/"


In [20]:
def get_expanded_page_source(url):
    """
    Uses Selenium to load a webpage, find all expandable course links,
    click each one to reveal its details, and then returns the final,
    fully-expanded page source HTML.
    """
    print(f"Fetching and expanding page with Selenium: {url}")
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    try:
        driver.get(url)
        # Wait for the main content to be present
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "custom_leftpad_20"))
        )
        
        # Find all links that can be expanded
        expandable_links = driver.find_elements(By.CSS_SELECTOR, "a[aria-expanded='false']")
        print(f"Found {len(expandable_links)} expandable course links to click.")
        
        for link in expandable_links:
            try:
                # Scroll the element into view and click it
                driver.execute_script("arguments[0].click();", link)
                # Wait a very short moment for the content to expand
                time.sleep(0.2)
            except Exception as e:
                print(f"  - Could not click a link: {e}")
                
        print("Finished clicking. Waiting for final content to load...")
        time.sleep(2) # Final wait to ensure all content is loaded
        
        return driver.page_source
    finally:
        driver.quit()

# Run the function and store the fully expanded HTML
html_content = get_expanded_page_source(TEST_URL)

print("\nSuccessfully fetched and expanded HTML. Preview:")
print(html_content[:1000])

Fetching and expanding page with Selenium: https://catalog.coastal.edu/preview_program.php?catoid=33&poid=5939
Found 69 expandable course links to click.
Finished clicking. Waiting for final content to load...

Successfully fetched and expanded HTML. Preview:
<html lang="en" class="linux chrome chrome139 webkit webkit537-36 js flexbox canvas canvastext webgl no-touch geolocation postmessage no-websqldatabase indexeddb hashchange history draganddrop websockets rgba hsla multiplebgs backgroundsize borderimage borderradius boxshadow textshadow opacity cssanimations csscolumns cssgradients cssreflections csstransforms no-csstransforms3d csstransitions fontface no-generatedcontent video audio localstorage sessionstorage webworkers no-applicationcache svg inlinesvg smil svgclippaths" id="gateway-page"><head>
		<title>Program: Computer Science, B.S. - Coastal Carolina University - Modern Campus Catalog™</title>
				<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
							<li

In [21]:
soup = BeautifulSoup(html_content, "html.parser")

# Find the main content area holding the requirements.
main_content = soup.find('div', class_='custom_leftpad_20')

if main_content:
    print("Successfully found the main content div:")
    print(main_content.prettify()) # Uncomment to see the full HTML of this section
else:
    print("Could not find the main content div with class 'custom_leftpad_20'.")


Successfully found the main content div:
<div class="custom_leftpad_20">
 <div class="acalog-core">
  <h2>
   <a name="DegreeRequirements120136Credits">
   </a>
   <a id="core_58441" name="degreerequirements120136credits">
   </a>
   Degree Requirements (120-136 Credits)
  </h2>
  <hr/>
 </div>
 <div class="custom_leftpad_20">
  <div class="acalog-core">
   <h3>
    <a name="CoreCurriculumRequirements">
    </a>
    <a id="core_58442" name="corecurriculumrequirements">
    </a>
    Core Curriculum Requirements
   </h3>
   <hr/>
   <div style="display: inline">
    <a href="preview_program.php?catoid=33&amp;poid=5839" onclick="showCatalogData('33', '1', '5839', 'Core%20Curriculum%20%2836-40%20Total%20Credit%20Hours%29', this, ''); return false;" target="_blank">
     Core Curriculum (36-40 Total Credit Hours)
    </a>
   </div>
   <span style="display: none !important">
   </span>
  </div>
  <div class="acalog-core">
   <h3>
    <a name="GraduationRequirements">
    </a>
    <a id="core

In [41]:
def parse_course_text(text):
    """
    Parses a string of course text to extract code, title, and credits
    using the updated regex.
    """
    match = re.match(r'([A-Z]{4}\s\d{3}[A-Z]?)\s+(.*?)\s+-\s+(\d+)\s+credits', text)
    if match:
        code, title, credits = match.groups()
        return code.strip(), title.strip(), int(credits)
    return None, None, None

# --- Test the function ---
sample_text = "MATH 160A Calculus I A  -  2  credits"
code, title, credits = parse_course_text(sample_text)

print(f"Testing regex on: '{sample_text}'")
print(f"  -> Code: {code}")
print(f"  -> Title: {title}")
print(f"  -> Credits: {credits}")

Testing regex on: 'MATH 160A Calculus I A  -  2  credits'
  -> Code: MATH 160A
  -> Title: Calculus I A
  -> Credits: 2


In [53]:
# Find all headers (h3, h4) which introduce a list of courses
headers = main_content.find_all(['h3', 'h4'])

# --- Process a specific header for testing (e.g., headers[0] is the first one) ---
if headers:
    test_header = headers[3] 
    title = test_header.text.strip()
    ul_element = test_header.find_next_sibling('ul')

    print(f"--- Testing section: '{title}' ---")
    
    if ul_element:
        # Step 1: Group list items into logical blocks
        blocks = []
        current_block = []
        for li in ul_element.find_all('li', recursive=False):
            if li.has_attr('class') and 'acalog-adhoc' in li['class'] and not li.text.strip():
                if current_block:
                    blocks.append(current_block)
                current_block = []
            else:
                current_block.append(li)
        if current_block:
            blocks.append(current_block)

        print(f"Found {len(blocks)} logical rule blocks in this section.")

        # Step 2: Process each block to determine its rule
        final_rules = []
        for block in blocks:
            block_text = " ".join([li.get_text(" ", strip=True) for li in block])
            
            courses_in_block = []
            for item in block:
                a_tag = item.find('a')
                if a_tag:
                    code, _, _ = parse_course_text(a_tag.get_text(" ", strip=True))
                    if code:
                        courses_in_block.append(code)

            if "OR" in block_text:
                options = []
                current_and_group = []
                for item in block:
                    item_text = item.get_text(" ", strip=True)
                    if "OR" in item_text and 'acalog-adhoc' in item.get('class', []):
                        if current_and_group:
                            options.append("+".join(current_and_group))
                        current_and_group = []
                        continue
                    
                    code, _, _ = parse_course_text(item_text)
                    if code:
                        current_and_group.append(code)
                
                if current_and_group:
                    options.append("+".join(current_and_group))
                
                final_rules.append({"type": "choose_one_of", "courses": options})

            elif "AND" in block_text:
                final_rules.append({"type": "take_all_of", "courses": courses_in_block})
            
            elif len(courses_in_block) == 1:
                final_rules.append({"type": "single_course", "course_id": courses_in_block[0]})
            
            elif len(courses_in_block) > 1:
                 final_rules.append({"type": "take_all_of", "courses": courses_in_block})

        print("\n--- Final Parsed Rules for this Section ---")
        print(json.dumps(final_rules, indent=2))
else:
    print("No h3 or h4 headers found in the main content.")

--- Testing section: 'Complete the following courses:' ---
Found 3 logical rule blocks in this section.

--- Final Parsed Rules for this Section ---
[
  {
    "type": "choose_one_of",
    "courses": [
      "MATH 160",
      "MATH 160A+MATH 160B"
    ]
  },
  {
    "type": "single_course",
    "course_id": "MATH 174"
  },
  {
    "type": "take_all_of",
    "courses": [
      "STAT 201"
    ]
  }
]


In [62]:
def scrape_expanded_course_details(course_li_element):
    """
    Parses a single, already-expanded <li> element from the main page
    to get details like prerequisites, description, and semesters offered.
    """
    details = {
        "prerequisites": [],
        "description": "",
        "semesters_offered": []
    }
    
    details_table = course_li_element.find('table', class_='td_dark')
    if not details_table:
        return details

    full_text = details_table.get_text(" ", strip=True)
    
    # Extract Prerequisites
    prereq_match = re.search(r"Prerequisite\(s\):\s*(.*?)(?=\s*Semester\(s\) Offered:|$)", full_text)
    if prereq_match:
        prereq_text = prereq_match.group(1)
        details["prerequisites"] = re.findall(r'[A-Z]{4}\s\d{3}[A-Z]?', prereq_text)

    # Extract Description
    br_tag = details_table.find('br')
    if br_tag:
        desc_node = br_tag.next_sibling
        if desc_node and isinstance(desc_node, str):
            details["description"] = desc_node.strip()
    
    # Extract Semesters Offered
    semester_match = re.search(r"Semester\(s\) Offered:\s*(.+)", full_text)
    if semester_match:
        semester_str = semester_match.group(1).replace("Close", "").strip()
        details["semesters_offered"] = [s.strip() for s in semester_str.split(',')]
        
    return details

# --- Test the function on the first course ---
first_course_li = main_content.find('li', class_='acalog-course')
if first_course_li:
    print("--- Testing detail scraper on first expanded course ---")
    scraped_details = scrape_expanded_course_details(first_course_li)
    print(json.dumps(scraped_details, indent=2))


--- Testing detail scraper on first expanded course ---
{
  "prerequisites": [
    "MATH 131",
    "MATH 135"
  ],
  "description": "Limits, continuity, differentiation and integration of algebraic and transcendental functions, applications of the derivative to curve sketching, optimization and related rates.",
  "semesters_offered": [
    "Fall",
    "Spring",
    "Summer"
  ]
}


In [64]:
def run_full_scraper(url, output_file):
    html_content = get_expanded_page_source(url)
    soup = BeautifulSoup(html_content, "html.parser")

    program_name_tag = soup.find("h1", id="acalog-page-title")
    main_content = soup.find('div', class_='custom_leftpad_20')

    if not program_name_tag or not main_content:
        print("Could not find essential page elements. Aborting.")
        return

    program_data = {
        "program_name": program_name_tag.text.strip(),
        "url": url,
        "requirement_sections": [],
        "all_courses": {}
    }

    # Find all h3/h4 tags that are direct children of an acalog-core div
    # This is a more stable way to find the main section headers
    headers = main_content.select('div.acalog-core > h3, div.acalog-core > h4')

    for header in headers:
        title = header.text.strip()
        # The container for the rules is the div that contains the header
        container = header.parent
        
        print(f"Processing section: {title}")
        
        section_data = {
            "title": title,
            "rules": []
        }

        # Find all sub-headers (like "Choose one course...") within this container
        sub_headers = container.find_all(['h4', 'h5'])
        if not sub_headers: # If no sub-headers, the main header is the rule
            sub_headers = [header]

        for sub_header in sub_headers:
            sub_title = sub_header.text.strip()
            ul_element = sub_header.find_next_sibling('ul')
            if not ul_element: continue

            rule = {
                "type": "take",
                "amount": 1, # Default amount
                "courses": []
            }

            if "Choose one" in sub_title:
                rule["amount"] = 1
            elif "Choose two" in sub_title:
                rule["amount"] = 2
            elif "Choose three" in sub_title:
                rule["amount"] = 3
            
            # Logic to parse courses within this specific rule
            current_and_group = []
            for li in ul_element.find_all('li', recursive=False):
                li_text = li.get_text(" ", strip=True)
                a_tag = li.find('a')

                if "OR" in li_text and 'acalog-adhoc' in li.get('class', []):
                    if current_and_group:
                        rule["courses"].append("+".join(current_and_group))
                    current_and_group = []
                    continue

                if a_tag:
                    code, course_title, credits = parse_course_text(a_tag.get_text(" ", strip=True))
                    if code:
                        # Scrape details and add to master list
                        if code not in program_data["all_courses"]:
                            details = scrape_expanded_course_details(li)
                            program_data["all_courses"][code] = {
                                "title": course_title,
                                "credits": credits,
                                **details
                            }
                        
                        # Handle AND groups
                        if "AND" in li_text:
                            current_and_group.append(code)
                        else:
                            if current_and_group:
                                current_and_group.append(code)
                                rule["courses"].append("+".join(current_and_group))
                                current_and_group = []
                            else:
                                rule["courses"].append(code)
            
            if current_and_group:
                rule["courses"].append("+".join(current_and_group))
            
            # If the rule is to take all courses, set amount accordingly
            if rule["amount"] == 1 and len(rule["courses"]) > 1 and "Choose" not in sub_title:
                 rule["amount"] = len(rule["courses"])

            section_data["rules"].append(rule)
        
        program_data["requirement_sections"].append(section_data)
    
    with open(output_file, 'w') as f:
        json.dump(program_data, f, indent=2)
    print(f"\nScraping complete! Data saved to '{os.path.abspath(output_file)}'")

# --- Run the full process ---
run_full_scraper(TEST_URL, "computer_science_major.json")

Fetching and expanding page with Selenium: https://catalog.coastal.edu/preview_program.php?catoid=33&poid=5939
Found 69 expandable course links to click.
Finished clicking. Waiting for final content to load...
Processing section: Core Curriculum Requirements
Processing section: Graduation Requirements
Processing section: Foundation Requirements (19-30 Credits) *
Processing section: Complete the following courses:
Processing section: Choose two courses from the following:
Processing section: Choose two courses from the following:
Processing section: Choose one course from the following:
Processing section: Major Requirements (57-60 Credits)
Processing section: Complete the following courses:
Processing section: Choose one course from the following:
Processing section: Choose three courses from the following:
Processing section: Note:

Scraping complete! Data saved to '/home/mikeszklarz/dev/courseflow/scraper/computer_science_major.json'
