In [2]:
import requests
from bs4 import BeautifulSoup
import re

def extract_prerequisites(text):
    # Search for prerequisites in the course description
    prereq_match = re.search(r'Prerequisites?:(.*?)(?=Can be taken Concurrently:|Attribute|$)', text, re.DOTALL)
    if prereq_match:
        prereq_text = prereq_match.group(1).strip()
        # Remove all spaces and clean up course codes
        prereq_text = re.sub(r'\s+', '', prereq_text)
        # Insert 'and' between course codes that are consecutive
        prereq_text = re.sub(r'([A-Z]{2,4}\d{3})(?=[A-Z]{2,4}\d{3})', r'\1and', prereq_text)
        # Replace consecutive 'and' with 'or' for course options
        prereq_text = re.sub(r'([A-Z]{2,4}\d{3})and([A-Z]{2,4}\d{3})and', r'\1or\2or', prereq_text)
        return prereq_text
    return "Noprerequisites"

def read_subjects(filename):
    subjects = []
    with open(filename, 'r') as file:
        for line in file:
            if 'code' in line:
                code = line.split("'")[3].lower()
                subjects.append(code)
    return subjects

def get_all_course_prereqs():
    subjects = read_subjects('subject.txt')
    all_course_prereqs = {}

    for subject in subjects:
        url = f"https://catalog.lehigh.edu/courselisting/{subject}/"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        courseblocks = soup.find_all('div', class_='courseblock')

        for block in courseblocks:
            # Extract course ID
            title = block.find('p', class_='courseblocktitle')
            if title:
                course_id = re.search(rf'({subject.upper()}\s*\d+)', title.text)
                if course_id:
                    course_id = re.sub(r'\s+', '', course_id.group(1))
                else:
                    continue

            # Extract prerequisites
            block_text = block.get_text(strip=True)
            prereq_text = extract_prerequisites(block_text)
            if prereq_text == "Noprerequisites":
                prereq_text = None
            all_course_prereqs[course_id] = prereq_text

    return all_course_prereqs

# Call the function and store the results
course_prereqs_dict = get_all_course_prereqs()

# Optional: check the results
print(course_prereqs_dict)


{'ACCT001': None, 'ACCT108': None, 'ACCT151': None, 'ACCT152': 'ACCT151', 'ACCT307': 'ACCT151', 'ACCT309': 'ACCT307', 'ACCT310': 'ACCT151andFIN125', 'ACCT311': 'ACCT152andBIS111', 'ACCT315': 'ACCT152', 'ACCT316': 'ACCT315', 'ACCT317': 'ACCT316', 'ACCT318': 'ACCT316', 'ACCT320': '(ACCT311orCSB311)and(ACCT315)', 'ACCT324': 'ACCT152', 'ACCT330': 'ECO045', 'ACCT340': 'ACCT151andFIN125', 'ACCT371': None, 'ACCT372': None, 'ACCT410': 'ACCT151orGBUS401orMBA451', 'ACCT432': 'GBUS401andMGT431', 'ACCT442': None, 'ACCT444': 'ACCT442', 'AAS003': None, 'AAS005': None, 'AAS025': None, 'AAS038': None, 'AAS066': None, 'AAS076': 'THTR066orAAS066', 'AAS091': None, 'AAS095': None, 'AAS102': None, 'AAS103': None, 'AAS106': None, 'AAS117': None, 'AAS121': None, 'AAS124': None, 'AAS125': None, 'AAS126': None, 'AAS128': None, 'AAS129': None, 'AAS130': None, 'AAS131': None, 'AAS132': None, 'AAS134': None, 'AAS138': None, 'AAS140': None, 'AAS144': None, 'AAS147': None, 'AAS155': None, 'AAS163': None, 'AAS164': 

In [3]:
from bs4 import BeautifulSoup
import requests

def extract_course_prereqs(course_block):
    # Extract the full course title and obtain the course code (e.g., "CSE 325")
    title_full = course_block.find("p", class_="courseblocktitle").text.strip()
    title = title_full.split(" ")[0] + title_full.split(" ")[1]  # e.g., "CSE" + "325" -> "CSE325"
    
    # Extract prerequisites
    prereq_section = course_block.find("b", text="Prerequisites:")
    prerequisites = ""
    if prereq_section:
        # Retrieve remaining text after "Prerequisites:" label
        next_node = prereq_section.next_sibling
        while next_node:
            if next_node.name == "a":  # linked courses
                prerequisites += next_node.text.strip()
            elif next_node.name == "br":
                break  # stop before Attributes/Distribution section
            else:
                prerequisites += next_node.strip() if next_node else ""
            next_node = next_node.next_sibling
    
    # Clean unnecessary spaces and special characters
    prerequisites = prerequisites.replace('\xa0', ' ').strip()
    prerequisites = ' '.join(prerequisites.split())  # convert multiple spaces to a single space
    
    return title, prerequisites if prerequisites else None

def get_all_course_prereqs():
    subjects = read_subjects('subject.txt')
    all_course_prereqs = {}

    for subject in subjects:
        url = f"https://catalog.lehigh.edu/courselisting/{subject}/"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all course blocks
        course_blocks = soup.find_all("div", class_="courseblock")
        
        for block in course_blocks:
            title, prerequisites = extract_course_prereqs(block)
            title = title.replace(' ', '')  # clean non-breaking spaces
            if prerequisites != None:
                # Clean spaces and commas from prerequisites
                prerequisites = prerequisites.replace(' ', '')
                prerequisites = prerequisites.replace(',', '')
            print(title, prerequisites)
            all_course_prereqs[title] = prerequisites
    
    return all_course_prereqs

# Call the function and store the results
course_prereqs_dict = get_all_course_prereqs()


  prereq_section = course_block.find("b", text="Prerequisites:")


ACCT001 None
ACCT108 None
ACCT151 None
ACCT152 ACCT151
ACCT307 ACCT151
ACCT309 ACCT307
ACCT310 ACCT151andFIN125
ACCT311 ACCT152andBIS111
ACCT315 ACCT152
ACCT316 ACCT315
ACCT317 ACCT316
ACCT318 ACCT316
ACCT320 (ACCT311orCSB311)and(ACCT315)
ACCT324 ACCT152
ACCT330 ECO045
ACCT340 ACCT151andFIN125
ACCT371 None
ACCT372 None
ACCT410 ACCT151orGBUS401orMBA451
ACCT432 GBUS401andMGT431
ACCT442 None
ACCT444 ACCT442
AAS003 None
AAS005 None
AAS025 None
AAS038 None
AAS066 None
AAS076 THTR066orAAS066
AAS091 None
AAS095 None
AAS102 None
AAS103 None
AAS106 None
AAS117 None
AAS121 None
AAS124 None
AAS125 None
AAS126 None
AAS128 None
AAS129 None
AAS130 None
AAS131 None
AAS132 None
AAS134 None
AAS138 None
AAS140 None
AAS144 None
AAS147 None
AAS155 None
AAS163 None
AAS164 None
AAS166 None
AAS176 None
AAS178 None
AAS179 None
AAS191 None
AAS205 None
AAS210 None
AAS230 None
AAS278 None
AAS291 None
AAS299 None
AAS305 None
AAS311 None
AAS312 None
AAS313 None
AAS314 None
AAS317 None
AAS318 None
AAS320 None
AAS32

In [6]:
course_prereqs_dict['ISE372'] = "(ISE220orIE220orISE230orIE230)and(ISE220orIE220orISE240orIE240)and(ISE275orIE275)"
course_prereqs_dict['CSE323'] = "(MATH205orMATH242orDSCI311)and(MATH205orMATH242orDSCI321)and(MATH231orECO045orDSCI311)and(MATH231orECO045orDSCI321)and(CSE017orDSCI311)and(CSE017orDSCI321)"
course_prereqs_dict['CHM372'] = "(BIOS473orBIOS371orCHM371)and(BIOS473orBIOS041)"
course_prereqs_dict['BIOS372'] = "(BIOS473orBIOS371orCHM371)and(BIOS473orBIOS041)"
course_prereqs_dict['ENGL011'] = None
course_prereqs_dict['CSE371'] = "(CSE109)and(CSE202orECE201)"
course_prereqs_dict['EES318'] = "(EES080orES319)and(EES115orEES152orES319)"
course_prereqs_dict['EES325'] = "(EES080orES319)and(EES115orEES152orEES131orES319)"
course_prereqs_dict['ECO146'] = "(ECO001)and(MATH021orMATH031orMATH051orMATH081orMATH075)and(MATH021orMATH031orMATH051orMATH081orMATH076)"
course_prereqs_dict['ECO333'] = "(ECO146)and(ECO045orMATH012orMATH231orISE111)and(MATH021orMATH031orMATH051orMATH081orMATH075)and(MATH021orMATH031orMATH051orMATH081orMATH076)"
course_prereqs_dict['ECO358'] = "(ECO146)and(MATH021orMATH031orMATH051orMATH081orMATH075)and(MATH021orMATH031orMATH051orMATH081orMATH076)"
course_prereqs_dict['EPI305'] = "(EPI104)and(BSTA133orBSTA103)and(BSTA133orBSTA104)"
course_prereqs_dict['MGT350'] = "(MGT321)and(MGT333orSCM328orSCM340orSCM342orSCM309orSCM354orMGT311orBIS311)and(MGT333orSCM328orSCM340orSCM342orSCM309orSCM354orMGT311orBIS324)and(MGT333orSCM328orSCM340orSCM342orSCM309orSCM354orMGT306orBIS311)and(MGT333orSCM328orSCM340orSCM342orSCM309orSCM354orMGT306orBIS324)"
course_prereqs_dict['MATH310'] = "(MATH263orMATH309orMATH231)and(MATH263orMATH309orMATH205orMATH241)"

In [7]:
for key, value in course_prereqs_dict.items():
    print(key, value)

ACCT001 None
ACCT108 None
ACCT151 None
ACCT152 ACCT151
ACCT307 ACCT151
ACCT309 ACCT307
ACCT310 ACCT151andFIN125
ACCT311 ACCT152andBIS111
ACCT315 ACCT152
ACCT316 ACCT315
ACCT317 ACCT316
ACCT318 ACCT316
ACCT320 (ACCT311orCSB311)and(ACCT315)
ACCT324 ACCT152
ACCT330 ECO045
ACCT340 ACCT151andFIN125
ACCT371 None
ACCT372 None
ACCT410 ACCT151orGBUS401orMBA451
ACCT432 GBUS401andMGT431
ACCT442 None
ACCT444 ACCT442
AAS003 None
AAS005 None
AAS025 None
AAS038 None
AAS066 None
AAS076 THTR066orAAS066
AAS091 None
AAS095 None
AAS102 None
AAS103 None
AAS106 None
AAS117 None
AAS121 None
AAS124 None
AAS125 None
AAS126 None
AAS128 None
AAS129 None
AAS130 None
AAS131 None
AAS132 None
AAS134 None
AAS138 None
AAS140 None
AAS144 None
AAS147 None
AAS155 None
AAS163 None
AAS164 None
AAS166 None
AAS176 None
AAS178 None
AAS179 None
AAS191 None
AAS205 None
AAS210 None
AAS230 None
AAS278 None
AAS291 None
AAS299 None
AAS305 None
AAS311 None
AAS312 None
AAS313 None
AAS314 None
AAS317 None
AAS318 None
AAS320 None
AAS32

In [70]:
print(course_prereqs_dict["CSE325"])

(MATH231orECO045orISE121)andCSE017and(MATH205orMATH241orMATH242)and(CSE160orCSE326orCSE327orMATH365orECE414orISE364orISE365orISE367)


In [13]:
import re

def parse_prerequisites(prereq_string):
    if prereq_string is None:
        return None
    
    # Remove all parentheses and spaces
    prereq_string = re.sub(r'[() ]', '', prereq_string)
    
    # Split the string by 'and'
    and_parts = re.split(r'and', prereq_string)
    
    result = []
    for part in and_parts:
        # Split by 'or'
        or_parts = re.split(r'or', part)
        
        # Clean up each course code
        cleaned_parts = [code.strip() for code in or_parts if code]
        
        result.append(cleaned_parts)
    
    return result

# Process each course
parsed_data = {}
for course, prereqs in course_prereqs_dict.items():
    if prereqs is None:
        parsed_data[course] = None
        continue
    parsed_prereqs = parse_prerequisites(prereqs)
    if parsed_prereqs:
        parsed_data[course] = parsed_prereqs

for key, value in parsed_data.items():
    print(key, value)

import json
# save the data as a Json file
with open('course_prereqs.json', 'w') as file:
    json.dump(parsed_data, file, indent=4)

ACCT001 None
ACCT108 None
ACCT151 None
ACCT152 [['ACCT151']]
ACCT307 [['ACCT151']]
ACCT309 [['ACCT307']]
ACCT310 [['ACCT151'], ['FIN125']]
ACCT311 [['ACCT152'], ['BIS111']]
ACCT315 [['ACCT152']]
ACCT316 [['ACCT315']]
ACCT317 [['ACCT316']]
ACCT318 [['ACCT316']]
ACCT320 [['ACCT311', 'CSB311'], ['ACCT315']]
ACCT324 [['ACCT152']]
ACCT330 [['ECO045']]
ACCT340 [['ACCT151'], ['FIN125']]
ACCT371 None
ACCT372 None
ACCT410 [['ACCT151', 'GBUS401', 'MBA451']]
ACCT432 [['GBUS401'], ['MGT431']]
ACCT442 None
ACCT444 [['ACCT442']]
AAS003 None
AAS005 None
AAS025 None
AAS038 None
AAS066 None
AAS076 [['THTR066', 'AAS066']]
AAS091 None
AAS095 None
AAS102 None
AAS103 None
AAS106 None
AAS117 None
AAS121 None
AAS124 None
AAS125 None
AAS126 None
AAS128 None
AAS129 None
AAS130 None
AAS131 None
AAS132 None
AAS134 None
AAS138 None
AAS140 None
AAS144 None
AAS147 None
AAS155 None
AAS163 None
AAS164 None
AAS166 None
AAS176 None
AAS178 None
AAS179 None
AAS191 None
AAS205 None
AAS210 None
AAS230 None
AAS278 None
AAS2