<a href="https://colab.research.google.com/github/Kiron-Ang/DSC/blob/main/web_scraping_schedule_of_classes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import the requests library to send HTTP requests and handle responses
import requests

# Open a file to write the data in CSV format, where the schedule data will be stored
data = open("baylor_scheduleofclasses_spring_2025.csv", "w")

# Write the headers for the CSV file, describing the columns
data.write("prefix,course,section,max_enroll,seats_avail,waitlist\n")

# Define the URL for the Baylor schedule of classes page
url = "https://www1.baylor.edu/scheduleofclasses"

# Send a GET request to retrieve the page content
response = requests.get(url)

# Get the response text (HTML content)
html_content = response.text

# Split the HTML content into separate lines for easier processing
html_content = html_content.splitlines()

# Initialize an empty list to store course prefixes (e.g., 'CS', 'MATH')
prefix_list = []

# Flag to track whether we've found the section of the HTML that contains the course prefixes
found_prefix_list = False

# Loop through each line of the HTML content
for line in html_content:

    # If the line contains "/select", it indicates the end of the prefix list
    if "/select" in line:
        found_prefix_list = False

    # If we've found the prefix list, we start extracting prefixes
    if found_prefix_list:
        # Extract the value of the prefix from the HTML option tag
        start_index = line.find('value="') + len('value="')
        end_index = line.find('"', start_index)
        prefix = line[start_index:end_index]  # The prefix value
        prefix_list.append(prefix)  # Add the prefix to the list

    # When we encounter a specific option, we know we've found the prefix list section
    if '<option value="Z">Any Prefix</option>' in line:
        found_prefix_list = True

# Flags to track when to extract specific course information
next_is_section = False
next_is_max_enroll = False
next_is_seats_avail = False

# For each prefix in the prefix list, fetch the corresponding schedule page
for prefix in prefix_list:
    # Construct the URL to fetch the course schedule for the specific prefix
    url = "https://www1.baylor.edu/scheduleofclasses/Results.aspx?"
    url += f"Term=202510&College=Z&Prefix={prefix}&StartCN=Z&EndCN=5000&"
    url += "Status=Z&Days=Z&Instructor=&IsMini=false&OnlineOnly=0&"
    url += "POTerm=Z&CourseAttr=Z&Sort=SN"

    # Send a GET request to retrieve the course details for the given prefix
    response = requests.get(url)

    # Get the HTML content of the response
    html_content = response.text

    # Split the HTML content into lines for easier processing
    html_content = html_content.splitlines()

    # Loop through each line of the course schedule HTML content
    for line in html_content:

        # Check if the line contains course information (prefix and course number)
        if "col-md-2" in line:
            # Extract the course prefix and number from the <strong> tags
            start_index = line.find("<strong>") + len("<strong>")
            end_index = line.find("</strong>")
            prefix_course = line[start_index:end_index].split(" ")
            prefix = prefix_course[0]  # Course prefix (e.g., 'CS')
            course = prefix_course[1]  # Course number (e.g., '101')

        # If the next line is expected to contain the section number
        if next_is_section:
            next_is_section = False
            # Extract the section number from the line
            start_index = line.find("<strong>") + len("<strong>")
            end_index = line.find("</strong>")
            section = line[start_index:end_index]

        # When we encounter a line with "Section:", set the flag to extract the section
        if "Section:" in line:
            next_is_section = True

        # If the next line contains the maximum enrollment number
        if next_is_max_enroll:
            next_is_max_enroll = False
            # Extract digits (max enrollment) from the line
            max_enroll = ''.join([char for char in line if char.isdigit()])

        # When we encounter a line with "Max Enroll:", set the flag to extract max enrollment
        if "Max Enroll:" in line:
            next_is_max_enroll = True

        # If the next line contains the available seats
        if next_is_seats_avail:
            next_is_seats_avail = False
            # Extract digits (seats available) from the line
            seats_avail = ''.join([char for char in line if char.isdigit()])

        # When we encounter a line with "Seats Avail", set the flag to extract available seats
        if "Seats Avail" in line:
            next_is_seats_avail = True

        # If the line contains waitlist information, extract it
        if "col-sm-1 column-lg hidden-xs" in line:
            start_index = line.find('>') + 1  # Start extracting after the '>'
            end_index = line.find('<', start_index)  # Stop extracting before '<'
            waitlist = line[start_index:end_index]  # Extract the waitlist value

            # Write the extracted data for this course to the CSV file
            data.write(f"{prefix},{course},{section},{max_enroll},{seats_avail},{waitlist}\n")

# Close the file once all data is written
data.close()

In [2]:
# Import the pandas library, which provides data manipulation and analysis tools
import pandas

# Use pandas to read the CSV file and load its contents into a DataFrame
data = pandas.read_csv("baylor_scheduleofclasses_spring_2025.csv")

# Display the contents of the DataFrame to check the loaded data
data

Unnamed: 0,prefix,course,section,max_enroll,seats_avail,waitlist
0,ACC,2301,01,50,0,6.0
1,ACC,2301,02,50,0,2.0
2,ACC,2301,03,50,0,1.0
3,ACC,2301,W1,40,1,5.0
4,ACC,2303,01,200,46,0.0
...,...,...,...,...,...,...
4609,THEA,4V9R,01,19,15,0.0
4610,THEA,4VC5,01,15,14,0.0
4611,THEA,4VC5,02,15,14,0.0
4612,UNSC,3001,U1,65,61,0.0
