<a href="https://colab.research.google.com/github/Kiron-Ang/DSC/blob/main/web_scraping_schedule_of_classes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Check the Python version using the command line
!python -V

# Importing the 'requests' library to send HTTP requests
import requests

# Open a file named 'data.csv' in write mode to store the extracted data
data = open("data.csv", "w")

# Write the column headers for the CSV file
data.write("prefix,course,section,max_enroll,seats_avail,waitlist\n")

# List of course prefixes to be iterated over (e.g., "BINF", "BME")
prefix_list = ["BINF", "BME", "CSI", "CSF", "DSC", "ELC", "EGR", "ME"]

# Flags to control the extraction of different data points
next_is_section = False
next_is_max_enroll = False
next_is_seats_avail = False

# Loop through each course prefix (e.g., BINF, BME)
for prefix in prefix_list:
  # Construct the URL to request schedule data from Baylor University's website
  url = "https://www1.baylor.edu/scheduleofclasses/Results.aspx?"
  url += f"Term=202510&College=Z&Prefix={prefix}&StartCN=Z&EndCN=5000&"
  url += "Status=Z&Days=Z&Instructor=&IsMini=false&OnlineOnly=0&"
  url += "POTerm=Z&CourseAttr=Z&Sort=SN"

  # Send a GET request to fetch the HTML content of the page
  response = requests.get(url)

  # Get the HTML text from the response object
  html_content = response.text

  # Split the HTML content into lines for easier processing
  html_content = html_content.splitlines()

  # Loop through each line in the HTML content
  for line in html_content:

    # If the line contains course data (identified by the class 'col-md-2')
    if "col-md-2" in line:
      # Extract the course name by finding text between <strong> tags
      start_index = line.find("<strong>") + len("<strong>")
      end_index = line.find("</strong>")
      prefix_course = line[start_index:end_index].split(" ")
      prefix = prefix_course[0]
      course = prefix_course[1]

    # If the flag is set to extract section data
    if next_is_section:
      # Reset the flag and extract the section number
      next_is_section = False
      start_index = line.find("<strong>") + len("<strong>")
      end_index = line.find("</strong>")
      section = line[start_index:end_index]

    # Check if the line contains the word "Section:" to set the flag for the next section
    if "Section:" in line:
      next_is_section = True

    # If the flag is set to extract max enrollment data
    if next_is_max_enroll:
      # Reset the flag and extract the max enrollment number (digits only)
      next_is_max_enroll = False
      max_enroll = ''.join([char for char in line if char.isdigit()])

    # Check if the line contains the word "Max Enroll:" to set the flag for the next max enrollment
    if "Max Enroll:" in line:
      next_is_max_enroll = True

    # If the flag is set to extract available seats data
    if next_is_seats_avail:
      # Reset the flag and extract the available seats number (digits only)
      next_is_seats_avail = False
      seats_avail = ''.join([char for char in line if char.isdigit()])

    # Check if the line contains the word "Seats Avail" to set the flag for the next seats availability
    if "Seats Avail" in line:
      next_is_seats_avail = True

    # If the line contains waitlist information (identified by a specific class)
    if "col-sm-1 column-lg hidden-xs" in line:
      # Extract the waitlist data by finding the text between '>' and '<'
      start_index = line.find('>') + 1
      end_index = line.find('<', start_index)
      waitlist = line[start_index:end_index]

      # Write the course data to the CSV file
      data.write(f"{prefix},{course},{section},{max_enroll},{seats_avail},{waitlist}\n")

# Close the 'data.csv' file after data extraction is complete
data.close()

Python 3.10.12
