<a href="https://colab.research.google.com/github/Kiron-Ang/DSC/blob/main/web_scraping_schedule_of_classes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Check the Python version using the command line
!python -V

# Importing the 'requests' library to send HTTP requests
import requests

# Open a file named 'data.csv' in write mode to store the extracted data
data = open("data.csv", "w")

# Write the column headers for the CSV file
data.write("course,section,max_enroll,seats_avail,waitlist,days,time")

# List of course prefixes to be iterated over (e.g., "BINF", "BME")
prefix_list = ["BINF", "BME", "CSI", "CSF", "DSC", "ELC", "EGR", "ME"]

# Flags to control the extraction of different data points
next_is_section = False
next_is_max_enroll = False
next_is_seats_avail = False

# Loop through each course prefix (e.g., BINF, BME)
for prefix in prefix_list:
  # Construct the URL to request schedule data from Baylor University's website
  url = "https://www1.baylor.edu/scheduleofclasses/Results.aspx?"
  url += f"Term=202510&College=Z&Prefix={prefix}&StartCN=Z&EndCN=5000&"
  url += "Status=Z&Days=Z&Instructor=&IsMini=false&OnlineOnly=0&"
  url += "POTerm=Z&CourseAttr=Z&Sort=SN"

  # Send a GET request to fetch the HTML content of the page
  response = requests.get(url)

  # Get the HTML text from the response object
  html_content = response.text

  # Split the HTML content into lines for easier processing
  html_content = html_content.splitlines()

  # Loop through each line in the HTML content
  for line in html_content:

    # If the line contains course data (identified by the class 'col-md-2')
    if "col-md-2" in line:
      # Extract the course name by finding text between <strong> tags
      start_index = line.find("<strong>") + len("<strong>")
      end_index = line.find("</strong>")
      course = line[start_index:end_index]
      print(course)  # Output the course name

    # If the flag is set to extract section data
    if next_is_section:
      # Reset the flag and extract the section number
      next_is_section = False
      start_index = line.find("<strong>") + len("<strong>")
      end_index = line.find("</strong>")
      section = line[start_index:end_index]
      print(section)  # Output the section number

    # Check if the line contains the word "Section:" to set the flag for the next section
    if "Section:" in line:
      next_is_section = True

    # If the flag is set to extract max enrollment data
    if next_is_max_enroll:
      # Reset the flag and extract the max enrollment number (digits only)
      next_is_max_enroll = False
      max_enroll = ''.join([char for char in line if char.isdigit()])
      print(max_enroll)  # Output the max enrollment

    # Check if the line contains the word "Max Enroll:" to set the flag for the next max enrollment
    if "Max Enroll:" in line:
      next_is_max_enroll = True

    # If the flag is set to extract available seats data
    if next_is_seats_avail:
      # Reset the flag and extract the available seats number (digits only)
      next_is_seats_avail = False
      seats_avail = ''.join([char for char in line if char.isdigit()])
      print(seats_avail)  # Output the available seats

    # Check if the line contains the word "Seats Avail" to set the flag for the next seats availability
    if "Seats Avail" in line:
      next_is_seats_avail = True

    # If the line contains waitlist information (identified by a specific class)
    if "col-sm-1 column-lg hidden-xs" in line:
      # Extract the waitlist data by finding the text between '>' and '<'
      start_index = line.find('>') + 1
      end_index = line.find('<', start_index)
      waitlist = line[start_index:end_index]
      print(waitlist)  # Output the waitlist count

# Close the 'data.csv' file after data extraction is complete
data.close()

Python 3.10.12
BINF 3360
01
18
12
0
BINF 43C9
01
6
5
0
BME 4360
01
19
12
0
BME 4372
01
12
1
0
BME 4374
01
24
4
0
CSI 1401
01
15
0
1
02
15
9
0
03
10
9
0
CSI 1402
01
19
6
0
02
19
2
0
03
15
15
0
CSI 1430
01
19
9
0
02
0
0
0
03
19
8
0
CSI 1440
01
25
0
3
02
25
0
9
CSI 2300
01
19
4
0
02
19
8
0
03
1
0
0
CSI 2334
01
19
9
0
CSI 2350
01
25
1
2
02
25
9
0
03
0
0
1
CSI 3303
01
38
16
0
CSI 3334
01
29
10
0
CSI 3335
01
22
6
0
CSI 3336
01
19
4
0
CSI 3344
01
19
3
0
02
19
1
0
CSI 3372
01
19
4
0
CSI 3374
01
58
1
0
CSI 3471
03
19
0
0
04
19
6
0
CSI 4010
02
19
19
0
CSI 4111
01
24
10
0
CSI 4321
01
30
0
0
CSI 4323
01
39
16
0
CSI 4325
01
8
2
0
CSI 4328
01
18
18
0
CSI 4330
01
39
0
0
CSI 4337
01
19
1
0
CSI 43C9
01
32
1
0
CSF 3001
01
1
1
0
CSF 3101
01
0
0
0
CSF 3102
01
1
1
0
CSF 4001
01
0
0
0
CSF 4302
01
0
0
0
CSF 4V01
01
0
0
0
DSC 2350
01
40
21
0
DSC 3310
01
15
5
0
DSC 3334
02
35
1
1
DSC 3335
01
19
5
0
02
19
0
0
DSC 3344
01
22
14
0
DSC 4310
01
36
1
0
DSC 4320
01
35
4
2
DSC 43C9
01
30
2
0
ELC 2130
01
15
0
0
03
15
0