In [9]:
import httpx

url = "https://bulletin.engin.umich.edu/courses/eecs/"
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
}
response = httpx.get(url,headers=headers)
# print(response.text) 
# This line is commented out to avoid excessive output for the notebook cell. 
# Uncomment this line during production stage to look through the html structure. 
# Or even better, use developer tools of web browser to inspect webpages directly.


First take a look of selecting all the p tags under "\<div class="entry-content">" and printing their cotent out in full. 

In [10]:
from bs4 import BeautifulSoup

parser = BeautifulSoup(response, 'html.parser')

div = parser.find('div', class_= "entry-content")

p_tags = div.find_all('p')[1:] # through inspection the first tag is irrelevant.

print(p_tags[0])
#for p in p_tags:
    #print(p)

<p><strong>EECS 110. Discover Computer Science </strong><br/><em>Enforced Prerequisite: None. (2 credits).</em><br/>Introduction to basic CS concepts (variables, conditionals, loops, functions) using an introductory programming language, such as Python. Students interact with researchers and computing professionals to learn about real-world, interdisciplinary applications of CS. Intended for students without prior programming experience to (optionally) take prior to EECS 183 or ENGR 101. <a href="https://atlas.ai.umich.edu/course/EECS%20110/">CourseProfile (ATLAS)</a> </p>


Print a clean version

In [11]:
print(p_tags[0].getText())
#for p in p_tags:
    #print(p.getText())

EECS 110. Discover Computer Science Enforced Prerequisite: None. (2 credits).Introduction to basic CS concepts (variables, conditionals, loops, functions) using an introductory programming language, such as Python. Students interact with researchers and computing professionals to learn about real-world, interdisciplinary applications of CS. Intended for students without prior programming experience to (optionally) take prior to EECS 183 or ENGR 101. CourseProfile (ATLAS) 


As described in our guide, we only need the informations under the "strong", "em", and "a" tag. For "a" tag, we need its "href" attribute for a web url (instead of the literal words "CourseProfile (ATLAS)" )

In [12]:
course = []
prereq = []
link = []

for p in p_tags:
    strong, em, a = p.find('strong'), p.find('em'),  p.find('a')  
    if strong is not None: course.append(strong.getText())
    if em is not None: prereq.append(em.getText())
    if a is not None: link.append(a['href']) # Get the href attribute, not literal text

print(course[0])
print(prereq[0])
print(link[0])

EECS 110. Discover Computer Science 
Enforced Prerequisite: None. (2 credits).
https://atlas.ai.umich.edu/course/EECS%20110/


Some of the courses have the character \xa0, a variant of a standard space chraracter that prevents an automatic link break at its position. We need to strip it. 

In [13]:
course = [c .replace('\xa0', '') for c in course]
print(course)

['EECS 110. Discover Computer Science', 'EECS 180. Exam/Transfer Introductory Computer Programming Credit', 'EECS 183. Elementary Programming Concepts', 'EECS 198. Special Topics', 'EECS 200. Electrical Engineering Systems Design I', 'EECS 201. Computer Science Pragmatics', 'EECS 203. Discrete Mathematics', 'EECS 215. Introduction to Electronic Circuits', 'EECS 216. Introduction to Signals and Systems', 'EECS 230. Electromagnetics I', 'EECS 250 (NAVSCI 202). Electronic Sensing Systems', 'EECS 270. Introduction to Logic Design', 'EECS 280. Programming and Introductory Data Structures', 'EECS 281. Data Structures and Algorithms', 'EECS 285. Practical Programming in Java', 'EECS 298. Special Topics', 'EECS 300. Electrical Engineering Systems Design II', 'EECS 301. Probabilistic Methods in Engineering', 'EECS 311. Analog Circuits', 'EECS 312. Digital Integrated Circuits', 'EECS 314. Electrical Circuits, Systems, and Applications', 'EECS 320. Introduction to Semiconductor Devices', 'EECS 33

Use regex to process the data.
1. Seperate course code and course name.
2. Extract credit information
3. Put course code, course name, credit information, link, and prereq to seperate attributes

| CourseCode    | CourseName |    Credit       | Link  | Note     |
|---------|-----|---------------|-------------|-------------|

In [14]:
import re
data = []
for c, p, l in zip(course, prereq, link):
    data_entry = {}
    course_split = c.split(".", 1) # Split at first occurance of a period
    if len(course_split) != 2: continue # Skip instances where there is bad course format. 
    data_entry["CourseCode"] = course_split[0]
    data_entry["CourseName"]= course_split[1].lstrip(" ") # Remove leading space.

    credit = re.findall(r'\((\d)+\s+credits\)', p)
    """
    the regular expression \((\d+)\s+credits\) matches any sequence of digits (\d+)
    that are directly after a parenthesis and followed by one or more spaces (\s+)
    and the word "credits". The parentheses around \d+ create a group that 
    findall() returns as a list.
    """
    data_entry["Credit"] = " ".join(credit)
    data_entry["Link"] = l
    data_entry["Note"] = p
    data.append(data_entry)

for i in range(5):
    print(data[i])

{'CourseCode': 'EECS 110', 'CourseName': 'Discover Computer Science', 'Credit': '2', 'Link': 'https://atlas.ai.umich.edu/course/EECS%20110/', 'Note': 'Enforced Prerequisite: None. (2 credits).'}
{'CourseCode': 'EECS 180', 'CourseName': 'Exam/Transfer Introductory Computer Programming Credit', 'Credit': '', 'Link': 'https://atlas.ai.umich.edu/course/EECS%20180/', 'Note': 'Cannot receive credit if student has credit for EECS 183 or ENGR 101 or ENGR 151 (3-4 credits).'}
{'CourseCode': 'EECS 183', 'CourseName': 'Elementary Programming Concepts', 'Credit': '4', 'Link': 'https://atlas.ai.umich.edu/course/EECS%20183/', 'Note': 'Prerequisite: None. (Credit for only one: EECS 180, EECS 183, ENGR 101 or ENGR 151) (4 credits)'}
{'CourseCode': 'EECS 198', 'CourseName': 'Special Topics', 'Credit': '', 'Link': 'https://atlas.ai.umich.edu/course/EECS%20198/', 'Note': 'Advisory Prerequisite: Permission of instructor. (1-4 credits)'}
{'CourseCode': 'EECS 200', 'CourseName': 'Electrical Engineering Syst

Now, lets put our data into a CSV file. 

In [15]:
import pandas as pd
df = pd.DataFrame(data)
df.to_csv('eecs_course.csv', index=False)