In [2]:
from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time
import pandas

# 1. Obtain list of scraped core and elective courses. The aim here is to scrape any courses in the "Elective Streams" that are not included in the already scraped courses

In [3]:
scraped_courses_df = pandas.read_csv("UAlberta_MechEng_Core_and_standard_Electives_Courses.csv")
scraped_courses = list(scraped_courses_df["Course Number"])
len(scraped_courses)

103

In [4]:
scraped_courses

['CIV E 270',
 'ENGG 299',
 'MATH 209',
 'MEC E 230',
 'MEC E 260',
 'MEC E 265',
 'STAT 235',
 'CH E 243',
 'ECE 209',
 'MAT E 202',
 'MATH 201',
 'MEC E 200',
 'MEC E 250',
 'WKEXP 901',
 'WKEXP 902',
 'MATH 300',
 'MEC E 300',
 'MEC E 301',
 'MEC E 331',
 'MEC E 371',
 'MEC E 380',
 'ENG M 310',
 'ENG M 401',
 'MEC E 340',
 'MEC E 360',
 'MEC E 362',
 'MEC E 390',
 'WKEXP 903',
 'WKEXP 904',
 'ENGG 404',
 'MEC E 430',
 'MEC E 480',
 'MEC E 463',
 'WKEXP 905',
 'CH E 448',
 'MEC E 420',
 'ENGG 400',
 'MEC E 403',
 'MEC E 451',
 'MEC E 460',
 'MEC E 467',
 'MEC E 468',
 'MEC E 539',
 'MEC E 563',
 'ACCTG 300',
 'ACCTG 311',
 'B LAW 301',
 'B LAW 422',
 'B LAW 444',
 'BIOCH 200',
 'BIOL 107',
 'BIOL 108',
 'BIOL 207',
 'BME 320',
 'BME 321',
 'BME 513',
 'BME 553',
 'BME 564',
 'CH E 582',
 'CHEM 261',
 'CHEM 263',
 'ECE 405',
 'ECE 440',
 'ECE 449',
 'ENGG 406',
 'ENGG 420',
 'ENG M 402',
 'ENG M 408',
 'ENG M 501',
 'ENG M 508',
 'ENG M 514',
 'ENG M 516',
 'ENG M 530',
 'ENG M 540',

# 2. Collect course links of the elective stream courses

In [5]:
url = "https://calendar.ualberta.ca/preview_program.php?catoid=34&poid=38711"

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--incognito')
chrome_options.add_argument('--headless')

driver = webdriver.Chrome("C:\\Users\\jerry\\Downloads\\chromedriver", options=chrome_options)

In [6]:
driver.get(url)

In [7]:
page_soup = soup(driver.page_source, 'lxml')

In [8]:
all_container = page_soup.find("div", {"class":"custom_leftpad_20"}) #contains all courses in the page

In [9]:
containers = all_container.findAll("li", {"class": "acalog-course"})

In [12]:
link_texts = [container.find("a").text.strip() for container in containers]
link_texts

['CIV E 270 - Mechanics of Deformable Bodies I',
 'ENGG 299 - Orientation to Cooperative Education',
 'MATH 209 - Calculus III',
 'MEC E 230 - Introduction to Thermo-Fluid Sciences',
 'MEC E 260 - Mechanical Design I',
 'MEC E 265 - Engineering Graphics and CAD',
 'STAT 235 - Introductory Statistics for Engineering',
 'CH E 243 - Engineering Thermodynamics',
 'ECE 209 - Fundamentals of Electrical Engineering',
 'MAT E 202 - Materials Science II',
 'MATH 201 - Differential Equations',
 'MEC E 200 - Introduction to Mechanical Engineering',
 'MEC E 250 - Engineering Mechanics II',
 'WKEXP 901 - Engineering Work Experience I',
 'WKEXP 902 - Engineering Work Experience II',
 'MATH 300 - Advanced Boundary Value Problems I',
 'MEC E 300 - Mechanical Measurements',
 'MEC E 301 - Mechanical Engineering Laboratory I',
 'MEC E 331 - Fluid Mechanics I',
 'MEC E 371 - Heat Transfer',
 'MEC E 380 - Advanced Strength of Materials I',
 'ENG M 310 - Engineering Economy',
 'ENG M 401 - Financial Managem

# 3. Open any courses that haven't been scraped

In [None]:
for link_text in link_texts:
    course_code = link_text.split(" - ")[0]
    if course_code not in scraped_courses:
        link = driver.find_element_by_link_text(link_text)
        link.click()
        time.sleep(3)
        print("opened ", course_code)

In [14]:
len(link_texts)

152

In [15]:
len(scraped_courses)

103

In [16]:
len(list(dict.fromkeys(link_texts)))

104

# 4. There's only one course that haven't been scraped. Obtain updated page html and scrape

In [21]:
page_soup = soup(driver.page_source, 'lxml')
containers = page_soup.findAll("li", {"class":"acalog-course acalog-course-open"}) #contains all open courses in the page
print(len(containers))

1


In [22]:
course_descs = [container.find("div", {"class": None}).text.strip() for container in containers]
course_descs

['MEC E 569 - Mechanics and Design of Composite Materials  ★ 3 (fi 6) (either term or Spring/Summer, 3-0-0) Introduction to composite materials. Mechanical characterization and strength theories of a lamina. Micro-mechanical analysis of a lamina. Macro-mechanical analysis of laminates. Failure analysis and design of laminates. Prerequisite: MEC E 380.']

In [24]:
import re
course_descs = [re.split("\([A-Za-z0-9 ]{4}\) \([A-Za-z0-9, -/.]{10,45}\)", desc)[1] for desc in course_descs]
course_descs

[' Introduction to composite materials. Mechanical characterization and strength theories of a lamina. Micro-mechanical analysis of a lamina. Macro-mechanical analysis of laminates. Failure analysis and design of laminates. Prerequisite: MEC E 380.']

In [25]:
course_descs = [desc.strip() for desc in course_descs]
course_descs

['Introduction to composite materials. Mechanical characterization and strength theories of a lamina. Micro-mechanical analysis of a lamina. Macro-mechanical analysis of laminates. Failure analysis and design of laminates. Prerequisite: MEC E 380.']

In [26]:
course_titles = [link_text for link_text in link_texts if link_text.split(" - ")[0] not in scraped_courses]

In [27]:
course_titles

['MEC E 569 - Mechanics and Design of Composite Materials']

In [28]:
course_names = [title.split(" - ")[1] for title in course_titles]
course_names

['Mechanics and Design of Composite Materials']

In [29]:
course_codes = [title.split(" - ")[0] for title in course_titles]
course_codes

['MEC E 569']

# 5. Write to CSV

In [30]:
import pandas as pd

df = pd.DataFrame({
    "Course Number": course_codes,
    "Course Name": course_names,
    "Course Description": course_descs    
})

df

Unnamed: 0,Course Number,Course Name,Course Description
0,MEC E 569,Mechanics and Design of Composite Materials,Introduction to composite materials. Mechanica...


In [31]:
df.to_csv('UAlberta_MechEng_stream_specific_electives_Courses.csv', index = False)

In [32]:
driver.quit()