Following https://realpython.com/beautiful-soup-web-scraper-python/
and https://www.scrapingbee.com/blog/selenium-python/


install webdriver for Chrome from https://chromedriver.chromium.org/downloads

in your python environment install selenium https://selenium-python.readthedocs.io/installation.html

to identify xpath location of relevant content can use https://selectorshub.com/selectorshub/

In [1]:
import json
import pandas as pd

import os

from selenium import webdriver
from selenium.webdriver.common.by import By

import pprint

In [118]:
config_json = "scraping_config.json"
institution_json = "institution.json"

with open(config_json) as config_file:
    config = json.load(config_file)

institutions = config['institutions']
driver_path = config['driver_path']

print(institutions)

pp = pprint.PrettyPrinter(indent=4)


['Durham_England', 'Glasgow_Scotland', 'Kennesaw_USA', 'UOC_Spain', 'Uppsala_Sweden', 'VirginiaTech_USA']


In [105]:
def heading(field):
    return "<h2>" + field + "</h2>"

In [119]:
driver = webdriver.Chrome(executable_path=driver_path)
key_fields = ['institution', 'elective', 'overview']
overview_fields = ['title', 'summary', 'content', 'ilo']
all_fields = key_fields + overview_fields
all_electives_dfs =[]


try:
    for institution_name in institutions:
        path = institution_name

        with open(os.path.join(path, institution_json)) as institution_file:
            institution_config = json.load(institution_file)
            pp.pprint(institution_config)
        electives_df = pd.DataFrame(columns=all_fields, dtype="string")

        if pre_scraped := institution_config.get('pre_scraped_file',""):
            if fields := institution_config.get('fields', ""):
                pre_scraped_df = pd.read_csv(os.path.join(path, pre_scraped), dtype='str')
                pre_scraped_df['institution'] = institution_name
                pre_scraped_df['overview'] = ""
                for field in [i for i in all_fields if i not in ['institution','overview']]:
                    lookup = fields[field]
                    if not lookup:
                        pre_scraped_df[field] = ""
                        continue
                    pre_scraped_df[field] = pre_scraped_df[lookup]

                    if field in overview_fields:
                        pre_scraped_df['overview'] =  pre_scraped_df['overview'] + heading(field) + pre_scraped_df[field]

                electives_df = pre_scraped_df[all_fields]
                electives_df = electives_df[electives_df['elective'].str.len() >0]
                all_electives_dfs.append(electives_df)
                electives_df.to_csv(os.path.join(path,'electives.csv'), index= False)
                continue

        url = institution_config['scrapeURL']        
        xpaths = institution_config['XPath']
        electives = institution_config['electives']

        for elective in electives:
            if isinstance(electives, dict):
                elective_url = url.replace("%ELECTIVE%", electives[elective])
            else:
                elective_url = url.replace("%ELECTIVE%", elective)
            driver.get(elective_url)
#            full_page=driver.find_elements(By.XPATH, '//').get_attribute('innerHTML')
#            full_file = open("Page" + elective, "w")
#            full_file.write(full_page)
#            full_file.close()
            overview = ""
            overview_dictionary = {}
            for overview_field in overview_fields:
                overview_dictionary[overview_field] = ""
                try:
                    overview_elts = driver.find_elements(By.XPATH, xpaths[overview_field])
                except Exception:
#                    print("Could not find field " + overview_field)
                    continue
                overview += heading(overview_field)
                for elt in overview_elts:
#                    print ("found elt for " + overview_field)
                    innerHTML = elt.get_attribute('innerHTML')
                    overview += innerHTML
                    overview_dictionary[overview_field] += innerHTML
            new_row = {"institution": institution_config['institution'],
                       "elective": elective,
                       "overview": overview} | overview_dictionary
            electives_df = electives_df.append(new_row, ignore_index=True)
        electives_df = electives_df[electives_df['elective'].str.len() >0]
        all_electives_dfs.append(electives_df)
        electives_df.to_csv(os.path.join(path,'electives.csv'), index=False)
    driver.quit()
except OSError as err:
    print("OS error:", err)

pd.concat(all_electives_dfs).to_csv('all_electives.csv', index=False)
# except Exception as err:
#     print(f"Unexpected {err=}, {type(err)=}")
#     driver.quit()
 


  driver = webdriver.Chrome(executable_path=driver_path)


{   'XPath': {   'content': "//*[@id='fhcontent']/following-sibling::ul[1]",
                 'ilo': "//*[@id='fhlearn']/following-sibling::ul[1]",
                 'summary': "//*[@id='fhaims']/following-sibling::ul[1]",
                 'title': "//*[@id='content']/div/h3"},
    'electives': [   'COMP3421',
                     'COMP3477',
                     'COMP3487',
                     'COMP3491',
                     'COMP3507',
                     'COMP3517',
                     'COMP3527',
                     'COMP3547',
                     'COMP3557',
                     'COMP3567',
                     'COMP3577',
                     'COMP3587',
                     'COMP3607',
                     'COMP3617',
                     'COMP3621',
                     'COMP3637',
                     'COMP3647',
                     'COMP3657',
                     'COMP3667',
                     'COMP3677',
                     'COMP4087',
                     'COMP409

  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = elective

  electives_df = electives_df.append(new_row, ignore_index=True)


{   'XPath': {   'content': "//h3[normalize-space()='Course "
                            "Aims']/following-sibling::div[1]",
                 'ilo': "//h3[normalize-space()='Intended Learning Outcomes of "
                        "Course']/following-sibling::div[1]",
                 'summary': "//h3[normalize-space()='Short "
                            "Description']/following-sibling::div[1]",
                 'title': '/html[1]/body[1]/div[4]/div[2]/div[1]/div[1]/div[1]/main[1]/div[1]/h2[1]'},
    'electives': [   'COMPSCI4009',
                     'COMPSCI4062',
                     'COMPSCI4073',
                     'COMPSCI4024P',
                     'COMPSCI4014',
                     'COMPSCI4014',
                     'COMPSCI4014',
                     'COMPSCI4014',
                     'COMPSCI4038',
                     'COMPSCI4038',
                     'COMPSCI4038',
                     'COMPSCI4038',
                     'COMPSCI4015',
                     'COMPS

  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = elective

{   'fields': {   'content': '',
                  'elective': 'MCode',
                  'ilo': 'MOutcomes',
                  'summary': 'MDescription',
                  'title': 'MTitle'},
    'institution': 'Kennesaw_USA',
    'pre_scraped_file': 'courses.csv'}
{   'fields': {   'content': '',
                  'elective': 'Course',
                  'ilo': '',
                  'summary': 'Description',
                  'title': 'Title'},
    'institution': 'UOC_Spain',
    'pre_scraped_file': 'courses.csv'}
{   'XPath': {   'content': "//*[@id='syllabusContainer']/div[1]/ul[2]",
                 'ilo': "//*[@id='syllabusContainer']/div[1]/ul[1]",
                 'summary': '',
                 'title': "//*[@id='readspeaker-content']/h1"},
    'electives': [   '1DL301',
                     '1DL311',
                     '1DT052',
                     '1TD403',
                     '1DL231',
                     '1DL321',
                     '1DT072',
                     '1D

  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = elective

{   'XPath': {   'content': '',
                 'ilo': '',
                 'summary': "//*[@id='vt_with_rb']/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/p[1]",
                 'title': "//*[@id='vt_main']/div/div[1]/h1"},
    'electives': [   'CS3314',
                     'CS3414',
                     'CS3654',
                     'CS3704',
                     'CS3714',
                     'CS3724',
                     'CS3744',
                     'CS3754',
                     'CS3824',
                     'CS4104',
                     'CS4114',
                     'CS4124',
                     'CS4134',
                     'CS4144',
                     'CS4234',
                     'CS4254',
                     'CS4264',
                     'CS4274',
                     'CS4284',
                     'CS4504',
                     'CS4604',
                     'CS4624',
                     'CS4644',
                     'CS4654',
                     'CS4664',
  

  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = electives_df.append(new_row, ignore_index=True)
  electives_df = elective

In [112]:
driver.quit()

Unnamed: 0,MCode,Cohort,AcademicYearStart,Women3,Men3,Cap
0,IT4733,1,2020,3,0,
1,IT4793,1,2021,3,0,
2,IT4733,1,2022,10,5,
3,IT4773,1,2022,6,3,
4,IT4533,1,2020,7,4,
...,...,...,...,...,...,...
159,CYBR4883,1,2020,0,13,
160,CYBR4883,1,2021,0,14,
161,IT4403,1,2020,0,11,
162,SWE3683,1,2020,0,10,
