In [1]:
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
import requests

from datetime import datetime
from typing import Tuple
import re
from random import random
from io import StringIO

from tqdm.notebook import tqdm

## Get profs

In [None]:
page = requests.get("https://stat.wisc.edu/people-main-faculty/")
soup = BeautifulSoup(page.content, "html.parser")
faculty = soup.find_all("a", {"class": "faculty-name"})
[
    f"{name[1]} {name[0]}"
    for name in [
        prof.text.replace("\n", "").strip().split(", ")
        for prof in faculty
    ]
]

## Get salary data

In [None]:
def scrape_my_professor(prof_name: Tuple[str, str]):
    """
    prof_name should be in Last, First order.
    """
    with webdriver.Firefox() as driver:
        if prof_name[1].startswith("Ric"):
            first_name = "Richard"
        else:
            first_name = prof_name[1]
        driver.get(f'https://madison.com/uw-salary-database/html_e37ff025-9a87-5a31-91ea-b2eb50aba4cb.html#2018-uw-salaries/?view_20_page=1&view_20_filters=%5B%7B%22value%22:%22{first_name}%20{prof_name[0]}%22,%22field%22:%22field_104%22,%22operator%22:%22contains%22%7D,%7B%22field%22:%22field_106%22,%22operator%22:%22is%22,%22value%22:%22UW%20Madison%22%7D%5D&view_20_sort=field_107%7Cdesc')
        table = driver.find_element(By.XPATH, '//*[@id="view_20"]/div[2]/table')
        table_text = table.text.replace("Fiscal year\nName\nTitle\nDept Description\nCampus\nStart Date\nTotal Pay\nDetails\n", "")
    if table_text == "No Data":
        return []
    else:
        return [
            string_parser(row, prof_name)
            for row in
            table_text.split("\n")
        ]

def string_parser(salary_string: str, prof_name: Tuple[str, str]) -> dict:
    name_match = re.search(f"{prof_name[1][0]}.*{prof_name[0]}", salary_string)
    dept_match = re.search("Professor.+/.+UW", salary_string)
    date_match = re.search("\d{2}/\d{2}/\d{4}", salary_string)
    return {
        "Fiscal year": int(salary_string[:name_match.span()[0]].strip()),
        "Name": name_match.group(0),
        "Title": salary_string[name_match.span()[1]:dept_match.span()[0] + 9].strip(),
        "Dept Description": dept_match.group(0).replace("Professor ", "").replace(" UW", ""),
        "Campus": salary_string[dept_match.span()[1] - 2:date_match.span()[0]].strip(),
        "Start Date": datetime.strptime(date_match.group(0), "%m/%d/%Y"),
        "Total Pay": float(salary_string[date_match.span()[1]:].strip().replace("$", "").replace(",", "").replace(" view", ""))
    }

In [None]:
stats_dept = []
cache = faculty
while len(cache) > 0:
    for prof in tqdm(cache):
        try:
            stats_dept += scrape_my_professor(prof)
            cache.remove(prof)
        except AttributeError as ex:
            print(ex)
            cache.remove(prof)
            continue
        except Exception as ex:
            print(ex)
            continue
    print(cache)

In [None]:
pd.DataFrame(stats_dept)

## Get all salaries

In [2]:
cache = [
    i + 1
    for i in range(114)
]

In [3]:
while len(cache) > 0:
    for page in tqdm(cache):
        try:
            with webdriver.Firefox() as driver:
                driver.get(f"https://madison.com/uw-salary-database/html_e37ff025-9a87-5a31-91ea-b2eb50aba4cb.html#2018-uw-salaries/?view_20_page={page}&view_20_filters=%5B%7B%22field%22:%22field_106%22,%22operator%22:%22is%22,%22value%22:%22UW%20Madison%22%7D%5D&view_20_sort=field_107%7Cdesc&view_20_per_page=1000")
                table = driver.find_element(By.XPATH, '//*[@id="view_20"]/div[2]/table')
                table_html = table.get_attribute("outerHTML")
                df = pd.read_html(StringIO(table_html))[0]
                df.to_csv(f"../../data/cache/forbidden{page}.csv", index = False)
                cache.remove(page)
        except Exception as ex:
            print(ex)
            continue
    print(cache)

  0%|          | 0/114 [00:00<?, ?it/s]

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdri

  0%|          | 0/67 [00:00<?, ?it/s]

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdri

  0%|          | 0/40 [00:00<?, ?it/s]

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdri

  0%|          | 0/26 [00:00<?, ?it/s]

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdri

  0%|          | 0/16 [00:00<?, ?it/s]

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdri

  0%|          | 0/10 [00:00<?, ?it/s]

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdri

  0%|          | 0/8 [00:00<?, ?it/s]

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

[5, 33, 43, 57, 108]


  0%|          | 0/5 [00:00<?, ?it/s]

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

[33, 43, 57]


  0%|          | 0/3 [00:00<?, ?it/s]

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdri

  0%|          | 0/3 [00:00<?, ?it/s]

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

[43, 57]


  0%|          | 0/2 [00:00<?, ?it/s]

[57]


  0%|          | 0/1 [00:00<?, ?it/s]

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

[57]


  0%|          | 0/1 [00:00<?, ?it/s]

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

[57]


  0%|          | 0/1 [00:00<?, ?it/s]

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

[57]


  0%|          | 0/1 [00:00<?, ?it/s]

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

[57]


  0%|          | 0/1 [00:00<?, ?it/s]

Message: Unable to locate element: //*[@id="view_20"]/div[2]/table
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.jsm:12:1
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:404:5
element.find/</<@chrome://remote/content/marionette/element.js:291:16

[57]


  0%|          | 0/1 [00:00<?, ?it/s]

[]


In [4]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

In [5]:
forbidden = dd.read_csv("../../data/cache/forbidden*.csv")

In [6]:
with ProgressBar():
    all_forbidden = forbidden.compute()

[########################################] | 100% Completed | 734.02 ms


In [7]:
all_forbidden.to_csv(f"../../data/forbidden.csv", index = False)

## All campuses

In [9]:
with webdriver.Firefox() as driver:
    driver.get(f"https://madison.com/uw-salary-database/html_e37ff025-9a87-5a31-91ea-b2eb50aba4cb.html#2018-uw-salaries/?view_20_page=1&view_20_sort=field_107%7Cdesc&view_20_per_page=1000&view_20_filters=[%7B%22operator%22:%22is%20not%20blank%22,%22field%22:%22field_115%22%7D]")
    table = driver.find_element(By.XPATH, '//*[@id="view_20"]/div[2]/table')
    table_html = table.get_attribute("outerHTML")

In [4]:
with webdriver.Firefox() as driver:
    driver.get(f"https://madison.com/uw-salary-database/html_e37ff025-9a87-5a31-91ea-b2eb50aba4cb.html#2018-uw-salaries/?view_20_page=51&view_20_sort=field_107%7Cdesc&view_20_per_page=1000&view_20_filters=[%7B%22operator%22:%22is%20not%20blank%22,%22field%22:%22field_115%22%7D]")
    try:
        table = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="view_20"]/div[2]/table'))
        )
        table_html = table.get_attribute("outerHTML")
    finally:
        driver.quit()

In [5]:
table_html

'<table class="kn-responsive kn-table-table is-bordered is-striped">\n  <thead>\n\n    <tr><th class="field_169">\n      <span class="table-fixed-label">\n\n      \n        <a href="#field_169|asc" address="true" class="kn-sort">\n      \n          <span class="kn-sort-icon"></span>Fiscal year\n      \n        </a>\n      \n      </span>\n    </th>\n\n\n    <th class="field_104">\n      <span class="table-fixed-label">\n\n      \n        <a href="#field_104|asc" address="true" class="kn-sort">\n      \n          <span class="kn-sort-icon"></span>Name\n      \n        </a>\n      \n      </span>\n    </th>\n\n\n    <th class="field_105">\n      <span class="table-fixed-label">\n\n      \n        <a href="#field_105|asc" address="true" class="kn-sort">\n      \n          <span class="kn-sort-icon"></span>Title\n      \n        </a>\n      \n      </span>\n    </th>\n\n\n    <th class="field_107">\n      <span class="table-fixed-label">\n\n      \n        <a href="#field_107|asc" address=

In [10]:
pd.read_html(table_html)

'<table class="kn-responsive kn-table-table is-bordered is-striped">\n  <thead>\n\n    <tr><th class="field_169">\n      <span class="table-fixed-label">\n\n      \n        <a href="#field_169|asc" address="true" class="kn-sort">\n      \n          <span class="kn-sort-icon"></span>Fiscal year\n      \n        </a>\n      \n      </span>\n    </th>\n\n\n    <th class="field_104">\n      <span class="table-fixed-label">\n\n      \n        <a href="#field_104|asc" address="true" class="kn-sort">\n      \n          <span class="kn-sort-icon"></span>Name\n      \n        </a>\n      \n      </span>\n    </th>\n\n\n    <th class="field_105">\n      <span class="table-fixed-label">\n\n      \n        <a href="#field_105|asc" address="true" class="kn-sort">\n      \n          <span class="kn-sort-icon"></span>Title\n      \n        </a>\n      \n      </span>\n    </th>\n\n\n    <th class="field_107">\n      <span class="table-fixed-label">\n\n      \n        <a href="#field_107|asc" address=