In [1]:
%load_ext lab_black

In [21]:
import os
import glob
import pathlib

In [None]:
import pandas as pd

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC

In [23]:
data_dir = this_dir / "data"

In [22]:
this_dir = pathlib.Path(os.path.abspath(""))

Configure the browser

In [5]:
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-extensions')

In [6]:
driver = webdriver.Chrome(options=options)

Get the page

In [7]:
driver.get(
    "https://datavisualization.cdph.ca.gov/t/LNC/views/COVIDSNFDASHV3/COVIDSNFDASH?:embed=y&:showVizHome=no&:host_url=https%3A%2F%2Fdatavisualization.cdph.ca.gov%2F&:embed_code_version=3&:tabs=yes&:toolbar=yes&:showAppBanner=false&:display_spinner=no&:loadOrderID=0"
)

Pull the date

In [8]:
def get_date():
    wait = WebDriverWait(driver, 300)
    wait.until(
        EC.visibility_of_element_located(
            (By.ID, "tableau_base_widget_QuickFilterPanel_0")
        )
    )
    date_filter = driver.find_element_by_id("tableau_base_widget_QuickFilterPanel_0")
    date_string = date_filter.find_elements_by_xpath(
        "//div[contains(@class, 'tabComboBoxName')]"
    )[-1].text
    return pd.to_datetime(date_string).date()

In [9]:
date = get_date()

Routine to pull the download button

In [10]:
def get_download_button():
    wait = WebDriverWait(driver, 300)
    wait.until(EC.visibility_of_element_located((By.ID, "download-ToolbarButton")))
    btn = driver.find_element_by_id("download-ToolbarButton")
    assert btn.text == "Download"
    return btn

Routine to pull data for values

In [11]:
def get_data(element_id):
    # WAit for the box
    wait = WebDriverWait(driver, 300)
    wait.until(EC.visibility_of_element_located((By.ID, element_id)))

    # Get it and click it twice
    box = driver.find_element_by_id(element_id)
    box.click()
    box.click()

    # Click the download button
    download_button = get_download_button()
    download_button.click()

    # Wait for the popup
    wait.until(EC.visibility_of_element_located((By.XPATH, "//fieldset")))
    fieldset = driver.find_element_by_xpath("//fieldset")

    # Get the Data button and click it
    buttons = fieldset.find_elements_by_xpath("//button")
    data_button = next(e for e in buttons if e.text == "Data")
    data_button.click()

    # Move to the new tab
    driver.switch_to.window(driver.window_handles[1])

    # Get the data
    wait.until(EC.visibility_of_element_located((By.XPATH, "//td")))
    value = int(driver.find_element_by_xpath("//td").text)

    # Close out
    driver.close()
    driver.switch_to.window(driver.window_handles[0])

    # Return the data
    return value

Pull the four we're after

In [12]:
resident_deaths = get_data("tabZoneId78")

In [13]:
worker_deaths = get_data("tabZoneId86")

In [14]:
resident_cases = get_data("tabZoneId76")

In [15]:
worker_cases = get_data("tabZoneId84")

In [16]:
data = dict(
    date=date,
    resident_cases=resident_cases,
    worker_cases=worker_cases,
    resident_deaths=resident_deaths,
    worker_deaths=worker_deaths,
)

In [17]:
data

{'date': datetime.date(2021, 9, 15),
 'resident_cases': 63455,
 'worker_cases': 53889,
 'resident_deaths': 9200,
 'worker_deaths': 249}

Kill the browser

In [18]:
driver.quit()

Export

In [19]:
df = pd.DataFrame([data])

In [24]:
df.to_csv(data_dir / f"{date}.csv", index=False)

In [25]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [26]:
df_list = []
for csv in csv_list:
    file_date = csv.split("/")[-1].replace(".csv", "")
    df = pd.read_csv(csv, parse_dates=["date"])
    df_list.append(df)

In [27]:
df = pd.concat(df_list).sort_values(["date"])

In [28]:
df.to_csv(data_dir / "timeseries.csv", index=False)