Code formatting with [black](https://pypi.org/project/nb-black/).

In [260]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


Add our `utils` directory to the system's `$PATH` so we can import Python files from sibling directories.

In [261]:
import os
import sys
import glob

In [262]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import regex as re

In [263]:
data_dir = os.path.join(os.path.abspath(""), "data")

Retrieve the page

In [264]:
url = "https://www.cdph.ca.gov/Programs/CID/DCDC/Pages/COVID-19/Regional-ICU-Capacity.aspx"

In [265]:
page = requests.get(url)

Parse it

In [266]:
soup = BeautifulSoup(page.content, "html.parser")

Focus in on the content well.

In [267]:
content = soup.find("div", {"id": "s4-bodyContainer"})

Get the timestamp

In [268]:
date_container = content.find("span", {"class": "article-date-title"})

In [269]:
latest_date = pd.to_datetime(date_container.text.strip()).date()

In [270]:
latest_date

datetime.date(2021, 9, 6)

Get table

In [271]:
table = content.find("table", {"class": "ms-rteTable-4"})

Verify the table is there

In [272]:
assert len(table) == 1

Read it in

In [273]:
table_headers = [
    "region",
    "pct_icu_beds_available",  # % of Staffed Adult ICU Beds Available
    "pct_covid_positive_patients",  # % COVID+ in Adult ICU Beds
    "consecutive_days_under_10_pct",  # Number of  Consecutive Days Under 10 %
    "health_order_effective_date",  # Date Health Order Effective
    "health_order_expiration_date",  # Date Health Order Set to Expire
]

In [274]:
def safetxt(element):
    v = element.text.strip()
    v = v.replace("\u200b", "")
    return v

In [275]:
def safenumber(element):
    v = safetxt(element)
    v = v.replace(",", "")
    v = v.replace(" ", "")
    return v

In [276]:
df = parse_table(table)

Clean up missing values

In [277]:
df["health_order_effective_date"] = pd.to_datetime(
    df["health_order_effective_date"], errors="coerce"
)

In [278]:
df["health_order_expiration_date"] = pd.to_datetime(
    df["health_order_expiration_date"], errors="coerce"
)

In [279]:
df["consecutive_days_under_10_pct"] = df["consecutive_days_under_10_pct"].replace(
    "--", "0"
)

In [280]:
df

Unnamed: 0,region,pct_icu_beds_available,pct_covid_positive_patients,consecutive_days_under_10_pct,health_order_effective_date,health_order_expiration_date,date
0,California Statewide,20.3,36.7,0,NaT,NaT,2021-09-06
1,Bay Area,24.6,31.4,0,NaT,NaT,2021-09-06
2,Greater Sacramento,14.8,43.2,0,NaT,NaT,2021-09-06
3,Northern California,19.7,54.7,0,NaT,NaT,2021-09-06
4,San Joaquin Valley,8.4,52.7,7,2021-09-03,2021-09-09,2021-09-06
5,Southern California,21.8,33.4,0,NaT,NaT,2021-09-06


Verify that all the values can be converted to floats

In [281]:
try:
    df.set_index(
        [
            "date",
            "health_order_effective_date",
            "health_order_expiration_date",
            "region",
        ]
    ).astype(float)
except Exception as e:
    raise AssertionError(e)

Write it out

In [282]:
df.to_csv(
    os.path.join(data_dir, f"daily/icu-capacity-{latest_date}.csv"),
    index=False,
)

### Concatenate

Combine all scraped tables into one timeseries

In [283]:
path = ""
files = glob.glob(os.path.join(path, "data/daily/*.csv"))

In [284]:
file_df = (
    pd.read_csv(
        f,
        low_memory=False,
        parse_dates=[
            "date",
            "health_order_effective_date",
            "health_order_expiration_date",
        ],
    )
    for f in files
)

In [285]:
concat_df = pd.concat(
    file_df,
    ignore_index=True,
)

In [286]:
sort_df = concat_df.sort_values(["date", "region"])

In [287]:
sort_df.to_csv("data/all/all.csv", index=False)