# Age timeseries processor

Add our `utils` directory to the system's `$PATH` so we can import Python files from sibling directories.

In [1]:
%load_ext lab_black

In [2]:
import os
import glob
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
data_dir = os.path.join(os.path.abspath(""), "data")

Retrieve the page

In [4]:
url = "https://www.cdph.ca.gov/Programs/CID/DCDC/Pages/COVID-19/COVID-19-Cases-by-Age-Group.aspx"

In [5]:
page = requests.get(url)

Parse it.

In [6]:
soup = BeautifulSoup(page.content, "html.parser")

Focus in on the content well.

In [7]:
content = soup.find("div", {"id": "MSOZoneCell_WebPartWPQ4"})

Get the timestamp

In [8]:
latest_date = pd.to_datetime(content.find("h3").text).date()

Get tables.

In [9]:
table_list = content.find_all("table", {"class": "ms-rteTable-4"})

Verify there is only one.

In [10]:
assert len(table_list) == 1

Read them in.

In [11]:
def safetxt(element):
    v = element.text.strip()
    v = v.replace("\u200b", "")
    return v

In [12]:
def safenumber(element):
    v = safetxt(element)
    v = v.replace(",", "")
    v = v.replace(" ", "")
    return v

In [13]:
def parse_table(soup):
    tbody = soup.tbody
    row_list = tbody.find_all("tr")[1:]
    dict_list = []
    for row in row_list:
        cell_list = row.find_all("td")
        d = dict(
            age=safetxt(cell_list[0]),
            confirmed_cases=safenumber(cell_list[1]),
            cases_pct=safenumber(cell_list[2]),
            deaths=safenumber(cell_list[3]),
            deaths_pct=safenumber(cell_list[4]),
            population_pct=safenumber(cell_list[5]),
        )
        dict_list.append(d)
    df = pd.DataFrame(dict_list)
    df["date"] = latest_date
    return df

In [14]:
df = parse_table(table_list[0])

In [15]:
assert len(df) == 12

Verify that all the values can be converted to floats

In [16]:
try:
    df.set_index(["date", "age"]).astype(float)
except Exception as e:
    raise AssertionError(e)

Write them out.

In [17]:
df.to_csv(
    os.path.join(data_dir, f"ages-{latest_date.strftime('%Y%m%d')}.csv"), index=False
)

In [18]:
csv_list = [
    i for i in glob.glob(data_dir + "/*.csv") if not str(i).endswith("timeseries.csv")
]

In [19]:
df_list = [pd.read_csv(p, parse_dates=["date"]) for p in csv_list]

In [20]:
df = pd.concat(df_list).sort_values(["date", "age"])

In [21]:
df.to_csv(os.path.join(data_dir, "timeseries.csv"), index=False)