## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

In [14]:
import os
import glob
import time
import requests
import pandas as pd
from datetime import datetime
from dateutil import tz
from bs4 import BeautifulSoup

In [15]:
data_dir = os.path.join(os.path.abspath(""), "data")

## County Names

In [4]:
countyNames = [
    "california",
    "losangeles",
    "orange",
    "ventura",
    "sanbernardino",
    "riverside",
    "sandiego",
    "imperial",
    "inyo",
    "santabarbara",
    "tulare",
    "kings",
    "kern",
    "fresno",
    "sanluisobispo",
    "monterey",
    "mono",
    "madera",
    "merced",
    "mariposa",
    "sanmateo",
    "santaclara",
    "sanfrancisco",
    "sacramento",
    "alameda",
    "napa",
    "contracosta",
    "solano",
    "marin",
    "sonoma",
    "santacruz",
    "sanbenito",
    "sanjoaquin",
    "calaveras",
    "tuolumne",
    "stanislaus",
    "mendocino",
    "lake",
    "humboldt",
    "trinity",
    "delnorte",
    "siskiyou",
    "amador",
    "placer",
    "yolo",
    "eldorado",
    "alpine",
    "sutter",
    "yuba",
    "nevada",
    "sierra",
    "colusa",
    "glenn",
    "butte",
    "plumas",
    "shasta",
    "modoc",
    "lassen",
    "tehama",
]

## Function to loop through each county to get the data

In [5]:
data = []
tempData = {}


def getData(i, verbose=False):
    url = f"https://files.covid19.ca.gov/data/reviewed/equitydash/cumulative-{i}.json"
    if verbose:
        print(url)
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
    }
    request = requests.get(url, headers=headers)
    tempData = request.json()
    for item in tempData:
        data.append(item)

In [6]:
for name in countyNames:
    getData(name)
    time.sleep(0.3)

## Get into DataFrame

In [7]:
df = pd.DataFrame.from_dict(data).fillna(0)

In [8]:
try:
    assert df.shape[0] == 1593
except:
    console.log("Irregular number of rows")

## Get Last Modified Date

In [9]:
url = (
    f"https://files.covid19.ca.gov/data/reviewed/equitydash/cumulative-california.json"
)
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
}
request = requests.get(url, headers=headers)

date_raw = request.headers.get("Last-Modified")
objDate = datetime.strptime(date_raw, "%a, %d %b %Y %H:%M:%S GMT")

## Convert from UTC to PST

In [10]:
from_zone = tz.gettz("GMT")
to_zone = tz.gettz("US/Pacific")
date_raw = objDate.replace(tzinfo=from_zone)
date = date_raw.astimezone(to_zone)

## Format Date

In [11]:
date = date.strftime("%Y-%m-%d")

## Write to CSV

In [12]:
df.to_csv(os.path.join(data_dir, f"cdph-equity-{date}.csv"), index=False)

In [38]:
df_list = []
for p in [i for i in glob.glob(data_dir + "/*.csv") if not str(i).endswith("timeseries.csv")]:
    s = p[-14:-4]
    d = pd.to_datetime(s)
    df = pd.read_csv(p)
    df['date'] = d
    df_list.append(df)

In [39]:
big_df = pd.concat(df_list)

In [41]:
big_df.to_csv(os.path.join(data_dir, "timeseries.csv"), index=False)