## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

In [2]:
import os
import pathlib

In [3]:
this_dir = pathlib.Path(os.path.abspath(""))

In [4]:
data_dir = this_dir / "data"

In [5]:
import re
import pytz
import glob
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, date, timedelta
from tableauscraper import TableauScraper as TS

## Download

In [6]:
url = "https://public.tableau.com/views/case10/rev-ZIPRATE_1?%3Adisplay_static_image=y&%3AbootstrapWhenNotified=true&%3Aembed=true&%3Alanguage=en-US&:embed=y&:showVizHome=n&:apiID=host0#navType=0&navSrc=Parse"

## Parse

In [7]:
ts = TS()
ts.loads(url)
workbook = ts.getWorkbook()

# for t in workbook.worksheets:
#     print(f"worksheet name : {t.name}")  # show worksheet name
#     print(t.data)  # show dataframe for this worksheet

worksheet name : ZIPCODE
  HUMBOLDT COUNTY REGIONS-alias CNT(humlist_update.csv)-alias  \
0                         SOUTH                           742   
1                         NORTH                          2896   
2          GREATER HUMBOLDT BAY                          7632   
3          GREATER FORTUNA AREA                          2420   

  Latitude (generated)-value Latitude (generated)-alias  \
0                    40.2127                    40.2127   
1                  41.128306                    41.1283   
2                  40.644001                    40.6440   
3                  40.504102                    40.5041   

  Longitude (generated)-value Longitude (generated)-alias  \
0                 -123.842598                   -123.8426   
1                 -123.782724                   -123.7827   
2                 -123.887703                   -123.8877   
3                 -124.009066                   -124.0091   

  AGG(Case rate per 100,000 by Hum. Co. Region)

In [8]:
ws = ts.getWorksheet("ZIPCODE")
df = ws.data

In [9]:
rename_cols = {
    "HUMBOLDT COUNTY REGIONS-alias": "area",
    "CNT(humlist_update.csv)-alias": "confirmed_cases",
    "Latitude (generated)-value": "lat",
    "Latitude (generated)-alias": "lat_alias",
    "Longitude (generated)-value": "lon",
    "Longitude (generated)-alias": "lat_alias",
    "AGG(Case rate per 100,000 by Hum. Co. Region)-alias": "case_rate",
}

In [10]:
df = df.rename(columns=rename_cols)

In [11]:
df = df.drop(["lat", "lat_alias", "lon", "lat_alias", "case_rate"], axis=1)

In [12]:
df["area"] = df["area"].str.title()

In [13]:
df.insert(0, "county", "Humboldt")

Scrape another embed for the update date

In [14]:
date_url = "https://public.tableau.com/views/case1_16476245794140/caseplustest22?:language=en-US&:embed=y&:embed_code_version=3&:loadOrderID=0&:display_count=y&:origin=viz_share_link"

In [15]:
ts_date = TS()
ts_date.loads(date_url)

In [16]:
ws_date = ts_date.getWorksheet("todaydate")
df_date = ws_date.data

In [17]:
county_date = df_date["DAY(Todaysdate)-value"].iloc[0]

In [18]:
county_date

'2022-05-03 00:00:00'

In [19]:
df["county_date"] = pd.to_datetime(county_date)

In [20]:
df

Unnamed: 0,county,area,confirmed_cases,county_date
0,Humboldt,South,742,2022-05-03
1,Humboldt,North,2896,2022-05-03
2,Humboldt,Greater Humboldt Bay,7632,2022-05-03
3,Humboldt,Greater Fortuna Area,2420,2022-05-03


## Vet

In [21]:
try:
    assert not len(df) > 4
except AssertionError:
    raise AssertionError("Humboldt County's scraper has extra rows")

In [22]:
try:
    assert not len(df) < 4
except AssertionError:
    raise AssertionError("Humboldt County's scraper is missing rows")

## Export

Set the date

In [23]:
tz = pytz.timezone("America/Los_Angeles")

In [24]:
today = datetime.now(tz).date()

In [25]:
slug = "humboldt"

In [26]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [27]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [28]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [29]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [30]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)