## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

In [2]:
import os
import pathlib
from bs4 import BeautifulSoup

In [3]:
this_dir = pathlib.Path(os.path.abspath(""))

In [4]:
data_dir = this_dir / "data"

In [5]:
import pytz
import glob
import requests
import json
import pandas as pd
from slugify import slugify
from datetime import datetime

## Download

Retrieve the page

In [6]:
# url = "https://services.arcgis.com/UHg8l1wC48WQyDSO/arcgis/rest/services/reportareadb_view/FeatureServer/0//query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&returnGeodetic=false&outFields=Report_Are%2C+Case_Count%2Csurvey_date&returnGeometry=true&returnCentroid=false&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pjson&token="

In [7]:
url = "https://www.edcgov.us/Government/hhsa/covid19/Documents/dashboard.html#cases-casos"  #

In [8]:
r = requests.get(url)

In [9]:
# data = r.json()

In [10]:
soup = BeautifulSoup(r.content, "html.parser")

## Parse

An unstable text parse but should do for now. 

In [11]:
p = soup.findAll("p")

In [12]:
dateline = p[1].text.split("for ")[1].replace(": ", "")

In [13]:
timestamp = datetime.strptime(dateline, "%B %d, %Y")

In [14]:
timestamp

datetime.datetime(2022, 1, 4, 0, 0)

Now parse on case map

In [15]:
div = soup.find(
    "div", {"id": "total-number-of-cases-by-region-número-total-de-casos-por-región"}
)

In [16]:
script = div.find("script")

In [17]:
data = json.loads(script.contents[0])

In [18]:
legend = data["x"]["calls"][-1]

In [19]:
labels = legend["args"][0]["labels"]

In [20]:
labels

['<strong>Camino, Pollock Pines, Kyburz</strong><br/>Total Cases: 1,033',
 '<strong>El Dorado Hills</strong><br/>Total Cases: 4,021',
 '<strong>El Dorado, Diamond Springs</strong><br/>Total Cases: 906',
 '<strong>North County</strong><br/>Total Cases: 1,034',
 '<strong>Greater Placerville Area</strong><br/>Total Cases: 3,375',
 '<strong>Shingle Springs, Cameron Park, Rescue</strong><br/>Total Cases: 3,323',
 '<strong>South County</strong><br/>Total Cases: 281',
 '<strong>Tahoe Region</strong><br/>Total Cases: 4,251']

In [21]:
labels[0].split(">")[1].split("<")[0]

'Camino, Pollock Pines, Kyburz'

In [22]:
labels[0].split(": ")[1].replace(",", "")

'1033'

In [23]:
dict_list = []

In [24]:
for line in labels:
    d = dict(
        county="El Dorado",
        area=line.split(">")[1].split("<")[0],
        confirmed_cases=int(line.split(": ")[1].replace(",", "")),
        county_date=timestamp,
    )
    dict_list.append(d)

In [25]:
df = pd.DataFrame(dict_list)

In [26]:
df

Unnamed: 0,county,area,confirmed_cases,county_date
0,El Dorado,"Camino, Pollock Pines, Kyburz",1033,2022-01-04
1,El Dorado,El Dorado Hills,4021,2022-01-04
2,El Dorado,"El Dorado, Diamond Springs",906,2022-01-04
3,El Dorado,North County,1034,2022-01-04
4,El Dorado,Greater Placerville Area,3375,2022-01-04
5,El Dorado,"Shingle Springs, Cameron Park, Rescue",3323,2022-01-04
6,El Dorado,South County,281,2022-01-04
7,El Dorado,Tahoe Region,4251,2022-01-04


We can slot this back in when the ARCGIS is back up

In [27]:
# for item in data["features"]:
#     timestamp = item["attributes"]["survey_date"]
#     timestamp = datetime.fromtimestamp((timestamp / 1000))
#     d = dict(
#         county="El Dorado",
#         area=item["attributes"]["Report_Are"],
#         confirmed_cases=item["attributes"]["Case_Count"],
#         county_date=timestamp,
#     )
#     dict_list.append(d)

Convert to dataframe

In [28]:
df = pd.DataFrame(dict_list)

In [29]:
df["county_date"] = df["county_date"].dt.strftime("%Y-%m-%d")

In [30]:
df

Unnamed: 0,county,area,confirmed_cases,county_date
0,El Dorado,"Camino, Pollock Pines, Kyburz",1033,2022-01-04
1,El Dorado,El Dorado Hills,4021,2022-01-04
2,El Dorado,"El Dorado, Diamond Springs",906,2022-01-04
3,El Dorado,North County,1034,2022-01-04
4,El Dorado,Greater Placerville Area,3375,2022-01-04
5,El Dorado,"Shingle Springs, Cameron Park, Rescue",3323,2022-01-04
6,El Dorado,South County,281,2022-01-04
7,El Dorado,Tahoe Region,4251,2022-01-04


Set the date

In [31]:
tz = pytz.timezone("America/Los_Angeles")

In [32]:
today = datetime.now(tz).date()

In [33]:
slug = "el-dorado"

## Vet

In [34]:
try:
    assert not len(df) > 8
except AssertionError:
    raise AssertionError("El Dorado's area scraper has extra rows")

In [35]:
try:
    assert not len(df) < 8
except AssertionError:
    raise AssertionError("El Dorado's area scraper is missing rows")

## Export

In [36]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [37]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [38]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [39]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [40]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)