## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

In [2]:
import os
import pytz
import glob
import pathlib

this_dir = pathlib.Path(os.path.abspath(""))
data_dir = this_dir / "data"

In [3]:
import requests
import pandas as pd
import json
from datetime import datetime
from slugify import slugify



## Download

Retrieve the page

In [4]:
url = "https://services7.arcgis.com/zaLZMEOGUnUT78nG/ArcGIS/rest/services/COVID-19%20Public%20Dashboard%20Data%20V3/FeatureServer/16/query?where=1%3D1&objectIds=&time=&resultType=none&outFields=*&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&sqlFormat=none&f=pjson&token="

In [5]:
r = requests.get(url)

In [6]:
data = r.json()

## Parse

Get latest from timeseries

In [7]:
latest = data["features"]

In [8]:
dict_list = []
for l in latest:
    d = dict(
        area=l["attributes"]["PLACE"],
        confirmed_cases=l["attributes"]["CUMULTOTALCASES"],
    )
    dict_list.append(d)

In [9]:
df = pd.DataFrame.from_dict(dict_list)

Trim down the dataframe based on the list

In [10]:
# def prep_df(df, list):
#     df = df[df["area"].isin(list)]
#     df["area"] = df["area"].astype(str)
#     df["area"] = df["area"].str[2:]
#     return df

In [11]:
# trim_df = prep_df(df, cities_list)
trim_df = df.copy()

Convert camel case to regular

In [12]:
# def change_case(str):
#     res = [str[0]]
#     for c in str[1:]:
#         if c in ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
#             res.append(" ")
#             res.append(c)
#         else:
#             res.append(c)

#     return "".join(res)

In [13]:
trim_df["area"] = trim_df["area"].apply(lambda x: x.title())

Fix truncated names 

In [14]:
def clean_city_names(s):
    if s in df:
        return s.strip()
    else:
        s = s.replace("Green V Lake", "Green Valley Lake")
        s = s.replace("Lucerne V", "Lucerne Valley")
        s = s.replace("Newberry S", "Newberry Springs")
        s = s.replace("Pinon Hills", "Piñon Hills")
        s = s.replace("R C", "Rancho Cucamonga")
        s = s.replace("R Springs", "Running Springs")
        s = s.replace("San B", "San Bernardino")
        s = s.replace("Twentynine P", "Twentynine Palms")
        return s.strip()

In [15]:
# trim_df["area"] = trim_df["area"].apply(clean_city_names)

Get timestamp

In [16]:
timestamp = latest[0]["attributes"]["RECENTDAY"]

In [17]:
# timestamp = datetime.fromtimestamp((timestamp / 1000))

In [18]:
latest_date = pd.to_datetime(timestamp).date()

In [19]:
trim_df["county_date"] = latest_date

In [20]:
trim_df.insert(0, "county", "San Bernardino")

In [38]:
trim_df.drop(trim_df.loc[trim_df.area == "Undetermined"].index, inplace=True)

## Vet

In [39]:
try:
    assert not len(trim_df) < 64
except AssertionError:
    raise AssertionError("San Bernardino County's scraper is missing rows")

In [40]:
try:
    assert not len(trim_df) > 64
except AssertionError:
    raise AssertionError("San Bernardino County's scraper has more rows than before")

## Export

Set date

In [41]:
tz = pytz.timezone("America/Los_Angeles")

In [42]:
today = datetime.now(tz).date()

In [43]:
slug = "san-bernardino"

In [44]:
trim_df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [45]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [46]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [47]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [48]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)