## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [2]:
import os
import pytz
import glob
import pathlib

this_dir = pathlib.Path(os.path.abspath(""))
data_dir = this_dir / "data"

In [3]:
import requests
import pandas as pd
import json
from datetime import datetime
from slugify import slugify



## Download

Retrieve the page

In [4]:
url = "https://services.arcgis.com/yq3FgOI44hYHAFVZ/arcgis/rest/services/covid19_city_all/FeatureServer/5/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&relationParam=&returnGeodetic=false&outFields=*&returnGeometry=true&returnCentroid=false&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&defaultSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pjson&token="

In [5]:
r = requests.get(url)

In [6]:
data = r.json()

## Parse

In [7]:
dict_list = []

In [8]:
for item in data["features"]:
    d = dict(
        county="San Mateo",
        area=item["attributes"]["place"],
        confirmed_cases=item["attributes"]["n"],
    )
    dict_list.append(d)

In [9]:
df = pd.DataFrame(dict_list)

In [10]:
df["confirmed_cases"] = df.confirmed_cases.fillna("<10")

Get timestamp

In [11]:
date_url = "https://services.arcgis.com/yq3FgOI44hYHAFVZ/arcgis/rest/services/covid19_city_all/FeatureServer/5?f=json"
date_r = requests.get(date_url)
date_data = date_r.json()

In [12]:
timestamp = date_data["editingInfo"]["lastEditDate"]

In [13]:
timestamp = datetime.fromtimestamp((timestamp / 1000))

In [14]:
latest_date = pd.to_datetime(timestamp).date()

In [15]:
df["county_date"] = latest_date

In [16]:
len(df)

30

## Vet

In [17]:
try:
    assert not len(df) > 30
except AssertionError:
    raise AssertionError("San Mateo County's scraper has more rows than before")

In [18]:
try:
    assert not len(df) < 30
except AssertionError:
    raise AssertionError("San Mateo County's scraper is missing rows")

## Export

Set date

In [19]:
tz = pytz.timezone("America/Los_Angeles")

In [20]:
today = datetime.now(tz).date()

In [21]:
slug = "san-mateo"

In [22]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [23]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [24]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [25]:
df = pd.concat(df_list).sort_values(["date", "area"])

## Combine Montara, Moss beach, El Granada
Montara and Moss beach now considered "Midcoast Region" (Feb 2022)

In [26]:
midcoast = df[df.area.isin(["Montara", "Moss Beach", "El Granada"])]

In [27]:
midcoast["confirmed_cases"] = midcoast["confirmed_cases"].apply(
    lambda x: 0 if x == "<10" else x.split(".")[0]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  midcoast["confirmed_cases"] = midcoast["confirmed_cases"].apply(


In [28]:
midcoast_agg = (
    midcoast.groupby(["county_date", "date"])["confirmed_cases"]
    .apply(lambda x: x.astype(int).sum())
    .reset_index()
)

In [29]:
midcoast_agg["county"] = "San Mateo"

In [30]:
midcoast_agg["area"] = "Midcoast Region"

In [31]:
final = pd.concat(
    [df[~df.area.isin(["Montara", "Moss Beach", "El Granada"])], midcoast_agg]
).sort_values(["date", "area"])

  final = pd.concat(


In [32]:
final

Unnamed: 0,county,area,confirmed_cases,county_date,date
1152,San Mateo,94002: Belmont,22,NaT,2020-04-30 00:00:00
1153,San Mateo,94005: Brisbane,<10,NaT,2020-04-30 00:00:00
1154,San Mateo,94010: Burlingame/Hillsborough,51,NaT,2020-04-30 00:00:00
1155,San Mateo,94014: Daly City/Colma,93,NaT,2020-04-30 00:00:00
1156,San Mateo,94015: Daly City,159,NaT,2020-04-30 00:00:00
...,...,...,...,...,...
4,San Mateo,San Carlos,5833,2023-02-03,2023-02-06
11,San Mateo,San Mateo,23347,2023-02-03,2023-02-06
17,San Mateo,South San Francisco,18593,2023-02-03,2023-02-06
25,San Mateo,West Menlo Park,665,2023-02-03,2023-02-06


In [33]:
final.to_csv(data_dir / slug / "timeseries.csv", index=False)