In [11]:
import requests
from pyquery import PyQuery as pq
import simplejson as json
from tqdm import tqdm
# import pandas as pd     # so so so so so slow
from openpyxl import load_workbook

import os
from glob import glob
import csv

In [12]:
reportdir = "/var/www/html/misc/20200417-covid-county-analysis/"   # Blank or ending in a slash
if not os.path.exists(reportdir):
    reportdir = ""

xlsxdir = "xlsx/"
csvdir = "csv/"
summarydir = reportdir + "white-house-reports/"

for targetdir in [xlsxdir, csvdir, summarydir]:
    os.makedirs(targetdir, exist_ok=True)

In [13]:
baseurl = "https://healthdata.gov"
starturl = "https://healthdata.gov/Health/COVID-19-Community-Profile-Report/gqxm-d9w9"
r = requests.get(starturl)
html = r.text

In [14]:
# Find the right JSON object
for row in html.splitlines():
    if '{"view":' in row:
        break

In [15]:
rawdata = json.loads(row)

In [16]:
# Grab just what we want
entries = rawdata['view']['attachments']

In [17]:
newfiles = 0
for entry in tqdm(entries):
    basefilename = entry['name']
    if ".xlsx" in basefilename:   # Get just the Excel files
        targetfilename = xlsxdir + basefilename
        if not os.path.exists(targetfilename):    # if we haven't already downloaded this
            filedate = basefilename.split("_")[-2]
            if not os.path.exists(csvdir + filedate + ".csv"):    # If we don't have the CSV either
                targeturl = baseurl + entry['href']
                r = requests.get(targeturl)
                if r.status_code != 200:
                    print(f"Error downloading {basefilename} from {targeturl}")
                else:
                    with open(targetfilename, "wb") as outfile:
                        outfile.write(r.content)
                    newfiles += 1
print(f"{newfiles} new file(s) found")

100%|███████████████████████████████████████████████████████████████████████████████| 525/525 [00:00<00:00, 834.63it/s]

1 new file(s) found





In [18]:
if newfiles > 0:     # If we need to reparse everything:
    excelfiles = glob(xlsxdir + "*.xlsx")
    for excelfilename in tqdm(excelfiles):
        filedate = excelfilename.split("_")[-2]
        csvfilename = csvdir + filedate + ".csv"
        if not os.path.exists(csvfilename):
            workbook = load_workbook(filename=excelfilename)
            countytab = workbook["Counties"]
            with open(csvfilename, "w", newline="", encoding="utf-8") as outfile:
                writer = csv.writer(outfile)
                for entry in list(countytab.iter_rows(values_only=True))[1:]:   # Skip first row
                    writer.writerow(list(entry))


100%|████████████████████████████████████████████████████████████████████████████████| 263/263 [35:53<00:00,  8.19s/it]


In [19]:
if newfiles > 0:     # If we need to reparse everything:
    masterdict = {}
    csvfiles = list(sorted(glob(csvdir + "*.csv")))
    for filenumber, csvfilename in enumerate(tqdm(csvfiles)):
        basefilename = csvfilename.replace("\\", "/").replace(csvdir, "")
        filedate = basefilename.replace(".csv", "")
        with open(csvfilename, "r", encoding="utf-8") as infile:
            reader = csv.DictReader(infile)
            for row in reader:
                fips = row['FIPS code'].zfill(5)
                state = row['State Abbreviation']
                county = row['County']
                cases = row['Cumulative cases']
                deaths = row['Cumulative deaths']
                if cases == "":
                    cases = 0
                else:
                    cases = int(cases)
                if deaths == "":
                    deaths = 0
                else:
                    deaths = int(deaths)
                if state not in masterdict:
                    masterdict[state] = {}
                if fips not in masterdict[state]:
                    masterdict[state][fips] = {}
                line = {
                    "fips": fips,
                    "state": state,
                    "county": county,
                    "filedate": filedate,
                    "cases": cases,
                    "deaths": deaths
                }
                masterdict[state][fips][filedate] = line   

100%|████████████████████████████████████████████████████████████████████████████████| 263/263 [00:24<00:00, 10.80it/s]


In [20]:
if newfiles > 0:     # If we need to reparse everything:masterdict = {}
    headers = list(masterdict[state][fips][filedate].keys())
    for state in tqdm(masterdict):
        with open(summarydir + state + ".csv", "w", newline="", encoding="utf-8") as outfile:
            writer = csv.writer(outfile)
            writer.writerow(headers)
            for fips in sorted(masterdict[state]):
                for filedate in masterdict[state][fips]:
                    writer.writerow(list(masterdict[state][fips][filedate].values()))

100%|██████████████████████████████████████████████████████████████████████████████████| 52/52 [00:01<00:00, 27.96it/s]
