This repository has been archived by the owner on Apr 18, 2021. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Stichting NICE downloader and parser (#55)
- Loading branch information
Showing
10 changed files
with
283 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
date,newIntake,intakeCount,intakeCumulative,icCount,diedCumulative,survivedCumulative | ||
2020-02-12,0,1,0,1,0,0 | ||
2020-02-21,1,2,1,2,0,0 | ||
2020-02-25,1,3,2,3,0,0 | ||
2020-02-28,0,3,2,3,0,0 | ||
2020-02-29,0,4,2,3,0,0 | ||
2020-03-01,1,5,3,4,0,0 | ||
2020-03-03,1,6,4,5,0,0 | ||
2020-03-04,3,9,7,7,0,1 | ||
2020-03-05,1,9,8,7,0,1 | ||
2020-03-06,3,12,11,10,0,1 | ||
2020-03-07,3,14,14,11,0,2 | ||
2020-03-08,5,18,19,15,2,2 | ||
2020-03-09,5,22,24,17,2,2 | ||
2020-03-10,8,30,32,17,2,2 | ||
2020-03-11,6,39,38,20,3,2 | ||
2020-03-12,22,60,60,26,4,2 | ||
2020-03-13,14,70,74,29,4,2 | ||
2020-03-14,17,87,91,37,4,4 | ||
2020-03-15,27,112,118,46,6,4 | ||
2020-03-16,32,140,150,53,10,6 | ||
2020-03-17,35,169,185,55,12,6 | ||
2020-03-18,51,220,236,59,15,8 | ||
2020-03-19,59,272,295,65,21,8 | ||
2020-03-20,86,352,381,66,25,9 | ||
2020-03-21,71,419,452,67,35,10 | ||
2020-03-22,71,466,523,67,41,12 | ||
2020-03-23,102,559,625,68,54,12 | ||
2020-03-24,108,638,733,70,61,16 | ||
2020-03-25,116,733,849,70,71,20 | ||
2020-03-26,100,816,949,70,87,27 | ||
2020-03-27,115,903,1064,71,100,32 | ||
2020-03-28,103,970,1167,70,118,33 | ||
2020-03-29,95,1022,1262,71,141,41 | ||
2020-03-30,131,1105,1393,70,160,49 | ||
2020-03-31,105,1165,1498,71,191,57 | ||
2020-04-01,76,1173,1574,72,209,67 | ||
2020-04-02,97,1220,1671,72,229,75 | ||
2020-04-03,82,1253,1753,71,258,81 | ||
2020-04-04,54,1248,1807,71,274,86 | ||
2020-04-05,64,1252,1871,71,292,90 | ||
2020-04-06,18,1216,1889,71,295,95 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
python download_rivm_map_data.py | ||
python merge_data.py | ||
python render_datasets.py | ||
python press_releases.py | ||
python press_releases.py | ||
python download_parse_nice.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,229 @@ | ||
"""Download and parse NICE data to CSV""" | ||
|
||
import csv | ||
import json | ||
import os | ||
import sys | ||
import shutil | ||
from datetime import datetime | ||
from pathlib import Path | ||
import numpy as np | ||
import pandas as pd | ||
import requests | ||
|
||
# JSONs to download. | ||
# Files with single object (dict) and files with single array (list(dict)) are supported out of the box | ||
NICE_URLS = ['https://www.stichting-nice.nl/covid-19/public/new-intake', # new IC patients with proven COVID-19, per day | ||
'https://www.stichting-nice.nl/covid-19/public/intake-count', # current total IC patients with proven COVID-19, per day | ||
'https://www.stichting-nice.nl/covid-19/public/intake-cumulative', # cumulative IC patients with proven COVID-19, per day | ||
'https://www.stichting-nice.nl/covid-19/public/ic-count', # current total of ICUs with at least one proven COVID-19 patient, per day | ||
'https://www.stichting-nice.nl/covid-19/public/died-and-survivors-cumulative'] # cumulative IC patients with proven COVID-19 that died and survived, per day | ||
|
||
# https://www.stichting-nice.nl/covid-19/public/global is also available, but doesn't have a date or other lastupdate indicator | ||
|
||
|
||
def download_json(urls): | ||
"""Download a list of JSONs, return dictionary with filename => data""" | ||
data = {} | ||
for url in urls: | ||
name = url.rsplit('/', 1)[-1] | ||
print('Downloading {}'.format(url)) | ||
try: | ||
resp = requests.get(url) | ||
except Exception as e: | ||
sys.exit('ERROR: could not download {} ({})'.format(url,e)) | ||
|
||
if name == 'died-and-survivors-cumulative': | ||
# parse died and survivors to their own files | ||
died = [] | ||
survived = [] | ||
for i,v in enumerate(resp.json()): | ||
for l in v: | ||
if i == 0: | ||
l['diedCumulativeNew'] = l.pop('value') | ||
died.append(l) | ||
|
||
else: | ||
l['survivedCumulative'] = l.pop('value') | ||
survived.append(l) | ||
|
||
data['died-cumulative'] = died | ||
data['survived-cumulative'] = survived | ||
else: | ||
data[name] = resp.json() | ||
|
||
return data | ||
|
||
# use this if you want to store the raw JSONs | ||
def dump_json(data, dir): | ||
"""Parse a dictionary of JSONs. Key = filename (without .json)""" | ||
|
||
try: | ||
Path(dir).mkdir(parents=True, exist_ok=True) | ||
except Exception as e: | ||
sys.exit('ERROR: could not create dir {} ({})'.format(dir,e)) | ||
|
||
# add current datetime to filename | ||
dt = datetime.now().strftime('%Y%m%d%H%M') | ||
|
||
for name,content in data.items(): | ||
file = dir+name+'-'+dt+'.json' | ||
print('Dumping raw to {}'.format(file)) | ||
|
||
try: | ||
with open(file, 'w') as out: | ||
json.dump(content, out) | ||
except Exception as e: | ||
sys.exit('ERROR: could not dump json to {} ({})'.format(file,e)) | ||
|
||
return True | ||
|
||
def write_json_to_csv(data,dir): | ||
"""Write a dictionary of filename => data (dict) to CSVs in dir""" | ||
for file,json in data.items(): | ||
file = dir+file+'.csv' | ||
|
||
print('Writing json to {}'.format(file)) | ||
|
||
try: | ||
with open(file, 'w', newline='') as out_f: | ||
if isinstance(json, dict): | ||
# single object | ||
# get header | ||
header = list(json.keys()) | ||
|
||
# put data in list so it will fit in the DictWriter | ||
lines = [json] | ||
else: | ||
# single array | ||
# get header | ||
l = json[0] | ||
header = [] | ||
for k,v in l.items(): | ||
header.append(k) | ||
|
||
lines = json | ||
|
||
# write the header | ||
writer = csv.DictWriter(out_f,fieldnames=header) | ||
writer.writeheader() | ||
|
||
# write the data | ||
for l in lines: | ||
writer.writerow(l) | ||
except Exception as e: | ||
sys.exit('ERROR: could not write json to csv {} ({})'.format(file,e)) | ||
|
||
|
||
def merge_csvs(merge_list,dir,on,choose,dtype=int): | ||
"""Merge CSV files from list in dir, join on 'on', select values | ||
with numpy func 'choose', convert values to dtype. | ||
Returns pandas DataFrame""" | ||
# first file is filea | ||
filea = merge_list[0] | ||
try: | ||
a = pd.read_csv(dir+filea) | ||
except Exception as e: | ||
sys.exit('ERROR: could not read {} ({})'.format(dir+filea,e)) | ||
|
||
l = iter(merge_list) | ||
next(l) | ||
|
||
labels = [] | ||
|
||
# loop through the rest as fileb | ||
for fileb in l: | ||
try: | ||
b = pd.read_csv(dir+fileb) | ||
except Exception as e: | ||
sys.exit('ERROR: could not read {} ({})'.format(dir+fileb,e)) | ||
|
||
# merge filea and b | ||
print('Merging {} to {}'.format(fileb,filea)) | ||
merged = a.merge(b,how='left',on=on) | ||
|
||
# collect labels; choose values; fix N/A; convert to int | ||
for k in b.keys(): | ||
if k != on: | ||
xy = [k+'_x', k+'_y'] | ||
for label in xy: | ||
if label in merged.columns: | ||
merged[k] = choose(merged[xy], axis = 1) | ||
if label not in labels: | ||
labels.append(label) | ||
# forward fill NaN. First row = 0 | ||
if np.isnan(merged.iloc[0, merged.columns.get_loc(k)]): | ||
merged.iloc[0, merged.columns.get_loc(k)] = 0 | ||
merged[k] = merged[k].fillna(method='ffill').astype(int) | ||
|
||
a = merged | ||
# drop _x and _y labels | ||
a.drop(labels=labels, axis=1, inplace=True) | ||
|
||
return a | ||
|
||
def cleanup_processing(dir): | ||
"""Delete all contents in dir of dir itself""" | ||
print('Cleaning up processing') | ||
for filename in os.listdir(dir): | ||
file_path = os.path.join(dir, filename) | ||
try: | ||
if os.path.isfile(file_path) or os.path.islink(file_path): | ||
os.unlink(file_path) | ||
elif os.path.isdir(file_path): | ||
shutil.rmtree(file_path) | ||
except Exception as e: | ||
print('ERROR: could not delete {}. ({})'.format(file_path, e)) | ||
|
||
if __name__ == '__main__': | ||
# download jsons | ||
data = download_json(NICE_URLS) | ||
|
||
# dump jsons to raw_data, by date | ||
json_dir = 'raw_data/nice/' | ||
dump_json(data, json_dir) | ||
|
||
# write jsons to csv | ||
processing_dir = 'raw_data/nice/processing/' | ||
Path(processing_dir).mkdir(parents=True, exist_ok=True) | ||
write_json_to_csv(data, processing_dir) | ||
|
||
# list of csvs to merge (join) | ||
merge_list = ('ic-count.csv', 'intake-count.csv', 'intake-cumulative.csv', | ||
'new-intake.csv', 'died-cumulative.csv', 'survived-cumulative.csv') | ||
|
||
# join files on | ||
on = 'date' | ||
|
||
# numpy function for choosing which value, in case of duplicate columns | ||
choose = np.max | ||
|
||
# merge the CSVs | ||
new_csv = merge_csvs(merge_list,processing_dir,on,choose) | ||
|
||
# fix 0s in intakeCumulative: replace all with NaN, ffill, replace first NaN with 0 | ||
new_csv['intakeCumulative'] = new_csv.intakeCumulative.replace(0, np.nan) | ||
new_csv['intakeCumulative'] = new_csv.intakeCumulative.fillna(method='ffill') | ||
new_csv['intakeCumulative'] = new_csv.intakeCumulative.fillna(0).astype(int) | ||
|
||
# clean up columns. diedCumulative and isCumulative never have data, so drop them. | ||
drop_columns = ['diedCumulative', 'icCumulative'] | ||
new_csv.drop(labels=drop_columns, axis=1, inplace=True) | ||
new_csv.rename(columns={'diedCumulativeNew': 'diedCumulative'}, inplace=True) | ||
|
||
# new file to merge to | ||
data_dir = 'data/' | ||
filename = 'nice_ic_by_day.csv' | ||
file_new = data_dir+filename | ||
|
||
# write to csv | ||
print('Writing merged data to {}'.format(file_new)) | ||
try: | ||
new_csv.to_csv(file_new,index=False) | ||
except Exception as e: | ||
print('ERROR: could not write new csv to {} ({})'.format(file_new,e)) | ||
|
||
# clean up processing | ||
cleanup_processing(processing_dir) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
[{"date": "2020-03-08", "diedCumulativeNew": 2}, {"date": "2020-03-11", "diedCumulativeNew": 3}, {"date": "2020-03-12", "diedCumulativeNew": 4}, {"date": "2020-03-15", "diedCumulativeNew": 6}, {"date": "2020-03-16", "diedCumulativeNew": 10}, {"date": "2020-03-17", "diedCumulativeNew": 12}, {"date": "2020-03-18", "diedCumulativeNew": 15}, {"date": "2020-03-19", "diedCumulativeNew": 21}, {"date": "2020-03-20", "diedCumulativeNew": 25}, {"date": "2020-03-21", "diedCumulativeNew": 35}, {"date": "2020-03-22", "diedCumulativeNew": 41}, {"date": "2020-03-23", "diedCumulativeNew": 54}, {"date": "2020-03-24", "diedCumulativeNew": 61}, {"date": "2020-03-25", "diedCumulativeNew": 71}, {"date": "2020-03-26", "diedCumulativeNew": 87}, {"date": "2020-03-27", "diedCumulativeNew": 100}, {"date": "2020-03-28", "diedCumulativeNew": 118}, {"date": "2020-03-29", "diedCumulativeNew": 141}, {"date": "2020-03-30", "diedCumulativeNew": 160}, {"date": "2020-03-31", "diedCumulativeNew": 191}, {"date": "2020-04-01", "diedCumulativeNew": 209}, {"date": "2020-04-02", "diedCumulativeNew": 229}, {"date": "2020-04-03", "diedCumulativeNew": 258}, {"date": "2020-04-04", "diedCumulativeNew": 274}, {"date": "2020-04-05", "diedCumulativeNew": 292}, {"date": "2020-04-06", "diedCumulativeNew": 295}] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
[{"date": "2020-02-12", "newIntake": 0, "diedCumulative": 0, "intakeCount": 1, "intakeCumulative": 0, "icCount": 1, "icCumulative": 0}, {"date": "2020-02-21", "newIntake": 0, "diedCumulative": 0, "intakeCount": 2, "intakeCumulative": 0, "icCount": 2, "icCumulative": 0}, {"date": "2020-02-25", "newIntake": 0, "diedCumulative": 0, "intakeCount": 3, "intakeCumulative": 0, "icCount": 3, "icCumulative": 0}, {"date": "2020-02-28", "newIntake": 0, "diedCumulative": 0, "intakeCount": 3, "intakeCumulative": 0, "icCount": 3, "icCumulative": 0}, {"date": "2020-02-29", "newIntake": 0, "diedCumulative": 0, "intakeCount": 4, "intakeCumulative": 0, "icCount": 3, "icCumulative": 0}, {"date": "2020-03-01", "newIntake": 0, "diedCumulative": 0, "intakeCount": 5, "intakeCumulative": 0, "icCount": 4, "icCumulative": 0}, {"date": "2020-03-03", "newIntake": 0, "diedCumulative": 0, "intakeCount": 6, "intakeCumulative": 0, "icCount": 5, "icCumulative": 0}, {"date": "2020-03-04", "newIntake": 0, "diedCumulative": 0, "intakeCount": 9, "intakeCumulative": 0, "icCount": 7, "icCumulative": 0}, {"date": "2020-03-05", "newIntake": 0, "diedCumulative": 0, "intakeCount": 9, "intakeCumulative": 0, "icCount": 7, "icCumulative": 0}, {"date": "2020-03-06", "newIntake": 0, "diedCumulative": 0, "intakeCount": 12, "intakeCumulative": 0, "icCount": 10, "icCumulative": 0}, {"date": "2020-03-07", "newIntake": 0, "diedCumulative": 0, "intakeCount": 14, "intakeCumulative": 0, "icCount": 11, "icCumulative": 0}, {"date": "2020-03-08", "newIntake": 0, "diedCumulative": 0, "intakeCount": 18, "intakeCumulative": 0, "icCount": 15, "icCumulative": 0}, {"date": "2020-03-09", "newIntake": 0, "diedCumulative": 0, "intakeCount": 22, "intakeCumulative": 0, "icCount": 17, "icCumulative": 0}, {"date": "2020-03-10", "newIntake": 0, "diedCumulative": 0, "intakeCount": 30, "intakeCumulative": 0, "icCount": 17, "icCumulative": 0}, {"date": "2020-03-11", "newIntake": 0, "diedCumulative": 0, "intakeCount": 39, "intakeCumulative": 0, "icCount": 20, "icCumulative": 0}, {"date": "2020-03-12", "newIntake": 0, "diedCumulative": 0, "intakeCount": 60, "intakeCumulative": 0, "icCount": 26, "icCumulative": 0}, {"date": "2020-03-13", "newIntake": 0, "diedCumulative": 0, "intakeCount": 70, "intakeCumulative": 0, "icCount": 29, "icCumulative": 0}, {"date": "2020-03-14", "newIntake": 0, "diedCumulative": 0, "intakeCount": 87, "intakeCumulative": 0, "icCount": 37, "icCumulative": 0}, {"date": "2020-03-15", "newIntake": 0, "diedCumulative": 0, "intakeCount": 112, "intakeCumulative": 0, "icCount": 46, "icCumulative": 0}, {"date": "2020-03-16", "newIntake": 0, "diedCumulative": 0, "intakeCount": 140, "intakeCumulative": 0, "icCount": 53, "icCumulative": 0}, {"date": "2020-03-17", "newIntake": 0, "diedCumulative": 0, "intakeCount": 169, "intakeCumulative": 0, "icCount": 55, "icCumulative": 0}, {"date": "2020-03-18", "newIntake": 0, "diedCumulative": 0, "intakeCount": 220, "intakeCumulative": 0, "icCount": 59, "icCumulative": 0}, {"date": "2020-03-19", "newIntake": 0, "diedCumulative": 0, "intakeCount": 272, "intakeCumulative": 0, "icCount": 65, "icCumulative": 0}, {"date": "2020-03-20", "newIntake": 0, "diedCumulative": 0, "intakeCount": 352, "intakeCumulative": 0, "icCount": 66, "icCumulative": 0}, {"date": "2020-03-21", "newIntake": 0, "diedCumulative": 0, "intakeCount": 419, "intakeCumulative": 0, "icCount": 67, "icCumulative": 0}, {"date": "2020-03-22", "newIntake": 0, "diedCumulative": 0, "intakeCount": 466, "intakeCumulative": 0, "icCount": 67, "icCumulative": 0}, {"date": "2020-03-23", "newIntake": 0, "diedCumulative": 0, "intakeCount": 559, "intakeCumulative": 0, "icCount": 68, "icCumulative": 0}, {"date": "2020-03-24", "newIntake": 0, "diedCumulative": 0, "intakeCount": 638, "intakeCumulative": 0, "icCount": 70, "icCumulative": 0}, {"date": "2020-03-25", "newIntake": 0, "diedCumulative": 0, "intakeCount": 733, "intakeCumulative": 0, "icCount": 70, "icCumulative": 0}, {"date": "2020-03-26", "newIntake": 0, "diedCumulative": 0, "intakeCount": 816, "intakeCumulative": 0, "icCount": 70, "icCumulative": 0}, {"date": "2020-03-27", "newIntake": 0, "diedCumulative": 0, "intakeCount": 903, "intakeCumulative": 0, "icCount": 71, "icCumulative": 0}, {"date": "2020-03-28", "newIntake": 0, "diedCumulative": 0, "intakeCount": 970, "intakeCumulative": 0, "icCount": 70, "icCumulative": 0}, {"date": "2020-03-29", "newIntake": 0, "diedCumulative": 0, "intakeCount": 1022, "intakeCumulative": 0, "icCount": 71, "icCumulative": 0}, {"date": "2020-03-30", "newIntake": 0, "diedCumulative": 0, "intakeCount": 1105, "intakeCumulative": 0, "icCount": 70, "icCumulative": 0}, {"date": "2020-03-31", "newIntake": 0, "diedCumulative": 0, "intakeCount": 1165, "intakeCumulative": 0, "icCount": 71, "icCumulative": 0}, {"date": "2020-04-01", "newIntake": 0, "diedCumulative": 0, "intakeCount": 1173, "intakeCumulative": 0, "icCount": 72, "icCumulative": 0}, {"date": "2020-04-02", "newIntake": 0, "diedCumulative": 0, "intakeCount": 1220, "intakeCumulative": 0, "icCount": 72, "icCumulative": 0}, {"date": "2020-04-03", "newIntake": 0, "diedCumulative": 0, "intakeCount": 1253, "intakeCumulative": 0, "icCount": 71, "icCumulative": 0}, {"date": "2020-04-04", "newIntake": 0, "diedCumulative": 0, "intakeCount": 1248, "intakeCumulative": 0, "icCount": 71, "icCumulative": 0}, {"date": "2020-04-05", "newIntake": 0, "diedCumulative": 0, "intakeCount": 1252, "intakeCumulative": 0, "icCount": 71, "icCumulative": 0}, {"date": "2020-04-06", "newIntake": 0, "diedCumulative": 0, "intakeCount": 1216, "intakeCumulative": 0, "icCount": 71, "icCumulative": 0}] |
Oops, something went wrong.