# Setup

# Before running, you have to set the parameters at these numbers: 1, 2

In [4]:
import json
import yaml
import requests
import pandas as pd

In [5]:
# to visualize progress bars
from tqdm.auto import tqdm
tqdm.pandas()

In [6]:
# we need a github access token to perform a large number of requests (rate limit)
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)

token = cfg['access_token']
headers = {'Authorization': 'token ' + token}

In [7]:
token

'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'

In [8]:
# 1, change comments where needed

# for ETHZ Germany:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'DEU-estimates.csv'

# for ETHZ Austria:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'AUT-estimates.csv'

# for ETHZ Switzerland:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'CHE-estimates.csv'

# for TU Ilmenau Germany
# repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
# branch = 'gh-pages'
# file_path = 'data/estimates/Germany_RKI_R.csv'

# for TU Ilmenau Austria
# repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
# branch = 'gh-pages'
# file_path = 'data/estimates/Austria_JHU_R.csv'

# for TU Ilmenau Switzerland
# repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
# branch = 'gh-pages'
# file_path = 'data/estimates/Switzerland_JHU_R.csv'

# for OWID:
# repo_name = 'owid/covid-19-data'
# branch = 'master'
# file_path = 'public/data/owid-covid-data.csv'
# Note that OWID reports reproduction_rate only since 2020-11-13, this script will also download older data. This older data has to be removed from the data-raw folder before running the "process_historic_data_owid.r" script.

# for globalrt:
# repo_name = 'crondonm/TrackingR'
# branch = 'main'
# file_path = 'Estimates-Database/database.csv'

# for epiforecast:
# repo_name = 'epiforecasts/covid-rt-estimates'
# branch = 'master'
# file_path = 'national/cases/summary/rt.csv'

# for zidatalab:
repo_name = 'zidatalab/covid19dashboard'
branch = 'master'
file_path = 'data/plotdata/rwert_bund.json'


# Get commit history

In [9]:
# retrieve information about all commits that modified the file we want
all_commits = []

page = 0
while True:
    page += 1
    r = requests.get('https://api.github.com/repos/{}/commits'.format(repo_name),
                     params = {'sha': branch, 'path': file_path, 'page': str(page)},
                     headers=headers)

    if (not r.ok) or (r.text == '[]'): break

    all_commits += json.loads(r.text or r.content)

In [10]:
all_commits

[{'sha': '8c5fcb2b2b3938a4de30065130194517beb9db2d',
  'node_id': 'MDY6Q29tbWl0MjU5MjM0ODU5OjhjNWZjYjJiMmIzOTM4YTRkZTMwMDY1MTMwMTk0NTE3YmViOWRiMmQ=',
  'commit': {'author': {'name': 'Edgar Steiger',
    'email': 'esteiger@zi.de',
    'date': '2021-09-14T03:33:13Z'},
   'committer': {'name': 'Edgar Steiger',
    'email': 'esteiger@zi.de',
    'date': '2021-09-14T03:33:13Z'},
   'message': 'Auto update Dashboard',
   'tree': {'sha': '83693cc09070882ca95669f7731f0dc1169573db',
    'url': 'https://api.github.com/repos/zidatalab/covid19dashboard/git/trees/83693cc09070882ca95669f7731f0dc1169573db'},
   'url': 'https://api.github.com/repos/zidatalab/covid19dashboard/git/commits/8c5fcb2b2b3938a4de30065130194517beb9db2d',
   'comment_count': 0,
   'verification': {'verified': False,
    'reason': 'unsigned',
    'signature': None,
    'payload': None}},
  'url': 'https://api.github.com/repos/zidatalab/covid19dashboard/commits/8c5fcb2b2b3938a4de30065130194517beb9db2d',
  'html_url': 'https://git

In [11]:
# extract sha of each commit
all_shas = [c['sha'] for c in all_commits]

In [12]:
# date of each commit
commit_dates = [pd.to_datetime(c['commit']['author']['date']) for c in all_commits]

In [13]:
# combine into dataframe
df = pd.DataFrame({'sha': all_shas, 'commit_date': commit_dates})

In [14]:
# date without time
df['date'] = df.commit_date.dt.date

In [15]:
# only consider last commit made each day
df = df.loc[df.groupby('date')['commit_date'].idxmax()]

In [16]:
df

Unnamed: 0,sha,commit_date,date
297,0b4ddcccfa9d12a382ef49cc8db8ee00206f40db,2020-11-17 16:08:09+00:00,2020-11-17
296,cd0befa33789f07f3a3ec24c6ae498c29ccbead1,2020-11-18 09:18:33+00:00,2020-11-18
295,e99943e4569be67f132903ddfb21cc35993bd0f1,2020-11-19 09:00:54+00:00,2020-11-19
294,842bc846112169bc4b705acf094506f48dd6f3b2,2020-11-20 15:18:25+00:00,2020-11-20
293,3cf1c2104248f295160f5b71496b1ce90f2f4032,2020-11-21 06:06:58+00:00,2020-11-21
...,...,...,...
4,136dd5f8db02ccad6046dd2a64b61707e881586f,2021-09-10 03:32:16+00:00,2021-09-10
3,dc8cbe34632dbb605b55ab30a4129c28d48c2276,2021-09-11 03:32:42+00:00,2021-09-11
2,f3cbc1f35e2b7110d3b536d1cde9e724d75d5d4c,2021-09-12 03:34:18+00:00,2021-09-12
1,e57f37a4e1d29cc4d0dd9b11a787009055183f90,2021-09-13 03:33:27+00:00,2021-09-13


# Download and save files

In [18]:
# iterate over the rows of df and download the corresponding file
for index, row in tqdm(df.iterrows(), total=df.shape[0]):  
    # try:      # try-except added only for globalrt because HTTP-Error occurred
    # df_temp = pd.read_csv('https://raw.githubusercontent.com/{}/{}/{}'.format(repo_name, row['sha'], file_path),
                          #low_memory=False)
    # except:
        # pass
    # for zidatalab only:
    df_temp = pd.read_json('https://raw.githubusercontent.com/{}/{}/{}'.format(repo_name, row['sha'], file_path))
#2
    # result_path =  '../data-raw/ETHZ DEU/' + str(row['date']) + '_ethz_deu_raw.csv'
    # result_path =  '../data-raw/ETHZ AUT/' + str(row['date']) + '_ethz_aut_raw.csv'
    # result_path =  '../data-raw/ETHZ CHE/' + str(row['date']) + '_ethz_che_raw.csv'
    # result_path =  '../data-raw/Ilmenau DEU/' + str(row['date']) + '_ilmenau_deu_raw.csv'
    # result_path =  '../data-raw/Ilmenau AUT/' + str(row['date']) + '_ilmenau_aut_raw.csv'
    # result_path =  '../data-raw/Ilmenau CHE/' + str(row['date']) + '_ilmenau_che_raw.csv'
    # result_path =  '../data-raw/owid/' + str(row['date']) + '_owid_raw.csv'
    # result_path =  '../data-raw/globalrt/' + str(row['date']) + '_globalrt_raw.csv'
    # result_path = '../data-raw/epiforecast_au_ch_ger/' + str(row['date']) + '_epiforecast_raw.csv'
    result_path = '../data-raw/zidatalab/' + str(row['date']) + '_zidatalab_raw.json'
    # for owid only: filter only data from Germany:
    # df_temp = df_temp.loc[df_temp['iso_code'] == 'DEU']
    # for globalrt only: filter only data from Germany, Austria and Switzerland:
    # df_temp = df_temp.loc[(df_temp['Country/Region'] == 'Germany') | (df_temp['Country/Region'] == 'Austria') | (df_temp['Country/Region'] == 'Switzerland')]
    # for epiforecast:
    # df_temp = df_temp[(df_temp.iloc[:, 0] == 'Germany') | (df_temp.iloc[:, 0] == 'Austria') | (df_temp.iloc[:, 0] == 'Switzerland')]
    # df_temp.to_csv(result_path, index=False)
    # for zidatalab only:
    df_temp.to_json(result_path)

100%|██████████| 290/290 [02:03<00:00,  2.35it/s]
