# Setup

# Before running, you have to set the parameters at these numbers: 1, 2

In [127]:
import json
import yaml
import requests
import pandas as pd

In [128]:
# to visualize progress bars
from tqdm.auto import tqdm
tqdm.pandas()

In [129]:
# we need a github access token to perform a large number of requests (rate limit)
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)

token = cfg['access_token']
headers = {'Authorization': 'token ' + token}

In [130]:
token

'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'

In [141]:
# 1, change comments where needed

# for ETHZ Germany:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'DEU-estimates.csv'

# for ETHZ Austria:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'AUT-estimates.csv'

# for ETHZ Switzerland:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'CHE-estimates.csv'

# for TU Ilmenau Germany
# repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
# branch = 'gh-pages'
# file_path = 'data/estimates/Germany_RKI_R.csv'

# for TU Ilmenau Austria
# repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
# branch = 'gh-pages'
# file_path = 'data/estimates/Austria_JHU_R.csv'

# for TU Ilmenau Switzerland
# repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
# branch = 'gh-pages'
# file_path = 'data/estimates/Switzerland_JHU_R.csv'

# for OWID:
# repo_name = 'owid/covid-19-data'
# branch = 'master'
# file_path = 'public/data/owid-covid-data.csv'
# Note that OWID reports reproduction_rate only since 2020-11-13, this script will also download older data. This older data has to be removed from the data-raw folder before running the "process_historic_data_owid.r" script.

# for globalrt:
# repo_name = 'crondonm/TrackingR'
# branch = 'main'
# file_path = 'Estimates-Database/database.csv'

# for epiforecast:
# repo_name = 'epiforecasts/covid-rt-estimates'
# branch = 'master'
# file_path = 'national/cases/summary/rt.csv'

# for zidatalab:
# repo_name = 'zidatalab/covid19dashboard'
# branch = 'master'
# file_path = 'data/plotdata/rwert_bund.json'

# for RKI:
repo_name = 'robert-koch-institut/SARS-CoV-2-Nowcasting_und_-R-Schaetzung'
branch = 'main'
file_path = 'Nowcast_R_aktuell.csv'


# Get commit history

In [142]:
# retrieve information about all commits that modified the file we want
all_commits = []

page = 0
while True:
    page += 1
    r = requests.get('https://api.github.com/repos/{}/commits'.format(repo_name),
                     params = {'sha': branch, 'path': file_path, 'page': str(page)},
                     headers=headers)

    if (not r.ok) or (r.text == '[]'): break

    all_commits += json.loads(r.text or r.content)

In [143]:
all_commits

[{'sha': '0b807f9793f3be4162b04a87936b8dcc40b540c5',
  'node_id': 'C_kwDOFSsWgNoAKDBiODA3Zjk3OTNmM2JlNDE2MmIwNGE4NzkzNmI4ZGNjNDBiNTQwYzU',
  'commit': {'author': {'name': 'anderHeidenM',
    'email': 'AnderHeidenM@rki.de',
    'date': '2021-10-04T23:47:15Z'},
   'committer': {'name': 'Opendata',
    'email': 'wuenscheh@rki.de',
    'date': '2021-10-04T23:47:15Z'},
   'message': 'Update 2021-10-05',
   'tree': {'sha': '7e6fe3ec32f9935ab23a514834a873a2ddbc6fa8',
    'url': 'https://api.github.com/repos/robert-koch-institut/SARS-CoV-2-Nowcasting_und_-R-Schaetzung/git/trees/7e6fe3ec32f9935ab23a514834a873a2ddbc6fa8'},
   'url': 'https://api.github.com/repos/robert-koch-institut/SARS-CoV-2-Nowcasting_und_-R-Schaetzung/git/commits/0b807f9793f3be4162b04a87936b8dcc40b540c5',
   'comment_count': 0,
   'verification': {'verified': False,
    'reason': 'unsigned',
    'signature': None,
    'payload': None}},
  'url': 'https://api.github.com/repos/robert-koch-institut/SARS-CoV-2-Nowcasting_und_-R-

In [144]:
# extract sha of each commit
all_shas = [c['sha'] for c in all_commits]

In [145]:
# date of each commit
commit_dates = [pd.to_datetime(c['commit']['author']['date']) for c in all_commits]

In [146]:
# combine into dataframe
df = pd.DataFrame({'sha': all_shas, 'commit_date': commit_dates})

In [147]:
# date without time
df['date'] = df.commit_date.dt.date

In [148]:
# only consider last commit made each day
df = df.loc[df.groupby('date')['commit_date'].idxmax()]

In [149]:
df

Unnamed: 0,sha,commit_date,date
167,fd111bf04a5ff9c7cf4de24016d39ccc3bf705c2,2021-04-21 09:02:21+00:00,2021-04-21
166,bc83f162905b3f42cb704c66f1aaac558311c1ac,2021-04-22 06:51:34+00:00,2021-04-22
165,3ffce79ca26daa7c3dd9dda76d069b0706d4cfe3,2021-04-23 07:35:14+00:00,2021-04-23
164,fe4d431f287fcf182132a903bf5afce544ac4031,2021-04-24 09:50:36+00:00,2021-04-24
163,128c87a9e55f23d0e7f9ee161aa6847fd4d36198,2021-04-25 10:41:47+00:00,2021-04-25
...,...,...,...
4,06bda48473a514564a80f9c6be877ceaee3df649,2021-09-30 23:44:07+00:00,2021-09-30
3,6a8ef4bfc0e48db0b0ff7535bf3b3416bc5b4033,2021-10-01 23:48:22+00:00,2021-10-01
2,c6f88e3b99f4b0221bff287be69359080a4eaf52,2021-10-02 23:47:49+00:00,2021-10-02
1,1b0be38cbc5b1e44432d53b362e03d697a5c2b00,2021-10-03 23:47:03+00:00,2021-10-03


# Download and save files

In [151]:
# iterate over the rows of df and download the corresponding file
for index, row in tqdm(df.iterrows(), total=df.shape[0]):  
    try:      # try-except added only for globalrt because HTTP-Error occurred
        df_temp = pd.read_csv('https://raw.githubusercontent.com/{}/{}/{}'.format(repo_name, row['sha'], file_path),
                          low_memory=False)
    except:
        pass
    # for zidatalab only:
    #df_temp = pd.read_json('https://raw.githubusercontent.com/{}/{}/{}'.format(repo_name, row['sha'], file_path))
#2
    # result_path =  '../data-raw/ETHZ DEU/' + str(row['date']) + '_ethz_deu_raw.csv'
    # result_path =  '../data-raw/ETHZ AUT/' + str(row['date']) + '_ethz_aut_raw.csv'
    # result_path =  '../data-raw/ETHZ CHE/' + str(row['date']) + '_ethz_che_raw.csv'
    # result_path =  '../data-raw/Ilmenau DEU/' + str(row['date']) + '_ilmenau_deu_raw.csv'
    # result_path =  '../data-raw/Ilmenau AUT/' + str(row['date']) + '_ilmenau_aut_raw.csv'
    # result_path =  '../data-raw/Ilmenau CHE/' + str(row['date']) + '_ilmenau_che_raw.csv'
    # result_path =  '../data-raw/owid/' + str(row['date']) + '_owid_raw.csv'
    # result_path =  '../data-raw/globalrt/' + str(row['date']) + '_globalrt_raw.csv'
    # result_path = '../data-raw/epiforecast_au_ch_ger/' + str(row['date']) + '_epiforecast_raw.csv'
    # result_path = '../data-raw/zidatalab/' + str(row['date']) + '_zidatalab_raw.json'
    result_path = '../data-raw/rki-historic/' + str(row['date']) + 'rki_historic_raw.csv'
    # for owid only: filter only data from Germany:
    # df_temp = df_temp.loc[df_temp['iso_code'] == 'DEU']
    # for globalrt only: filter only data from Germany, Austria and Switzerland:
    # df_temp = df_temp.loc[(df_temp['Country/Region'] == 'Germany') | (df_temp['Country/Region'] == 'Austria') | (df_temp['Country/Region'] == 'Switzerland')]
    # for epiforecast:
    # df_temp = df_temp[(df_temp.iloc[:, 0] == 'Germany') | (df_temp.iloc[:, 0] == 'Austria') | (df_temp.iloc[:, 0] == 'Switzerland')]
    df_temp.to_csv(result_path, index=False)
    # for zidatalab only:
    # df_temp.to_json(result_path)

100%|██████████| 165/165 [00:34<00:00,  4.80it/s]
