# Setup

In [1]:
import json
import yaml
import requests
import pandas as pd

In [2]:
# to visualize progress bars
from tqdm.auto import tqdm
tqdm.pandas()

In [45]:
# we need a github access token to perform a large number of requests (rate limit)
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)

token = cfg['access_token']
headers = {'Authorization': 'token ' + token}

In [46]:
token

'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'

In [3]:
token = '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'
headers = {'Authorization': 'token ' + token}

In [4]:
# 1, change comments where needed

# for ETHZ Germany:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'DEU-estimates.csv'

# for ETHZ Austria:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'AUT-estimates.csv'

# for ETHZ Switzerland:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'CHE-estimates.csv'

# for TU Ilmenau Germany
# repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
# branch = 'gh-pages'
# file_path = 'data/estimates/Germany_RKI_R.csv'

# for TU Ilmenau Austria
# repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
# branch = 'gh-pages'
# file_path = 'data/estimates/Austria_JHU_R.csv'

# for TU Ilmenau Switzerland
# repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
# branch = 'gh-pages'
# file_path = 'data/estimates/Switzerland_JHU_R.csv'

# for OWID:
# repo_name = 'owid/covid-19-data'
# branch = 'master'
# file_path = 'public/data/owid-covid-data.csv'
# Note that OWID reports reproduction_rate only since 2020-11-13, this script will also download older data. This older data has to be removed from the data-raw folder before running the "process_historic_data_owid.r" script.

# for globalrt:
repo_name = 'crondonm/TrackingR'
branch = 'main'
file_path = 'Estimates-Database/database.csv'

# for epiforecast:
# repo_name = 'epiforecasts/covid-rt-estimates'
# branch = 'master'
# file_path = 'national/cases/summary/rt.csv'

# for zidatalab:
# repo_name = 'zidatalab/covid19dashboard'
# branch = 'master'
# file_path = 'data/plotdata/rwert_bund.json'

# for RKI:
# repo_name = 'robert-koch-institut/SARS-CoV-2-Nowcasting_und_-R-Schaetzung'
# branch = 'main'
# file_path = 'Nowcast_R_aktuell.csv'

# for Swiss Data Science Center (SDSC): download files manually from gitlab repo: https://renkulab.io/gitlab/covid-19/covid-19-forecast/-/tree/master/data/JHU/prediction
# try to download only predicition folder since repo is very large: https://bytenota.com/git-how-to-clone-a-specific-directory-from-a-git-repository/

# Get commit history

In [5]:
# retrieve information about all commits that modified the file we want
all_commits = []

page = 0
while True:
    page += 1
    r = requests.get('https://api.github.com/repos/{}/commits'.format(repo_name),
                     params = {'sha': branch, 'path': file_path, 'page': str(page)},
                     headers=headers)

    if (not r.ok) or (r.text == '[]'): break

    all_commits += json.loads(r.text or r.content)

In [6]:
# extract sha of each commit
all_shas = [c['sha'] for c in all_commits]
all_shas

['a6dac89653452dff1584d984e73757509770e379',
 '341c7b8b97cfdb016d656343073131aac8221e6f',
 'df8d5f1e452ceb5e8e495e566c92c0a6f381637c',
 '494c9f456853d8df8d98412bd300994f68a27f84',
 'dc83828c33abe9e057078aa43139d5d261f97b53',
 'f56c0bdc48d6c5fef0d901053d47e9d4268e5cae',
 '5e6c20a0a8ab22f023225a780697c3f8f0d58d08',
 '15b8d1c5089499b2925ef810af803e2b1b672c34',
 '0688fdc5569afd13a296c59cbd1d944367d7d16b',
 '76e1fb6f560b5931c5d039697e8afde38a4964bf',
 '6547b6da71e62ec8e74646edd05b76ade58b25a8',
 'cc05ad95fc5ad90b5fabdbb36e07f988820f8b1f',
 '0abea4bab1fb1ec7b02d9eb46e1c527d49202f43',
 '06c81c054766df227481b23ae43c6a569abf227b',
 'e48dd2aff36ff4560ae10c799397563a72d8451b',
 '64c545bf2f7ad1b23e7115cb81fd29b53e7ddcd6',
 '8e560f081c73a75ba40b5861007615ca300d410b',
 '70b2884d4cef0f0ab989390560a645e61b8b7730',
 '6e5b002c1426e7e1668500de5020672b8691270f',
 'e880ab37cef07204e41ff0f0e616b2f1496b5e47',
 'ab0f71a1d7fb5b32e14ed335e6b575c00fe4f423',
 '40c2d030bccb3e6feb18aaf50ac6b65bdef7367a',
 '3994feda

# Download and save files

## All but globalrt

In [51]:
# date of each commit
commit_dates = [pd.to_datetime(c['commit']['author']['date']) for c in all_commits]

In [52]:
# combine into dataframe
df = pd.DataFrame({'sha': all_shas, 'commit_date': commit_dates})

In [53]:
# date without time
df['date'] = df.commit_date.dt.date

In [54]:
# only consider last commit made each day
df = df.loc[df.groupby('date')['commit_date'].idxmax()]

In [55]:
df

Unnamed: 0,sha,commit_date,date
434,0b4ddcccfa9d12a382ef49cc8db8ee00206f40db,2020-11-17 16:08:09+00:00,2020-11-17
433,cd0befa33789f07f3a3ec24c6ae498c29ccbead1,2020-11-18 09:18:33+00:00,2020-11-18
432,e99943e4569be67f132903ddfb21cc35993bd0f1,2020-11-19 09:00:54+00:00,2020-11-19
431,842bc846112169bc4b705acf094506f48dd6f3b2,2020-11-20 15:18:25+00:00,2020-11-20
430,3cf1c2104248f295160f5b71496b1ce90f2f4032,2020-11-21 06:06:58+00:00,2020-11-21
...,...,...,...
4,e99494fc123b1c648924531caacd2bf2c9ea6c76,2022-01-27 03:47:47+00:00,2022-01-27
3,e5c0c262bfe1f63e2a5c1756baaa22070a665ffd,2022-01-28 03:45:55+00:00,2022-01-28
2,b19e98fddf3e03af4addc79a854d4f352075c3b2,2022-01-29 04:47:11+00:00,2022-01-29
1,fb7476f54374be7787bbc4dedc6b09fe0eeb6c90,2022-01-30 03:51:48+00:00,2022-01-30


In [56]:
# iterate over the rows of df and download the corresponding file
for index, row in tqdm(df.iterrows(), total=df.shape[0]):  
    # try:      # try-except added only for globalrt because HTTP-Error occurred
    #     df_temp = pd.read_csv('https://raw.githubusercontent.com/{}/{}/{}'.format(repo_name, row['sha'], file_path),
    #                       low_memory=False)
    # except:
    #     pass
    # for zidatalab only:
    df_temp = pd.read_json('https://raw.githubusercontent.com/{}/{}/{}'.format(repo_name, row['sha'], file_path))
#2
    # result_path =  '../data-raw/ETHZ DEU/' + str(row['date']) + '_ethz_deu_raw.csv'
    # result_path =  '../data-raw/ETHZ AUT/' + str(row['date']) + '_ethz_aut_raw.csv'
    # result_path =  '../data-raw/ETHZ CHE/' + str(row['date']) + '_ethz_che_raw.csv'
    # result_path =  '../data-raw/Ilmenau DEU/' + str(row['date']) + '_ilmenau_deu_raw.csv'
    # result_path =  '../data-raw/Ilmenau AUT/' + str(row['date']) + '_ilmenau_aut_raw.csv'
    # result_path =  '../data-raw/Ilmenau CHE/' + str(row['date']) + '_ilmenau_che_raw.csv'
    # result_path =  '../data-raw/owid/' + str(row['date']) + '_owid_raw.csv'
    # result_path = '../data-raw/epiforecast_au_ch_ger/' + str(row['date']) + '_epiforecast_raw.csv'
    result_path = '../data-raw/zidatalab/' + str(row['date']) + '_zidatalab_raw.json'
    # result_path = '../data-raw/rki-historic/' + str(row['date']) + 'rki_historic_raw.csv'
    # for owid only: filter only data from Germany:
    # df_temp = df_temp.loc[df_temp['iso_code'] == 'DEU']
    # for globalrt only: filter only data from Germany, Austria and Switzerland:
    # df_temp = df_temp.loc[(df_temp['Country/Region'] == 'Germany') | (df_temp['Country/Region'] == 'Austria') | (df_temp['Country/Region'] == 'Switzerland')]
    # for epiforecast:
    # df_temp = df_temp[(df_temp.iloc[:, 0] == 'Germany') | (df_temp.iloc[:, 0] == 'Austria') | (df_temp.iloc[:, 0] == 'Switzerland')]
    df_temp.to_csv(result_path, index=False)
    # for zidatalab only:
    df_temp.to_json(result_path)

100%|██████████| 426/426 [03:36<00:00,  1.97it/s]


## globalrt

In [None]:
# iterate over all commits
for i in range(len(all_shas)):
    if i != 269:
        print(str(i+1) + "/" + str(len(all_shas)))
        try:
            df_temp = pd.read_csv(
                "https://raw.githubusercontent.com/{}/{}/{}".format(
                    repo_name, all_shas[i], file_path
                ),
                low_memory=False,
            )
        except:
            pass

        # Germany
        df_DE = df_temp.loc[(df_temp["Country/Region"] == "Germany")]
        dates_DE = df_DE.loc[:,"last_updated"].unique()
        if len(dates_DE) == 1:
            date_DE = dates_DE[0]
            result_path_DE = "../data-raw/globalrt DE/" + date_DE + "_globalrt_raw.csv"
            df_DE.to_csv(result_path_DE, index=False)
        else:
            print("last_updated for Germany not unique in commit: sha = " + all_shas[i])
            print(dates_DE)

        # Austria
        df_AT = df_temp.loc[(df_temp["Country/Region"] == "Austria")]
        dates_AT = df_AT.loc[:,"last_updated"].unique()
        if len(dates_AT) == 1:
            date_AT = dates_AT[0]
            result_path_AT = "../data-raw/globalrt AT/" + date_AT + "_globalrt_raw.csv"
            df_AT.to_csv(result_path_AT, index=False)
        else:
            print("last_updated for Austria not unique in commit: sha = " + all_shas[i])
            print(dates_AT)

        # Switzerland
        df_CH = df_temp.loc[(df_temp["Country/Region"] == "Switzerland")]
        dates_CH = df_CH.loc[:,"last_updated"].unique()
        if len(dates_CH) == 1:
            date_CH = dates_CH[0]
            result_path_CH = "../data-raw/globalrt CH/" + date_CH + "_globalrt_raw.csv"
            df_CH.to_csv(result_path_CH, index=False)
        else:
            print("last_updated for Switzerland not unique in commit: sha = " + all_shas[i])
            print(dates_CH)