# Setup

In [1]:
import json
import yaml
import requests
import pandas as pd

In [2]:
# to visualize progress bars
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
# we need a github access token to perform a large number of requests (rate limit)
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)

token = cfg['access_token']
headers = {'Authorization': 'token ' + token}

In [46]:
token

'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'

In [4]:
token = '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'
headers = {'Authorization': 'token ' + token}

In [36]:
# 1, change comments where needed

# for ETHZ Germany:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'DEU-estimates.csv'

# for ETHZ Austria:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'AUT-estimates.csv'

# for ETHZ Switzerland:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'CHE-estimates.csv'

# for TU Ilmenau Germany
# repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
# branch = 'gh-pages'
# file_path = 'data/estimates/Germany_RKI_R.csv'

# for TU Ilmenau Austria
# repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
# branch = 'gh-pages'
# file_path = 'data/estimates/Austria_JHU_R.csv'

# for TU Ilmenau Switzerland
# repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
# branch = 'gh-pages'
# file_path = 'data/estimates/Switzerland_JHU_R.csv'

# for OWID:
# repo_name = 'owid/covid-19-data'
# branch = 'master'
# file_path = 'public/data/owid-covid-data.csv'
# Note that OWID reports reproduction_rate only since 2020-11-13, this script will also download older data. This older data has to be removed from the data-raw folder before running the "process_historic_data_owid.r" script.

# for globalrt:
# repo_name = 'crondonm/TrackingR'
# branch = 'main'
# file_path = 'Estimates-Database/database.csv'

# for epiforecast:
# repo_name = 'epiforecasts/covid-rt-estimates'
# branch = 'master'
# file_path = 'national/cases/summary/rt.csv'

# for zidatalab:
# repo_name = 'zidatalab/covid19dashboard'
# branch = 'master'
# file_path = 'data/plotdata/rwert_bund.json'

# for RKI:
repo_name = 'robert-koch-institut/SARS-CoV-2-Nowcasting_und_-R-Schaetzung'
branch = 'main'
file_path = 'Nowcast_R_aktuell.csv'

# for Swiss Data Science Center (SDSC): download files manually from gitlab repo: https://renkulab.io/gitlab/covid-19/covid-19-forecast/-/tree/master/data/JHU/prediction
# try to download only predicition folder since repo is very large: https://bytenota.com/git-how-to-clone-a-specific-directory-from-a-git-repository/

# Get commit history

In [37]:
# retrieve information about all commits that modified the file we want
all_commits = []

page = 0
while True:
    page += 1
    r = requests.get('https://api.github.com/repos/{}/commits'.format(repo_name),
                     params = {'sha': branch, 'path': file_path, 'page': str(page)},
                     headers=headers)

    if (not r.ok) or (r.text == '[]'): break

    all_commits += json.loads(r.text or r.content)

In [38]:
# extract sha of each commit
all_shas = [c['sha'] for c in all_commits]
all_shas

['b27bcf781636688953a5219afb328f5e7389ca72',
 '520c7c354530db72a63425d0da81b76489ff4c1d',
 '0fe90dd0fc629d51d4642c6c8ffe2fea114922bd',
 '5ed0f3b894681e99e98e4782b68b4d5c7d8dad51',
 'c4a24e1a982da37e847067b8004c430f8656e67d',
 '9225f0c36e31e448d0f41a4457a8a299228ef882',
 '157cd2942324681bfce0393f7734377d9a8de075',
 '665c2cdc209fda3d351e481243d642915992ba21',
 '26d69b0218c37a6f989d642281193baa21aa1db3',
 '726e13a9b84235260e20ef2f02eb8506b60dd487',
 'c394386bdbeebf315105c075716872455558e787',
 'e7c3080277d04345c4e70f9bdb288d0acb753c59',
 'cec9032d4c705d61965277c7612f0e05a0f0eca3',
 'da7c4e1726d2071629a0119a46c5811aa756cff7',
 'e6b41da355393bf6570ab5fa0d0aa1007b226a28',
 '1eab6077d6628817290585cb696a0438f3718944',
 'd851b2a1060addc6571d266d2178aa190379bd9f',
 '0a88dd8803ff5f702ac719c7492080b1daae7665',
 '3f1b9c8863a512e6e8b0788c80a3513cbb060d6f',
 '29ee9acacf3e5c6679df24499efd23d6f4041561',
 '8d7c97b5f28b18bec31c167ea4f59d1bc08188c9',
 '9dd5a5f1443c7cb569c49ffeca61c76ccce5960b',
 'b7a85078

# Download and save files

## Default

In [39]:
# date of each commit
commit_dates = [pd.to_datetime(c['commit']['author']['date']) for c in all_commits]

In [40]:
# combine into dataframe
df = pd.DataFrame({'sha': all_shas, 'commit_date': commit_dates})

In [41]:
# date without time
df['date'] = df.commit_date.dt.date

In [42]:
# only consider last commit made each day
df = df.loc[df.groupby('date')['commit_date'].idxmax()]

In [43]:
df

Unnamed: 0,sha,commit_date,date
394,fd111bf04a5ff9c7cf4de24016d39ccc3bf705c2,2021-04-21 09:02:21+00:00,2021-04-21
393,bc83f162905b3f42cb704c66f1aaac558311c1ac,2021-04-22 06:51:34+00:00,2021-04-22
392,3ffce79ca26daa7c3dd9dda76d069b0706d4cfe3,2021-04-23 07:35:14+00:00,2021-04-23
391,fe4d431f287fcf182132a903bf5afce544ac4031,2021-04-24 09:50:36+00:00,2021-04-24
390,128c87a9e55f23d0e7f9ee161aa6847fd4d36198,2021-04-25 10:41:47+00:00,2021-04-25
...,...,...,...
4,c4a24e1a982da37e847067b8004c430f8656e67d,2022-05-19 03:24:12+00:00,2022-05-19
3,5ed0f3b894681e99e98e4782b68b4d5c7d8dad51,2022-05-20 10:04:58+00:00,2022-05-20
2,0fe90dd0fc629d51d4642c6c8ffe2fea114922bd,2022-05-21 03:19:13+00:00,2022-05-21
1,520c7c354530db72a63425d0da81b76489ff4c1d,2022-05-22 03:04:17+00:00,2022-05-22


In [49]:
df_temp = pd.read_csv('https://raw.githubusercontent.com/{}/{}/{}'.format(repo_name, all_shas[0], file_path),
                          low_memory=False)
df_temp

Unnamed: 0,Datum,PS_COVID_Faelle,UG_PI_COVID_Faelle,OG_PI_COVID_Faelle,PS_COVID_Faelle_ma4,UG_PI_COVID_Faelle_ma4,OG_PI_COVID_Faelle_ma4,PS_7_Tage_R_Wert,UG_PI_7_Tage_R_Wert,OG_PI_7_Tage_R_Wert
0,2020-03-02,303,292,318,225,213,238,,,
1,2020-03-03,321,304,336,261,248,275,,,
2,2020-03-04,449,430,469,326,311,342,,,
3,2020-03-05,503,483,524,394,377,412,,,
4,2020-03-06,757,735,782,507,488,528,2.34,2.28,2.39
...,...,...,...,...,...,...,...,...,...,...
804,2022-05-15,43924,39486,47688,44892,42255,47503,0.83,0.82,0.85
805,2022-05-16,43194,38214,47523,44450,40949,47770,0.79,0.77,0.82
806,2022-05-17,38435,32661,43384,42885,38411,46872,0.75,0.72,0.78
807,2022-05-18,27960,20847,34777,38379,32802,43343,0.71,0.68,0.75


In [56]:
# iterate over the rows of df and download the corresponding file
for index, row in tqdm(df.iterrows(), total=df.shape[0]):  
    # try:      # try-except added only for globalrt because HTTP-Error occurred
    #     df_temp = pd.read_csv('https://raw.githubusercontent.com/{}/{}/{}'.format(repo_name, row['sha'], file_path),
    #                       low_memory=False)
    # except:
    #     pass
    # for zidatalab only:
    df_temp = pd.read_json('https://raw.githubusercontent.com/{}/{}/{}'.format(repo_name, row['sha'], file_path))
#2
    # result_path =  '../data-raw/ETHZ DEU/' + str(row['date']) + '_ethz_deu_raw.csv'
    # result_path =  '../data-raw/ETHZ AUT/' + str(row['date']) + '_ethz_aut_raw.csv'
    # result_path =  '../data-raw/ETHZ CHE/' + str(row['date']) + '_ethz_che_raw.csv'
    # result_path =  '../data-raw/Ilmenau DEU/' + str(row['date']) + '_ilmenau_deu_raw.csv'
    # result_path =  '../data-raw/Ilmenau AUT/' + str(row['date']) + '_ilmenau_aut_raw.csv'
    # result_path =  '../data-raw/Ilmenau CHE/' + str(row['date']) + '_ilmenau_che_raw.csv'
    # result_path =  '../data-raw/owid/' + str(row['date']) + '_owid_raw.csv'
    # result_path = '../data-raw/epiforecast_au_ch_ger/' + str(row['date']) + '_epiforecast_raw.csv'
    result_path = '../data-raw/zidatalab/' + str(row['date']) + '_zidatalab_raw.json'
    # result_path = '../data-raw/rki-historic/' + str(row['date']) + 'rki_historic_raw.csv'
    # for owid only: filter only data from Germany:
    # df_temp = df_temp.loc[df_temp['iso_code'] == 'DEU']
    # for globalrt only: filter only data from Germany, Austria and Switzerland:
    # df_temp = df_temp.loc[(df_temp['Country/Region'] == 'Germany') | (df_temp['Country/Region'] == 'Austria') | (df_temp['Country/Region'] == 'Switzerland')]
    # for epiforecast:
    # df_temp = df_temp[(df_temp.iloc[:, 0] == 'Germany') | (df_temp.iloc[:, 0] == 'Austria') | (df_temp.iloc[:, 0] == 'Switzerland')]
    df_temp.to_csv(result_path, index=False)
    # for zidatalab only:
    df_temp.to_json(result_path)

100%|██████████| 426/426 [03:36<00:00,  1.97it/s]


## globalrt

In [None]:
# iterate over all commits
for i in range(len(all_shas)):
    if i != 269:
        print(str(i+1) + "/" + str(len(all_shas)))
        try:
            df_temp = pd.read_csv(
                "https://raw.githubusercontent.com/{}/{}/{}".format(
                    repo_name, all_shas[i], file_path
                ),
                low_memory=False,
            )
        except:
            pass

        # Germany
        df_DE = df_temp.loc[(df_temp["Country/Region"] == "Germany")]
        dates_DE = df_DE.loc[:,"last_updated"].unique()
        if len(dates_DE) == 1:
            date_DE = dates_DE[0]
            result_path_DE = "../data-raw/globalrt DE/" + date_DE + "_globalrt_raw.csv"
            df_DE.to_csv(result_path_DE, index=False)
        else:
            print("last_updated for Germany not unique in commit: sha = " + all_shas[i])
            print(dates_DE)

        # Austria
        df_AT = df_temp.loc[(df_temp["Country/Region"] == "Austria")]
        dates_AT = df_AT.loc[:,"last_updated"].unique()
        if len(dates_AT) == 1:
            date_AT = dates_AT[0]
            result_path_AT = "../data-raw/globalrt AT/" + date_AT + "_globalrt_raw.csv"
            df_AT.to_csv(result_path_AT, index=False)
        else:
            print("last_updated for Austria not unique in commit: sha = " + all_shas[i])
            print(dates_AT)

        # Switzerland
        df_CH = df_temp.loc[(df_temp["Country/Region"] == "Switzerland")]
        dates_CH = df_CH.loc[:,"last_updated"].unique()
        if len(dates_CH) == 1:
            date_CH = dates_CH[0]
            result_path_CH = "../data-raw/globalrt CH/" + date_CH + "_globalrt_raw.csv"
            df_CH.to_csv(result_path_CH, index=False)
        else:
            print("last_updated for Switzerland not unique in commit: sha = " + all_shas[i])
            print(dates_CH)