# Setup

# Before running, you have to set the parameters at these numbers: 1, 2

In [None]:
import json
import yaml
import requests
import pandas as pd

In [None]:
# to visualize progress bars
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
# we need a github access token to perform a large number of requests (rate limit)
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)

token = cfg['access_token']
headers = {'Authorization': 'token ' + token}

In [None]:
token

In [None]:
# 1, change comments where needed

# for ETHZ:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'DEU-estimates.csv'

# for TU Ilmenau:
# repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
# branch = 'gh-pages'
# file_path = 'data/estimates/Germany_RKI_R.csv'

# for OWID:
# repo_name = 'owid/covid-19-data'
# branch = 'master'
# file_path = 'public/data/owid-covid-data.csv'
# Note that OWID reports reproduction_rate only since 2020-11-13, this script will also download older data. This older data has to be removed from the data-raw folder before running the "process_historic_data_owid.r" script.

# for globalrt:
repo_name = 'crondonm/TrackingR'
branch = 'main'
file_path = 'Estimates-Database/database.csv'


# Get commit history

In [None]:
# retrieve information about all commits that modified the file we want
all_commits = []

page = 0
while True:
    page += 1
    r = requests.get('https://api.github.com/repos/{}/commits'.format(repo_name),
                     params = {'sha': branch, 'path': file_path, 'page': str(page)},
                     headers=headers)

    if (not r.ok) or (r.text == '[]'): break

    all_commits += json.loads(r.text or r.content)

In [None]:
all_commits

In [8]:
# extract sha of each commit
all_shas = [c['sha'] for c in all_commits]

In [9]:
# date of each commit
commit_dates = [pd.to_datetime(c['commit']['author']['date']) for c in all_commits]

In [10]:
# combine into dataframe
df = pd.DataFrame({'sha': all_shas, 'commit_date': commit_dates})

In [11]:
# date without time
df['date'] = df.commit_date.dt.date

In [12]:
# only consider last commit made each day
df = df.loc[df.groupby('date')['commit_date'].idxmax()]

In [13]:
df.head()

Unnamed: 0,sha,commit_date,date
128,00136f39adfc338dbc61905bdea4ec6528006e01,2021-02-10 13:03:37+00:00,2021-02-10
127,26a022695dc5a02b267a35e46c56f9b5e0753dd6,2021-02-11 15:32:41+00:00,2021-02-11
126,99be4e19a115b391df2f9ae22589034bcfbf6c46,2021-02-12 11:17:17+00:00,2021-02-12
125,316c5740dc6a120764841d94c0328951cb271d8b,2021-02-14 11:35:35+00:00,2021-02-14
123,6473a8370e031e0ba07e41981820cd9d143f2a45,2021-02-15 23:28:34+00:00,2021-02-15


# Download and save files

In [15]:
# iterate over the rows of df and download the corresponding file
for index, row in tqdm(df.iterrows(), total=df.shape[0]):  
    try:      
        df_temp = pd.read_csv('https://raw.githubusercontent.com/{}/{}/{}'.format(repo_name, row['sha'], file_path),
                          low_memory=False)
    except:
        pass
#2
    # result_path =  '../data-raw/ETH Zürich/' + str(row['date']) + '_zürich_raw.csv'
    # result_path =  '../data-raw/TU Ilmenau/' + str(row['date']) + '_Ilmenau_raw.csv'
    # result_path =  '../data-raw/owid/' + str(row['date']) + '_owid_raw.csv'
    result_path =  '../data-raw/globalrt/' + str(row['date']) + '_globalrt_raw.csv'
    # for owid only: filter only data from Germany:
    # df_temp = df_temp.loc[df_temp['iso_code'] == 'DEU']
    # for globalrt only: filter only data from Germany:
    df_temp = df_temp.loc[df_temp['Country/Region'] == 'Germany']
    #
    df_temp.to_csv(result_path, index=False)

100%|██████████| 128/128 [21:20<00:00, 10.01s/it]
