# Setup

# Before running, you have to set the parameters at these numbers: 1, 2

In [43]:
import json
import yaml
import requests
import pandas as pd

In [44]:
# to visualize progress bars
from tqdm.auto import tqdm
tqdm.pandas()

In [45]:
# we need a github access token to perform a large number of requests (rate limit)
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)

token = cfg['access_token']
headers = {'Authorization': 'token ' + token}

In [46]:
token

'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'

In [47]:
# 1, change comments where needed

# for ETHZ Germany:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'DEU-estimates.csv'

# for ETHZ Austria:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'AUT-estimates.csv'

# for ETHZ Switzerland:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'CHE-estimates.csv'

# for TU Ilmenau Germany
# repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
# branch = 'gh-pages'
# file_path = 'data/estimates/Germany_RKI_R.csv'

# for TU Ilmenau Austria
# repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
# branch = 'gh-pages'
# file_path = 'data/estimates/Austria_JHU_R.csv'

# for TU Ilmenau Switzerland
repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
branch = 'gh-pages'
file_path = 'data/estimates/Switzerland_JHU_R.csv'

# for OWID:
# repo_name = 'owid/covid-19-data'
# branch = 'master'
# file_path = 'public/data/owid-covid-data.csv'
# Note that OWID reports reproduction_rate only since 2020-11-13, this script will also download older data. This older data has to be removed from the data-raw folder before running the "process_historic_data_owid.r" script.

# for globalrt:
# repo_name = 'crondonm/TrackingR'
# branch = 'main'
# file_path = 'Estimates-Database/database.csv'

# for epiforecast:
# repo_name = 'epiforecasts/covid-rt-estimates'
# branch = 'master'
# file_path = 'national/cases/summary/rt.csv'


# Get commit history

In [48]:
# retrieve information about all commits that modified the file we want
all_commits = []

page = 0
while True:
    page += 1
    r = requests.get('https://api.github.com/repos/{}/commits'.format(repo_name),
                     params = {'sha': branch, 'path': file_path, 'page': str(page)},
                     headers=headers)

    if (not r.ok) or (r.text == '[]'): break

    all_commits += json.loads(r.text or r.content)

In [49]:
all_commits

[{'sha': 'fca658d3e4c8c4d74a2594efeed3276b47bdb3a1',
  'node_id': 'MDY6Q29tbWl0MjUwNDc1MzQ3OmZjYTY1OGQzZTRjOGM0ZDc0YTI1OTRlZmVlZDMyNzZiNDdiZGIzYTE=',
  'commit': {'author': {'name': 'Maurice-Bot',
    'email': 'statconsult@tu-ilmenau.de',
    'date': '2021-08-11T07:06:15Z'},
   'committer': {'name': 'Maurice-Bot',
    'email': 'statconsult@tu-ilmenau.de',
    'date': '2021-08-11T07:06:15Z'},
   'message': 'Maurice build 2021-08-11T09:06+02:00',
   'tree': {'sha': '9f3272114399e5e94dd4d29148a489e931590b01',
    'url': 'https://api.github.com/repos/Stochastik-TU-Ilmenau/COVID-19/git/trees/9f3272114399e5e94dd4d29148a489e931590b01'},
   'url': 'https://api.github.com/repos/Stochastik-TU-Ilmenau/COVID-19/git/commits/fca658d3e4c8c4d74a2594efeed3276b47bdb3a1',
   'comment_count': 0,
   'verification': {'verified': False,
    'reason': 'unsigned',
    'signature': None,
    'payload': None}},
  'url': 'https://api.github.com/repos/Stochastik-TU-Ilmenau/COVID-19/commits/fca658d3e4c8c4d74a2594ef

In [50]:
# extract sha of each commit
all_shas = [c['sha'] for c in all_commits]

In [51]:
# date of each commit
commit_dates = [pd.to_datetime(c['commit']['author']['date']) for c in all_commits]

In [52]:
# combine into dataframe
df = pd.DataFrame({'sha': all_shas, 'commit_date': commit_dates})

In [53]:
# date without time
df['date'] = df.commit_date.dt.date

In [54]:
# only consider last commit made each day
df = df.loc[df.groupby('date')['commit_date'].idxmax()]

In [55]:
df

Unnamed: 0,sha,commit_date,date
268,d0c0182913df1e75587c773ca9fda0a8278f2abc,2020-11-02 10:02:11+00:00,2020-11-02
267,b9d2f8875052724234af5d124113cd203adf079f,2020-11-16 09:31:51+00:00,2020-11-16
266,bcf79f73c799700a332d525cb39f9e575ad2ecf5,2020-11-18 08:03:36+00:00,2020-11-18
265,3156059f3001bfa1d14bedda174cc283b64af074,2020-11-19 08:03:37+00:00,2020-11-19
264,23446e9ebb5abed5ebfc5aacdaafdadb965e7e6a,2020-11-20 08:04:06+00:00,2020-11-20
...,...,...,...
4,cbdba84cbbbf6886e5576c893205894f013dd451,2021-08-07 07:06:09+00:00,2021-08-07
3,b23128926907100a4f01cd82f8476fd3b2db2b45,2021-08-08 07:06:11+00:00,2021-08-08
2,a6f83722b4606f95a032cc3453f773a19a8eba0f,2021-08-09 07:06:11+00:00,2021-08-09
1,22933462277e7f9e89845f34879ee1060dafc9d7,2021-08-10 07:06:11+00:00,2021-08-10


# Download and save files

In [56]:
# iterate over the rows of df and download the corresponding file
for index, row in tqdm(df.iterrows(), total=df.shape[0]):  
    # try:      # try-except added only for globalrt because HTTP-Error occurred
    df_temp = pd.read_csv('https://raw.githubusercontent.com/{}/{}/{}'.format(repo_name, row['sha'], file_path),
                          low_memory=False)
    # except:
        # pass
#2
    # result_path =  '../data-raw/ETHZ DEU/' + str(row['date']) + '_ethz_deu_raw.csv'
    # result_path =  '../data-raw/ETHZ AUT/' + str(row['date']) + '_ethz_aut_raw.csv'
    # result_path =  '../data-raw/ETHZ CHE/' + str(row['date']) + '_ethz_che_raw.csv'
    # result_path =  '../data-raw/Ilmenau DEU/' + str(row['date']) + '_ilmenau_deu_raw.csv'
    # result_path =  '../data-raw/Ilmenau AUT/' + str(row['date']) + '_ilmenau_aut_raw.csv'
    result_path =  '../data-raw/Ilmenau CHE/' + str(row['date']) + '_ilmenau_che_raw.csv'
    # result_path =  '../data-raw/owid/' + str(row['date']) + '_owid_raw.csv'
    # result_path =  '../data-raw/globalrt/' + str(row['date']) + '_globalrt_raw.csv'
    # result_path = '../data-raw/epiforecast_au_ch_ger/' + str(row['date']) + '_epiforecast_raw.csv'
    # for owid only: filter only data from Germany:
    # df_temp = df_temp.loc[df_temp['iso_code'] == 'DEU']
    # for globalrt only: filter only data from Germany, Austria and Switzerland:
    # df_temp = df_temp.loc[(df_temp['Country/Region'] == 'Germany') | (df_temp['Country/Region'] == 'Austria') | (df_temp['Country/Region'] == 'Switzerland')]
    # for epiforecast:
    # df_temp = df_temp[(df_temp.iloc[:, 0] == 'Germany') | (df_temp.iloc[:, 0] == 'Austria') | (df_temp.iloc[:, 0] == 'Switzerland')]
    df_temp.to_csv(result_path, index=False)

100%|██████████| 265/265 [01:16<00:00,  3.48it/s]
