# Setup

# Before running, you have to set the parameters at these numbers: 1, 2

In [None]:
import json
import yaml
import requests
import pandas as pd

In [None]:
# to visualize progress bars
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
# we need a github access token to perform a large number of requests (rate limit)
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)

token = cfg['access_token']
headers = {'Authorization': 'token ' + token}

In [None]:
token

In [None]:
# 1, change comments where needed

# for ETHZ:
# repo_name = 'covid-19-Re/dailyRe-Data'
# branch = 'master'
# file_path = 'DEU-estimates.csv'

# for TU Ilmenau:
# repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
# branch = 'gh-pages'
# file_path = 'data/estimates/Germany_RKI_R.csv'

# for OWID:
repo_name = 'owid/covid-19-data'
branch = 'master'
file_path = 'public/data/owid-covid-data.csv'
# Note that OWID reports reproduction_rate only since 2020-11-13, this script will also download older data. This older data has to be removed from the data-raw folder before running the "process_historic_data_owid.r" script.


# Get commit history

In [None]:
# retrieve information about all commits that modified the file we want
all_commits = []

page = 0
while True:
    page += 1
    r = requests.get('https://api.github.com/repos/{}/commits'.format(repo_name),
                     params = {'sha': branch, 'path': file_path, 'page': str(page)},
                     headers=headers)

    if (not r.ok) or (r.text == '[]'): break

    all_commits += json.loads(r.text or r.content)

In [None]:
all_commits

In [33]:
# extract sha of each commit
all_shas = [c['sha'] for c in all_commits]

In [34]:
# date of each commit
commit_dates = [pd.to_datetime(c['commit']['author']['date']) for c in all_commits]

In [35]:
# combine into dataframe
df = pd.DataFrame({'sha': all_shas, 'commit_date': commit_dates})

In [36]:
# date without time
df['date'] = df.commit_date.dt.date

In [37]:
# only consider last commit made each day
df = df.loc[df.groupby('date')['commit_date'].idxmax()]

In [38]:
df.head()

Unnamed: 0,sha,commit_date,date
1722,697cc89c03de8622d50c50074edeb842e35ba33c,2020-04-16 20:24:09+00:00,2020-04-16
1719,a9355803a303312ea889d31f5b3c5a8f95fee267,2020-04-17 17:15:09+00:00,2020-04-17
1717,348f83eeff6acc5fc58a3b55e08398c096bc22bf,2020-04-18 17:15:12+00:00,2020-04-18
1715,9ee33ac73942b2e37eb04014bf2a7a17a83998cf,2020-04-19 14:37:16+00:00,2020-04-19
1713,8222d828a8af35147af3a9ad983a424f22d2037b,2020-04-20 21:30:07+00:00,2020-04-20


# Download and save files

In [41]:
# iterate over the rows of df and download the corresponding file
for index, row in tqdm(df.iterrows(), total=df.shape[0]):        
    df_temp = pd.read_csv('https://raw.githubusercontent.com/{}/{}/{}'.format(repo_name, row['sha'], file_path),
                          low_memory=False)
#2
    # result_path =  '../data-raw/ETH Zürich/' + str(row['date']) + '_zürich_raw.csv'
    # result_path =  '../data-raw/TU Ilmenau/' + str(row['date']) + '_Ilmenau_raw.csv'
    result_path =  '../data-raw/owid/' + str(row['date']) + '_owid_raw.csv'
    # for owid only: filter only data from Germany:
    df_temp = df_temp.loc[df_temp['iso_code'] == 'DEU']
    #
    df_temp.to_csv(result_path, index=False)

100%|██████████| 454/454 [20:04<00:00,  2.65s/it]
