# Setup

In [None]:
import json
import yaml
import requests
import pandas as pd

In [None]:
# to visualize progress bars
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
# we need a github access token to perform a large number of requests (rate limit)
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)

token = cfg['access_token']
headers = {'Authorization': 'token ' + token}

In [None]:
token

In [None]:
repo_name = 'covid-19-Re/dailyRe-Data'
branch = 'master'
file_path = 'DEU-estimates.csv'

# Get commit history

In [None]:
# retrieve information about all commits that modified the file we want
all_commits = []

page = 0
while True:
    page += 1
    r = requests.get('https://api.github.com/repos/{}/commits'.format(repo_name),
                     params = {'sha': branch, 'path': file_path, 'page': str(page)},
                     headers=headers)

    if (not r.ok) or (r.text == '[]'): break

    all_commits += json.loads(r.text or r.content)

In [None]:
all_commits

In [None]:
# extract sha of each commit
all_shas = [c['sha'] for c in all_commits]

In [None]:
# date of each commit
commit_dates = [pd.to_datetime(c['commit']['author']['date']) for c in all_commits]

In [None]:
# combine into dataframe
df = pd.DataFrame({'sha': all_shas, 'commit_date': commit_dates})

In [None]:
# date without time
df['date'] = df.commit_date.dt.date

In [None]:
# only consider last commit made each day
df = df.loc[df.groupby('date')['commit_date'].idxmax()]

In [None]:
df.head()

# Download and save files

In [None]:
# iterate over the rows of df and download the corresponding file
for index, row in tqdm(df.iterrows(), total=df.shape[0]):        
    df_temp = pd.read_csv('https://raw.githubusercontent.com/{}/{}/{}'.format(repo_name, row['sha'], file_path),
                          low_memory=False)

    result_path =  '../data-raw/ETH Zürich/' + str(row['date']) + '_zürich_raw.csv'
    df_temp.to_csv(result_path, index=False)