# Setup

In [23]:
import json
import yaml
import requests
import pandas as pd

In [24]:
# to visualize progress bars
from tqdm.auto import tqdm
tqdm.pandas()

In [25]:
# we need a github access token to perform a large number of requests (rate limit)
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)

token = cfg['access_token']
headers = {'Authorization': 'token ' + token}

In [26]:
token

'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'

In [46]:
repo_name = 'covid-19-Re/dailyRe-Data'
branch = 'master'
file_path = 'DEU-estimates.csv'

# Get commit history

In [47]:
# retrieve information about all commits that modified the file we want
all_commits = []

page = 0
while True:
    page += 1
    r = requests.get('https://api.github.com/repos/{}/commits'.format(repo_name),
                     params = {'sha': branch, 'path': file_path, 'page': str(page)},
                     headers=headers)

    if (not r.ok) or (r.text == '[]'): break

    all_commits += json.loads(r.text or r.content)

In [48]:
all_commits

s://api.github.com/repos/covid-19-Re/dailyRe-Data/commits/4c99f12a6ed074f6ba6144c28e0cd34a118a5fb6',
    'html_url': 'https://github.com/covid-19-Re/dailyRe-Data/commit/4c99f12a6ed074f6ba6144c28e0cd34a118a5fb6'}]},
 {'sha': '4c99f12a6ed074f6ba6144c28e0cd34a118a5fb6',
  'node_id': 'MDY6Q29tbWl0Mjg4MDk4MDkzOjRjOTlmMTJhNmVkMDc0ZjZiYTYxNDRjMjhlMGNkMzRhMTE4YTVmYjY=',
  'commit': {'author': {'name': 'data uploader',
    'email': 'covid19',
    'date': '2020-08-17T21:43:32Z'},
   'committer': {'name': 'data uploader',
    'email': 'covid19',
    'date': '2020-08-17T21:43:32Z'},
   'message': 'update data',
   'tree': {'sha': 'f96ceeebee20ffb4feb0847d5f6d45d047129b56',
    'url': 'https://api.github.com/repos/covid-19-Re/dailyRe-Data/git/trees/f96ceeebee20ffb4feb0847d5f6d45d047129b56'},
   'url': 'https://api.github.com/repos/covid-19-Re/dailyRe-Data/git/commits/4c99f12a6ed074f6ba6144c28e0cd34a118a5fb6',
   'comment_count': 0,
   'verification': {'verified': False,
    'reason': 'unsigned',
  

In [49]:
# extract sha of each commit
all_shas = [c['sha'] for c in all_commits]

In [50]:
# date of each commit
commit_dates = [pd.to_datetime(c['commit']['author']['date']) for c in all_commits]

In [51]:
# combine into dataframe
df = pd.DataFrame({'sha': all_shas, 'commit_date': commit_dates})

In [52]:
# date without time
df['date'] = df.commit_date.dt.date

In [53]:
# only consider last commit made each day
df = df.loc[df.groupby('date')['commit_date'].idxmax()]

In [54]:
df.head()

Unnamed: 0,sha,commit_date,date
296,b36a3127e79d754b0d31261e21e326dc0370181d,2020-08-17 23:43:06+00:00,2020-08-17
272,b274ead828de23a80e3c4e1518fbe4c86f094e19,2020-08-18 23:42:26+00:00,2020-08-18
257,1ee171acc1630e4192d95e1c3029643fd314cfd2,2020-08-19 13:42:53+00:00,2020-08-19
256,b401d81775f1eaa112951fa0c3b74bbfbeb3ea01,2020-08-20 08:11:14+00:00,2020-08-20
255,1d359ed5085adfb91b2d1fa89bc189882266c329,2020-08-21 11:11:17+00:00,2020-08-21


# Download and save files

In [55]:
# iterate over the rows of df and download the corresponding file
for index, row in tqdm(df.iterrows(), total=df.shape[0]):        
    df_temp = pd.read_csv('https://raw.githubusercontent.com/{}/{}/{}'.format(repo_name, row['sha'], file_path),
                          low_memory=False)

    result_path =  '../data-raw/ETH Zürich/' + str(row['date']) + '_zürich_raw.csv'
    df_temp.to_csv(result_path, index=False)

100%|██████████| 258/258 [01:27<00:00,  2.94it/s]
