# Setup

In [1]:
import json
import yaml
import requests
import pandas as pd

In [2]:
# to visualize progress bars
from tqdm.auto import tqdm
tqdm.pandas()

In [4]:
# we need a github access token to perform a large number of requests (rate limit)
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)

token = cfg['access_token']
headers = {'Authorization': 'token ' + token}

In [5]:
token

'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'

In [9]:
repo_name = 'Stochastik-TU-Ilmenau/COVID-19'
branch = 'gh-pages'
file_path = 'data/estimates/Germany_RKI_R.csv'

# Get commit history

In [10]:
# retrieve information about all commits that modified the file we want
all_commits = []

page = 0
while True:
    page += 1
    r = requests.get('https://api.github.com/repos/{}/commits'.format(repo_name),
                     params = {'sha': branch, 'path': file_path, 'page': str(page)},
                     headers=headers)

    if (not r.ok) or (r.text == '[]'): break

    all_commits += json.loads(r.text or r.content)

In [11]:
all_commits

repos',
   'events_url': 'https://api.github.com/users/Maurice-Bot/events{/privacy}',
   'received_events_url': 'https://api.github.com/users/Maurice-Bot/received_events',
   'type': 'User',
   'site_admin': False},
  'committer': {'login': 'Maurice-Bot',
   'id': 63242777,
   'node_id': 'MDQ6VXNlcjYzMjQyNzc3',
   'avatar_url': 'https://avatars.githubusercontent.com/u/63242777?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/Maurice-Bot',
   'html_url': 'https://github.com/Maurice-Bot',
   'followers_url': 'https://api.github.com/users/Maurice-Bot/followers',
   'following_url': 'https://api.github.com/users/Maurice-Bot/following{/other_user}',
   'gists_url': 'https://api.github.com/users/Maurice-Bot/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/Maurice-Bot/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/Maurice-Bot/subscriptions',
   'organizations_url': 'https://api.github.com/users/Maurice-Bot/orgs',
   'repos_ur

In [12]:
# extract sha of each commit
all_shas = [c['sha'] for c in all_commits]

In [13]:
# date of each commit
commit_dates = [pd.to_datetime(c['commit']['author']['date']) for c in all_commits]

In [14]:
# combine into dataframe
df = pd.DataFrame({'sha': all_shas, 'commit_date': commit_dates})

In [15]:
# date without time
df['date'] = df.commit_date.dt.date

In [16]:
# only consider last commit made each day
df = df.loc[df.groupby('date')['commit_date'].idxmax()]

In [38]:
df.head()

Unnamed: 0,sha,commit_date,date
1455,697cc89c03de8622d50c50074edeb842e35ba33c,2020-04-16 20:24:09+00:00,2020-04-16
1452,a9355803a303312ea889d31f5b3c5a8f95fee267,2020-04-17 17:15:09+00:00,2020-04-17
1450,348f83eeff6acc5fc58a3b55e08398c096bc22bf,2020-04-18 17:15:12+00:00,2020-04-18
1448,9ee33ac73942b2e37eb04014bf2a7a17a83998cf,2020-04-19 14:37:16+00:00,2020-04-19
1446,8222d828a8af35147af3a9ad983a424f22d2037b,2020-04-20 21:30:07+00:00,2020-04-20


# Download and save files

In [17]:
# iterate over the rows of df and download the corresponding file
for index, row in tqdm(df.iterrows(), total=df.shape[0]):        
    df_temp = pd.read_csv('https://raw.githubusercontent.com/{}/{}/{}'.format(repo_name, row['sha'], file_path),
                          low_memory=False)
    # df_temp_DEU = df_temp.loc[df_temp['iso_code'] == "DEU"]
    result_path =  '../data-raw/TU Ilmenau/' + str(row['date']) + '_Ilmenau_raw.csv'
    df_temp.to_csv(result_path, index=False)

100%|██████████| 189/189 [00:53<00:00,  3.55it/s]
