In [1]:
import json
from typing_extensions import TypedDict

DatasetItem = TypedDict('DatasetItem', {
    'type': str,
    'url': str,
    'content': dict[str, str],
    'codes': list[str],
})

with open('dataset.json', 'r') as dataset_file:
    dataset: list[DatasetItem] = json.load(dataset_file)

# We are only interested in commits, not issues
dataset = [item for item in dataset if item['type'] == 'commit']

In [2]:
from github import Github, Auth

with open('TOKEN.txt', 'r') as token_file:
    token = token_file.readline().strip()
    gh = Github(auth=Auth.Token(token))

print(f'Using GitHub as', gh.get_user().login)

Using GitHub as InputUsername


In [14]:
from github.Repository import Repository

from datetime import datetime, UTC
import os
import time

repos: dict[str, Repository] = {}

commit_diffs: list[dict[str]]
if os.path.exists('diffs.json'):
    with open('diffs.json', 'r') as diffs_file:
        commit_diffs = json.load(diffs_file)
else:
    commit_diffs = []

commit_errors: dict[str, str] = {}

diffs_file = open('diffs.json', 'w')
errors_file = open('errors.json', 'w')

for i, item in enumerate(dataset):
    _, _, _, owner, name, _, sha = item['url'].split('/')

    repo_id = f'{owner}/{name}'
    commit_url = item['url']

    try:
        # Skip already collected commits
        if not any(diff_item['url'] == commit_url for diff_item in commit_diffs):
            # Cache repo objects
            if repo_id not in repos:
                repos[repo_id] = gh.get_repo(repo_id)
            repo = repos[repo_id]

            commit = repo.get_commit(sha)

            if 'link' in commit.raw_headers:
                raise Exception('Commit is paginated')

            files = [{
                'filename': f.filename,
                'additions': f.additions,
                'deletions': f.deletions,
                'changes': f.changes,
                'status': f.status,
                'patch': f.patch,
            } for f in commit.files]

            commit_diffs.append({
                'url': item['url'],
                'files': files,
                'existing_codes': item['codes'],
                'codes': [],
                'notes': '',
            })
    except Exception as e:
        print(f'Error fetching commit {commit_url}:', e)
        commit_errors[commit_url] = str(e)

    json.dump(commit_diffs, diffs_file)
    diffs_file.flush()
    json.dump(commit_errors, errors_file)
    errors_file.flush()

    if gh.rate_limiting[0] <= 1:
        now = datetime.now(tz=UTC)
        reset = datetime.fromtimestamp(gh.rate_limiting_resettime, tz=UTC)

        # If we hit the rate limit, sleep until the reset time (with a 60s buffer)
        time.sleep(float((reset - now).seconds + 60))

    if i % 10 == 0:
        print(f'{i}/{len(dataset)} commits processed, rate limit:', gh.rate_limiting, 'reset:', gh.rate_limiting_resettime)

diffs_file.close()
errors_file.close()

print('Done')

0/538 commits processed, rate limit: (3949, 5000) reset: 1712235520
Error fetching commit https://github.com/blinkist/terraform-aws-airship-ecs-cluster/commit/d7aa659971bee1be873d3dda92e30443556f52df: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
10/538 commits processed, rate limit: (3948, 5000) reset: 1712235520
20/538 commits processed, rate limit: (3948, 5000) reset: 1712235520
30/538 commits processed, rate limit: (3948, 5000) reset: 1712235520
40/538 commits processed, rate limit: (3948, 5000) reset: 1712235520
50/538 commits processed, rate limit: (3948, 5000) reset: 1712235520
60/538 commits processed, rate limit: (3948, 5000) reset: 1712235520
70/538 commits processed, rate limit: (3948, 5000) reset: 1712235520
80/538 commits processed, rate limit: (3948, 5000) reset: 1712235520
Error fetching commit https://github.com/generation-org/tech-foundations-labs/commit/88f50c92b92f5c3ab8259902a56932f295fecce7: 404 {"mes