In [1]:
import json
from typing_extensions import TypedDict

DatasetItem = TypedDict('DatasetItem', {'type': str, 'url': str, 'content': dict[str, str], 'codes': list[str]})

with open('dataset.json', 'r') as dataset_file:
    dataset: list[DatasetItem] = json.load(dataset_file)

# We are only interested in commits, not issues
dataset = list(filter(lambda item: item['type'] == 'commit', dataset))

In [2]:
from github import Github, Auth

with open('TOKEN.txt', 'r') as token_file:
    token = token_file.readline().strip()
    gh = Github(auth=Auth.Token(token))

print('Using GitHub as', gh.get_user().login)

Using GitHub as InputUsername


In [9]:
from github.Repository import Repository

from collections import OrderedDict
from datetime import datetime, UTC
import os
import time

CommitID = tuple[str, str, str] # owner, name, sha

repos: dict[tuple[str, str], Repository] = {}

commit_diffs: OrderedDict[CommitID, dict[str, str]]
if os.path.exists('diffs.json'):
    with open('diffs.json', 'r') as diffs_file:
        commit_diffs = json.load(diffs_file, object_pairs_hook=OrderedDict)
else:
    commit_diffs = OrderedDict()

commit_errors: dict[CommitID] = {}

for item in dataset:
    _, _, _, owner, name, _, sha = item['url'].split('/')

    commit_id = f'{owner}/{name}/{sha}'

    try:
        repo = repos[f'{owner}/{name}'] if (owner, name) in repos else gh.get_repo(f'{owner}/{name}')

        if commit_id not in commit_diffs:
            commit = repo.get_commit(sha)

            if 'link' in commit.raw_headers:
                raise Exception(f'Commit is paginated')

            commit_diffs[commit_id] = {file.filename: file.patch for file in commit.files}
    except Exception as e:
        print(f'Error fetching commit {commit_id}:', e)
        commit_errors[commit_id] = str(e)

    if gh.rate_limiting[0] <= 1:
        now = datetime.now(UTC)
        reset = datetime.fromtimestamp(gh.rate_limiting_resettime, UTC)

        time.sleep(float((reset - now).seconds))

    break

with open('diffs.json', 'w') as diffs_file:
    json.dump(commit_diffs, diffs_file, indent=2)

with open('errors.json', 'w') as errors_file:
    json.dump(commit_errors, errors_file, indent=2)

print(repos)
print(commit_diffs)
print(commit_errors)

{}
OrderedDict([('tkhoa2711/terraform-digitalocean/a86d89369aaf5a20c1e4d8415a8a771aa7de7d10', {'main.tf': '@@ -0,0 +1,59 @@\n+provider "digitalocean" {\n+  token = "${var.do_token}"\n+}\n+\n+data "digitalocean_ssh_key" "personal" {\n+  name = "${var.do_ssh_key_name}"\n+}\n+\n+data "digitalocean_sizes" "web" {\n+  filter {\n+    key    = "vcpus"\n+    values = [1]\n+  }\n+\n+  filter {\n+    key    = "memory"\n+    values = [1024]\n+  }\n+\n+  filter {\n+    key    = "regions"\n+    values = ["sgp1"]\n+  }\n+\n+  sort {\n+    key       = "price_monthly"\n+    direction = "asc"\n+  }\n+}\n+\n+resource "digitalocean_droplet" "web" {\n+  ssh_keys = ["${data.digitalocean_ssh_key.personal.id}"]\n+  image    = "ubuntu-18-04-x64"\n+  name     = "web"\n+  region   = "sgp1"\n+\n+  # https://developers.digitalocean.com/documentation/v2/#list-all-sizes\n+  # size = "s-1vcpu-1gb" # $5.0/month\n+  size = "${data.digitalocean_sizes.web.sizes.0.slug}"\n+\n+  provisioner "remote-exec" {\n+    inline = 