In [1]:
%load_ext autoreload
%autoreload 2

from coeditor.common import *
import os

import requests
import shutil
import random

os.chdir(proj_root())

In [25]:
import requests
import dateparser
from coeditor.git import GitRepo
import warnings
import time


def request_page(page: int, license: str, n_items: int = 10):
    if Path("config/github_token.txt").exists():
        token = Path("config/github_token.txt").read_text().strip()
        headers = {
            "Authorization": f"Bearer {token}"
        }
    else:
        headers = None
    return requests.get(
        f"https://api.github.com/search/repositories?q=NOT+interview+NOT+reference+NOT+course+NOT+cheatsheet+created%3A>2018-01-01+stars%3A>100+size%3A<20000+language%3APython+license%3A{license}&sort=stars&order=desc&per_page={n_items}&page={page}",
        headers=headers,
    ).json()


def fetch_python_repos(license2counts: dict[str, int]):
    n_repos = sum(license2counts.values())
    repos = dict[str, GitRepo]()
    with tqdm(total=n_repos) as pbar:
        for license, n_repos in license2counts.items():
            for i in range(1, n_repos // 100 + 1):
                page = request_page(i, n_items=100, license=license)
                if (msg := page.get("message", "")) and msg.startswith(
                    "API rate limit exceeded"
                ):
                    print("API rate limit exceeded, now wait for 1 min")
                    time.sleep(60)
                    continue
                if not page.get("items"):
                    print("Fetching page failed:")
                    print(page)
                    break
                for item in page["items"]:
                    r = GitRepo.from_github_item(item)
                    if not r.archived:
                        if r.authorname() in repos:
                            print(f"[warning] {r.authorname()} already in repos")
                        repos[r.authorname()] = r
                pbar.update(len(page["items"]))
    return [repos[k] for k in list(repos)]


In [10]:
{
    l: int(request_page(0, l, n_items=1)["total_count"])
    for l in ["mit", "apache-2.0", "bsd-3-clause", "bsd-2-clause"]
}


{'mit': 7386, 'apache-2.0': 2809, 'bsd-3-clause': 523, 'bsd-2-clause': 149}

In [23]:
license2counts = {
    "mit": 1000,
    "apache-2.0": 1000,
    "bsd-3-clause": 500,
}

all_repos = fetch_python_repos(license2counts)
print("Repos:", len(all_repos))

100%|██████████| 2500/2500 [01:17<00:00, 32.10it/s]

Repos: 2445





In [26]:
dataset_name = "perm2K"  # permissive licensed 2K repos
repos_dir = get_dataset_dir(dataset_name)
(repos_dir / "downloading").mkdir(exist_ok=True, parents=True)
(repos_dir / "downloaded").mkdir(exist_ok=True, parents=True)

downloaded = pmap(
    GitRepo.download,
    all_repos,
    key_args={"repos_dir": repos_dir, "full_history": True},
    desc="downloading repos",
    max_workers=4,
    chunksize=1,
)

print("Successfully downloaded:", sum(downloaded))
downloaded_repos = [r for r, d in zip(all_repos, downloaded) if d]


downloading repos: 100%|██████████| 2445/2445 [22:13<00:00,  1.83it/s]

Successfully downloaded: 2444





In [27]:
print("Successfully downloaded:", len(downloaded_repos))

Successfully downloaded: 2444


In [32]:
# now filter out repos with less than 50 commits
filtered_repos = [r for r in tqdm(downloaded_repos) if r.count_commits(repos_dir) >= 50]
print("After filtering by commits:", len(filtered_repos))

100%|██████████| 2444/2444 [00:31<00:00, 77.33it/s]

After filtering by commits: 1664





In [38]:
from coeditor.dataset import get_repo_signature

repo_paths = [repos_dir / "downloaded" / r.authorname() for r in filtered_repos]
sigs = pmap(get_repo_signature, repo_paths, desc="getting repo signatures", chunksize=1)
sig_groups = groupby(enumerate(sigs), lambda x: x[1])

duplicates = set[str]()
for sig, group in sig_groups.items():
    if len(group) > 1:
        print(f"{len(group)} repos have the same signature {sig}:")
        for i, _ in group:
            print(f"  {downloaded_repos[i].authorname()}")
        for i, _ in group[1:]:
            duplicates.add(downloaded_repos[i].authorname())

print("Totoal duplicates:", len(duplicates))
filtered_repos = [r for r in filtered_repos if r.authorname() not in duplicates]
print("After filtering duplicates:", len(filtered_repos))


Totoal duplicates: 15
After filtering duplicates: 1650


In [35]:
n_test = 50
n_valid = 50
n_train = len(filtered_repos) - n_test - n_valid
print(f"n_test={n_test}, n_valid={n_valid}, n_train={n_train}")

random.seed(42)
filtered_repos.sort(key=lambda r: r.authorname())
random.shuffle(filtered_repos)

split = {
    "test": filtered_repos[:n_test],
    "valid": filtered_repos[n_test : n_test + n_valid],
    "train": filtered_repos[n_test + n_valid :][:n_train],
}

pickle_dump(repos_dir / "repos_split.pkl", split)


n_test=50, n_valid=50, n_train=1550


In [36]:
# move downloaded repos to their split group
for group, rs in split.items():
    for repo in tqdm(rs, desc=f"moving {group}"):
        dest = repos_dir / "repos" / group
        dest.mkdir(exist_ok=True, parents=True)
        shutil.move(repos_dir / "downloaded" / repo.authorname(), dest)


moving test: 100%|██████████| 50/50 [00:00<00:00, 670.37it/s]
moving valid: 100%|██████████| 50/50 [00:00<00:00, 716.50it/s]
moving train: 100%|██████████| 1550/1550 [00:02<00:00, 686.52it/s]
