In [4]:
%load_ext autoreload
%autoreload 2

from coeditor.common import *
import os

import requests
import shutil
import random

os.chdir(proj_root())

In [5]:
import requests
import dateparser
from spot.data import GitRepo
import warnings
import time


def request_page(page: int, n_items: int = 10):
    return requests.get(
        f"https://api.github.com/search/repositories?q=NOT+interview+NOT+reference+NOT+course+NOT+cheatsheet+created%3A>2018-01-01+stars%3A>1000+size%3A<20000+language%3APython&sort=stars&order=desc&per_page={n_items}&page={page}"
    ).json()


def fetch_top_python_repos(n_repos: int):
    repos = dict[str, GitRepo]()
    i = 1
    with tqdm(total=n_repos) as pbar:
        while len(repos) < n_repos:
            page = request_page(i, n_items=100)
            time.sleep(0.5)
            if (msg := page.get("message", "")) and msg.startswith("API rate limit exceeded"):
                print("API rate limit exceeded, now wait for 1 min")
                time.sleep(60)
                continue
            if not page.get("items"):
                print("Fetching page failed:")
                print(page)
                break
            for item in page["items"]:
                r = GitRepo(
                    author=item["owner"]["login"],
                    name=item["name"],
                    url=item["html_url"],
                    description=item["description"],
                    stars=item["stargazers_count"],
                    forks=item["forks_count"],
                    archived=item["archived"],
                    last_update=dateparser.parse(item["pushed_at"]).replace(tzinfo=None),
                )
                if not r.archived:
                    if r.authorname() in repos:
                        warnings.warn(f"{r.authorname()} already in repos", UserWarning)
                    repos[r.authorname()] = r
            pbar.update(len(page["items"]))
            i += 1
    return [repos[k] for k in list(repos)[:n_repos]]


In [21]:
all_repos = fetch_top_python_repos(1000)
print("Repos:", len(all_repos))


100%|█████████▉| 997/1000 [00:37<00:00, 30.96it/s]

API rate limit exceeded, now wait for 2 min


100%|██████████| 1000/1000 [02:38<00:00,  6.29it/s]

Fetching page failed:
{'message': 'Only the first 1000 search results are available', 'documentation_url': 'https://docs.github.com/v3/search/'}
Repos: 978





In [8]:
dataset_name = "xl"
repos_dir = get_dataset_dir(dataset_name)
(repos_dir / "downloading").mkdir(exist_ok=True, parents=True)
(repos_dir / "downloaded").mkdir(exist_ok=True, parents=True)

downloaded = pmap(
    GitRepo.download,
    all_repos,
    [repos_dir] * len(all_repos),
    [True] * len(all_repos),
    desc="downloading repos",
    max_workers=4,
)

print("Successfully downloaded:", sum(downloaded))
downloaded_repos = [r for r, d in zip(all_repos, downloaded) if d]


downloading repos: 100%|██████████| 978/978 [10:31<00:00,  1.55it/s]

Successfully downloaded: 978





In [16]:
downloaded_repos = [r for rs in split.values() for r in rs]
len(downloaded_repos)

978

In [17]:
from coeditor.dataset import get_repo_signature
repo_paths = [repos_dir / "downloaded" / r.authorname() for r in downloaded_repos]
sigs = pmap(get_repo_signature, repo_paths, desc="getting repo signatures")
sig_groups = groupby(enumerate(sigs), lambda x: x[1])

duplicates = set[str]()
for sig, group in sig_groups.items():
    if len(group) > 1:
        print(f"{len(group)} repos have the same signature {sig}:")
        for i, _ in group:
            print(f"  {downloaded_repos[i].authorname()}")
        for i, _ in group[1:]:
            duplicates.add(downloaded_repos[i].authorname())
            
print("Totoal duplicates:", len(duplicates))
downloaded_repos = [r for r in downloaded_repos if r.authorname() not in duplicates]
print("Remaining repos:", len(downloaded_repos))

getting repo signatures: 100%|██████████| 978/978 [03:53<00:00,  4.19it/s]

2 repos have the same signature ('add tensorboard logger', 'README modify heading', 'README add acknowlegement', 'check before release', 'update README', 'update README', 'updata README', 'update README', 'update README', 'clean others', 'clean data loader', 'add wgan-gp and other matters', 'add extract subimages based on var', 'add discriminator arch', 'SRGAN modify loss action and dataset ref', 'change dataset opt', 'add LRHRRef dataloader', 'srgan initial', 'format logger', 'update discriminator and vggfeatureextractor network', 'merge generator, discriminator and perceptual network to architecture', 'modify GANloss', 'add perceptual network(VGG feature extractor)', 'README', 'update README', 'add GAN loss', 'json with leading comma', 'add Degradation Net', 'first commit', 'Initial commit'):
  XPixelGroup?BasicSR
  xinntao?EDVR
2 repos have the same signature ('Update README.md', 'Update README.md', 'Update README.md', 'Update README.md', 'Update README.md', 'Update README.md', 'Upd




In [18]:
n_test = 50
n_valid = 50
n_train = len(downloaded_repos) - n_test - n_valid
print(f"n_test={n_test}, n_valid={n_valid}, n_train={n_train}")

random.seed(42)
downloaded_repos.sort(key=lambda r: r.authorname())
random.shuffle(downloaded_repos)

split = {
    "train": downloaded_repos[n_test + n_valid :][:n_train],
    "test": downloaded_repos[:n_test],
    "valid": downloaded_repos[n_test : n_test + n_valid],
}

pickle_dump(repos_dir / "repos_split.pkl", split)


n_test=50, n_valid=50, n_train=875


In [19]:
# move downloaded repos to their split group
for group, rs in split.items():
    for repo in tqdm(rs, desc=f"moving {group}"):
        dest = repos_dir / "repos" / group
        dest.mkdir(exist_ok=True, parents=True)
        shutil.move(repos_dir / "downloaded" / repo.authorname(), dest)


moving train: 100%|██████████| 875/875 [00:01<00:00, 805.73it/s]
moving test: 100%|██████████| 50/50 [00:00<00:00, 802.17it/s]
moving valid: 100%|██████████| 50/50 [00:00<00:00, 797.69it/s]
