In [2]:
%load_ext autoreload
%autoreload 2

from coeditor.common import *
import os

import requests
import shutil
import random

os.chdir(proj_root())

In [3]:
import requests
import dateparser
from spot.data import GitRepo
import warnings


def request_page(page: int, n_items: int = 10):
    return requests.get(
        f"https://api.github.com/search/repositories?q=NOT+interview+NOT+reference+NOT+course+NOT+cheatsheet+created%3A>2018-01-01+stars%3A>1000+size%3A<20000+language%3APython&sort=stars&order=desc&per_page={n_items}&page={page}"
    ).json()


def fetch_top_python_repos(n_repos: int):
    repos = dict[str, GitRepo]()
    i = 1
    while len(repos) < n_repos:
        page = request_page(i, n_items=100)
        if not page.get("items"):
            print("Fetching page failed:")
            print(page)
            break
        for item in page["items"]:
            r = GitRepo(
                author=item["owner"]["login"],
                name=item["name"],
                url=item["html_url"],
                description=item["description"],
                stars=item["stargazers_count"],
                forks=item["forks_count"],
                archived=item["archived"],
                last_update=dateparser.parse(item["pushed_at"]).replace(tzinfo=None),
            )
            if not r.archived:
                if r.authorname() in repos:
                    warnings.warn(f"{r.authorname()} already in repos", UserWarning)
                repos[r.authorname()] = r
        i += 1
    return [repos[k] for k in list(repos)[:n_repos]]


In [4]:
repos = fetch_top_python_repos(200)
print("Repos:", len(repos))


Repos: 200


In [11]:
dataset_name = "medium"
repos_dir = get_dataset_dir(dataset_name)
(repos_dir / "downloading").mkdir(exist_ok=True, parents=True)
(repos_dir / "downloaded").mkdir(exist_ok=True, parents=True)

downloaded = pmap(
    GitRepo.download,
    repos,
    [repos_dir] * len(repos),
    [True] * len(repos),
    desc="downloading repos",
    max_workers=4,
)

print("Successfully downloaded:", sum(downloaded))


downloading repos: 100%|██████████| 200/200 [01:07<00:00,  2.97it/s]

Successfully downloaded: 200





In [8]:
n_test = 20
n_valid = 20
n_train = 100

downloaded_repos = [r for r, d in zip(repos, downloaded) if d]
random.seed(42)
random.shuffle(downloaded_repos)
split = {
    "train": downloaded_repos[n_test + n_valid :][:n_train],
    "test": downloaded_repos[:n_test],
    "valid": downloaded_repos[n_test : n_test + n_valid],
}

pickle_dump(repos_dir / "repos_split.pkl", split)


In [13]:
# move downloaded repos to their split group
for group, repos in split.items():
    for repo in tqdm(repos, desc=f"moving {group}"):
        dest = repos_dir / "repos" / group
        dest.mkdir(exist_ok=True, parents=True)
        shutil.move(repos_dir / "downloaded" / repo.authorname(), dest)


moving train: 100%|██████████| 170/170 [00:00<00:00, 717.00it/s]
moving test: 100%|██████████| 20/20 [00:00<00:00, 725.43it/s]
moving valid: 100%|██████████| 10/10 [00:00<00:00, 716.47it/s]
