In [1]:
import os
import json
from dotenv import load_dotenv
import requests
from pathlib import Path
import tarfile
import re

load_dotenv()
GITHUB_API_KEY = os.getenv("GITHUB_API_KEY")

CODESEARCHNET_DIR = "../dataset/codesearchnet-java-decompressed"
REPO_TARBALLS_DIR = "./repo_tarballs"
REPO_JAVA_FILES_DIR = "./repo_java_files"

In [2]:

def get_contents():
    for file_name in os.listdir(CODESEARCHNET_DIR):
        print(f"=== reading file {file_name}")

        with open(CODESEARCHNET_DIR + "/" + file_name, "r") as f_in:
            for line in f_in:
                yield file_name, line


def get_repos():
    for source_file, line in get_contents():
        line = json.loads(line)
        url = line["url"]
        url_split = url.split("/")
        for idx, part in enumerate(url_split):
            if part == "blob":
                repo_url = "/".join(url_split[: idx + 2])
                tree_sha = url_split[idx + 1]
                repo_name = "/".join(url_split[idx - 2 : idx])
                break

        yield source_file, repo_url, tree_sha, repo_name


In [3]:
bad_repos = set()

for source_file, repo_url, tree_sha, repo_name in get_repos():
    dir_name = f"{REPO_TARBALLS_DIR}/{source_file}"
    file_path = f"{dir_name}/{repo_name.replace('/', '_')}_{tree_sha}.tar.gz"

    if os.path.exists(file_path) or file_path in bad_repos:
        # print(f"File {file_path} already exists, skipping")
        continue

    url = f"http://api.github.com/repos/{repo_name}/tarball/{tree_sha}"

    try:
        res = requests.get(
            url,
            {
                "Accept": "application/vnd.github+json",
                "Authorization": f"Bearer {GITHUB_API_KEY}",
                "X-GitHub-Api-Version": "2022-11-28",
            },
        )

        if res.status_code != 200:
            print(f"Error {res.status_code} when downloading {url}")
            bad_repos.add(file_path)
            continue

        Path(dir_name).mkdir(parents=True, exist_ok=True)

        with open(file_path, "wb") as f:
            f.write(res.content)
    except Exception as e:
        print(f"Error when downloading {url}: {e}")


=== reading file java_test_0.jsonl
Error 404 when downloading http://api.github.com/repos/streamsets/datacollector/tarball/ea63245ea14d59d5229248387f0628f46131eae5
Error 404 when downloading http://api.github.com/repos/deeplearning4j/nd4j/tarball/8f005bcecb240d1fbb83b9d390ad801d1d3b6933
Error 404 when downloading http://api.github.com/repos/signalfx/appd-integration/tarball/f87d4d75f28a0e8f4722f4b3087522cedf63cf22
Error 404 when downloading http://api.github.com/repos/shopping24/solr-jdbc/tarball/3806370f2a5e26db2194f4f63b98edf2fee5efd8
Error 404 when downloading http://api.github.com/repos/haducloc/appslandia-common/tarball/41dc9b0e1dd4fa86a9ad03627f70cbee61e74061
Error 404 when downloading http://api.github.com/repos/demidenko05/beigesoft-replicator/tarball/d3cb848c366d2f6fbf6c2fb5066f0abe577a24bf
Error 404 when downloading http://api.github.com/repos/mortezaadi/GoodbyDao/tarball/0a365f8b96add274e2a64ddc885e4804b07021c6
=== reading file java_train_0.jsonl
=== reading file java_train_

In [3]:
def get_tarball():
    for dir_name in os.listdir(REPO_TARBALLS_DIR):
        print(f"=== reading dir {dir_name}")

        for file_name in os.listdir(f"{REPO_TARBALLS_DIR}/{dir_name}"):
            with tarfile.open(f"{REPO_TARBALLS_DIR}/{dir_name}/{file_name}", encoding="utf-8") as tar:
                yield dir_name, file_name, tar


In [12]:
for dir_name, file_name, tar in get_tarball():
    dir_name = f"{REPO_JAVA_FILES_DIR}/{dir_name}"
    file_path = f"{dir_name}/{file_name.removesuffix('.tar.gz')}.json"

    if os.path.exists(file_path):
        # print(f"File {file_path} already exists, skipping")
        continue

    repo_java_files = []

    for member in tar:
        if not member.isfile():
            continue
        if not member.name.endswith(".java"):
            continue

        with tar.extractfile(member) as javafile:
            javafile_text = javafile.read().decode(encoding="utf-8", errors="ignore")
            javafile_name = member.name.split("/")[-1]
            for line in javafile_text.splitlines():
                matched = package_string = re.search(
                    r"^\s*package ([a-zA-Z0-9_.\-]+);", line
                )
                if not matched:
                    continue

                package_string = matched.group(1)

                repo_java_files.append(
                    {
                        "file_name": javafile_name,
                        "dir_hierarchy": package_string.split("."),
                        "text": javafile_text,
                    }
                )

                break
            else:
                print(
                    f"No package declaration found for {dir_name} {file_name} {member.name}"
                )

    Path(dir_name).mkdir(parents=True, exist_ok=True)

    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(repo_java_files, f)


=== reading dir ./repo_tarballs
No package declaration found for ./repo_java_files/java_test_0.jsonl blademainer_common_utils_ef6baf0367c16de95a28caab67a7d5842a6d13db.tar.gz blademainer-common_utils-ef6baf0/common_helper/src/test/java/AtomicTest.java
No package declaration found for ./repo_java_files/java_test_0.jsonl blademainer_common_utils_ef6baf0367c16de95a28caab67a7d5842a6d13db.tar.gz blademainer-common_utils-ef6baf0/common_helper/src/test/java/LinkList.java
No package declaration found for ./repo_java_files/java_test_0.jsonl blademainer_common_utils_ef6baf0367c16de95a28caab67a7d5842a6d13db.tar.gz blademainer-common_utils-ef6baf0/common_helper/src/test/java/LongTest.java
No package declaration found for ./repo_java_files/java_test_0.jsonl blademainer_common_utils_ef6baf0367c16de95a28caab67a7d5842a6d13db.tar.gz blademainer-common_utils-ef6baf0/common_helper/src/test/java/UserAuthentication.java
No package declaration found for ./repo_java_files/java_test_0.jsonl blademainer_common_