In [1]:
import requests, os, zipfile, shutil
from pathlib import Path
import javalang
import pandas as pd

In [3]:
TOKEN = os.getenv("GITHUB_TOKEN")
if not TOKEN:
    raise SystemExit("❌ Please set GITHUB_TOKEN in your environment.")

HEADERS = {"Authorization": f"Bearer {TOKEN}", "Accept": "application/vnd.github+json"}

DATA_DIR = Path("java_repos") 

In [None]:
def search_java_repos(n=5, min_star=50):
    """
    Search Java repos by stars with pagination support.
    * Arguments:
        - n: number of repos to fetch
        - min_star: minimum stars to filter repos
    * Returns:
        - list of repo dicts as returned by GitHub API
    """

    url = "https://api.github.com/search/repositories"
    repos, page = [], 1
    while len(repos) < n:
        params = {
            "q": f"language:Java stars:>{min_star}",
            "sort": "stars",
            "order": "desc",
            "per_page": min(100, n - len(repos)),
            "page": page,
        }
        r = requests.get(url, headers=HEADERS, params=params, timeout=30)
        r.raise_for_status()
        items = r.json().get("items", [])
        if not items:
            break
        repos.extend(items)
        page += 1
    return repos[:n]


def get_repo_files(owner, repo, branch="main", extension=".java"):
    """
    Get all file paths in a repo.
    * Arguments:
        - owner: repo owner
        - repo: repo name
        - branch: branch name (default: main)
    * Returns:
        - list of file paths (strings)
    """

    url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1"
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    tree = r.json().get("tree", [])
    return [item["path"] for item in tree if item["type"] == "blob" and item["path"].endswith(extension)]


def get_last_commit(owner, repo, filepath):
    """
    Get the last commit SHA for a given file.   
    
    Since each file can have multiple method, it is hard to track commit per method. 
    We assume the last commit of the file is the commit for all methods in that file.

    * Arguments:
        - owner: repo owner
        - repo: repo name
        - filepath: path to the file in the repo
    * Returns:
        - commit SHA (string) or None if not found
    """
    url = f"https://api.github.com/repos/{owner}/{repo}/commits"
    params = {"path": filepath, "per_page": 1}
    r = requests.get(url, headers=HEADERS, params=params, timeout=30)
    r.raise_for_status()
    commits = r.json()
    return commits[0]["sha"] if commits else None


def download_and_extract_repo(owner, repo, branch, dest):
    """Download repo as zipball and extract it locally."""
    dest.mkdir(parents=True, exist_ok=True)
    zip_path = dest / f"{owner}-{repo}-{branch}.zip"
    extract_dir = dest / f"{owner}-{repo}-{branch}"

    if extract_dir.exists():
        shutil.rmtree(extract_dir)

    url = f"https://api.github.com/repos/{owner}/{repo}/zipball/{branch}"
    r = requests.get(url, headers=HEADERS, stream=True, timeout=60)
    r.raise_for_status()

    with open(zip_path, "wb") as f:
        for chunk in r.iter_content(1024 * 256):
            f.write(chunk)

    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(extract_dir)

    # GitHub zip includes a single top-level folder, return that
    subfolders = list(extract_dir.iterdir())
    return subfolders[0] if subfolders else extract_dir


def extract_methods_from_file(filepath):
    """Extract methods from a Java file using javalang."""
    try:
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            code = f.read()
        tree = javalang.parse.parse(code)
    except Exception:
        return []

    results = []
    lines = code.splitlines()
    for _, node in tree.filter(javalang.tree.MethodDeclaration):
        method_name = node.name
        start_line = node.position.line if node.position else None
        end_line = None
        if node.body and node.body[-1].position:
            end_line = node.body[-1].position.line
        signature = f"{' '.join(node.modifiers)} {node.return_type} {method_name}({', '.join(str(p.type) for p in node.parameters)})"
        original_code = "\n".join(lines[start_line-1:end_line]) if start_line and end_line else ""
        code_tokens = original_code.split()  # simple whitespace tokenization
        results.append({
            "method_name": method_name,
            "start_line": start_line,
            "end_line": end_line,
            "signature": signature,
            "original_code": original_code,
            "code_tokens": code_tokens,
        })
    return results


def build_dataset(n_repos=2, min_star=500, max_files=50, output_csv="java_functions_dataset.csv"):
    all_results = []
    repos = search_java_repos(n=n_repos, min_star=min_star)

    for repo in repos:
        owner, name = repo["full_name"].split("/")
        branch = repo["default_branch"]
        repo_url = repo["html_url"]

        print(f"\n🔹 Processing repo: {repo['full_name']} (branch={branch})")

        # Get files and commits
        files = get_repo_files(owner, name, branch)
        print(f"   Found {len(files)} .java files")

        local_repo = download_and_extract_repo(owner, name, branch, DATA_DIR)

        for f in files[:max_files]:  # avoid crawling too many files
            commit_sha = get_last_commit(owner, name, f)

            file_path = local_repo / f
            if not file_path.exists():
                continue

            methods = extract_methods_from_file(file_path)
            for m in methods:
                all_results.append({
                    "repo_name": repo["full_name"],
                    "repo_url": repo_url,
                    "commit_sha": commit_sha,
                    "file_path": f,
                    **m
                })

    df = pd.DataFrame(all_results)
    df.to_csv(output_csv, index=False)
    return df


In [None]:
df = build_dataset(n_repos=3, min_star=50, max_files=30)
print(df.head())

In [None]:
df.to_csv("methods.csv", index=False)

In [1]:
import requests
import os

token = os.getenv("GITHUB_TOKEN")  # load from env if set
headers = {"Authorization": f"Bearer {token}"} if token else {}

url = "https://api.github.com/rate_limit"
r = requests.get(url, headers=headers, timeout=30)
r.raise_for_status()
print(r.json())


{'resources': {'core': {'limit': 5000, 'used': 5000, 'remaining': 0, 'reset': 1758148849}, 'search': {'limit': 30, 'used': 0, 'remaining': 30, 'reset': 1758147425}, 'graphql': {'limit': 5000, 'used': 4, 'remaining': 4996, 'reset': 1758147532}, 'integration_manifest': {'limit': 5000, 'used': 0, 'remaining': 5000, 'reset': 1758150965}, 'source_import': {'limit': 100, 'used': 0, 'remaining': 100, 'reset': 1758147425}, 'code_scanning_upload': {'limit': 5000, 'used': 5000, 'remaining': 0, 'reset': 1758148849}, 'code_scanning_autofix': {'limit': 10, 'used': 0, 'remaining': 10, 'reset': 1758147425}, 'actions_runner_registration': {'limit': 10000, 'used': 0, 'remaining': 10000, 'reset': 1758150965}, 'scim': {'limit': 15000, 'used': 0, 'remaining': 15000, 'reset': 1758150965}, 'dependency_snapshots': {'limit': 100, 'used': 0, 'remaining': 100, 'reset': 1758147425}, 'dependency_sbom': {'limit': 100, 'used': 0, 'remaining': 100, 'reset': 1758147425}, 'audit_log': {'limit': 1750, 'used': 0, 'remai