# Description

In this notebook, I will use Python to craw 20_000 java method.

In [1]:
import argparse
import csv
import json
import os
import re
import sys
import time
from pathlib import Path
from typing import Dict, Iterable, List, Optional
import requests
import zipfile
import pandas as pd
from pathlib import Path
import javalang

In [2]:
TOKEN = os.getenv("GITHUB_TOKEN")
if not TOKEN:
    raise SystemExit("❌ Please set GITHUB_TOKEN in your environment.")

HEADERS = {"Authorization": f"Bearer {TOKEN}",
           "Accept": "application/vnd.github+json"}

# 1. Download Java repos

In [3]:
def search_java_repos(n=5, min_star=50):
    """Search Java repos (top starred)."""
    url = "https://api.github.com/search/repositories"
    params = {"q": f"language:Java stars:>{min_star}", "sort": "stars", "order": "desc", "per_page": n}
    r = requests.get(url, headers=HEADERS, params=params, timeout=30)
    r.raise_for_status()
    return r.json()["items"]

def download_zip(owner, repo, branch, dest):
    """Download repo as zipball."""
    url = f"https://api.github.com/repos/{owner}/{repo}/zipball/{branch}"
    r = requests.get(url, headers=HEADERS, stream=True, timeout=60)
    r.raise_for_status()
    path = dest / f"{owner}-{repo}.zip"
    with open(path, "wb") as f:
        for chunk in r.iter_content(1024 * 256):
            f.write(chunk)
    return path

def unzip_file(zip_path, dest):
    extract_dir = dest / zip_path.stem
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(extract_dir)
    return extract_dir

Download top repos with highest star

In [4]:
MAX_REPOS = 5
MIN_NUM_STARS = 50

base_dir = Path("java_repos")
base_dir.mkdir(exist_ok=True)

In [5]:
repos = search_java_repos(MAX_REPOS, MIN_NUM_STARS) 

In [7]:
# Save repos infor to a json file
with open(base_dir / "repos.json", "w") as f:
    json.dump(repos, f, indent=2)
print(f"Found {len(repos)} repos.")

Found 5 repos.


In [None]:
%%time
repos = search_java_repos(MAX_REPOS, MIN_NUM_STARS) 

for repo in repos:
    owner = repo["owner"]["login"]
    name = repo["name"]
    branch = repo["default_branch"]
    print(f"Downloading {owner}/{name}...")
    zip_path = download_zip(owner, name, branch, base_dir)
    folder = unzip_file(zip_path, base_dir)
    print(f"Saved {owner}/{name} into {folder}\n")
    time.sleep(1)  

# 2. Parse java method from repos

In [35]:
ROOT = Path("java_repos")          # parent folder containing repos
OUT  = Path("methods.csv")         # output file

In [36]:
def type_name(t):
    if t is None: return "void"
    name = getattr(t, "name", str(t))
    if getattr(t, "dimensions", None):
        name += "[]" * len(t.dimensions)
    return name

def params_sig(params):
    parts = []
    for p in params or []:
        ty = type_name(p.type)
        if getattr(p, "varargs", False): ty += "..."
        parts.append(f"{ty} {p.name}")
    return ", ".join(parts)

def find_method_code(src, start_line):
    """Grab code lines for a method (rough, brace-based)."""
    lines = src.splitlines()
    if not start_line or start_line > len(lines):
        return start_line, start_line, ""
    i = start_line - 1
    # find first '{'
    while i < len(lines) and '{' not in lines[i]:
        i += 1
    if i >= len(lines):
        return start_line, start_line, ""
    depth, j = 0, i
    code_lines = []
    while j < len(lines):
        line = lines[j]
        if "{" in line: depth += line.count("{")
        if "}" in line: depth -= line.count("}")
        code_lines.append(line)
        if depth == 0 and "}" in line:
            break
        j += 1
    return start_line, j + 1, "\n".join(code_lines)

def parse_file(java_file, writer, repo_name, repo_url):
    src = java_file.read_text(encoding="utf-8", errors="ignore")
    try:
        tree = javalang.parse.parse(src)
    except:
        return
    for _, m in tree.filter(javalang.tree.MethodDeclaration):
        sig = f"{' '.join(sorted(m.modifiers or []))} {type_name(m.return_type)} {m.name}({params_sig(m.parameters)})".strip()
        start, end, code = find_method_code(src, getattr(m.position, "line", None))
        writer.writerow([
            repo_name, repo_url, "", str(java_file),
            m.name, start or "", end or "", sig, code.strip()
        ])


In [37]:
with OUT.open("w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow([
        "repo_name","repo_url","commit_sha","file_path",
        "method_name","start_line","end_line","signature","original_code"
    ])
    for repo_dir in ROOT.iterdir():
        if repo_dir.is_dir():
            repo_name = repo_dir.name
            repo_url  = f"https://github.com/{repo_name.replace('-', '/')}"
            for java_file in repo_dir.rglob("*.java"):
                parse_file(java_file, writer, repo_name, repo_url)

print(f"✅ Done. Wrote {OUT}")

✅ Done. Wrote methods.csv


Test the craw file

In [39]:
df = pd.read_csv("methods.csv")
print(f"Total methods: {len(df)}")
df.head()

Total methods: 53016


Unnamed: 0,repo_name,repo_url,commit_sha,file_path,method_name,start_line,end_line,signature,original_code
0,spring-projects-spring-boot,https://github.com/spring/projects/spring/boot,,java_repos/spring-projects-spring-boot/spring-...,load,37,41,static Changelog load(),static Changelog load() {\n\t\tConfigurationMe...
1,spring-projects-spring-boot,https://github.com/spring/projects/spring/boot,,java_repos/spring-projects-spring-boot/spring-...,load,43,50,private static ConfigurationMetadataRepository...,private static ConfigurationMetadataRepository...
2,spring-projects-spring-boot,https://github.com/spring/projects/spring/boot,,java_repos/spring-projects-spring-boot/spring-...,generateChangeLog,44,54,void generateChangeLog(),void generateChangeLog() throws IOException {\...
3,spring-projects-spring-boot,https://github.com/spring/projects/spring/boot,,java_repos/spring-projects-spring-boot/spring-...,addJar,56,66,"private void addJar(File directory, String fil...","private void addJar(File directory, String fil..."
4,spring-projects-spring-boot,https://github.com/spring/projects/spring/boot,,java_repos/spring-projects-spring-boot/spring-...,diffContainsDifferencesBetweenLeftAndRightInputs,37,66,void diffContainsDifferencesBetweenLeftAndRigh...,void diffContainsDifferencesBetweenLeftAndRigh...
