Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 26 additions & 26 deletions build_ast_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,13 @@

_VERBOSE_STDERR_LOCK = threading.Lock()

_PASS1_START = "[pass1] starting · parsing Java files under source root"
_PASS2_START = "[pass2] starting · emitting EXTENDS / IMPLEMENTS / DECLARES rows"
_PASS3_START = "[pass3] starting · call resolution (outgoing calls per site)"
_PASS4_START = "[pass4] starting · route and EXPOSES extraction"
_PASS5_START = "[pass5] starting · imperative HTTP_CALLS / ASYNC_CALLS edges"
_PASS6_START = "[pass6] starting · cross-service call-edge matching"
_WRITE_START = "[write] starting · writing Kuzu graph to disk"
_PASS1_START = "[graph] pass 1 · parsing Java files"
_PASS2_START = "[graph] pass 2 · emitting EXTENDS / IMPLEMENTS / DECLARES rows"
_PASS3_START = "[graph] pass 3 · call resolution (outgoing calls per site)"
_PASS4_START = "[graph] pass 4 · route and EXPOSES extraction"
_PASS5_START = "[graph] pass 5 · imperative HTTP_CALLS / ASYNC_CALLS edges"
_PASS6_START = "[graph] pass 6 · cross-service call-edge matching"
_WRITE_START = "[graph] writing · Kuzu graph to disk"


def _verbose_stderr_line(content: str) -> None:
Expand Down Expand Up @@ -104,7 +104,7 @@ def worker() -> None:
t0 = time.monotonic()
while not stop.wait(timeout=5.0):
elapsed = int(time.monotonic() - t0)
_verbose_stderr_line(f"{tag} running … {elapsed}s elapsed")
_verbose_stderr_line(f"{tag} · {elapsed}s elapsed")

self._thr = threading.Thread(target=worker, name=f"hb-{tag}", daemon=True)
self._thr.start()
Expand Down Expand Up @@ -476,7 +476,7 @@ def pass1_parse(root: Path, tables: GraphTables, *, verbose: bool) -> dict[str,
slow_sec = float(raw_slow)
except ValueError:
slow_sec = 0.0
with _VerbosePassHeartbeats("[pass1]", verbose=verbose):
with _VerbosePassHeartbeats("[graph] pass 1", verbose=verbose):
if verbose and slow_sec > 0:
time.sleep(slow_sec)
for p in iter_java_source_files(root, ignore=ignore):
Expand Down Expand Up @@ -521,7 +521,7 @@ def pass1_parse(root: Path, tables: GraphTables, *, verbose: bool) -> dict[str,
if verbose:
elapsed = time.time() - t0
_verbose_stderr_line(
f"[pass1] parsed {n_files} files in {elapsed:.2f}s: "
f"[graph] pass 1 · parsed {n_files} files in {elapsed:.2f}s: "
f"{len(tables.types)} types, {len(tables.members)} members, "
f"{tables.parse_errors} parse errors, {tables.skipped_files} skipped",
)
Expand Down Expand Up @@ -759,7 +759,7 @@ def pass2_edges(tables: GraphTables, asts: dict[str, JavaFileAst], *, verbose: b
seen_inj: set[tuple[str, str, str, str]] = set()
if verbose:
_verbose_stderr_line(_PASS2_START)
with _VerbosePassHeartbeats("[pass2]", verbose=verbose):
with _VerbosePassHeartbeats("[graph] pass 2", verbose=verbose):
for fqn, entry in tables.types.items():
ast = asts.get(entry.file_path)
if ast is None:
Expand All @@ -769,7 +769,7 @@ def pass2_edges(tables: GraphTables, asts: dict[str, JavaFileAst], *, verbose: b
if verbose:
elapsed = time.time() - t0
_verbose_stderr_line(
f"[pass2] emitted {len(tables.extends_rows)} EXTENDS, "
f"[graph] pass 2 · emitted {len(tables.extends_rows)} EXTENDS, "
f"{len(tables.implements_rows)} IMPLEMENTS, "
f"{len(tables.injects_rows)} INJECTS, "
f"{len(tables.phantoms)} phantoms in {elapsed:.2f}s",
Expand Down Expand Up @@ -1432,7 +1432,7 @@ def pass3_calls(tables: GraphTables, asts: dict[str, JavaFileAst], *, verbose: b
_verbose_stderr_line(_PASS3_START)
_build_member_indexes(tables)
stats = CallResolutionStats()
with _VerbosePassHeartbeats("[pass3]", verbose=verbose):
with _VerbosePassHeartbeats("[graph] pass 3", verbose=verbose):
for rel_path, file_ast in asts.items():
try:
_process_file_calls(file_ast, rel_path, tables, stats)
Expand All @@ -1455,7 +1455,7 @@ def pass3_calls(tables: GraphTables, asts: dict[str, JavaFileAst], *, verbose: b
)
log.info(msg)
if verbose:
_verbose_stderr_line(f"[pass3] {msg}")
_verbose_stderr_line(f"[graph] pass 3 · {msg}")


_PATH_VAR_SEG = re.compile(r"^\{([^:{}]+)(?::([^}]*))?\}$") # whole path segment
Expand Down Expand Up @@ -1586,7 +1586,7 @@ def pass4_routes(
meta_chain = collect_annotation_meta_chain(prs)
if verbose:
_verbose_stderr_line(_PASS4_START)
with _VerbosePassHeartbeats("[pass4]", verbose=verbose):
with _VerbosePassHeartbeats("[graph] pass 4", verbose=verbose):

for ast in asts.values():
stats.routes_skipped_unresolved += ast.routes_skipped_unresolved
Expand Down Expand Up @@ -1710,7 +1710,7 @@ def pass4_routes(
)
log.info(msg)
if verbose:
_verbose_stderr_line(f"[pass4] {msg}")
_verbose_stderr_line(f"[graph] pass 4 · {msg}")


def pass5_imperative_edges(
Expand Down Expand Up @@ -1763,7 +1763,7 @@ def _phantom_async_route_id(call: OutgoingCallDecl) -> str:

if verbose:
_verbose_stderr_line(_PASS5_START)
with _VerbosePassHeartbeats("[pass5]", verbose=verbose):
with _VerbosePassHeartbeats("[graph] pass 5", verbose=verbose):
for member in sorted(tables.members, key=lambda x: x.node_id):
if member.decl.is_constructor:
continue
Expand Down Expand Up @@ -2018,7 +2018,7 @@ def _phantom_async_route_id(call: OutgoingCallDecl) -> str:
http_strategy = dict(sorted(tables.call_edge_stats.http_calls_by_strategy.items()))
async_strategy = dict(sorted(tables.call_edge_stats.async_calls_by_strategy.items()))
_verbose_stderr_line(
f"[pass5] HTTP_CALLS: {len(tables.http_call_rows)} edges, "
f"[graph] pass 5 · HTTP_CALLS: {len(tables.http_call_rows)} edges, "
f"ASYNC_CALLS: {len(tables.async_call_rows)} edges; "
f"http_by_client_kind={http_client}, async_by_client_kind={async_client}, "
f"http_by_strategy={http_strategy}, async_by_strategy={async_strategy}",
Expand Down Expand Up @@ -2165,7 +2165,7 @@ def _micro_factor(member: MemberEntry | None) -> float:

if verbose:
_verbose_stderr_line(_PASS6_START)
with _VerbosePassHeartbeats("[pass6]", verbose=verbose):
with _VerbosePassHeartbeats("[graph] pass 6", verbose=verbose):
for row in tables.http_call_rows:
if row.match != "unresolved":
continue
Expand Down Expand Up @@ -2317,14 +2317,14 @@ def _micro_factor(member: MemberEntry | None) -> float:
first_http = ", ".join(suppressed_auto_cross_http)
first_async = ", ".join(suppressed_auto_cross_async)
_verbose_stderr_line(
f"[pass6] cross_service_resolution=brownfield_only:\n"
f"[graph] pass 6 · cross_service_resolution=brownfield_only:\n"
f" {n_bf} cross_service edges from brownfield layers,\n"
f" {suppressed_auto_cross_count} auto-cross-service candidates suppressed -> unresolved\n"
f" (first 5 http: {first_http})\n"
f" (first 5 async: {first_async})",
)
_verbose_stderr_line(
f"[pass6] http_match={dict(sorted(tables.call_edge_stats.http_calls_match_breakdown.items()))}, "
f"[graph] pass 6 · http_match={dict(sorted(tables.call_edge_stats.http_calls_match_breakdown.items()))}, "
f"async_match={dict(sorted(tables.call_edge_stats.async_calls_match_breakdown.items()))}, "
f"cross_service_calls_total={tables.call_edge_stats.cross_service_calls_total}",
)
Expand Down Expand Up @@ -3004,7 +3004,7 @@ def write_kuzu(
)
if verbose:
_verbose_stderr_line(_WRITE_START)
with _VerbosePassHeartbeats("[write]", verbose=verbose):
with _VerbosePassHeartbeats("[graph] writing", verbose=verbose):
db_path.parent.mkdir(parents=True, exist_ok=True)
db = kuzu.Database(str(db_path))
conn = kuzu.Connection(db)
Expand All @@ -3018,17 +3018,17 @@ def write_kuzu(
meta_chain=meta_chain,
)
if verbose:
_verbose_stderr_line(f"[write] nodes written in {time.time() - t0:.2f}s")
_verbose_stderr_line(f"[graph] writing · nodes written in {time.time() - t0:.2f}s")
_populate_declares_rows(tables)
_populate_overrides_rows(tables)
t1 = time.time()
_write_edges(conn, tables)
if verbose:
_verbose_stderr_line(f"[write] edges written in {time.time() - t1:.2f}s")
_verbose_stderr_line(f"[graph] writing · edges written in {time.time() - t1:.2f}s")
t2 = time.time()
_write_routes_and_exposes(conn, tables)
if verbose:
_verbose_stderr_line(f"[write] routes/exposes written in {time.time() - t2:.2f}s")
_verbose_stderr_line(f"[graph] writing · routes/exposes written in {time.time() - t2:.2f}s")
_write_meta(conn, tables, source_root)
conn.close()

Expand Down Expand Up @@ -3073,7 +3073,7 @@ def main() -> int:
pass6_match_edges(tables, verbose=args.verbose)
write_kuzu(kuzu_path, tables, source_root=root, verbose=args.verbose)
if args.verbose:
_verbose_stderr_line(f"[done] kuzu at {kuzu_path}")
_verbose_stderr_line(f"[graph] done · kuzu at {kuzu_path}")
return 0


Expand Down
63 changes: 41 additions & 22 deletions java_codebase_rag/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,19 +100,25 @@ def _emit_reprocess_outcome(payload: dict[str, Any], *, selective_tty_mode: str


def _pipeline_header(subcommand: str, cfg: ResolvedOperatorConfig) -> None:
from java_codebase_rag.cli_format import bold

root = cfg.source_root.resolve()
idx = cfg.index_dir.resolve()
print(
f"java-codebase-rag {subcommand} {_PIPELINE_SEP} source={root} {_PIPELINE_SEP} index={idx}",
bold(f"java-codebase-rag {subcommand} {_PIPELINE_SEP} source={root} {_PIPELINE_SEP} index={idx}"),
file=sys.stderr,
flush=True,
)


def _pipeline_footer(subcommand: str, started: float, exit_code: int) -> None:
from java_codebase_rag.cli_format import bold, styled_check, styled_cross

elapsed = time.perf_counter() - started
marker = styled_check() if exit_code == 0 else styled_cross()
print(
f"java-codebase-rag {subcommand} {_PIPELINE_SEP} finished in {elapsed:.2f}s (exit={exit_code})",
f"{marker} {bold(f'java-codebase-rag {subcommand} {_PIPELINE_SEP} finished in {elapsed:.2f}s')}"
+ (f" (exit={exit_code})" if exit_code != 0 else ""),
file=sys.stderr,
flush=True,
)
Expand Down Expand Up @@ -205,6 +211,22 @@ def _add_index_embedding_flags(p: argparse.ArgumentParser) -> None:
p.add_argument("--embedding-device", type=str, default=None, help="Override SBERT_DEVICE / YAML embedding.device")


def _add_verbosity_flags(p: argparse.ArgumentParser) -> None:
g = p.add_mutually_exclusive_group()
g.add_argument(
"--quiet", "-q",
action="store_true",
dest="quiet",
help="Suppress stderr progress relay; stdout payload unchanged.",
)
g.add_argument(
"--verbose", "-v",
action="store_true",
dest="verbose",
help="Show full subprocess output (Lance warnings, brownfield events, progress bars).",
)


def _cmd_init(args: argparse.Namespace) -> int:
cfg = _resolved_from_ns(args)
_startup_hints(cfg)
Expand All @@ -227,10 +249,12 @@ def _cmd_init(args: argparse.Namespace) -> int:

def work() -> int:
env = cfg.subprocess_env()
verbose = bool(args.verbose)
coco = run_cocoindex_update(
env,
full_reprocess=False,
quiet=bool(args.quiet),
verbose=verbose,
lance_project_root=None if args.quiet else cfg.source_root,
)
if coco.returncode != 0:
Expand All @@ -244,10 +268,13 @@ def work() -> int:
}
)
return 1
if not args.quiet:
print(file=sys.stderr, flush=True)
g = run_build_ast_graph(
source_root=cfg.source_root,
kuzu_path=cfg.kuzu_path,
verbose=not args.quiet,
verbose=verbose,
quiet=bool(args.quiet),
env=env,
)
if g.returncode != 0:
Expand Down Expand Up @@ -279,6 +306,7 @@ def work() -> int:
env,
full_reprocess=False,
quiet=bool(args.quiet),
verbose=bool(args.verbose),
lance_project_root=None if args.quiet else cfg.source_root,
)
if coco.returncode != 0:
Expand All @@ -305,11 +333,12 @@ def _cmd_reprocess(args: argparse.Namespace) -> int:

def work() -> int:
env = cfg.subprocess_env()
verbose = bool(args.verbose)
vectors_only = bool(getattr(args, "vectors_only", False))
graph_only = bool(getattr(args, "graph_only", False))

if vectors_only:
coco = run_cocoindex_update(env, full_reprocess=True, quiet=bool(args.quiet))
coco = run_cocoindex_update(env, full_reprocess=True, quiet=bool(args.quiet), verbose=verbose)
if _is_cocoindex_preflight_blocker(coco):
payload: dict[str, Any] = {
"success": False,
Expand Down Expand Up @@ -345,7 +374,8 @@ def work() -> int:
g = run_build_ast_graph(
source_root=cfg.source_root,
kuzu_path=cfg.kuzu_path,
verbose=not args.quiet,
verbose=verbose,
quiet=bool(args.quiet),
env=env,
)
if _is_graph_preflight_blocker(g):
Expand Down Expand Up @@ -381,7 +411,7 @@ def work() -> int:

import server # lazy: pulls sentence_transformers/torch/lancedb/kuzu

result = asyncio.run(server.run_refresh_pipeline(quiet=bool(args.quiet)))
result = asyncio.run(server.run_refresh_pipeline(quiet=bool(args.quiet), verbose=verbose))
payload = result.model_dump()
_emit_reprocess_outcome(payload)
return _reprocess_exit_code(payload)
Expand Down Expand Up @@ -614,11 +644,7 @@ def build_parser() -> argparse.ArgumentParser:
),
)
_add_index_embedding_flags(init)
init.add_argument(
"--quiet",
action="store_true",
help="Suppress stderr progress relay; stdout payload unchanged.",
)
_add_verbosity_flags(init)
init.set_defaults(handler=_cmd_init)

increment = subparsers.add_parser(
Expand All @@ -627,11 +653,7 @@ def build_parser() -> argparse.ArgumentParser:
description="Runs cocoindex catch-up (no full reprocess). Does not rebuild Kuzu; see stderr warning.",
)
_add_index_embedding_flags(increment)
increment.add_argument(
"--quiet",
action="store_true",
help="Suppress stderr progress relay; stdout payload unchanged.",
)
_add_verbosity_flags(increment)
increment.set_defaults(handler=_cmd_increment)

reprocess = subparsers.add_parser(
Expand All @@ -643,11 +665,7 @@ def build_parser() -> argparse.ArgumentParser:
),
)
_add_index_embedding_flags(reprocess)
reprocess.add_argument(
"--quiet",
action="store_true",
help="Suppress stderr progress relay; stdout payload unchanged.",
)
_add_verbosity_flags(reprocess)
_rex = reprocess.add_mutually_exclusive_group()
_rex.add_argument(
"--vectors-only",
Expand All @@ -669,8 +687,9 @@ def build_parser() -> argparse.ArgumentParser:
_add_index_embedding_flags(erase)
erase.add_argument("--yes", action="store_true", help="Confirm destructive deletion (required in CI)")
erase.add_argument(
"--quiet",
"--quiet", "-q",
action="store_true",
dest="quiet",
help="Suppress stderr progress relay; stdout payload unchanged.",
)
erase.set_defaults(handler=_cmd_erase)
Expand Down
Loading
Loading