Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,22 @@ services:
timeout: 5s
retries: 20

jina-reranker:
image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
platform: linux/amd64
container_name: jina-reranker
command: --model-id BAAI/bge-reranker-v2-m3 --max-client-batch-size 64 --max-batch-tokens 65536
volumes:
- reranker_cache:/data
ports:
- "8088:80"
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
interval: 15s
timeout: 5s
retries: 5
start_period: 360s

code-search:
build: .
platform: linux/amd64
Expand All @@ -37,14 +53,19 @@ services:
environment:
EMBEDDINGS_URL: http://jina-embeddings:80
QDRANT_URL: http://qdrant:6333
RERANKER_URL: http://jina-reranker:80
RERANKER_ENABLED: "true"
depends_on:
qdrant:
condition: service_healthy
jina-embeddings:
condition: service_healthy
jina-reranker:
condition: service_healthy
volumes:
- ./config.yaml:/app/config.yaml:ro

volumes:
qdrant_data:
embeddings_cache:
reranker_cache:
Empty file added eval/__init__.py
Empty file.
57 changes: 57 additions & 0 deletions eval/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from __future__ import annotations

import argparse
import asyncio
import logging
from datetime import datetime
from pathlib import Path

logging.basicConfig(level=logging.WARNING, format="%(levelname)s %(name)s: %(message)s")


def main() -> None:
parser = argparse.ArgumentParser(
description="Evaluate search_code retrieval quality against a golden dataset."
)
parser.add_argument("--dataset", required=True, help="Path to a YAML query dataset file")
parser.add_argument(
"--mode",
default="baseline",
choices=["baseline", "rerank"],
help="baseline = vector search only; rerank = vector + cross-encoder",
)
parser.add_argument(
"--output",
default=None,
help="Path to write JSON results (auto-generated if omitted)",
)
parser.add_argument(
"--top-k",
nargs="+",
type=int,
default=[1, 3, 5, 10],
metavar="K",
help="K values for Recall@K (default: 1 3 5 10)",
)
args = parser.parse_args()

output = args.output
if output is None:
ts = datetime.now().strftime("%Y-%m-%d-%H%M%S")
stem = Path(args.dataset).stem
output = f"eval/results/{stem}-{args.mode}-{ts}.json"

from eval.runner import run_evaluation

asyncio.run(
run_evaluation(
dataset_path=args.dataset,
mode=args.mode,
top_k_values=args.top_k,
output_path=output,
)
)


if __name__ == "__main__":
main()
124 changes: 124 additions & 0 deletions eval/datasets/queries.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# Evaluation dataset — primary golden query set
#
# Each case has:
# id: unique string (used in reports)
# query: natural language search query
# filters: optional Qdrant filters (language, service, symbol_type)
# expected: list of acceptable correct answers — OR semantics
# symbol_name: substring match (case-insensitive)
# file_path_contains: substring match on the stored file path
# notes: human-readable explanation of what the case tests
#
# A result is correct if ANY expected entry matches the hit's payload.
#
# Run: uv run python -m eval --dataset eval/datasets/queries.yaml

# ---------------------------------------------------------------------------
# Java / Spring — auth-server, gateway (GoodbyePlanet/spring-cg-bff)
# ---------------------------------------------------------------------------

- id: java-webauthn-controller
query: "REST controller for WebAuthn passkey authentication"
filters: { language: java }
expected:
- { symbol_name: "WebAuthnController", file_path_contains: "WebAuthnController" }
notes: "Tests @RestController detection in auth-server"

- id: java-entity-authorization
query: "JPA entity that stores OAuth2 authorization data"
filters: { language: java }
expected:
- { symbol_name: "Authorization", file_path_contains: "entity/authorization" }
notes: "Tests @Entity extraction — Authorization table"

- id: java-entity-consent
query: "JPA entity for OAuth2 authorization consent"
filters: { language: java }
expected:
- { symbol_name: "AuthorizationConsent", file_path_contains: "authorizationconsent" }
notes: "Tests @Entity extraction — AuthorizationConsent table"

- id: java-service-begin-auth
query: "service method that initiates the WebAuthn authentication flow"
filters: { language: java, symbol_type: method }
expected:
- { symbol_name: "beginAuthentication" }
notes: "Tests method-in-class chunking (parent WebAuthnService)"

- id: java-check-leaked-password
query: "service method that checks whether a password has been leaked or compromised"
filters: { language: java, symbol_type: method }
expected:
- { symbol_name: "isPasswordLeaked" }
notes: "Tests semantic retrieval of a boolean check method"

- id: java-webauthn-exception
query: "custom runtime exception thrown when WebAuthn authentication fails"
filters: { language: java }
expected:
- { symbol_name: "WebAuthnException", file_path_contains: "WebAuthnException" }
notes: "Tests exception class extraction"

- id: java-login-controller
query: "controller that serves the login page view"
filters: { language: java }
expected:
- { symbol_name: "LoginPageController", file_path_contains: "LoginPageController" }
notes: "Tests @Controller (non-REST) detection"

# ---------------------------------------------------------------------------
# TypeScript / React — fe-client (GoodbyePlanet/spring-cg-bff)
# ---------------------------------------------------------------------------

- id: ts-react-app-component
query: "main React application component that manages authentication state"
filters: { language: typescript, symbol_type: react_component }
expected:
- { symbol_name: "App", file_path_contains: "App.tsx" }
notes: "Tests react_component detection for root component"

- id: ts-interface-passkey
query: "TypeScript interface representing a registered passkey with a name and creation date"
filters: { language: typescript }
expected:
- { symbol_name: "RegisteredPasskey" }
notes: "Tests interface extraction from App.tsx"

# ---------------------------------------------------------------------------
# Go — leaked-passwords-api and passkey-service (GoodbyePlanet repos)
# ---------------------------------------------------------------------------

- id: go-struct-user
query: "Go struct representing a user with WebAuthn credentials"
filters: { language: go }
expected:
- { symbol_name: "User", file_path_contains: "user" }
notes: "Tests Go struct extraction — passkey-service models"

- id: go-struct-credential
query: "Go struct storing WebAuthn credential and authenticator data"
filters: { language: go }
expected:
- { symbol_name: "Credential", file_path_contains: "user" }
notes: "Tests Go struct extraction — Credential model"

- id: go-handler-check-password
query: "HTTP handler function that checks if a password has been leaked"
filters: { language: go }
expected:
- { symbol_name: "CheckPasswordHandler" }
notes: "Tests Go HTTP handler detection in leaked-passwords-api"

- id: go-handler-get-by-hash
query: "HTTP handler that looks up a password hash and returns breach count"
filters: { language: go }
expected:
- { symbol_name: "GetByHashHandler" }
notes: "Tests Go function extraction — hash lookup handler"

- id: go-service-password
query: "Go struct that wraps the password service with store and logger dependencies"
filters: { language: go, symbol_type: struct }
expected:
- { symbol_name: "PasswordService" }
notes: "Tests Go struct with dependency fields"
125 changes: 125 additions & 0 deletions eval/datasets/queries_html_css.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# Evaluation dataset — HTML and CSS specific cases
#
# These cases exist specifically to track retrieval quality for the HTML/CSS parsers,
# where the embedding model (Jina Code V2) is trained primarily on programming languages
# and its behaviour on CSS selectors / HTML structure is unverified.
#
# CSS symbol names are the selector string (e.g. ".btn-primary", "#header", "nav > ul").
# HTML symbol names are the tag name or heading text (for headings) or id/class value
# (for elements with id/class attributes).
#
# Run: uv run python -m eval --dataset eval/datasets/queries_html_css.yaml

# ---------------------------------------------------------------------------
# CSS examples — replace selectors with ones from your actual indexed files
# ---------------------------------------------------------------------------

- id: css-primary-button
query: "primary button styles"
filters: { language: css }
expected:
- { symbol_name: "btn-primary" }
- { symbol_name: ".btn-primary" }
- { symbol_name: ".button--primary" }
notes: "Button component CSS; tests selector-as-symbol-name pattern"

- id: css-hover-state
query: "hover state styles for navigation links"
filters: { language: css }
expected:
- { symbol_name: "nav" }
- { symbol_name: "a:hover" }
- { file_path_contains: "nav" }
notes: "Tests pseudo-class selector embedding quality"

- id: css-mobile-responsive
query: "mobile responsive breakpoint styles"
filters: { language: css }
expected:
- { file_path_contains: "responsive" }
- { file_path_contains: "mobile" }
- { symbol_name: "@media" }
notes: "Tests @media rule detection"

- id: css-grid-layout
query: "CSS grid layout for the main content area"
filters: { language: css }
expected:
- { symbol_name: ".container" }
- { symbol_name: ".main" }
- { symbol_name: ".grid" }
notes: "Tests layout-related class naming"

- id: css-form-input
query: "form input field styling"
filters: { language: css }
expected:
- { symbol_name: "input" }
- { symbol_name: ".form-control" }
- { symbol_name: ".input" }
notes: "Tests element + class selector disambiguation"

- id: css-error-state
query: "error state styling for form validation"
filters: { language: css }
expected:
- { symbol_name: ".error" }
- { symbol_name: ".is-invalid" }
- { symbol_name: ".form-error" }
notes: "Tests semantic understanding of validation-related CSS"

- id: css-card-component
query: "card component with shadow and border radius"
filters: { language: css }
expected:
- { symbol_name: ".card" }
- { symbol_name: ".card-container" }
notes: "Tests component-level CSS matching"

- id: css-color-variables
query: "CSS custom properties for brand colors"
filters: { language: css }
expected:
- { symbol_name: ":root" }
- { symbol_name: "--primary" }
notes: "Tests CSS custom property / variable detection"

# ---------------------------------------------------------------------------
# HTML examples
# ---------------------------------------------------------------------------

- id: html-login-form
query: "login form with username and password fields"
filters: { language: html }
expected:
- { file_path_contains: "login" }
- { symbol_name: "login" }
- { symbol_name: "login-form" }
notes: "Tests HTML form element extraction"

- id: html-nav-menu
query: "navigation menu with links"
filters: { language: html }
expected:
- { symbol_name: "nav" }
- { symbol_name: "navbar" }
- { symbol_name: "navigation" }
notes: "Tests nav element / id extraction"

- id: html-page-header
query: "page header with site title"
filters: { language: html }
expected:
- { symbol_name: "header" }
- { symbol_name: "site-header" }
- { symbol_name: "page-header" }
notes: "Tests heading and landmark element detection"

- id: html-product-card
query: "product card showing image, name and price"
filters: { language: html }
expected:
- { symbol_name: "product-card" }
- { symbol_name: "card" }
- { file_path_contains: "product" }
notes: "Tests class-named element extraction"
26 changes: 26 additions & 0 deletions eval/matching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from __future__ import annotations

from qdrant_client.models import ScoredPoint


def find_first_correct_rank(
hits: list[ScoredPoint],
expected: list[dict],
) -> int | None:
"""Return 1-based rank of the first hit that satisfies any expected entry, or None."""
for rank, hit in enumerate(hits, 1):
if any(_matches(hit.payload, exp) for exp in expected):
return rank
return None


def _matches(payload: dict, expected: dict) -> bool:
if "symbol_name" in expected:
hit_name = (payload.get("symbol_name") or "").lower()
if expected["symbol_name"].lower() not in hit_name:
return False
if "file_path_contains" in expected:
hit_path = (payload.get("file_path") or "").lower()
if expected["file_path_contains"].lower() not in hit_path:
return False
return True
15 changes: 15 additions & 0 deletions eval/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from __future__ import annotations


def recall_at_k(ranks: list[int | None], k: int) -> float:
"""Fraction of cases where the first correct hit appeared at rank <= k."""
if not ranks:
return 0.0
return sum(1 for r in ranks if r is not None and r <= k) / len(ranks)


def mrr(ranks: list[int | None]) -> float:
"""Mean Reciprocal Rank — 1/rank of first correct hit, averaged across cases."""
if not ranks:
return 0.0
return sum(1 / r for r in ranks if r is not None) / len(ranks)
Empty file added eval/results/.gitkeep
Empty file.
Loading