Netis · vaderyang · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/.github/workflows/pr-review-probe.yml b/.github/workflows/pr-review-probe.yml
@@ -0,0 +1,72 @@
+name: pr-review-probe
+
+# One-off probe that verifies the self-hosted runner has the
+# prerequisites the pr-review workflow needs. Run manually before
+# merging the pr-review feature, or any time the runner is reimaged.
+# Reports each check as a step so failure points are obvious in the
+# workflow log.
+
+on:
+  workflow_dispatch:
+
+jobs:
+  probe:
+    runs-on: [self-hosted, tokenscope]
+    timeout-minutes: 5
+    env:
+      ANTHROPIC_BASE_URL: http://172.16.103.81:4200
+      ANTHROPIC_API_KEY: dummy
+      ANTHROPIC_MODEL: claude-3-5-sonnet-20241022
+    steps:
+      - name: Runner host
+        run: |
+          echo "hostname: $(hostname)"
+          echo "user: $(whoami)"
+          echo "pwd: $(pwd)"
+          echo "uname: $(uname -a)"
+
+      - name: PATH
+        run: echo "$PATH" | tr ':' '\n'
+
+      - name: Claude Code CLI present
+        run: |
+          if command -v claude >/dev/null 2>&1; then
+            echo "claude: $(which claude)"
+            claude --version || echo "(no --version)"
+          else
+            echo "::error::claude CLI not found on PATH"
+            echo "install with: npm i -g @anthropic-ai/claude-code"
+            exit 1
+          fi
+
+      - name: gh CLI authenticated
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          if ! command -v gh >/dev/null 2>&1; then
+            echo "::error::gh CLI not found on PATH"
+            exit 1
+          fi
+          gh auth status || true
+          gh api repos/${{ github.repository }} --jq .full_name
+
+      - name: Python 3 + envsubst
+        run: |
+          python3 --version
+          command -v envsubst || { echo "::error::envsubst missing (apt install gettext-base)"; exit 1; }
+
+      - name: LiteLLM reachable
+        run: |
+          if ! curl -fsS --max-time 5 "$ANTHROPIC_BASE_URL/v1/models" >/dev/null; then
+            echo "::error::LiteLLM unreachable at $ANTHROPIC_BASE_URL"
+            curl -v --max-time 5 "$ANTHROPIC_BASE_URL/v1/models" || true
+            exit 1
+          fi
+          curl -s "$ANTHROPIC_BASE_URL/v1/models" | head -c 400
+          echo
+
+      - name: Round-trip claude → LiteLLM → GLM-5
+        run: |
+          OUT="$(timeout 60 claude --print --model "$ANTHROPIC_MODEL" --max-turns 1 --output-format text "say PONG, nothing else" 2>&1)"
+          echo "agent response: $OUT"
+          echo "$OUT" | grep -qi PONG || { echo "::error::no PONG in agent output"; exit 1; }
diff --git a/.github/workflows/pr-review.yml b/.github/workflows/pr-review.yml
@@ -0,0 +1,107 @@
+name: pr-review
+
+# Headless code-review agent. Fires AFTER the `ci` workflow completes
+# successfully on a PR — that gate keeps the agent from wasting GLM-5
+# cycles on PRs that don't even compile, and ensures the review the
+# author sees is "tests are green, here's what to look at next".
+#
+# Manual `workflow_dispatch` is kept as a fallback for re-runs.
+#
+# Runs on the wuneng self-hosted runner. Reaches LiteLLM at
+# 172.16.103.81:4200 over the internal libvirt bridge; LiteLLM
+# rewrites the Anthropic-shaped request onto the GLM-5 backend
+# (SGLang :9000). See docs/pr-review-agent.md.
+
+on:
+  workflow_run:
+    workflows: ["ci"]
+    types: [completed]
+  workflow_dispatch:
+    inputs:
+      pr_number:
+        description: PR number to review
+        required: true
+        type: number
+
+concurrency:
+  # One review per PR. A re-trigger cancels an in-flight review of
+  # the same PR so we don't post conflicting comments.
+  group: pr-review-${{ github.event.workflow_run.pull_requests[0].number || inputs.pr_number }}
+  cancel-in-progress: true
+
+jobs:
+  review:
+    # Gate the workflow_run path on:
+    #   * CI actually succeeded (we never review red builds)
+    #   * the upstream run was triggered by a PR (push events have no
+    #     PR to comment on)
+    #   * pull_requests[] is populated (empty for fork PRs — by design
+    #     we only review same-repo PRs from this internal repo)
+    if: >-
+      github.event_name == 'workflow_dispatch' ||
+      (
+        github.event.workflow_run.conclusion == 'success' &&
+        github.event.workflow_run.event == 'pull_request' &&
+        github.event.workflow_run.pull_requests[0] != null
+      )
+    runs-on: [self-hosted, tokenscope]
+    timeout-minutes: 30
+    env:
+      # Agent → LiteLLM → GLM-5. LiteLLM is configured to rewrite
+      # claude-3-5-sonnet-20241022 onto the GLM-5 backend.
+      ANTHROPIC_BASE_URL: http://172.16.103.81:4200
+      ANTHROPIC_API_KEY: dummy
+      ANTHROPIC_MODEL: claude-3-5-sonnet-20241022
+    steps:
+      - name: Resolve PR + head SHA
+        id: pr
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            PR_NUMBER="${{ inputs.pr_number }}"
+          else
+            PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}"
+          fi
+          if [ -z "$PR_NUMBER" ] || [ "$PR_NUMBER" = "null" ]; then
+            echo "::error::could not resolve PR number"
+            exit 1
+          fi
+          eval "$(gh pr view "$PR_NUMBER" \
+            --repo "$GITHUB_REPOSITORY" \
+            --json headRefOid,baseRefName \
+            --jq '"HEAD_SHA=\(.headRefOid)\nBASE_REF=\(.baseRefName)"')"
+          {
+            echo "pr_number=$PR_NUMBER"
+            echo "head_sha=$HEAD_SHA"
+            echo "base_ref=$BASE_REF"
+          } >> "$GITHUB_OUTPUT"
+
+      - name: Checkout PR head
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ steps.pr.outputs.head_sha }}
+          fetch-depth: 0
+
+      - name: Fetch base for diff
+        run: git fetch origin "${{ steps.pr.outputs.base_ref }}" --depth 200
+
+      - name: Run review agent
+        id: review
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_NUMBER: ${{ steps.pr.outputs.pr_number }}
+        run: bash scripts/pr-review/run_review.sh "$PR_NUMBER"
+
+      - name: Post review
+        if: always()
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_NUMBER: ${{ steps.pr.outputs.pr_number }}
+          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          AGENT_EXIT: ${{ steps.review.outcome }}
+        run: python3 scripts/pr-review/post_review.py "$PR_NUMBER"
+
+      - name: Cleanup transient files
+        if: always()
+        run: rm -f /tmp/pr-review-*.md /tmp/pr-review-*.log /tmp/pr-review-*.json || true
diff --git a/docs/pr-review-agent.md b/docs/pr-review-agent.md
@@ -0,0 +1,152 @@
+# PR review agent
+
+A headless code-review agent that fires after CI passes and before
+a human picks up the PR. The goal isn't to replace human review — it's
+to surface the obvious-in-hindsight stuff (schema mirror drift,
+body-column scans, missing queryKey deps, classifier rules sensitive
+to window width, etc.) so the human reviewer arrives at a PR with the
+"easy 80%" already triaged.
+
+## Architecture
+
+```
+GitHub                                  wuneng VM tokenscope-ci
+┌──────────────┐  ci passes (workflow_run)  ┌─────────────────────────┐
+│  PR opened   │ ─────────────────────────► │ self-hosted GH runner   │
+│  PR sync     │                            │  ┌───────────────────┐  │
+│              │                            │  │ pr-review.yml     │  │
+│              │                            │  │  ↓                │  │
+│              │                            │  │ run_review.sh ────┼──┼─► LiteLLM :4200
+│              │                            │  │   claude -p       │  │     ↓
+│              │                            │  │   read-only tools │  │   SGLang GLM-5 :9000
+│              │                            │  │  ↓                │  │
+│              │  gh pr review              │  │ post_review.py    │  │
+│              │ ◄──────────────────────────┤  └───────────────────┘  │
+└──────────────┘                            └─────────────────────────┘
+```
+
+## Components
+
+* `.github/workflows/pr-review.yml`
+  Trigger: `workflow_run` on the `ci` workflow's `completed` event,
+  gated on `conclusion == 'success'` and `event == 'pull_request'`.
+  Also accepts `workflow_dispatch` for manual re-runs (`gh workflow
+  run pr-review.yml -f pr_number=27`).
+* `scripts/pr-review/run_review.sh`
+  Substitutes `PR_NUMBER` / `HEAD_SHA` / `BASE_REF` into the prompt
+  template, pre-flights LiteLLM, runs `claude -p` with the read-only
+  tool allowlist + 1800 s outer timeout, writes the model's stdout to
+  `/tmp/pr-review-${N}-out.md`.
+* `scripts/pr-review/prompt.md`
+  Instructional prompt. Encodes repo facts the agent has to know
+  before reading the diff (crate map, schema mirror rules, repo's
+  history of footguns) and the strict output format the parser
+  expects.
+* `scripts/pr-review/allowed_tools.txt`
+  Explicit allowlist — `Read`, `Grep`, `Glob`, and a few inspection
+  Bash patterns. No `Edit`, no `Write`, no unrestricted `Bash(*)`.
+* `scripts/pr-review/post_review.py`
+  Reads the agent's output, picks the review event from the section
+  population (`Blocking` → REQUEST_CHANGES, `Suggestions`/`Questions`
+  → COMMENT, none → APPROVE), and hands it to `gh pr review`. Falls
+  back to a plain `gh pr comment` if the bot can't review the PR
+  (e.g. it authored it).
+
+## Trigger sequence
+
+```
+1. PR opened / synchronize
+2. `ci` workflow runs (cargo test, console build, …)
+3. `ci` completes with success
+4. `workflow_run` fires `pr-review` (this workflow)
+5. pr-review checks out the PR head, runs the agent
+6. agent posts a single PR review (APPROVE / COMMENT / REQUEST_CHANGES)
+```
+
+If CI fails, the review agent never runs. That's intentional — there's
+no value paying for a review on a PR that won't build.
+
+## Manual re-run
+
+```
+gh workflow run pr-review.yml -f pr_number=27
+```
+
+The `concurrency` block ensures a manual re-trigger cancels any
+in-flight review of the same PR — no duplicate comments.
+
+## Self-hosted runner expectations
+
+The `tokenscope` self-hosted runner on wuneng's `tokenscope-ci` VM
+needs:
+
+1. **Claude Code CLI** installed and on `$PATH`:
+   ```
+   npm i -g @anthropic-ai/claude-code
+   ```
+2. **GitHub CLI** authenticated:
+   ```
+   gh auth login         # one-time, as the `tokenscope-review-bot` account
+   ```
+3. **Python 3** (default `python3` is fine — `post_review.py` uses
+   stdlib only).
+4. **Network path** to LiteLLM at `172.16.103.81:4200` (the VM is on
+   wuneng's libvirt bridge, so this works out of the box).
+
+The workflow exports `ANTHROPIC_BASE_URL` / `ANTHROPIC_API_KEY` /
+`ANTHROPIC_MODEL` per-job. LiteLLM rewrites
+`claude-3-5-sonnet-20241022` onto GLM-5.
+
+## Cost / latency
+
+GLM-5 runs on-prem (GPUs 4-7 of wuneng, served by SGLang). No
+per-request cost; the constraint is GPU minutes.
+
+| PR size | Files | Input tokens | Output tokens | Wall clock |
+|---|---|---|---|---|
+| Small | 1–2 | 20–40 K | 3–6 K | 2–3 min |
+| Medium | 5–10 | 60–150 K | 8–15 K | 5–8 min |
+| Large | 30+ | 250–500 K | 15–25 K | 15–25 min |
+
+Concurrency cap: GH Actions `concurrency` is per-PR, but the runner
+itself is single-tenant (one job at a time). Multiple PRs serialize
+naturally. If we hit a "many PRs at once" pattern we can raise the
+runner's job-slot count, but two concurrent reviews is the ceiling
+before we start crowding training jobs on the same box.
+
+## Tuning the prompt
+
+The "Things this repo has been bitten by" section in `prompt.md` is
+the most valuable knob. Every time the agent misses a class of bug
+the human reviewer catches, add a one-line entry. Every time the
+agent flags a non-issue often enough to be annoying, refine or
+remove the corresponding entry.
+
+The prompt is intentionally repo-specific. A vanilla "review this
+diff" prompt produces generic style notes; the value of a per-repo
+agent is encoded prior knowledge about the repo's own historic
+footguns.
+
+## Failure modes
+
+| Failure | What happens | Mitigation |
+|---|---|---|
+| LiteLLM down | Pre-flight curl fails, `run_review.sh` exits 2 | `post_review.py` posts a brief "agent failed" comment with link to workflow log; PR is not blocked |
+| Agent loops | `timeout 1800` kills the agent | Same: failure comment, no block |
+| GLM-5 returns garbage / no `### Summary` | `run_review.sh` appends a warning to the output | `post_review.py` still posts it as COMMENT — the agent's broken output is visible, which is signal |
+| Bot can't `--approve` its own PR | `gh pr review` rc != 0 | `post_review.py` falls back to `gh pr comment` |
+| Schema mirror miss inside the agent | Agent under-reports | Add the missed signature to `prompt.md` § "Things this repo has been bitten by" — encode the lesson |
+
+## Phasing
+
+* **Phase 1 (this PR)**: scaffolding. `workflow_run` trigger gated on
+  CI success. Manual `workflow_dispatch` for re-runs. Test on a few
+  real PRs.
+* **Phase 2**: collect a calibration set of past PRs + human review
+  comments. Tune the prompt to converge with reviewer judgment. Add
+  a nightly self-check workflow that re-runs against a canonical
+  test PR and alerts on schema drift.
+* **Phase 3**: structured inline comments (`gh pr review
+  --comment line=...`) once we trust the line numbers in the agent's
+  output. Today the agent emits `file:line` references in markdown
+  and reviewers click through manually — fine for v1.
diff --git a/scripts/pr-review/allowed_tools.txt b/scripts/pr-review/allowed_tools.txt
@@ -0,0 +1,17 @@
+Read
+Grep
+Glob
+Bash(gh pr diff:*)
+Bash(gh pr view:*)
+Bash(gh pr checks:*)
+Bash(git diff:*)
+Bash(git log:*)
+Bash(git show:*)
+Bash(git blame:*)
+Bash(rg:*)
+Bash(find:*)
+Bash(wc:*)
+Bash(head:*)
+Bash(tail:*)
+Bash(cat:*)
+Bash(ls:*)