diff --git a/.github/workflows/ci-chaos.yml b/.github/workflows/ci-chaos.yml new file mode 100644 index 0000000..5eb83a0 --- /dev/null +++ b/.github/workflows/ci-chaos.yml @@ -0,0 +1,37 @@ +name: Chaos Tests (L6) + +on: + schedule: + # Weekly: Sunday 06:00 UTC + - cron: "0 6 * * 0" + workflow_dispatch: + # Manual trigger from GitHub UI + +jobs: + chaos-tests: + name: Chaos / Destructive Tests + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install package with test dependencies + run: | + pip install -e ".[test]" + pip install pytest pytest-timeout responses PyYAML + + - name: Run chaos tests + run: | + python -m pytest tests/chaos/ -m chaos -v --tb=long --timeout=120 \ + --junit-xml=chaos-results.xml + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: chaos-test-results + path: chaos-results.xml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..0dccc57 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,114 @@ +# CI — Simplified Pipeline +# Runs on push/PR to master. All tests run (no -x early exit). + +name: CI + +on: + push: + branches: [main, master] + paths-ignore: + - "**.md" + - "docs/**" + pull_request: + branches: [main, master] + +env: + PYTHONIOENCODING: utf-8 + +jobs: + # --------------------------------------------------------------------------- + # L0 — Version consistency check + # --------------------------------------------------------------------------- + version-check: + name: L0 — Version Sync + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install package + run: pip install -e . + - name: Check version consistency + run: python scripts/check_version_sync.py + + # --------------------------------------------------------------------------- + # L1 — Unit tests (3 OS x 1 Python) + # --------------------------------------------------------------------------- + unit-tests: + name: L1 — Unit Tests (${{ matrix.os }}, py3.11) + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install package with test deps + run: pip install -e ".[test]" + - name: Run unit tests + shell: bash + run: | + python -m pytest tests/ \ + --ignore=tests/sandbox \ + --ignore=tests/cli \ + --ignore=tests/e2e \ + --ignore=tests/journey \ + --ignore=tests/chaos \ + --ignore=tests/audit \ + -v --tb=short --timeout=60 + + # --------------------------------------------------------------------------- + # L3 — Plugin tests (Vitest) + # --------------------------------------------------------------------------- + plugin-tests: + name: L3 — Plugin Tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: 20 + cache: "npm" + cache-dependency-path: paperforge/plugin/package-lock.json + - run: npm ci + working-directory: paperforge/plugin + - run: npx vitest run --reporter=verbose + working-directory: paperforge/plugin + + # --------------------------------------------------------------------------- + # L4 — E2E + Audit + # --------------------------------------------------------------------------- + e2e-tests: + name: L4 — E2E + Audit + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install package with test deps + run: pip install -e ".[test]" + - name: Run E2E tests + run: python -m pytest tests/e2e/ -m e2e -v --tb=short --timeout=120 + - name: Run audit tests + run: python -m pytest tests/audit/ -m audit -v --tb=short --timeout=120 + + # --------------------------------------------------------------------------- + # Merge gate + # --------------------------------------------------------------------------- + alls-green: + name: All Checks Passed + if: always() + needs: + - unit-tests + - plugin-tests + runs-on: ubuntu-latest + steps: + - uses: re-actors/alls-green@v1.2.2 + with: + allowed-skips: version-check + jobs: ${{ toJSON(needs) }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..c8aab79 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,42 @@ +# Auto-release Obsidian plugin on tag push +# Triggered by tags like v1.4.18. Runs plugin tests, then creates a GitHub Release +# with the 4 required Obsidian plugin files. + +name: Release + +on: + push: + tags: + - "v*" + +jobs: + release: + name: Release Plugin + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-node@v4 + with: + node-version: 20 + cache: "npm" + cache-dependency-path: paperforge/plugin/package-lock.json + + - name: Run plugin tests + working-directory: paperforge/plugin + run: | + npm ci + npx vitest run --reporter=verbose + + - name: Create Release + uses: softprops/action-gh-release@v2 + with: + name: ${{ github.ref_name }} + generate_release_notes: true + files: | + paperforge/plugin/main.js + paperforge/plugin/styles.css + paperforge/plugin/manifest.json + paperforge/plugin/versions.json diff --git a/.planning/phases/agent-context-review/REVIEW.md b/.planning/phases/agent-context-review/REVIEW.md new file mode 100644 index 0000000..2ac8ac1 --- /dev/null +++ b/.planning/phases/agent-context-review/REVIEW.md @@ -0,0 +1,247 @@ +--- +phase: agent-context-plan-review +reviewed: 2026-05-12T00:00:00Z +depth: deep +files_reviewed: 6 +files_reviewed_list: + - docs/superpowers/plans/2026-05-12-agent-context.md + - docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md + - paperforge/cli.py + - paperforge/commands/__init__.py + - paperforge/core/result.py + - paperforge/memory/schema.py +findings: + critical: 1 + warning: 3 + info: 2 + total: 6 +status: issues_found +--- + +# Phase: agent-context Plan Review + +**Reviewed:** 2026-05-12 +**Depth:** deep (cross-file analysis — spec vs plan vs existing CLI conventions vs schema) +**Files Reviewed:** 6 +**Status:** issues_found + +## Summary + +Reviewed the `agent-context` implementation plan against the Phase 2-5 design spec (Feature 1), existing `cli.py` conventions, the `PFResult` contract, and the `papers` table schema. The plan follows established CLI dispatch patterns and all SQL column names correctly match the schema. However, one **BLOCKER** output-structure violation was found (collections at wrong JSON path), along with several warnings about error handling and spec fidelity. + +--- + +## Critical Issues + +### CR-01: `collections` output at wrong JSON path — spec/plan contract violation + +**File:** `docs/superpowers/plans/2026-05-12-agent-context.md:121-128` and `docs/superpowers/plans/2026-05-12-agent-context.md:239-248` + +**Issue:** The spec defines `collections` as a top-level key under `data`, sibling to `library`: + +```json +// Spec (lines 44-47 of design spec): +"data": { + "library": { ... }, + "collections": [ ... ], // ← top-level under data + "commands": { ... }, + "rules": [ ... ] +} +``` + +But the plan nests `collections` *inside* `library`: + +```python +# Plan: get_agent_context() returns (line 121-128): +return { + "paper_count": total, + "domain_counts": domains, + "lifecycle_counts": lifecycle_counts, + "ocr_counts": ocr_counts, + "deep_reading_counts": deep_counts, + "collections": collections, # ← inside library dict +} + +# Plan: CLI wrapper constructs (line 239-248): +data = { + "library": library, # ← library includes collections + ... +} +# No separate "collections" key at data level! +``` + +Result: `data.library.collections` instead of spec's `data.collections`. Any downstream agent or plugin that follows the spec contract and accesses `data.collections` will get nothing / `undefined`. + +**Fix:** Either: + +**Option A (move to spec location):** Remove `collections` from `get_agent_context()` return value, and set it at the `data` level in the CLI wrapper: + +```python +# In get_agent_context(), remove "collections": +return { + "paper_count": total, + "domain_counts": domains, + "lifecycle_counts": lifecycle_counts, + "ocr_counts": ocr_counts, + "deep_reading_counts": deep_counts, +} +# In CLI run(), add collections at data level: +data = { + "paperforge": {...}, + "library": library, + "collections": _build_collection_tree_from_conn(vault), # separate call + "commands": COMMANDS, + "rules": RULES, +} +``` + +**Option B (update spec):** If nesting is intentional, update the spec JSON example to show `library.collections` instead of `data.collections`. The non-json output code in the plan (line 264) already reads `lib.get("collections", [])` which matches the nested location. + +--- + +## Warnings + +### WR-01: Blanket `except Exception` silently swallows all query errors + +**File:** `docs/superpowers/plans/2026-05-12-agent-context.md:129-130` + +**Issue:** The `get_agent_context()` function has: + +```python +try: + ... + return {...} +except Exception: + return None +``` + +This catches *everything* — corrupt DB, permission errors, schema mismatch, disk I/O errors — and returns `None`. The caller then reports: + +> "Memory database not found. Run paperforge memory build." + +This message is **wrong** for non-missing-DB failures. A corrupt database or a permission error is not fixed by rebuilding the database. The real exception is lost entirely, making debugging impossible. + +**Fix:** At minimum, log the exception before returning `None`. Better: distinguish between "DB missing" and "DB query failed": + +```python +import logging +logger = logging.getLogger(__name__) + +def get_agent_context(vault: Path) -> dict: + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return None + + conn = get_connection(db_path, read_only=True) + try: + ... + return {...} + except Exception as exc: + logger.exception("Failed to query agent context from %s", db_path) + return None + finally: + conn.close() +``` + +Or propagate the exception upward and let the CLI layer construct a more accurate `PFError` with `ErrorCode.INTERNAL_ERROR` and the actual error message (matching how `search.py` handles exceptions at line 62-66). + +### WR-02: Docstring is misleading about return conditions + +**File:** `docs/superpowers/plans/2026-05-12-agent-context.md:78-81` + +**Issue:** + +```python +def get_agent_context(vault: Path) -> dict: + """Build agent bootstrap context from paperforge.db. + + Returns None if DB is missing. + """ +``` + +The docstring says "Returns None if DB is missing" but the function returns `None` on *any* exception (DB missing, corrupt, permission denied, etc.). Inaccurate docstrings mislead future maintainers. + +**Fix:** Update to: + +```python +"""Build agent bootstrap context from paperforge.db. + +Returns None if the DB file does not exist or a query fails. +""" +``` + +### WR-03: Search command usage string missing `--year-to` flag + +**File:** `docs/superpowers/plans/2026-05-12-agent-context.md:189-192` + +**Issue:** The plan's `COMMANDS` dict lists: + +```python +"search": { + "usage": "paperforge search --json [--collection NAME] [--domain NAME] [--ocr done|pending] [--year-from N] [--limit N]", + ... +} +``` + +But the spec (line 54) and the actual CLI parser (`cli.py:279`) both include `[--year-to N]`. The plan omits it. The spec usage string is: + +``` +paperforge search --json [--collection NAME] [--domain NAME] [--ocr done|pending] [--year-from N] [--year-to N] [--limit N] +``` + +An agent that reads the plan's command catalog may not know `--year-to` is available. + +**Fix:** Add `[--year-to N]` to the search usage string in `COMMANDS`: + +```python +"search": { + "usage": "paperforge search --json [--collection NAME] [--domain NAME] [--ocr done|pending] [--year-from N] [--year-to N] [--limit N]", + "purpose": "Full-text search with optional collection/domain/lifecycle filters", +}, +``` + +--- + +## Info + +### IN-01: Minimal test coverage — no integration-level test + +**File:** `docs/superpowers/plans/2026-05-12-agent-context.md:137-147` + +**Issue:** The single test only covers the `None` return when DB is absent: + +```python +def test_get_agent_context_returns_none_when_no_db(): + assert get_agent_context(Path("/nonexistent/vault")) is None +``` + +There are no tests for: +- Successful query of a populated DB +- Empty DB (0 papers) +- Collection tree with multi-level pipe-separated paths +- Collection tree with empty/whitespace-only paths +- Domain/lifecycle/OCR/deep-reading counts + +**Fix:** Consider adding a fixture-based test using an in-memory SQLite DB with sample data. + +### IN-02: Redundant `_COMMAND_REGISTRY` entry + +**File:** `docs/superpowers/plans/2026-05-12-agent-context.md:290-293` and `paperforge/cli.py:431-571` + +**Issue:** The plan adds `agent-context` to `_COMMAND_REGISTRY` *and* adds direct `if args.command == "agent-context"` dispatch in `cli.py`. The existing `cli.py` main() function already uses direct dispatch for most commands (`paper-status`, `search`, `context`, `dashboard`, etc.) and only uses `_COMMAND_REGISTRY` for `memory` subcommand dispatch. Adding to both is harmless but inconsistent — either use the registry or use direct dispatch, not both. + +Since the plan already adds to `_COMMAND_REGISTRY`, you could use it for dispatch instead: + +```python +if args.command == "agent-context": + mod = get_command_module("agent-context") + return mod.run(args) +``` + +Or keep the direct dispatch and skip `_COMMAND_REGISTRY` (matching `search`, `paper-status`, `context` patterns). Either is fine — just pick one. + +--- + +_Reviewed: 2026-05-12_ +_Reviewer: the agent (gsd-code-reviewer)_ +_Depth: deep_ diff --git a/README.en.md b/README.en.md deleted file mode 100644 index 5c7e1ce..0000000 --- a/README.en.md +++ /dev/null @@ -1,294 +0,0 @@ -

- PaperForge banner -

- -# PaperForge - -[![Version](https://img.shields.io/github/v/release/LLLin000/PaperForge?style=for-the-badge&label=version)](https://github.com/LLLin000/PaperForge/releases) -[![Python](https://img.shields.io/pypi/pyversions/paperforge?style=for-the-badge&logo=python&logoColor=white&color=3775A9)](https://python.org) -[![License](https://img.shields.io/badge/license-CC%20BY--NC--SA%204.0-lightgreen?style=for-the-badge)](LICENSE) - -[简体中文](README.md) · **English** - -> **铸知识为器,启洞见之明。 — Forge Knowledge, Empower Insight.** - -PaperForge brings your Zotero library into Obsidian. Sync papers, run OCR, extract figures, and do AI-assisted deep reading — all inside a single vault. - ---- - -## 0. What PaperForge Is - -PaperForge is **not just an Obsidian plugin**. It has two parts: - -| Part | What | Does | Where | -|------|------|------|-------| -| Obsidian Plugin | `main.js` + `manifest.json` + `styles.css` | Dashboard, buttons, settings UI | `.obsidian/plugins/paperforge/` in your vault | -| Python Package | `paperforge` | Sync, OCR, Doctor, repair | Your system Python (`pip install`) | - -The plugin is the **interface**. The Python package is the **engine**. Every button you click in the plugin actually runs a Python command behind the scenes. - -**After installing the plugin, you MUST verify that the Python package is also installed and version-matched.** - ---- - -## 1. Install the Obsidian Plugin - -### Option A: BRAT (Recommended) - -1. Install **BRAT** from the Obsidian community plugin browser -2. Open BRAT settings → `Add Beta Plugin` -3. Enter: `https://github.com/LLLin000/PaperForge` -4. BRAT downloads the latest `main.js`, `manifest.json`, and `styles.css` and installs them -5. Settings → Community Plugins → enable PaperForge - -> BRAT auto-detects GitHub Release updates. No manual downloads needed. - -### Option B: Manual Download - -1. Go to [Releases](https://github.com/LLLin000/PaperForge/releases) -2. Download the three files: `main.js`, `manifest.json`, `styles.css` -3. Create `.obsidian/plugins/paperforge/` in your vault -4. Put the three files there -5. Restart Obsidian → Settings → Community Plugins → enable PaperForge - -> Manual install does not auto-update. You'll need to re-download for each new version. - ---- - -## 2. Install the Python Package - -After enabling the plugin, open the PaperForge settings tab. You'll see a **Runtime Status** section: - -``` -Plugin v1.5.0 → Python Package v1.5.0 ✓ Matched -``` - -- If it says "Not installed" → click **Open Wizard** to re-run the setup process -- If it says "Mismatch" → the Python package auto-updates when the plugin updates. If it didn't succeed, click **Update Runtime** to manually trigger - ---- - -## 3. How Python Interpreter Resolution Works - -PaperForge needs to find a working Python on your system. It searches in this order: - -| Priority | Source | Description | -|----------|--------|-------------| -| 1 | **Manual override** | Settings → `Custom Python Path`, enter the full path (e.g., `C:\Users\you\...\python.exe`). **This is the most reliable method.** | -| 2 | **venv auto-detect** | Scans `.paperforge-test-venv`, `.venv`, `venv` under your vault root | -| 3 | **System auto-detect** | Tries `py -3`, `python`, `python3` in order, verifies with `--version` | -| 4 | **Fallback** | Defaults to `python` if nothing else works | - -> If you have multiple Python installations (e.g., system 3.9 + self-installed 3.11), **strongly recommend setting a manual path** in settings to avoid hitting the wrong one. -> -> The **Validate** button in settings immediately tests the resolved interpreter and shows its version. - ---- - -## 4. Setup Wizard — What Each Step Means - -Open the plugin settings panel (`Settings` → `Community plugins` → `PaperForge`) and click the **Open Wizard** button. The wizard walks you through configuration. Here's what every step does. - -### 4.1 Vault Path - -Your Obsidian vault root. Auto-detected, usually no need to change. - -### 4.2 AI Agent Platform - -PaperForge's deep reading features run through an AI Agent. The core mechanism is **trigger phrases**, not registered plugins: you type `/pf-deep ` directly into the Agent chat, and the Agent recognizes the trigger and loads the `literature-qa` Skill automatically. - -The setup wizard deploys Skill files to the correct location: - -| Agent | Skill location | Trigger example | -|-------|---------------|-----------------| -| **OpenCode** | `.opencode/skills/` + `.opencode/command/` | `/pf-deep ` | -| **Claude Code** | `.claude/skills/` | `/pf-deep ` | -| **Cursor** | `.cursor/skills/` | `/pf-deep ` | -| **GitHub Copilot** | `.github/skills/` | `/pf-deep ` | -| **Windsurf** | `.windsurf/skills/` | `/pf-deep ` | -| **Codex** | `.codex/skills/` | `$pf-deep ` | -| **Cline** | `.clinerules/` | `/pf-deep ` | - -> **Key concept**: `/pf-deep` is NOT a plugin you install on the Agent platform — it's a Skill file deployed inside your Vault. Once the setup wizard copies the files into place, the Agent auto-discovers the triggers on startup. You type the trigger phrase just like any other chat input. - -### 4.3 Directory Names - -The wizard asks what to name several directories. These are for organizing files inside your vault. **Defaults work for most users.** - -| Parameter | Default | Purpose | -|-----------|---------|---------| -| `system_dir` | `System` | Root for PaperForge internal data. Contains `exports/` (Zotero JSON exports), `ocr/` (OCR results), `config/`. You rarely need to open this manually. | -| `resources_dir` | `Resources` | Resources root. Your formal literature notes live under this directory, inside `literature_dir`. | -| `literature_dir` | `Literature` | Formal literature notes directory. `paperforge sync` generates frontmatter `.md` notes here. | -| `base_dir` | `Bases` | Obsidian Base view definitions. Dashboard filters ("Pending OCR", "Ready to Read", etc.) are stored here. | - -### 4.4 PaddleOCR API Token - -OCR requires a PaddleOCR API key. Configured in `.env`: - -``` -PADDLEOCR_API_TOKEN=your-api-key -``` - -The wizard guides you through setting this. You can also edit `.env` later. The OCR URL usually stays at the default. - -### 4.5 Zotero Data Directory - -PaperForge creates a junction (Windows) or symlink (macOS/Linux) linking your Zotero data directory into the vault. This is how Obsidian wikilinks resolve to PDF files. - -The wizard auto-detects your Zotero installation. If detection fails, manually enter the path to your Zotero data directory — the folder that contains the `storage/` subdirectory (not the Zotero executable). - -### 4.6 What Happens During Setup - -After confirming your choices, the wizard automatically: -- Creates all needed directory structures -- Deploys Agent command files to the correct locations -- Installs Obsidian plugin files -- Creates the Zotero junction/symlink -- Writes `paperforge.json` and `.env` - -The process is **incremental** — if files already exist in the chosen directories, the wizard only adds what's missing and never deletes existing content. - ---- - -## 5. First-Time Setup Checklist - -1. **Version match**: Settings → Runtime Status → confirm plugin and Python package match -2. **Python path**: Settings → Validate button → confirm it's the Python you want -3. **Setup wizard**: Settings → PaperForge → Open Wizard -4. **PaddleOCR key**: Enter your API token in `.env` (wizard guides this) -5. **Export from Zotero**: Right-click your library → `Export...` → format `Better BibTeX JSON` → check `Keep updated` → save to `/PaperForge/exports/` -6. **Run Doctor**: Dashboard → `Run Doctor` → all checks should pass - ---- - -## 6. Daily Use - -### Dashboard (Three-Mode Views) - -`Ctrl+P` → `PaperForge: Open Dashboard` opens the control panel with three views: - -| View | Purpose | -|------|---------| -| **Global** | System homepage: run Sync, OCR, Doctor, and other mechanical operations | -| **Collection** | Batch workspace: browse paper queues by domain, batch tagging | -| **Per-paper** | Reading companion: `do_ocr` / `analyze` toggle checkboxes, discussion record cards | - -> PDF files in the Dashboard automatically switch to Per-paper mode — no manual switching needed. - -### AI Deep Reading & Q&A (Requires Agent) - -Launch your Agent app and type commands into its chat input. **The more specific you are about the paper (Zotero Key, title, DOI), the faster the Agent locates it.** - -| Route | Command | Does | Trigger examples | Prerequisites | -|-------|---------|------|-----------------|--------------| -| Deep Read | `/pf-deep ` | Keshav three-pass deep reading, writes to formal note | `deep read XX`, `walk me through`, `journal club` | OCR done, analyze: true | -| Q&A | `/pf-paper ` | Interactive paper Q&A, OCR not required | `take a look at XX`, `what does this paper say` | Formal note exists | -| Archive | `/pf-end` | Save current `/pf-paper` Q&A session | `save`, `end discussion` | During `/pf-paper` session | - -### `/pf-end` Details - -- `/pf-end` only applies to `/pf-paper` Q&A sessions. Deep reading (`/pf-deep`) writes directly to the formal note and does not need `/pf-end`. -- When executed, two files are created in the paper's workspace: - - `discussion.md` — human-readable Q&A discussion record - - `discussion.json` — structured Q&A data (with timestamps, source tags) -- Dashboard **Per-paper** view automatically displays these as discussion record cards - -> Command prefixes vary by platform (mostly `/`, Codex uses `$`). - ---- - -## 7. Full Workflow - -``` -Add paper to Zotero - ↓ Better BibTeX auto-exports JSON to exports/ -Dashboard → Sync Library - ↓ Generates formal note (in Literature/, with frontmatter metadata) -Set do_ocr: true in the note's frontmatter - ↓ -Dashboard → Run OCR - ↓ PaddleOCR extracts full text + figures → ocr/ directory -Set analyze: true in the note's frontmatter - ↓ -Open Agent → type /pf-deep - ↓ Agent performs three-pass deep reading -## 🔍 Deep Reading section appears in the note - ↓ (for additional Q&A) -Open Agent → type /pf-paper - ↓ Interactive Q&A -Type /pf-end to save the discussion record - ↓ -Dashboard Per-paper view shows discussion cards -``` - ---- - -## 8. Troubleshooting - -### Plugin fails to load - -- Confirm `.obsidian/plugins/paperforge/` has `main.js`, `manifest.json`, `styles.css` -- If upgrading via BRAT from an old version: delete the entire `paperforge` plugin folder and let BRAT re-download -- Open Developer Console (`Ctrl+Shift+I`) and check the red errors - -### "Sync Runtime" doesn't update the version - -- The plugin may be calling a different Python than your terminal. Check Settings → Python path -- Try with `--no-cache-dir` to bypass pip cache -- Confirm `https://github.com/LLLin000/PaperForge` is reachable - -### OCR stays pending - -- Confirm `.env` has `PADDLEOCR_API_TOKEN` -- Run `paperforge ocr --diagnose` to check API connectivity -- PDF paths may be broken: run `paperforge repair --fix-paths` - -### No notes generated after sync - -- Is Better BibTeX auto-export configured in Zotero? Are JSON files in `exports/`? -- Run `paperforge doctor` to find which step failed - -### /pf-deep command does nothing - -- Make sure you're running it in your Agent app, not a terminal -- Confirm OCR is done (`ocr_status: done`) -- Confirm `analyze` is set to `true` - ---- - -## 9. Updating - -BRAT auto-detects plugin updates. For the Python package: - -```bash -paperforge update -# or -pip install --upgrade paperforge -``` - ---- - -## 10. Architecture - -``` -paperforge/ -├── core/ Contract layer — PFResult/ErrorCode/state machine -├── adapters/ Adapter layer — BBT parsing, paths, frontmatter I/O -├── services/ Service layer — SyncService orchestration -├── worker/ Worker layer — OCR, status, repair -├── commands/ CLI dispatch -├── setup/ Setup wizard (directories, agent deployment, Zotero linking) -├── plugin/ Obsidian plugin (Dashboard, settings panel) -└── schema/ Field registry -``` - ---- - -## License - -[CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/). Non-commercial use only. - -## Acknowledgments - -Built on [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [Obsidian](https://obsidian.md), [Better BibTeX for Zotero](https://retorque.re/zotero-better-bibtex/), and other great open-source projects. diff --git a/README.md b/README.md index 493b86b..19f8edb 100644 --- a/README.md +++ b/README.md @@ -8,98 +8,106 @@ [![Python](https://img.shields.io/pypi/pyversions/paperforge?style=for-the-badge&logo=python&logoColor=white&color=3775A9)](https://python.org) [![License](https://img.shields.io/badge/license-CC%20BY--NC--SA%204.0-lightgreen?style=for-the-badge)](LICENSE) -**简体中文** · [English](README.en.md) +[简体中文](README.zh.md) · **English** > **铸知识为器,启洞见之明。 — Forge Knowledge, Empower Insight.** -PaperForge 让你在 Obsidian 里管理 Zotero 文献。同步、OCR 全文提取、图表解析、AI 精读,全在一个 Vault 里完成。 +PaperForge brings your Zotero library into Obsidian. Sync papers, run OCR, extract figures, and do AI-assisted deep reading — all inside a single vault. --- -## 0. 先理解它是什么 +## 0. What PaperForge Is -PaperForge **不是一个纯 Obsidian 插件**。它有两部分: +PaperForge is **not just an Obsidian plugin**. It has two parts: -| 部分 | 是什么 | 干什么 | 装在哪 | -|------|--------|--------|--------| -| Obsidian 插件 | `main.js` + `manifest.json` + `styles.css` | Dashboard、按钮、设置界面 | Vault 的 `.obsidian/plugins/paperforge/` | -| Python 包 | `paperforge` | 同步、OCR、Doctor、修复 | 系统 Python 环境 (`pip install`) | +| Part | What | Does | Where | +|------|------|------|-------| +| Obsidian Plugin | `main.js` + `manifest.json` + `styles.css` | Dashboard, buttons, settings UI | `.obsidian/plugins/paperforge/` in your vault | +| Python Package | `paperforge` | Sync, OCR, Doctor, repair | Your system Python (`pip install`) | -插件是**壳**,Python 包是**引擎**。插件里的按钮点了之后,实际是调用 Python 命令行去干活。 +The plugin is the **interface**. The Python package is the **engine**. Every button you click in the plugin actually runs a Python command behind the scenes. -**所以装完插件之后,必须在设置里确认 Python 包也已安装,并且版本一致。** +**After installing the plugin, you MUST verify that the Python package is also installed and version-matched.** --- -## 1. 安装 Obsidian 插件 +## 1. Install the Obsidian Plugin -### 方式一:BRAT(推荐) +### Option A: Community Plugin Browser (Recommended) -1. 在 Obsidian 社区插件市场搜索安装 **BRAT**(Beta Reviewer's Auto-update Tester) -2. 打开 BRAT 设置 → `Add Beta Plugin` -3. 填入仓库地址:`https://github.com/LLLin000/PaperForge` -4. BRAT 会自动下载最新 Release 的 `main.js`、`manifest.json`、`styles.css` 并安装 -5. 在 Obsidian 设置 → 社区插件 → 启用 PaperForge +1. Open Obsidian → `Settings` → `Community plugins` → `Browse` +2. Search for **PaperForge** +3. Click `Install`, then `Enable` -> BRAT 能自动检测 GitHub Release 更新,不需要手动下载。 +> Community plugins auto-update through Obsidian. No extra steps needed. -### 方式二:手动下载 +### Option B: BRAT -1. 打开 [Releases](https://github.com/LLLin000/PaperForge/releases) 页面 -2. 下载最新版本的三个文件:`main.js`、`manifest.json`、`styles.css` -3. 在 Vault 里创建文件夹 `.obsidian/plugins/paperforge/` -4. 把三个文件放进去 -5. 重启 Obsidian → 设置 → 社区插件 → 启用 PaperForge +If you need beta versions or the plugin hasn't appeared in search yet: -> 手动安装不会自动更新,每次新版本需要重新下载替换。 +1. Install **BRAT** from the Obsidian community plugin browser +2. Open BRAT settings → `Add Beta Plugin` +3. Enter: `https://github.com/LLLin000/PaperForge` +4. Enable PaperForge in Settings → Community Plugins + +### Option C: Manual Download + +1. Go to [Releases](https://github.com/LLLin000/PaperForge/releases) +2. Download the three files: `main.js`, `manifest.json`, `styles.css` +3. Create `.obsidian/plugins/paperforge/` in your vault +4. Put the three files there +5. Restart Obsidian → Settings → Community Plugins → enable PaperForge + +> Manual install does not auto-update. You'll need to re-download for each new version. --- -## 2. 安装 Python 包 +## 2. Install the Python Package -插件装好后,打开 PaperForge 设置页面。你会看到 **运行时状态** 区域: +After enabling the plugin, open the PaperForge settings tab. You'll see a **Runtime Status** section: ``` -插件 v1.5.0 → Python 包 v1.5.0 ✓ 匹配 +Plugin v1.5.0 → Python Package v1.5.0 ✓ Matched ``` -- 如果显示"未安装" → 在设置里确认 Python 解释器路径,然后点击 **验证** 重新检测 -- 如果显示"版本不匹配" → 插件更新时 Python 包会自动同步升级,如果没成功,点 **更新运行时** 手动触发 +- If it says "Not installed" → click **Open Wizard** to re-run the setup process +- If it says "Mismatch" → the Python package auto-updates when the plugin updates. If it didn't succeed, click **Update Runtime** to manually trigger --- -## 3. Python 解释器识别逻辑 +## 3. How Python Interpreter Resolution Works -PaperForge 需要找到你系统里的 Python。它按以下顺序查找,找到就用: +PaperForge needs to find a working Python on your system. It searches in this order: -| 优先级 | 来源 | 说明 | -|--------|------|------| -| 1 | **你手动指定** | 设置 → `自定义 Python 路径`,填入完整路径(如 `C:\Users\你的用户名\AppData\Local\Programs\Python\Python311\python.exe`)。**这是最可靠的方式。** | -| 2 | **venv 自动检测** | 自动扫描 Vault 根目录下的 `.paperforge-test-venv`、`.venv`、`venv` 里的 Python | -| 3 | **系统自动检测** | 依次尝试 `py -3`、`python`、`python3`,用 `--version` 验证,挑第一个能用的 | -| 4 | **兜底** | 以上都找不到,回退到 `python` | +| Priority | Source | Description | +|----------|--------|-------------| +| 1 | **Manual override** | Settings → `Custom Python Path`, enter the full path (e.g., `C:\Users\you\...\python.exe`). **This is the most reliable method.** | +| 2 | **venv auto-detect** | Scans `.paperforge-test-venv`, `.venv`, `venv` under your vault root | +| 3 | **System auto-detect** | Tries `py -3`, `python`, `python3` in order, verifies with `--version` | +| 4 | **Fallback** | Defaults to `python` if nothing else works | -> 如果你系统里有多个 Python(比如系统自带的 3.9 + 自己装的 3.11),**强烈建议在设置里手动指定路径**,避免跑错环境。 +> If you have multiple Python installations (e.g., system 3.9 + self-installed 3.11), **strongly recommend setting a manual path** in settings to avoid hitting the wrong one. > -> 设置里的 **验证** 按钮会立即测试当前选中的解释器,显示它能不能用、是什么版本。 +> The **Validate** button in settings immediately tests the resolved interpreter and shows its version. --- -## 4. 配置说明 +## 4. Setup Wizard — What Each Step Means + +Open the plugin settings panel (`Settings` → `Community plugins` → `PaperForge`) and click the **Open Wizard** button. The wizard walks you through configuration. Here's what every step does. -以下参数在**插件设置页面**中配置(设置 → 第三方插件 → PaperForge → 打开安装向导)。首次安装时基础配置已是正确默认值,一般不需要手动改。以下解释供你了解每个参数的作用: +### 4.1 Vault Path -### 4.1 Vault 路径 -你当前打开的 Obsidian Vault 根目录。安装向导自动检测,一般不用改。 +Your Obsidian vault root. Auto-detected, usually no need to change. -### 4.2 AI Agent 平台 +### 4.2 AI Agent Platform -PaperForge 的精读功能通过 AI Agent 执行。核心机制是 **触发词** 而非注册插件:你直接在 Agent 对话里输入 `/pf-deep `,Agent 识别到触发词后自动加载 `literature-qa` Skill 来定位论文并执行精读。 +PaperForge's deep reading features run through an AI Agent. The core mechanism is **trigger phrases**, not registered plugins: you type `/pf-deep ` directly into the Agent chat, and the Agent recognizes the trigger and loads the `literature-qa` Skill automatically. -安装向导会把 Skill 文件部署到对应位置: +The setup wizard deploys Skill files to the correct location: -| Agent | Skill 安装位置 | 触发词示例 | -|-------|---------------|-----------| +| Agent | Skill location | Trigger example | +|-------|---------------|-----------------| | **OpenCode** | `.opencode/skills/` + `.opencode/command/` | `/pf-deep ` | | **Claude Code** | `.claude/skills/` | `/pf-deep ` | | **Cursor** | `.cursor/skills/` | `/pf-deep ` | @@ -108,193 +116,188 @@ PaperForge 的精读功能通过 AI Agent 执行。核心机制是 **触发词** | **Codex** | `.codex/skills/` | `$pf-deep ` | | **Cline** | `.clinerules/` | `/pf-deep ` | -> **关键理解**:`/pf-deep` 不是 Agent 平台的插件,而是部署在 Vault 里的 Skill 文件。安装向导把文件拷过去之后,Agent 启动时自动发现并识别这些触发词。你不需要在 Agent 平台里做任何"安装插件"的操作。 +> **Key concept**: `/pf-deep` is NOT a plugin you install on the Agent platform — it's a Skill file deployed inside your Vault. Once the setup wizard copies the files into place, the Agent auto-discovers the triggers on startup. You type the trigger phrase just like any other chat input. -### 4.3 目录命名 +### 4.3 Directory Names -安装向导会问你几个目录叫什么名字。这些都是给你自己看的,用来组织 Vault 里的文件结构。**大部分情况用默认值就行。** +The wizard asks what to name several directories. These are for organizing files inside your vault. **Defaults work for most users.** -| 参数 | 默认值 | 作用 | -|------|--------|------| -| `system_dir` | `System` | PaperForge 内部数据的总目录。下面会有 `exports/`(Zotero 导出的 JSON)、`ocr/`(OCR 结果)、`config/` 等子目录。你一般不需要手动进去看。 | -| `resources_dir` | `Resources` | 资源根目录。你的正式文献笔记就放在这里下面的 `literature_dir` 里。 | -| `literature_dir` | `Literature` | 正式文献笔记的目录。`paperforge sync` 生成的带 frontmatter 的 `.md` 笔记在这里。你日常阅读、编辑笔记都在这个目录。 | -| `base_dir` | `Bases` | Obsidian Base 视图文件目录。Dashboard 里的筛选视图("待 OCR"、"待精读"等)存在这里。 | +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `system_dir` | `System` | Root for PaperForge internal data. Contains `exports/` (Zotero JSON exports), `ocr/` (OCR results), `config/`. You rarely need to open this manually. | +| `resources_dir` | `Resources` | Resources root. Your formal literature notes live under this directory, inside `literature_dir`. | +| `literature_dir` | `Literature` | Formal literature notes directory. `paperforge sync` generates frontmatter `.md` notes here. | +| `base_dir` | `Bases` | Obsidian Base view definitions. Dashboard filters ("Pending OCR", "Ready to Read", etc.) are stored here. | ### 4.4 PaddleOCR API Token -OCR 功能需要 PaddleOCR 的 API。在 `.env` 文件里配置: +OCR requires a PaddleOCR API key. Configured in `.env`: ``` -PADDLEOCR_API_TOKEN=你的API密钥 +PADDLEOCR_API_TOKEN=your-api-key ``` -安装向导会引导你填写,也可以之后手动在 Vault 根目录的 `.env` 文件里加。OCR URL 一般不需要改。 +The wizard guides you through setting this. You can also edit `.env` later. The OCR URL usually stays at the default. -### 4.5 Zotero 数据目录 +### 4.5 Zotero Data Directory -PaperForge 会创建一个 junction(Windows)或 symlink(macOS/Linux),把 Zotero 的数据目录连接到 Vault 里。这样 Obsidian 的 wikilink 才能找到 PDF 文件。 +PaperForge creates a junction (Windows) or symlink (macOS/Linux) linking your Zotero data directory into the vault. This is how Obsidian wikilinks resolve to PDF files. -安装向导会自动检测 Zotero 的安装位置。如果检测失败,你需要手动指定 Zotero 数据目录的路径——也就是包含 `storage` 子目录的那个文件夹(不是 Zotero 程序本身)。 +The wizard auto-detects your Zotero installation. If detection fails, manually enter the path to your Zotero data directory — the folder that contains the `storage/` subdirectory (not the Zotero executable). -### 4.6 安装过程 +### 4.6 What Happens During Setup -确认配置后,安装向导会自动: -- 创建所有需要的目录结构 -- 把 Agent 命令文件部署到对应位置 -- 把 Obsidian 插件文件安装到位 -- 创建 Zotero junction/symlink -- 写入 `paperforge.json` 和 `.env` +After confirming your choices, the wizard automatically: +- Creates all needed directory structures +- Deploys Agent command files to the correct locations +- Installs Obsidian plugin files +- Creates the Zotero junction/symlink +- Writes `paperforge.json` and `.env` -整个过程是**增量的** — 如果你选的目录里已经有文件,安装向导只会补充缺失的,不会删除已有内容。 +The process is **incremental** — if files already exist in the chosen directories, the wizard only adds what's missing and never deletes existing content. --- -## 5. 首次使用 +## 5. First-Time Setup Checklist -1. **确认版本一致**:设置 → 运行时状态 → 确保插件和 Python 包版本一致 -2. **确认 Python 正确**:设置 → 验证按钮,确认连接的是你想要的 Python -3. **配置 PaddleOCR**:在 Vault 根目录 `.env` 里填入 API Token -4. **在 Zotero 里导出文献**:右键要同步的文献库 → `导出...` → 格式选 `Better BibTeX JSON` → 勾选 `Keep updated` → 保存到 `/PaperForge/exports/` -5. **运行 Doctor**:Dashboard → `Run Doctor`,确认所有检查通过 +1. **Version match**: Settings → Runtime Status → confirm plugin and Python package match +2. **Python path**: Settings → Validate button → confirm it's the Python you want +3. **Setup wizard**: Settings → PaperForge → Open Wizard +4. **PaddleOCR key**: Enter your API token in `.env` (wizard guides this) +5. **Export from Zotero**: Right-click your library → `Export...` → format `Better BibTeX JSON` → check `Keep updated` → save to `/PaperForge/exports/` +6. **Run Doctor**: Dashboard → `Run Doctor` → all checks should pass --- -## 6. 日常使用 +## 6. Daily Use -### Dashboard(三模式视图) +### Dashboard (Three-Mode Views) -`Ctrl+P` → `PaperForge: Open Dashboard` 打开控制面板,包含三种视图: +`Ctrl+P` → `PaperForge: Open Dashboard` opens the control panel with three views: -| 视图 | 用途 | -|------|------| -| **Global** | 系统首页:运行 Sync、OCR、Doctor 等机械操作 | -| **Collection** | 批量工作台:按领域查看文献队列、批量标记 | -| **Per-paper** | 单篇阅读伴侣:`do_ocr` / `analyze` 切换复选框,讨论记录卡片 | +| View | Purpose | +|------|---------| +| **Global** | System homepage: run Sync, OCR, Doctor, and other mechanical operations | +| **Collection** | Batch workspace: browse paper queues by domain, batch tagging | +| **Per-paper** | Reading companion: `do_ocr` / `analyze` toggle checkboxes, discussion record cards | -> Dashboard 里的 PDF 文件会自动进入 Per-paper 模式,无需手动切换。 +> PDF files in the Dashboard automatically switch to Per-paper mode — no manual switching needed. -### AI 精读与问答(需 Agent) +### AI Deep Reading & Q&A (Requires Agent) -打开 Agent 应用,直接输入触发词即可。Agent 识别到触发词后会自动加载 `literature-qa` Skill,按标准化流程定位论文并执行操作。 +Launch your Agent app and type commands into its chat input. **The more specific you are about the paper (Zotero Key, title, DOI), the faster the Agent locates it.** -**你对文献描述得越具体(Zotero Key、标题、DOI),Agent 定位越快。** +| Route | Command | Does | Trigger examples | Prerequisites | +|-------|---------|------|-----------------|--------------| +| Deep Read | `/pf-deep ` | Keshav three-pass deep reading, writes to formal note | `deep read XX`, `walk me through`, `journal club` | OCR done, analyze: true | +| Q&A | `/pf-paper ` | Interactive paper Q&A, OCR not required | `take a look at XX`, `what does this paper say` | Formal note exists | +| Archive | `/pf-end` | Save current `/pf-paper` Q&A session | `save`, `end discussion` | During `/pf-paper` session | -| 路由 | 触发词 | 做什么 | 前置条件 | -|------|--------|--------|---------| -| 精读 | `/pf-deep ` 或 `精读 ` | Keshav 三阶段组会式精读,结果写入 formal note | OCR 完成、analyze 为 true | -| 问答 | `/pf-paper ` 或 `文献问答 ` | 交互式论文 Q&A,不强制 OCR | 已有正式笔记 | -| 存档 | `/pf-end` 或 `结束讨论` | 保存本次 `/pf-paper` 问答记录 | `/pf-paper` 会话中 | +### `/pf-end` Details -> **两种触发方式等效**:你可以用 Agent 原生命令 `/pf-deep ABC12345`,也可以用自然语言 `精读 ABC12345`。Agent 识别到触发词后会自动加载 `literature-qa` Skill。 +- `/pf-end` only applies to `/pf-paper` Q&A sessions. Deep reading (`/pf-deep`) writes directly to the formal note and does not need `/pf-end`. +- When executed, two files are created in the paper's workspace: + - `discussion.md` — human-readable Q&A discussion record + - `discussion.json` — structured Q&A data (with timestamps, source tags) +- Dashboard **Per-paper** view automatically displays these as discussion record cards -> `/pf-deep` 和 `/pf-paper` **不是终端命令**,也不是 Agent 平台的注册插件。它们是部署在 Vault 里的 Skill 文件的触发词。安装向导把 Skill 文件放到正确位置后,Agent 启动时自动发现。使用方式就是打开 Agent 对话,输入触发词 —— 和你在终端敲 `ls` 一样直接。 - -### `/pf-end` 详解 - -- `/pf-end` 仅对 `/pf-paper` 问答会话生效。精读(`/pf-deep`)的内容直接写入 formal note,不需要 `/pf-end`。 -- 执行后会在论文 workspace 下生成两个文件: - - `discussion.md` — 人类可读的 Q&A 讨论记录 - - `discussion.json` — 结构化 Q&A 数据(含时间戳、来源标记) -- Dashboard 的 **Per-paper** 视图会自动以讨论记录卡片形式展示这些记录 - -> 不同 Agent 的命令前缀可能不同(大部分是 `/`,Codex 是 `$`)。 +> Command prefixes vary by platform (mostly `/`, Codex uses `$`). --- -## 7. 完整工作流 +## 7. Full Workflow ``` -Zotero 添加论文 - ↓ Better BibTeX 自动导出 JSON 到 exports/ 目录 +Add paper to Zotero + ↓ Better BibTeX auto-exports JSON to exports/ Dashboard → Sync Library - ↓ 生成正式笔记(Literature/ 目录下,带 frontmatter 元数据) -在笔记 frontmatter 里把 do_ocr 设为 true + ↓ Generates formal note (in Literature/, with frontmatter metadata) +Set do_ocr: true in the note's frontmatter ↓ Dashboard → Run OCR - ↓ PaddleOCR 提取全文 + 图表 → ocr/ 目录 -在笔记 frontmatter 里把 analyze 设为 true + ↓ PaddleOCR extracts full text + figures → ocr/ directory +Set analyze: true in the note's frontmatter ↓ -打开 Agent → 输入 /pf-deep - ↓ Agent 识别触发词 → 加载 literature-qa Skill → 三阶段精读 -笔记里出现 ## 🔍 精读 区域 - ↓(如需额外问答) -打开 Agent → 输入 /pf-paper - ↓ 交互式 Q&A -输入 /pf-end 保存讨论记录 +Open Agent → type /pf-deep + ↓ Agent performs three-pass deep reading +## 🔍 Deep Reading section appears in the note + ↓ (for additional Q&A) +Open Agent → type /pf-paper + ↓ Interactive Q&A +Type /pf-end to save the discussion record ↓ -Dashboard Per-paper 视图展示讨论卡片 +Dashboard Per-paper view shows discussion cards ``` --- -## 8. 常见问题 +## 8. Troubleshooting -### 插件加载失败(Cannot find module) +### Plugin fails to load -- 确认 `.obsidian/plugins/paperforge/` 下有 `main.js`、`manifest.json`、`styles.css` 三个文件 -- 如果 BRAT 从旧版升级后出问题:删除整个 `paperforge` 插件文件夹,让 BRAT 重新下载 -- 打开 Developer Console(`Ctrl+Shift+I`)看红色报错 +- Confirm `.obsidian/plugins/paperforge/` has `main.js`, `manifest.json`, `styles.css` +- If upgrading from an old version: delete the entire `paperforge` plugin folder and reinstall via the community plugin browser +- Open Developer Console (`Ctrl+Shift+I`) and check the red errors -### "同步运行时" 点了还是旧版本 +### "Sync Runtime" doesn't update the version -- 插件调用的 Python 可能和你终端用的是不同环境。检查设置 → Python 解释器路径 -- pip 缓存问题,用 `--no-cache-dir` 重装 -- 确认 `https://github.com/LLLin000/PaperForge` 网络能通 +- The plugin may be calling a different Python than your terminal. Check Settings → Python path +- Try with `--no-cache-dir` to bypass pip cache +- Confirm `https://github.com/LLLin000/PaperForge` is reachable -### OCR 一直 pending +### OCR stays pending -- 确认 `.env` 里有 `PADDLEOCR_API_TOKEN` -- 终端运行 `paperforge ocr --diagnose` 检查 API 连通性 -- PDF 路径可能不对:运行 `paperforge repair --fix-paths` +- Confirm `.env` has `PADDLEOCR_API_TOKEN` +- Run `paperforge ocr --diagnose` to check API connectivity +- PDF paths may be broken: run `paperforge repair --fix-paths` -### 同步后没有生成笔记 +### No notes generated after sync -- Zotero Better BibTeX 是否配置了自动导出?JSON 是否在 `exports/` 目录? -- 运行 `paperforge doctor` 看具体哪一步失败 -- 运行 `paperforge status` 查看系统状态总览 +- Is Better BibTeX auto-export configured in Zotero? Are JSON files in `exports/`? +- Run `paperforge doctor` to find which step failed -### /pf-deep 触发词没反应 +### /pf-deep command does nothing -- 确认你在 **Agent 应用** 里输入,不是在终端 -- 确认安装向导已运行,Skill 文件已部署到正确的 Vault 目录 -- 确认 OCR 已完成(ocr_status: done) -- 确认 analyze 已设为 true +- Make sure you're running it in your Agent app, not a terminal +- Confirm OCR is done (`ocr_status: done`) +- Confirm `analyze` is set to `true` --- -## 9. 更新 +## 9. Updating -BRAT 会自动检测插件更新。Python 包更新: +The Obsidian plugin auto-updates through the community plugin browser. For the Python package: ```bash paperforge update -# 或 +# or pip install --upgrade paperforge ``` +If you installed via BRAT, it also auto-detects GitHub Release updates. + --- -## 10. 架构 +## 10. Architecture ``` paperforge/ -├── core/ 契约层 — PFResult/ErrorCode/状态机 -├── adapters/ 适配器层 — BBT 解析、路径、frontmatter -├── services/ 服务层 — SyncService 编排 -├── worker/ 工人层 — OCR、状态、修复 -├── commands/ CLI 分发 -├── setup/ 安装向导(目录创建、Agent 部署、Zotero 链接) -├── plugin/ Obsidian 插件(Dashboard、设置面板) -└── schema/ 字段注册表 +├── core/ Contract layer — PFResult/ErrorCode/state machine +├── adapters/ Adapter layer — BBT parsing, paths, frontmatter I/O +├── services/ Service layer — SyncService orchestration +├── worker/ Worker layer — OCR, status, repair +├── commands/ CLI dispatch +├── setup/ Setup wizard (directories, agent deployment, Zotero linking) +├── plugin/ Obsidian plugin (Dashboard, settings panel) +└── schema/ Field registry ``` --- -## 协议 +## License -[CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/)。仅限非商业使用。 +[CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/). Non-commercial use only. -## 致谢 +## Acknowledgments -基于 [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)、[Obsidian](https://obsidian.md)、[Better BibTeX for Zotero](https://retorque.re/zotero-better-bibtex/) 等开源项目构建。 +Built on [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [Obsidian](https://obsidian.md), [Better BibTeX for Zotero](https://retorque.re/zotero-better-bibtex/), and other great open-source projects. diff --git a/README.zh.md b/README.zh.md new file mode 100644 index 0000000..75f9ff3 --- /dev/null +++ b/README.zh.md @@ -0,0 +1,300 @@ +

+ PaperForge banner +

+ +# PaperForge + +[![Version](https://img.shields.io/github/v/release/LLLin000/PaperForge?style=for-the-badge&label=version)](https://github.com/LLLin000/PaperForge/releases) +[![Python](https://img.shields.io/pypi/pyversions/paperforge?style=for-the-badge&logo=python&logoColor=white&color=3775A9)](https://python.org) +[![License](https://img.shields.io/badge/license-CC%20BY--NC--SA%204.0-lightgreen?style=for-the-badge)](LICENSE) + +**简体中文** · [English](README.md) + +> **铸知识为器,启洞见之明。 — Forge Knowledge, Empower Insight.** + +PaperForge 让你在 Obsidian 里管理 Zotero 文献。同步、OCR 全文提取、图表解析、AI 精读,全在一个 Vault 里完成。 + +--- + +## 0. 先理解它是什么 + +PaperForge **不是一个纯 Obsidian 插件**。它有两部分: + +| 部分 | 是什么 | 干什么 | 装在哪 | +|------|--------|--------|--------| +| Obsidian 插件 | `main.js` + `manifest.json` + `styles.css` | Dashboard、按钮、设置界面 | Vault 的 `.obsidian/plugins/paperforge/` | +| Python 包 | `paperforge` | 同步、OCR、Doctor、修复 | 系统 Python 环境 (`pip install`) | + +插件是**壳**,Python 包是**引擎**。插件里的按钮点了之后,实际是调用 Python 命令行去干活。 + +**所以装完插件之后,必须在设置里确认 Python 包也已安装,并且版本一致。** + +--- + +## 1. 安装 Obsidian 插件 + +### 方式一:BRAT(推荐) + +1. 在 Obsidian 社区插件市场搜索安装 **BRAT**(Beta Reviewer's Auto-update Tester) +2. 打开 BRAT 设置 → `Add Beta Plugin` +3. 填入仓库地址:`https://github.com/LLLin000/PaperForge` +4. BRAT 会自动下载最新 Release 的 `main.js`、`manifest.json`、`styles.css` 并安装 +5. 在 Obsidian 设置 → 社区插件 → 启用 PaperForge + +> BRAT 能自动检测 GitHub Release 更新,不需要手动下载。 + +### 方式二:手动下载 + +1. 打开 [Releases](https://github.com/LLLin000/PaperForge/releases) 页面 +2. 下载最新版本的三个文件:`main.js`、`manifest.json`、`styles.css` +3. 在 Vault 里创建文件夹 `.obsidian/plugins/paperforge/` +4. 把三个文件放进去 +5. 重启 Obsidian → 设置 → 社区插件 → 启用 PaperForge + +> 手动安装不会自动更新,每次新版本需要重新下载替换。 + +--- + +## 2. 安装 Python 包 + +插件装好后,打开 PaperForge 设置页面。你会看到 **运行时状态** 区域: + +``` +插件 v1.5.0 → Python 包 v1.5.0 ✓ 匹配 +``` + +- 如果显示"未安装" → 在设置里确认 Python 解释器路径,然后点击 **验证** 重新检测 +- 如果显示"版本不匹配" → 插件更新时 Python 包会自动同步升级,如果没成功,点 **更新运行时** 手动触发 + +--- + +## 3. Python 解释器识别逻辑 + +PaperForge 需要找到你系统里的 Python。它按以下顺序查找,找到就用: + +| 优先级 | 来源 | 说明 | +|--------|------|------| +| 1 | **你手动指定** | 设置 → `自定义 Python 路径`,填入完整路径(如 `C:\Users\你的用户名\AppData\Local\Programs\Python\Python311\python.exe`)。**这是最可靠的方式。** | +| 2 | **venv 自动检测** | 自动扫描 Vault 根目录下的 `.paperforge-test-venv`、`.venv`、`venv` 里的 Python | +| 3 | **系统自动检测** | 依次尝试 `py -3`、`python`、`python3`,用 `--version` 验证,挑第一个能用的 | +| 4 | **兜底** | 以上都找不到,回退到 `python` | + +> 如果你系统里有多个 Python(比如系统自带的 3.9 + 自己装的 3.11),**强烈建议在设置里手动指定路径**,避免跑错环境。 +> +> 设置里的 **验证** 按钮会立即测试当前选中的解释器,显示它能不能用、是什么版本。 + +--- + +## 4. 配置说明 + +以下参数在**插件设置页面**中配置(设置 → 第三方插件 → PaperForge → 打开安装向导)。首次安装时基础配置已是正确默认值,一般不需要手动改。以下解释供你了解每个参数的作用: + +### 4.1 Vault 路径 +你当前打开的 Obsidian Vault 根目录。安装向导自动检测,一般不用改。 + +### 4.2 AI Agent 平台 + +PaperForge 的精读功能通过 AI Agent 执行。核心机制是 **触发词** 而非注册插件:你直接在 Agent 对话里输入 `/pf-deep `,Agent 识别到触发词后自动加载 `literature-qa` Skill 来定位论文并执行精读。 + +安装向导会把 Skill 文件部署到对应位置: + +| Agent | Skill 安装位置 | 触发词示例 | +|-------|---------------|-----------| +| **OpenCode** | `.opencode/skills/` + `.opencode/command/` | `/pf-deep ` | +| **Claude Code** | `.claude/skills/` | `/pf-deep ` | +| **Cursor** | `.cursor/skills/` | `/pf-deep ` | +| **GitHub Copilot** | `.github/skills/` | `/pf-deep ` | +| **Windsurf** | `.windsurf/skills/` | `/pf-deep ` | +| **Codex** | `.codex/skills/` | `$pf-deep ` | +| **Cline** | `.clinerules/` | `/pf-deep ` | + +> **关键理解**:`/pf-deep` 不是 Agent 平台的插件,而是部署在 Vault 里的 Skill 文件。安装向导把文件拷过去之后,Agent 启动时自动发现并识别这些触发词。你不需要在 Agent 平台里做任何"安装插件"的操作。 + +### 4.3 目录命名 + +安装向导会问你几个目录叫什么名字。这些都是给你自己看的,用来组织 Vault 里的文件结构。**大部分情况用默认值就行。** + +| 参数 | 默认值 | 作用 | +|------|--------|------| +| `system_dir` | `System` | PaperForge 内部数据的总目录。下面会有 `exports/`(Zotero 导出的 JSON)、`ocr/`(OCR 结果)、`config/` 等子目录。你一般不需要手动进去看。 | +| `resources_dir` | `Resources` | 资源根目录。你的正式文献笔记就放在这里下面的 `literature_dir` 里。 | +| `literature_dir` | `Literature` | 正式文献笔记的目录。`paperforge sync` 生成的带 frontmatter 的 `.md` 笔记在这里。你日常阅读、编辑笔记都在这个目录。 | +| `base_dir` | `Bases` | Obsidian Base 视图文件目录。Dashboard 里的筛选视图("待 OCR"、"待精读"等)存在这里。 | + +### 4.4 PaddleOCR API Token + +OCR 功能需要 PaddleOCR 的 API。在 `.env` 文件里配置: + +``` +PADDLEOCR_API_TOKEN=你的API密钥 +``` + +安装向导会引导你填写,也可以之后手动在 Vault 根目录的 `.env` 文件里加。OCR URL 一般不需要改。 + +### 4.5 Zotero 数据目录 + +PaperForge 会创建一个 junction(Windows)或 symlink(macOS/Linux),把 Zotero 的数据目录连接到 Vault 里。这样 Obsidian 的 wikilink 才能找到 PDF 文件。 + +安装向导会自动检测 Zotero 的安装位置。如果检测失败,你需要手动指定 Zotero 数据目录的路径——也就是包含 `storage` 子目录的那个文件夹(不是 Zotero 程序本身)。 + +### 4.6 安装过程 + +确认配置后,安装向导会自动: +- 创建所有需要的目录结构 +- 把 Agent 命令文件部署到对应位置 +- 把 Obsidian 插件文件安装到位 +- 创建 Zotero junction/symlink +- 写入 `paperforge.json` 和 `.env` + +整个过程是**增量的** — 如果你选的目录里已经有文件,安装向导只会补充缺失的,不会删除已有内容。 + +--- + +## 5. 首次使用 + +1. **确认版本一致**:设置 → 运行时状态 → 确保插件和 Python 包版本一致 +2. **确认 Python 正确**:设置 → 验证按钮,确认连接的是你想要的 Python +3. **配置 PaddleOCR**:在 Vault 根目录 `.env` 里填入 API Token +4. **在 Zotero 里导出文献**:右键要同步的文献库 → `导出...` → 格式选 `Better BibTeX JSON` → 勾选 `Keep updated` → 保存到 `/PaperForge/exports/` +5. **运行 Doctor**:Dashboard → `Run Doctor`,确认所有检查通过 + +--- + +## 6. 日常使用 + +### Dashboard(三模式视图) + +`Ctrl+P` → `PaperForge: Open Dashboard` 打开控制面板,包含三种视图: + +| 视图 | 用途 | +|------|------| +| **Global** | 系统首页:运行 Sync、OCR、Doctor 等机械操作 | +| **Collection** | 批量工作台:按领域查看文献队列、批量标记 | +| **Per-paper** | 单篇阅读伴侣:`do_ocr` / `analyze` 切换复选框,讨论记录卡片 | + +> Dashboard 里的 PDF 文件会自动进入 Per-paper 模式,无需手动切换。 + +### AI 精读与问答(需 Agent) + +打开 Agent 应用,直接输入触发词即可。Agent 识别到触发词后会自动加载 `literature-qa` Skill,按标准化流程定位论文并执行操作。 + +**你对文献描述得越具体(Zotero Key、标题、DOI),Agent 定位越快。** + +| 路由 | 触发词 | 做什么 | 前置条件 | +|------|--------|--------|---------| +| 精读 | `/pf-deep ` 或 `精读 ` | Keshav 三阶段组会式精读,结果写入 formal note | OCR 完成、analyze 为 true | +| 问答 | `/pf-paper ` 或 `文献问答 ` | 交互式论文 Q&A,不强制 OCR | 已有正式笔记 | +| 存档 | `/pf-end` 或 `结束讨论` | 保存本次 `/pf-paper` 问答记录 | `/pf-paper` 会话中 | + +> **两种触发方式等效**:你可以用 Agent 原生命令 `/pf-deep ABC12345`,也可以用自然语言 `精读 ABC12345`。Agent 识别到触发词后会自动加载 `literature-qa` Skill。 + +> `/pf-deep` 和 `/pf-paper` **不是终端命令**,也不是 Agent 平台的注册插件。它们是部署在 Vault 里的 Skill 文件的触发词。安装向导把 Skill 文件放到正确位置后,Agent 启动时自动发现。使用方式就是打开 Agent 对话,输入触发词 —— 和你在终端敲 `ls` 一样直接。 + +### `/pf-end` 详解 + +- `/pf-end` 仅对 `/pf-paper` 问答会话生效。精读(`/pf-deep`)的内容直接写入 formal note,不需要 `/pf-end`。 +- 执行后会在论文 workspace 下生成两个文件: + - `discussion.md` — 人类可读的 Q&A 讨论记录 + - `discussion.json` — 结构化 Q&A 数据(含时间戳、来源标记) +- Dashboard 的 **Per-paper** 视图会自动以讨论记录卡片形式展示这些记录 + +> 不同 Agent 的命令前缀可能不同(大部分是 `/`,Codex 是 `$`)。 + +--- + +## 7. 完整工作流 + +``` +Zotero 添加论文 + ↓ Better BibTeX 自动导出 JSON 到 exports/ 目录 +Dashboard → Sync Library + ↓ 生成正式笔记(Literature/ 目录下,带 frontmatter 元数据) +在笔记 frontmatter 里把 do_ocr 设为 true + ↓ +Dashboard → Run OCR + ↓ PaddleOCR 提取全文 + 图表 → ocr/ 目录 +在笔记 frontmatter 里把 analyze 设为 true + ↓ +打开 Agent → 输入 /pf-deep + ↓ Agent 识别触发词 → 加载 literature-qa Skill → 三阶段精读 +笔记里出现 ## 🔍 精读 区域 + ↓(如需额外问答) +打开 Agent → 输入 /pf-paper + ↓ 交互式 Q&A +输入 /pf-end 保存讨论记录 + ↓ +Dashboard Per-paper 视图展示讨论卡片 +``` + +--- + +## 8. 常见问题 + +### 插件加载失败(Cannot find module) + +- 确认 `.obsidian/plugins/paperforge/` 下有 `main.js`、`manifest.json`、`styles.css` 三个文件 +- 如果 BRAT 从旧版升级后出问题:删除整个 `paperforge` 插件文件夹,让 BRAT 重新下载 +- 打开 Developer Console(`Ctrl+Shift+I`)看红色报错 + +### "同步运行时" 点了还是旧版本 + +- 插件调用的 Python 可能和你终端用的是不同环境。检查设置 → Python 解释器路径 +- pip 缓存问题,用 `--no-cache-dir` 重装 +- 确认 `https://github.com/LLLin000/PaperForge` 网络能通 + +### OCR 一直 pending + +- 确认 `.env` 里有 `PADDLEOCR_API_TOKEN` +- 终端运行 `paperforge ocr --diagnose` 检查 API 连通性 +- PDF 路径可能不对:运行 `paperforge repair --fix-paths` + +### 同步后没有生成笔记 + +- Zotero Better BibTeX 是否配置了自动导出?JSON 是否在 `exports/` 目录? +- 运行 `paperforge doctor` 看具体哪一步失败 +- 运行 `paperforge status` 查看系统状态总览 + +### /pf-deep 触发词没反应 + +- 确认你在 **Agent 应用** 里输入,不是在终端 +- 确认安装向导已运行,Skill 文件已部署到正确的 Vault 目录 +- 确认 OCR 已完成(ocr_status: done) +- 确认 analyze 已设为 true + +--- + +## 9. 更新 + +BRAT 会自动检测插件更新。Python 包更新: + +```bash +paperforge update +# 或 +pip install --upgrade paperforge +``` + +--- + +## 10. 架构 + +``` +paperforge/ +├── core/ 契约层 — PFResult/ErrorCode/状态机 +├── adapters/ 适配器层 — BBT 解析、路径、frontmatter +├── services/ 服务层 — SyncService 编排 +├── worker/ 工人层 — OCR、状态、修复 +├── commands/ CLI 分发 +├── setup/ 安装向导(目录创建、Agent 部署、Zotero 链接) +├── plugin/ Obsidian 插件(Dashboard、设置面板) +└── schema/ 字段注册表 +``` + +--- + +## 协议 + +[CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/)。仅限非商业使用。 + +## 致谢 + +基于 [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)、[Obsidian](https://obsidian.md)、[Better BibTeX for Zotero](https://retorque.re/zotero-better-bibtex/) 等开源项目构建。 diff --git a/docs/superpowers/plans/2026-05-12-agent-context.md b/docs/superpowers/plans/2026-05-12-agent-context.md new file mode 100644 index 0000000..17f5850 --- /dev/null +++ b/docs/superpowers/plans/2026-05-12-agent-context.md @@ -0,0 +1,329 @@ +# agent-context — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) +> or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax. + +**Goal:** Add `paperforge agent-context --json` command that gives agents a library overview, command catalog, collection map, and behavior rules in one call. + +**Architecture:** New `paperforge/memory/context.py` queries paperforge.db for aggregated stats. CLI wrapper in `paperforge/commands/agent_context.py`. Pure read-only, no file scanning. + +**Tech Stack:** Python stdlib `sqlite3`, existing `paperforge.memory.db`, `paperforge.core.result.PFResult`. + +**Spec:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md` + +**Prerequisites:** Memory Layer Phase 1 + FTS5 already implemented on `feature/memory` branch. + +--- + +## File Structure + +``` +Create: + paperforge/memory/context.py — get_agent_context(vault) -> dict + paperforge/commands/agent_context.py — CLI run(args) -> int + tests/unit/memory/test_context.py — unit tests + +Modify: + paperforge/cli.py — add "agent-context" subparser + dispatch + paperforge/commands/__init__.py — add to _COMMAND_REGISTRY +``` + +--- + +### Task 1: `paperforge/memory/context.py` + +**Files:** +- Create: `paperforge/memory/context.py` +- Create: `tests/unit/memory/test_context.py` + +- [ ] **Step 1: Write `paperforge/memory/context.py`** + +```python +from __future__ import annotations + +from pathlib import Path + +from paperforge.memory.db import get_connection, get_memory_db_path + + +def _build_collection_tree(conn) -> list[dict]: + """Build collection hierarchy from papers.collection_path. + + Each collection_path is pipe-separated, e.g. "骨科 | 骨折". + Returns flat list of top-level collections with sub-collections. + """ + rows = conn.execute( + "SELECT collection_path, COUNT(*) as cnt FROM papers " + "WHERE collection_path != '' " + "GROUP BY collection_path ORDER BY cnt DESC" + ).fetchall() + top: dict[str, dict] = {} + for row in rows: + parts = [p.strip() for p in row["collection_path"].split("|") if p.strip()] + if not parts: + continue + root = parts[0] + if root not in top: + top[root] = {"name": root, "count": 0, "sub": []} + top[root]["count"] += row["cnt"] + if len(parts) > 1: + sub_name = parts[-1] + if sub_name not in top[root]["sub"]: + top[root]["sub"].append(sub_name) + for c in top.values(): + c["sub"] = sorted(c["sub"]) + return sorted(top.values(), key=lambda x: -x["count"]) + + +def get_agent_context(vault: Path) -> dict | None: + """Build agent context from paperforge.db — library stats + collection tree. + + Returns None if DB is missing or query fails. + """ + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return None + + conn = get_connection(db_path, read_only=True) + try: + total = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0] + + domains = { + r["domain"]: r["cnt"] + for r in conn.execute( + "SELECT domain, COUNT(*) as cnt FROM papers GROUP BY domain ORDER BY cnt DESC" + ).fetchall() + } + + lifecycle_counts = { + r["lifecycle"]: r["cnt"] + for r in conn.execute( + "SELECT lifecycle, COUNT(*) as cnt FROM papers GROUP BY lifecycle" + ).fetchall() + } + + ocr_counts = { + r["ocr_status"]: r["cnt"] + for r in conn.execute( + "SELECT ocr_status, COUNT(*) as cnt FROM papers GROUP BY ocr_status" + ).fetchall() + } + + deep_counts = { + r["deep_reading_status"]: r["cnt"] + for r in conn.execute( + "SELECT deep_reading_status, COUNT(*) as cnt FROM papers GROUP BY deep_reading_status" + ).fetchall() + } + + collections = _build_collection_tree(conn) + + return { + "library": { + "paper_count": total, + "domain_counts": domains, + "lifecycle_counts": lifecycle_counts, + "ocr_counts": ocr_counts, + "deep_reading_counts": deep_counts, + }, + "collections": collections, + } + except Exception: + return None + finally: + conn.close() +``` + +- [ ] **Step 2: Write `tests/unit/memory/test_context.py`** + +```python +from __future__ import annotations + +from pathlib import Path + +from paperforge.memory.context import get_agent_context + + +def test_get_agent_context_returns_none_when_no_db(): + assert get_agent_context(Path("/nonexistent/vault")) is None +``` + +- [ ] **Step 3: Run tests** + +```bash +python -m pytest tests/unit/memory/test_context.py -v +``` + +- [ ] **Step 4: Commit** + +```bash +git add paperforge/memory/context.py tests/unit/memory/test_context.py +git commit -m "feat(memory): add agent context query module" +``` + +--- + +### Task 2: `paperforge/commands/agent_context.py` + +**Files:** +- Create: `paperforge/commands/agent_context.py` +- Modify: `paperforge/cli.py` (add parser + dispatch) +- Modify: `paperforge/commands/__init__.py` (register) + +- [ ] **Step 1: Write `paperforge/commands/agent_context.py`** + +```python +from __future__ import annotations + +import argparse +import sys + +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.context import get_agent_context +from paperforge import __version__ as PF_VERSION + +COMMANDS = { + "paper-status": { + "usage": "paperforge paper-status --json", + "purpose": "Look up one paper's full status and recommended next action", + }, + "search": { + "usage": "paperforge search --json [--collection NAME] [--domain NAME] [--ocr done|pending] [--year-from N] [--limit N]", + "purpose": "Full-text search with optional collection/domain/lifecycle filters", + }, + "retrieve": { + "usage": "paperforge retrieve --json [--limit N]", + "purpose": "Search OCR fulltext chunks for evidence paragraphs (coming soon)", + }, + "deep": { + "usage": "/pf-deep ", + "purpose": "Full three-pass deep reading with chart analysis", + }, + "ocr": { + "usage": "/pf-ocr", + "purpose": "Run OCR on papers marked do_ocr:true", + }, + "sync": { + "usage": "/pf-sync", + "purpose": "Sync Zotero and regenerate formal notes + index", + }, +} + +RULES = [ + "Use paperforge.db via CLI commands before reading individual files.", + "Do not infer paper state from stale frontmatter when memory status is fresh.", + "Read source files only after resolving candidates via paper-status or search.", + "To locate a paper: start with collection scope if known, then expand to full library search.", +] + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + + library = get_agent_context(vault) + if library is None: + result = PFResult( + ok=False, + command="agent-context", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message="Memory database not found or query failed. Run paperforge memory build.", + ), + ) + if args.json: + print(result.to_json()) + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 1 + + data = { + "paperforge": { + "version": PF_VERSION, + "vault": str(vault), + "memory_db": "ready", + }, + "library": library["library"], + "collections": library["collections"], + "commands": COMMANDS, + "rules": RULES, + } + + result = PFResult( + ok=True, + command="agent-context", + version=PF_VERSION, + data=data, + ) + + if args.json: + print(result.to_json()) + else: + lib = data["library"] + print(f"Papers: {lib['paper_count']} total") + print(f"Domains: {lib['domain_counts']}") + print(f"Lifecycle: {lib['lifecycle_counts']}") + for c in data.get("collections", []): + subs = f" ({len(c['sub'])} sub)" if c["sub"] else "" + print(f" [{c['count']:3}] {c['name']}{subs}") + + return 0 if result.ok else 1 +``` + +- [ ] **Step 2: Register CLI parser in `paperforge/cli.py`** + +In `build_parser()`, after the search parser, add: + +```python + p_ac = sub.add_parser("agent-context", help="Generate agent bootstrap context") + p_ac.add_argument("--json", action="store_true", help="Output as JSON") +``` + +In `main()` dispatch, after the search dispatch, add: + +```python + if args.command == "agent-context": + from paperforge.commands.agent_context import run + return run(args) +``` + +- [ ] **Step 3: Update `paperforge/commands/__init__.py`** + +Add to `_COMMAND_REGISTRY`: +```python + "agent-context": "paperforge.commands.agent_context", +``` + +- [ ] **Step 4: Verify** + +```bash +python -m paperforge agent-context --help +python -m pytest tests/unit/ -q --no-header +``` + +- [ ] **Step 5: Commit** + +```bash +git add paperforge/commands/agent_context.py paperforge/cli.py paperforge/commands/__init__.py +git commit -m "feat(cli): add agent-context command for agent bootstrap" +``` + +--- + +### Task 3: Integration test + install + +- [ ] **Step 1: Reinstall + test on test vault** + +```bash +pip install --force-reinstall --no-deps . # from feature/memory +python -m paperforge --vault "D:\L\Med\test1" agent-context --json +``` + +Expected: full PFResult with library overview and collection tree. + +- [ ] **Step 2: Verify all existing tests still pass** + +```bash +python -m pytest tests/unit/ -q --no-header +``` diff --git a/docs/superpowers/plans/2026-05-12-memory-layer-REVIEW-v3.md b/docs/superpowers/plans/2026-05-12-memory-layer-REVIEW-v3.md new file mode 100644 index 0000000..5b2cd7b --- /dev/null +++ b/docs/superpowers/plans/2026-05-12-memory-layer-REVIEW-v3.md @@ -0,0 +1,101 @@ +--- +phase: memory-layer-plan-v3-quick-check +reviewed: 2026-05-12T09:25:08Z +depth: standard +files_reviewed: 1 +files_reviewed_list: + - docs/superpowers/plans/2026-05-12-memory-layer.md +findings: + critical: 0 + warning: 1 + info: 0 + total: 1 +status: issues_found +--- + +# Phase: Memory Layer Plan v3 Quick Check + +**Reviewed:** 2026-05-12T09:25:08Z +**Depth:** standard (plan-only, cross-referenced against codebase for `--key` validation) +**Files Reviewed:** 1 +**Status:** ISSUES_FOUND (1 WARNING remaining) + +--- + +## Summary + +Quick final check of the implementation plan after v3 review fixes. All 5 named issues from the prior review are **confirmed fixed**. The plan incorporates all 14 fixes from the original v1 deep review (5 CR + 5 WR + 4 IN). One new WARNING-level issue identified in the `_entry_from_row` function. + +--- + +## Named Issue Verification + +| Issue | Status | Evidence | +|-------|--------|----------| +| **N-BLKR-01**: hash query inside try block | **FIXED** | Lines 713-716 — `stored_hash_row = conn.execute(...)` is inside the `try:` block at line 708 | +| **N-BLKR-02**: NameError on `status` in memory.py | **FIXED** | Line 985 — `if result.ok:` guards access to `status` on line 986. `result` is always assigned in both try/except branches | +| **N-WRN-01**: paper_status empty fields for unresolved | **FIXED** | Line 1055 — `if data.get("resolved"):` guards detailed field printing | +| **N-INFO-01**: private `_compute_hash` renamed to `compute_hash` | **FIXED** | Line 451 — `def compute_hash(...)` (public). Line 683 — `from paperforge.memory.builder import compute_hash` | +| **N-INFO-02**: JSON decode logged with `logging.warning` | **FIXED** | Lines 762-764 — `logging.warning("Corrupted JSON in column %s for paper %s", key, ...)` | + +All 5 named issues from the prior review are resolved in the plan. + +--- + +## Original v1 Review Issue Verification (bonus) + +Cross-checked all 14 issues from `2026-05-12-memory-layer-REVIEW.md`: + +| Issue | Status | +|-------|--------| +| CR-01: `make_result` import | **FIXED** — line 910 imports only `PFError, PFResult` | +| CR-02: hash not checked | **FIXED** — lines 713-746 compare stored hash vs computed | +| CR-03: legacy format crash in builder | **FIXED** — lines 475-480 handle `isinstance(envelope, list)` | +| CR-04: legacy format crash in query | **FIXED** — lines 725-733 handle bare list | +| CR-05: Windows-path URI bug | **FIXED** — line 122 uses `db_path.as_posix()` | +| WR-01: `--force` flag | **FIXED** — removed from CLI parser (lines 1080-1082) | +| WR-02: ambiguous query returns full status | **FIXED** — lines 823-838 return candidates only when >1 | +| WR-03: recommended_action missing | **FIXED** — lines 846-855 compute concrete action strings | +| WR-04: zero test coverage | **REMAINS** — plan still has only 4 schema + 3 hash tests | +| WR-05: CLI dispatch pattern | **FIXED** — lines 1089-1099 use simple dispatch | +| IN-01: unused compute_health import | **FIXED** — removed from builder imports (lines 417-422) | +| IN-02: _COMMAND_REGISTRY not consumed | **REMAINS** — still present but rate-limited to INFO | +| IN-03: compute_hash .get vs direct | **FIXED** — line 452 uses `e["zotero_key"]` (direct access) | +| IN-04: fragile rstrip("_json") | **FIXED** — line 760 uses `key[:-5]` instead of `rstrip` | + +--- + +## Warnings + +### WR-V3-01: Data silently lost when JSON decode fails in `_entry_from_row` + +**File:** `docs/superpowers/plans/2026-05-12-memory-layer.md:759-760` +**Issue:** When `json.loads()` raises `JSONDecodeError`, `entry.pop(key)` has already executed — the original `_json` column value is removed from the result dict and never restored. The field disappears silently from query output. + +```python +# Current (plan line 759-760) +try: + entry[key[:-5]] = json.loads(entry.pop(key)) # pop() happens BEFORE json.loads() +except json.JSONDecodeError: + logging.warning(...) # original value already lost +``` + +**Fix:** +```python +# Pop first, then try to decode, restore on failure +raw = entry.pop(key) +try: + entry[key[:-5]] = json.loads(raw) +except json.JSONDecodeError: + entry[key] = raw # keep original JSON string visible + logging.warning( + "Corrupted JSON in column %s for paper %s", + key, entry.get("zotero_key", "?"), + ) +``` + +--- + +_Reviewed: 2026-05-12T09:25:08Z_ +_Reviewer: VT-OS/OPENCODE (gsd-code-reviewer)_ +_Depth: standard_ diff --git a/docs/superpowers/plans/2026-05-12-memory-layer-REVIEW.md b/docs/superpowers/plans/2026-05-12-memory-layer-REVIEW.md new file mode 100644 index 0000000..39c6269 --- /dev/null +++ b/docs/superpowers/plans/2026-05-12-memory-layer-REVIEW.md @@ -0,0 +1,377 @@ +--- +phase: memory-layer-plan-review +reviewed: 2026-05-12T18:30:00Z +depth: deep +files_reviewed: 9 +files_reviewed_list: + - docs/superpowers/plans/2026-05-12-memory-layer.md + - docs/superpowers/specs/2026-05-12-memory-layer-design.md + - paperforge/config.py + - paperforge/cli.py + - paperforge/commands/__init__.py + - paperforge/core/result.py + - paperforge/core/errors.py + - paperforge/worker/asset_state.py + - paperforge/worker/asset_index.py +findings: + critical: 5 + warning: 5 + info: 4 + total: 14 +status: issues_found +--- + +# Phase: Memory Layer Plan Review + +**Reviewed:** 2026-05-12T18:30:00Z +**Depth:** deep (cross-file analysis with import graph tracing) +**Files Reviewed:** 9 +**Status:** ISSUES_FOUND + +## Verdict: ISSUES_FOUND + +5 BLOCKER, 5 WARNING, 4 INFO issues detected. Plan must not be executed until BLOCKER items are resolved. + +--- + +## Summary + +The plan maps spec requirements to tasks with reasonable granularity, and the overall architecture (SQLite under `paperforge/memory/`, derived from `formal-library.json`, PFResult-enveloped CLI) is sound. However, the cross-file trace against the actual codebase reveals **five BLOCKER defects** — a non-existent import, a missing spec-critical hash check, two crash-on-legacy-format scenarios, and a Windows-path URI bug. Five WARNING-level issues include an unimplemented `--force` flag, a behavioral divergence from spec for ambiguous queries, a missing `recommended_action` field, near-zero test coverage for business logic, and an inconsistent CLI dispatch pattern. + +--- + +## Critical Issues + +### CR-01: Import of non-existent `make_result` in `memory.py` + +**File:** Plan Task 6, Step 1 (`paperforge/commands/memory.py` line 4) +**Issue:** The plan code imports `make_result` from `paperforge.core.result`: +```python +from paperforge.core.result import PFError, PFResult, make_result +``` +`make_result` is **not defined anywhere** in the codebase. Verified by grep of the entire `paperforge/` tree — zero matches. `core/result.py` (lines 1-79) exports only `PFError` and `PFResult`. This would cause `ImportError` at runtime on every invocation of `paperforge memory`. + +**Fix:** +```python +# Remove make_result from the import line — it's never used in the function body either. +from paperforge.core.result import PFError, PFResult +``` + +--- + +### CR-02: `get_memory_status` does not check `canonical_index_hash` + +**File:** Plan Task 5, Step 1 (`paperforge/memory/query.py`, `get_memory_status()`) +**Issue:** The spec (Design Spec lines 221-226) explicitly requires `memory status` to verify `canonical_index_hash` against the SHA-256 of the current `formal-library.json`: +> - `canonical_index_hash` matches computed hash of current `formal-library.json` → `fresh: bool` + +The plan's implementation (lines 678-717) computes `fresh` as only: +```python +result["fresh"] = result["schema_ok"] and result["count_match"] +``` +The `canonical_index_hash` stored in `meta` during build is never read back and never compared. The status command will report `fresh: true` even when the canonical index has changed since the last build — giving a falsely green "fresh" signal that causes stale paper-status results. + +**Fix:** In `get_memory_status()` after the read-only connection is opened, add: +```python +# Read stored hash from meta +stored_hash_row = conn.execute( + "SELECT value FROM meta WHERE key = 'canonical_index_hash'" +).fetchone() +stored_hash = stored_hash_row["value"] if stored_hash_row else "" + +# Recompute hash from current index +envelope = read_index(vault) +items = envelope.get("items", []) if isinstance(envelope, dict) else [] +from paperforge.memory.builder import _compute_hash +current_hash = _compute_hash(items) if items else "" + +result["hash_match"] = stored_hash == current_hash +result["fresh"] = result["schema_ok"] and result["count_match"] and result["hash_match"] +``` + +--- + +### CR-03: `build_from_index` crashes on legacy-format (bare list) index + +**File:** Plan Task 4, Step 1 (`paperforge/memory/builder.py`, line 467-471) +**Issue:** `read_index(vault)` in `asset_index.py` (line 160-176) can return a **bare list** (legacy pre-v1.6 format). The `build_from_index` function only checks for `None`: +```python +envelope = read_index(vault) +if envelope is None: + raise FileNotFoundError(...) +items = envelope.get("items", []) # <-- CRASH: list has no .get() +``` +If the vault has a legacy-format `formal-library.json` (not yet migrated by a sync run), `envelope` is a `list`, and `envelope.get(...)` raises `AttributeError`. The existing codebase has `is_legacy_format()` and `migrate_legacy_index()` in `asset_index.py` (lines 178-212) specifically for this case. + +**Fix:** Add legacy format detection after the `None` check: +```python +envelope = read_index(vault) +if envelope is None: + raise FileNotFoundError( + "Canonical index not found. Run paperforge sync --rebuild-index." + ) +from paperforge.worker.asset_index import is_legacy_format +if is_legacy_format(envelope): + raise FileNotFoundError( + "Canonical index is in legacy (bare-list) format. " + "Run paperforge sync --rebuild-index to migrate." + ) +items = envelope.get("items", []) +generated_at = envelope.get("generated_at", "") +``` + +--- + +### CR-04: `get_memory_status` crashes on legacy-format index + +**File:** Plan Task 5, Step 1 (`paperforge/memory/query.py`, line 708-713) +**Issue:** Same legacy-format crash as CR-03, but in the read path: +```python +envelope = read_index(vault) +if envelope: + result["paper_count_index"] = envelope.get("paper_count", 0) # CRASH on list +``` +A bare-list envelope causes `AttributeError`. + +**Fix:** Add the same `is_legacy_format` guard: +```python +envelope = read_index(vault) +if envelope and isinstance(envelope, dict): + result["paper_count_index"] = envelope.get("paper_count", 0) + ... +``` + +--- + +### CR-05: Windows-path URI incompatibility in `get_connection` read-only mode + +**File:** Plan Task 2, Step 2 (`paperforge/memory/db.py`, line 122-123) +**Issue:** +```python +uri = f"file:{db_path}?mode=ro" if read_only else str(db_path) +conn = sqlite3.connect(uri, uri=read_only) +``` +On Windows, `db_path` contains backslashes (e.g., `D:\Vault\System\PaperForge\indexes\paperforge.db`). The constructed URI `file:D:\Vault\...?mode=ro` is NOT a valid [RFC 8089 file URI](https://datatracker.ietf.org/doc/html/rfc8089). SQLite's URI parser requires either `file:///D:/...` (authority path) or `file:D:/...` (local path with forward slashes). With backslashes, `sqlite3.connect(..., uri=True)` may fail with `sqlite3.OperationalError: unable to open database file` or silently misinterpret the path. + +**Fix:** Normalize the path to use forward slashes before constructing the URI: +```python +def get_connection(db_path: Path, read_only: bool = False) -> sqlite3.Connection: + if read_only: + # Windows-safe: convert to forward slashes for SQLite URI parser + posix_path = str(db_path.resolve()).replace("\\", "/") + uri = f"file:{posix_path}?mode=ro" + else: + uri = str(db_path) + conn = sqlite3.connect(uri, uri=read_only) + conn.row_factory = sqlite3.Row + if not read_only: + conn.execute("PRAGMA journal_mode=WAL;") + conn.execute("PRAGMA foreign_keys=ON;") + return conn +``` + +--- + +## Warnings + +### WR-01: `--force` flag on `memory build` defined but never implemented + +**File:** Plan Task 6, Step 3 (cli.py parser) + Task 4 builder +**Issue:** The CLI parser adds `--force` to `memory build`: +```python +p_memory_build.add_argument("--force", action="store_true", help="Force rebuild") +``` +Neither `memory.run()` nor `build_from_index()` checks `args.force`. The builder always deletes all paper data and rebuilds (lines 486-488), making `--force` redundant for the current logic. However, a future optimization that caches unchanged entries would make `--force` meaningful. Either implement the flag or remove it — dead CLI interfaces degrade user experience and create maintenance debt. + +**Fix:** Either (a) remove the `--force` argument entirely from the parser, or (b) wire it through: +```python +# In builder.py: add force parameter +def build_from_index(vault: Path, force: bool = False) -> dict: + ... + if force: + drop_all_tables(conn) + ... +# In memory.py: +counts = build_from_index(vault, force=getattr(args, "force", False)) +``` + +--- + +### WR-02: Ambiguous query (>1 results) returns full status instead of candidate list only + +**File:** Plan Task 5 (`paperforge/memory/query.py`, `get_paper_status()`, lines 775-793) +**Issue:** The spec (Design Spec line 243) states: +> **>1 results:** Candidate list only (no full status details) + +The plan returns full status for the first match PLUS the candidate list: +```python +entry = entries[0] +assets = get_paper_assets(conn, entry["zotero_key"]) +entry["health"] = compute_health(entry) # Full status details computed +entry["candidates"] = entries if len(entries) > 1 else None +entry["assets"] = assets +return entry +``` +And the CLI output (paper_status.py lines 986-991) always prints title/year/lifecycle/next_step — even when multiple candidates exist. This violates the spec's "candidate list only" requirement for ambiguous queries. + +**Fix:** When `len(entries) > 1`, return candidate summary only: +```python +if len(entries) > 1: + return { + "candidates": [ + { + "zotero_key": e.get("zotero_key", ""), + "title": e.get("title", ""), + "year": e.get("year", ""), + "doi": e.get("doi", ""), + "domain": e.get("domain", ""), + } + for e in entries + ], + "candidate_count": len(entries), + } +``` + +--- + +### WR-03: `recommended_action` field missing from paper-status output + +**File:** Plan Task 5 (`paperforge/memory/query.py`, `get_paper_status()`) + Task 6 (`paper_status.py`) +**Issue:** The spec (Design Spec lines 252-253) requires: +> `recommended_action`: e.g., `"/pf-deep ABCDEFG"` or `"paperforge sync"` or `"paperforge ocr"` + +The plan only returns `entry["next_step"]` (e.g., `"/pf-deep"`) in the output but never computes a concrete `recommended_action` like `"/pf-deep ABCDEFG"`. The spec implies this should be a ready-to-use command string with the paper key substituted in. + +**Fix:** In `get_paper_status()`, after computing health, add: +```python +step = entry.get("next_step", "") +zkey = entry.get("zotero_key", "") +action_map = { + "/pf-deep": f"/pf-deep {zkey}", + "ocr": f"paperforge ocr --key {zkey}", + "sync": "paperforge sync", + "repair": "paperforge repair", + "ready": "Ready — no action needed", +} +entry["recommended_action"] = action_map.get(step, step) +``` + +--- + +### WR-04: Core business logic functions have zero test coverage + +**File:** Plan Tasks 4-5 (test files) +**Issue:** The plan specifies 8 tests total: +- 4 schema tests (table creation/deletion/schema version) — good +- 3 builder tests — but ALL three test only `_compute_hash`, a 10-line helper. `build_from_index()` (~150 lines) has **zero tests**. +- 1 query test — only tests `get_memory_status()` with a nonexistent vault path. `lookup_paper()`, `get_paper_assets()`, `get_paper_status()`, and `_entry_from_row()` have **zero tests**. + +Untested edge cases include: empty items list, schema version mismatch trigger, corrupt JSON in authors/collections, exact zotero_key lookup, DOI lookup, title substring search, no-results path, asset reconstruction with None values. + +**Fix:** Add at minimum: +- `test_build_from_index_empty_items()` — ensure handles empty index gracefully +- `test_build_from_index_schema_mismatch()` — verify drop+rebuild on version change +- `test_build_from_index_populates_correctly()` — build from a mock envelope, verify paper count/asset count +- `test_lookup_paper_by_key()` — exact zotero_key match +- `test_lookup_paper_by_doi()` — DOI lookup +- `test_lookup_paper_by_title_substring()` — LIKE match +- `test_lookup_paper_no_results()` — returns empty list +- `test_get_paper_status_returns_none_for_missing()` — paper not found +- `test_entry_from_row_handles_null_fields()` — None values don't crash + +--- + +### WR-05: CLI dispatch pattern inconsistent with existing codebase + +**File:** Plan Task 6, Step 3 (cli.py dispatch blocks) +**Issue:** The plan adds verbose-index carving logic in the dispatch blocks: +```python +if args.command == "memory": + argv = sys.argv.copy() + try: + idx = argv.index("memory") + args.verbose = "--verbose" in argv[idx:] or "-v" in argv[idx:] + except ValueError: + pass + from paperforge.commands import memory + return memory.run(args) +``` +No other command dispatch in `cli.py` (lines 407-533) uses this pattern. All 15 existing command dispatches simply import and call `run(args)`. The `--verbose` flag is already a top-level argument parsed by argparse (cli.py lines 132-136), and `configure_logging(verbose=...)` is called at line 402 BEFORE any dispatch. This carving code is redundant and adds 14 lines of unnecessary complexity per command. + +**Fix:** Follow the existing pattern — just import and dispatch: +```python +if args.command == "memory": + from paperforge.commands import memory + return memory.run(args) + +if args.command == "paper-status": + from paperforge.commands import paper_status + return paper_status.run(args) +``` + +--- + +## Info + +### IN-01: Unused `compute_health` import in `builder.py` + +**File:** Plan Task 4, Step 1 (`paperforge/memory/builder.py`, line 414) +**Issue:** `compute_health` is imported but never called in `build_from_index()`. Per the spec (line 141), health dimensions are computed at query time only, so this import is conceptually correct to exclude. The dead import is harmless but clutters the import block. + +**Fix:** Remove `compute_health` from the builder import: +```python +from paperforge.worker.asset_state import ( + compute_lifecycle, + compute_maturity, + compute_next_step, +) +``` + +--- + +### IN-02: `_COMMAND_REGISTRY` entries not consumed by `cli.py` dispatch + +**File:** Plan Task 6, Step 4 (`paperforge/commands/__init__.py`) +**Issue:** The plan adds `"memory"` and `"paper-status"` to `_COMMAND_REGISTRY`, which powers `get_command_module()` for dynamic dispatch. However, `cli.py` uses hard-coded `if/elif` chains (not `get_command_module()`), so these registry entries are unused by the primary dispatch path. The entries are only consumed if some other code path calls `get_command_module("memory")`. + +**Fix:** Not critical for Phase 1, but either (a) use `get_command_module()` in cli.py dispatch to reduce duplication, or (b) document that the registry exists for future dynamic-dispatch migration. + +--- + +### IN-03: `_compute_hash` uses `.get()` instead of direct key access per spec + +**File:** Plan Task 4, Step 1 (`paperforge/memory/builder.py`, line 448-449) +**Issue:** The spec (line 202) explicitly says: +> `sorted(items, key=lambda e: e["zotero_key"])` + +The plan uses `e.get("zotero_key", "")` — a safe-access variant. This is arguably more robust (it won't crash on malformed entries), but the spec's direct-access was an intentional design choice to fail-loud on corrupt data rather than silently producing a different hash. Decide which contract you want. + +**Fix:** Either align with spec (remove `.get()` for loud failure) or update the spec to accept safe access. + +--- + +### IN-04: `_entry_from_row` uses fragile `.rstrip("_json")` + +**File:** Plan Task 5, Step 1 (`paperforge/memory/query.py`, line 729) +**Issue:** +```python +entry[key.rstrip("_json")] = json.loads(entry.pop(key)) +``` +`rstrip("_json")` removes any trailing characters in the set `{'_', 'j', 's', 'o', 'n'}`, not the literal substring `"_json"`. For `"authors_json"` this produces `"authors"` (correct), and for `"collections_json"` it produces `"collections"` (correct). But if future columns with names like `"version_json"` or `"annotation_json"` were added, this would produce `"versi"` or `"annotati"` — silently wrong. The fix is trivial and prevents future bugs. + +**Fix:** +```python +for key in ("authors_json", "collections_json"): + if key in entry and entry[key]: + try: + clean_key = key[:-5] # strip "_json" suffix (exactly 5 chars) + entry[clean_key] = json.loads(entry.pop(key)) + except json.JSONDecodeError: + pass +``` + +--- + +_Reviewed: 2026-05-12T18:30:00Z_ +_Reviewer: VT-OS/OPENCODE (gsd-code-reviewer)_ +_Depth: deep_ diff --git a/docs/superpowers/plans/2026-05-12-memory-layer.md b/docs/superpowers/plans/2026-05-12-memory-layer.md new file mode 100644 index 0000000..aa03cf5 --- /dev/null +++ b/docs/superpowers/plans/2026-05-12-memory-layer.md @@ -0,0 +1,1235 @@ +# Memory Layer Phase 1 — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) +> or superpowers:executing-plans to implement this plan task-by-task. +> Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a SQLite-backed Memory Layer with `memory build`, `memory status`, and `paper-status` commands. + +**Architecture:** New `paperforge/memory/` package with connection, schema, builder, and query modules. +Commands follow the existing CLI pattern (parser registration + `commands/` module dispatch + PFResult envelope). + +**Tech Stack:** Python stdlib `sqlite3`, `hashlib`, existing `paperforge.core.result`, `paperforge.worker.asset_index`, `paperforge.worker.asset_state`. + +**Spec:** `docs/superpowers/specs/2026-05-12-memory-layer-design.md` + +--- + +## File Structure Map + +``` +Create: + paperforge/memory/__init__.py — package init, re-export key types + paperforge/memory/db.py — get_connection(), get_memory_db_path() + paperforge/memory/schema.py — CURRENT_SCHEMA_VERSION, CREATE TABLE SQL, drop/create tables + paperforge/memory/builder.py — build_from_index() — reads formal-library.json, populates SQLite + paperforge/memory/query.py — lookup_paper(), get_paper_status(), get_memory_status() + paperforge/commands/memory.py — CLI run() for "memory build" and "memory status" + paperforge/commands/paper_status.py — CLI run() for "paper-status" + + tests/unit/memory/__init__.py + tests/unit/memory/test_schema.py + tests/unit/memory/test_builder.py + tests/unit/memory/test_query.py + +Modify: + paperforge/config.py:330-339 — add "memory_db" path key + paperforge/cli.py:258-259 — register "memory" and "paper-status" subcommands + paperforge/commands/__init__.py:4-13 — add to _COMMAND_REGISTRY +``` + +--- + +### Task 1: Register `memory_db` path in config + +**Files:** +- Modify: `paperforge/config.py:330-339` + +- [ ] **Step 1: Add `memory_db` key to `paperforge_paths()` return dict** + +```python +# At paperforge/config.py, after line 338 ("index": ...): +"memory_db": paperforge / "indexes" / "paperforge.db", +``` + +- [ ] **Step 2: Verify** + +```bash +python -c "from paperforge.config import paperforge_paths; p=paperforge_paths(); print(p.get('memory_db'), p.get('index'))" +``` + +Expected: both paths point under `.../PaperForge/indexes/`. + +- [ ] **Step 3: Commit** + +```bash +git add paperforge/config.py +git commit -m "feat(config): add memory_db path key for Memory Layer" +``` + +--- + +### Task 2: `paperforge/memory/__init__.py` and `db.py` + +**Files:** +- Create: `paperforge/memory/__init__.py` +- Create: `paperforge/memory/db.py` +- Test: `tests/unit/memory/test_schema.py` (write later) + +- [ ] **Step 1: Write `__init__.py`** + +```python +from __future__ import annotations + +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.schema import ensure_schema, drop_all_tables + +__all__ = [ + "get_connection", + "get_memory_db_path", + "ensure_schema", + "drop_all_tables", +] +``` + +- [ ] **Step 2: Write `db.py`** + +```python +from __future__ import annotations + +import sqlite3 +from pathlib import Path + +from paperforge.config import paperforge_paths + + +def get_memory_db_path(vault: Path) -> Path: + """Return the absolute path to paperforge.db.""" + paths = paperforge_paths(vault) + db_path = paths.get("memory_db") + if not db_path: + raise FileNotFoundError("memory_db path not configured") + return db_path + + +def get_connection(db_path: Path, read_only: bool = False) -> sqlite3.Connection: + """Open a SQLite connection to paperforge.db with WAL mode. + + Args: + db_path: Path to paperforge.db. + read_only: If True, open in read-only mode (for queries). + """ + if read_only: + uri = "file:" + db_path.as_posix() + "?mode=ro" + conn = sqlite3.connect(uri, uri=True) + else: + db_path.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + if not read_only: + conn.execute("PRAGMA journal_mode=WAL;") + conn.execute("PRAGMA foreign_keys=ON;") + return conn +``` + +- [ ] **Step 3: Run a manual import check** + +```bash +python -c "from paperforge.memory import get_connection, get_memory_db_path; print('OK')" +``` + +- [ ] **Step 4: Commit** + +```bash +git add paperforge/memory/__init__.py paperforge/memory/db.py +git commit -m "feat(memory): add db.py with connection and path resolution" +``` + +--- + +### Task 3: `paperforge/memory/schema.py` + +**Files:** +- Create: `paperforge/memory/schema.py` +- Create: `tests/unit/memory/__init__.py` +- Create: `tests/unit/memory/test_schema.py` + +- [ ] **Step 1: Write `schema.py` with SQL definitions** + +```python +from __future__ import annotations + +import sqlite3 + +CURRENT_SCHEMA_VERSION = 1 + +CREATE_META = """ +CREATE TABLE IF NOT EXISTS meta ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL +); +""" + +CREATE_PAPERS = """ +CREATE TABLE IF NOT EXISTS papers ( + zotero_key TEXT PRIMARY KEY, + citation_key TEXT NOT NULL DEFAULT '', + title TEXT NOT NULL, + year TEXT, + doi TEXT, + pmid TEXT, + journal TEXT, + first_author TEXT, + authors_json TEXT, + abstract TEXT, + domain TEXT, + collection_path TEXT, + collections_json TEXT, + has_pdf INTEGER NOT NULL DEFAULT 0, + do_ocr INTEGER, + analyze INTEGER, + ocr_status TEXT, + deep_reading_status TEXT, + ocr_job_id TEXT, + impact_factor REAL, + lifecycle TEXT, + maturity_level INTEGER, + maturity_name TEXT, + next_step TEXT, + pdf_path TEXT, + note_path TEXT, + main_note_path TEXT, + paper_root TEXT, + fulltext_path TEXT, + ocr_md_path TEXT, + ocr_json_path TEXT, + ai_path TEXT, + deep_reading_md_path TEXT, + updated_at TEXT +); +""" + +CREATE_ASSETS = """ +CREATE TABLE IF NOT EXISTS paper_assets ( + paper_id TEXT NOT NULL, + asset_type TEXT NOT NULL, + path TEXT NOT NULL, + exists_on_disk INTEGER NOT NULL DEFAULT 0, + PRIMARY KEY (paper_id, asset_type), + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +""" + +CREATE_ALIASES = """ +CREATE TABLE IF NOT EXISTS paper_aliases ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + paper_id TEXT NOT NULL, + alias TEXT NOT NULL, + alias_norm TEXT NOT NULL, + alias_type TEXT NOT NULL, + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +""" + +INDEX_SQL = [ + "CREATE INDEX IF NOT EXISTS idx_papers_doi ON papers(doi);", + "CREATE INDEX IF NOT EXISTS idx_papers_citation_key ON papers(citation_key);", + "CREATE INDEX IF NOT EXISTS idx_papers_domain ON papers(domain);", + "CREATE INDEX IF NOT EXISTS idx_papers_year ON papers(year);", + "CREATE INDEX IF NOT EXISTS idx_papers_ocr_status ON papers(ocr_status);", + "CREATE INDEX IF NOT EXISTS idx_papers_deep_status ON papers(deep_reading_status);", + "CREATE INDEX IF NOT EXISTS idx_papers_lifecycle ON papers(lifecycle);", + "CREATE INDEX IF NOT EXISTS idx_papers_next_step ON papers(next_step);", +] + +ALL_TABLES = ["papers", "paper_assets", "paper_aliases", "meta"] + + +def ensure_schema(conn: sqlite3.Connection) -> None: + """Create tables and indexes if they don't exist.""" + conn.execute(CREATE_META) + conn.execute(CREATE_PAPERS) + conn.execute(CREATE_ASSETS) + conn.execute(CREATE_ALIASES) + for idx_sql in INDEX_SQL: + conn.execute(idx_sql) + conn.commit() + + +def drop_all_tables(conn: sqlite3.Connection) -> None: + """Drop all Memory Layer tables (for rebuild).""" + for table in ALL_TABLES: + conn.execute(f"DROP TABLE IF EXISTS {table};") + conn.commit() + + +def get_schema_version(conn: sqlite3.Connection) -> int: + """Read the stored schema version from meta table, or 0 if not found.""" + try: + row = conn.execute( + "SELECT value FROM meta WHERE key = 'schema_version'" + ).fetchone() + return int(row["value"]) if row else 0 + except sqlite3.OperationalError: + return 0 +``` + +- [ ] **Step 2: Write the failing test `tests/unit/memory/test_schema.py`** + +```python +from __future__ import annotations + +import sqlite3 +import tempfile +from pathlib import Path + +from paperforge.memory.schema import ( + ALL_TABLES, + ensure_schema, + drop_all_tables, + get_schema_version, + CURRENT_SCHEMA_VERSION, +) +from paperforge.memory.db import get_connection + + +def test_ensure_schema_creates_all_tables(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + cursor = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name" + ) + tables = {row["name"] for row in cursor.fetchall()} + for table in ALL_TABLES: + assert table in tables, f"Missing table: {table}" + conn.close() + finally: + db_path.unlink(missing_ok=True) + + +def test_drop_all_tables_clears_all(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + drop_all_tables(conn) + cursor = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table'" + ) + tables = {row["name"] for row in cursor.fetchall()} + assert tables == set() + conn.close() + finally: + db_path.unlink(missing_ok=True) + + +def test_get_schema_version_returns_zero_when_no_meta(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + assert get_schema_version(conn) == 0 + conn.close() + finally: + db_path.unlink(missing_ok=True) + + +def test_get_schema_version_returns_stored_value(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + conn.execute( + "INSERT INTO meta (key, value) VALUES ('schema_version', '1')" + ) + conn.commit() + assert get_schema_version(conn) == 1 + conn.close() + finally: + db_path.unlink(missing_ok=True) + + +def test_schema_version_mismatch_triggers_rebuild_semantics(): + """When stored version != CURRENT, get_schema_version returns a different int.""" + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + conn.execute( + "INSERT INTO meta (key, value) VALUES ('schema_version', '99')" + ) + conn.commit() + stored = get_schema_version(conn) + assert stored != CURRENT_SCHEMA_VERSION + conn.close() + finally: + db_path.unlink(missing_ok=True) +``` + +- [ ] **Step 3: Run tests and verify they pass** + +```bash +python -m pytest tests/unit/memory/test_schema.py -v +``` + +- [ ] **Step 4: Commit** + +```bash +git add paperforge/memory/schema.py tests/unit/memory/ +git commit -m "feat(memory): add schema module with table definitions and tests" +``` + +--- + +### Task 4: `paperforge/memory/builder.py` + +**Files:** +- Create: `paperforge/memory/builder.py` +- Create: `tests/unit/memory/test_builder.py` +- Modify: (none) + +- [ ] **Step 1: Write `builder.py`** + +```python +from __future__ import annotations + +import hashlib +import json +import logging +from datetime import datetime, timezone +from pathlib import Path + +from paperforge import __version__ as PF_VERSION +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.schema import ( + CURRENT_SCHEMA_VERSION, + ensure_schema, + drop_all_tables, + get_schema_version, +) +from paperforge.worker.asset_index import read_index +from paperforge.worker.asset_state import ( + compute_lifecycle, + compute_maturity, + compute_next_step, +) + +logger = logging.getLogger(__name__) + +PAPER_COLUMNS = [ + "zotero_key", "citation_key", "title", "year", "doi", "pmid", + "journal", "first_author", "authors_json", "abstract", "domain", + "collection_path", "collections_json", + "has_pdf", "do_ocr", "analyze", "ocr_status", "deep_reading_status", + "ocr_job_id", "impact_factor", + "lifecycle", "maturity_level", "maturity_name", "next_step", + "pdf_path", "note_path", "main_note_path", "paper_root", + "fulltext_path", "ocr_md_path", "ocr_json_path", "ai_path", + "deep_reading_md_path", "updated_at", +] + +ASSET_FIELDS = [ + ("pdf", "pdf_path"), + ("formal_note", "note_path"), + ("main_note", "main_note_path"), + ("ocr_fulltext", "fulltext_path"), + ("ocr_meta", "ocr_json_path"), + ("deep_reading", "main_note_path"), + ("ai_dir", "ai_path"), +] + +ALIAS_TYPES = ["zotero_key", "citation_key", "title", "doi"] + + +def compute_hash(items: list[dict]) -> str: + sorted_items = sorted(items, key=lambda e: e["zotero_key"]) + raw = json.dumps(sorted_items, sort_keys=True, ensure_ascii=False) + return hashlib.sha256(raw.encode("utf-8")).hexdigest() + + +def _resolve_vault_path(vault: Path, rel_path: str) -> Path: + if not rel_path: + return Path() + p = vault / rel_path + return p.resolve() if p.exists() else p + + +def build_from_index(vault: Path) -> dict: + """Read formal-library.json and build/rebuild paperforge.db. + + Returns a dict with counts for reporting. + """ + envelope = read_index(vault) + if envelope is None: + raise FileNotFoundError( + "Canonical index not found. Run paperforge sync --rebuild-index." + ) + # Legacy format: bare list of entries (pre-envelope) + if isinstance(envelope, list): + items = envelope + generated_at = "" + else: + items = envelope.get("items", []) + generated_at = envelope.get("generated_at", "") + if isinstance(items, list) and items and isinstance(items[0], dict): + canonical_hash = compute_hash(items) + else: + canonical_hash = "" + + db_path = get_memory_db_path(vault) + conn = get_connection(db_path, read_only=False) + try: + stored_version = get_schema_version(conn) + if stored_version != CURRENT_SCHEMA_VERSION: + drop_all_tables(conn) + ensure_schema(conn) + + conn.execute("DELETE FROM paper_aliases;") + conn.execute("DELETE FROM paper_assets;") + conn.execute("DELETE FROM papers;") + + now_utc = datetime.now(timezone.utc).isoformat() + papers_count = 0 + assets_count = 0 + aliases_count = 0 + + for entry in items: + zotero_key = entry.get("zotero_key", "") + if not zotero_key: + continue + + lifecycle = str(compute_lifecycle(entry)) + maturity = compute_maturity(entry) + next_step = str(compute_next_step(entry)) + + paper_values = {} + for col in PAPER_COLUMNS: + if col == "authors_json": + paper_values[col] = json.dumps( + entry.get("authors", []), ensure_ascii=False + ) + elif col == "collections_json": + paper_values[col] = json.dumps( + entry.get("collections", []), ensure_ascii=False + ) + elif col == "lifecycle": + paper_values[col] = lifecycle + elif col == "maturity_level": + paper_values[col] = maturity.get("level", 1) + elif col == "maturity_name": + paper_values[col] = maturity.get("level_name", "") + elif col == "next_step": + paper_values[col] = next_step + elif col == "updated_at": + paper_values[col] = generated_at + elif col in ("do_ocr", "analyze"): + val = entry.get(col) + paper_values[col] = 1 if val else 0 + elif col == "has_pdf": + paper_values[col] = 1 if entry.get("has_pdf") else 0 + else: + paper_values[col] = entry.get(col, "") + + placeholders = ", ".join([f":{c}" for c in PAPER_COLUMNS]) + cols = ", ".join(PAPER_COLUMNS) + conn.execute( + f"INSERT OR REPLACE INTO papers ({cols}) VALUES ({placeholders})", + paper_values, + ) + papers_count += 1 + + for asset_type, entry_field in ASSET_FIELDS: + path_val = entry.get(entry_field, "") + if not path_val: + continue + rel_path = str(path_val).replace("\\", "/") + abs_path = _resolve_vault_path(vault, rel_path) + exists = 1 if abs_path.exists() else 0 + + if asset_type == "deep_reading": + if abs_path.exists(): + try: + content = abs_path.read_text(encoding="utf-8") + exists = 1 if "## 🔍 精读" in content else 0 + except Exception: + exists = 0 + + conn.execute( + """INSERT OR REPLACE INTO paper_assets + (paper_id, asset_type, path, exists_on_disk) + VALUES (?, ?, ?, ?)""", + (zotero_key, asset_type, rel_path, exists), + ) + assets_count += 1 + + for alias_type in ALIAS_TYPES: + raw_val = entry.get(alias_type, "") + if not raw_val: + continue + raw_str = str(raw_val) + conn.execute( + """INSERT OR REPLACE INTO paper_aliases + (paper_id, alias, alias_norm, alias_type) + VALUES (?, ?, ?, ?)""", + ( + zotero_key, + raw_str, + raw_str.lower().strip(), + alias_type, + ), + ) + aliases_count += 1 + + meta_upserts = [ + ("schema_version", str(CURRENT_SCHEMA_VERSION)), + ("paperforge_version", PF_VERSION), + ("created_at", now_utc), + ("last_full_build_at", now_utc), + ("canonical_index_hash", canonical_hash), + ("canonical_index_generated_at", generated_at), + ] + for key, value in meta_upserts: + conn.execute( + """INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)""", + (key, value), + ) + + conn.commit() + + return { + "db_path": str(db_path), + "papers_indexed": papers_count, + "assets_indexed": assets_count, + "aliases_indexed": aliases_count, + "schema_version": str(CURRENT_SCHEMA_VERSION), + } + except Exception: + conn.rollback() + raise + finally: + conn.close() +``` + +- [ ] **Step 2: Write the test `tests/unit/memory/test_builder.py`** + +Note: This test needs an actual `formal-library.json` fixture. Use the existing test vault. + +```python +from __future__ import annotations + +import tempfile +from pathlib import Path +from unittest.mock import patch, MagicMock + +from paperforge.memory.builder import build_from_index, compute_hash + + +def test_compute_hash_deterministic(): + items1 = [{"zotero_key": "A"}, {"zotero_key": "B"}] + items2 = [{"zotero_key": "B"}, {"zotero_key": "A"}] + assert compute_hash(items1) == compute_hash(items2) + + +def test_compute_hash_different_for_different_data(): + items1 = [{"zotero_key": "A", "title": "X"}] + items2 = [{"zotero_key": "A", "title": "Y"}] + assert compute_hash(items1) != compute_hash(items2) + + +def test_compute_hash_handles_empty(): + assert compute_hash([]) == compute_hash([]) + assert len(compute_hash([])) == 64 # SHA-256 hex +``` + +- [ ] **Step 3: Run tests** + +```bash +python -m pytest tests/unit/memory/test_builder.py -v +``` + +- [ ] **Step 4: Commit** + +```bash +git add paperforge/memory/builder.py tests/unit/memory/test_builder.py +git commit -m "feat(memory): add builder module that populates SQLite from formal-library.json" +``` + +--- + +### Task 5: `paperforge/memory/query.py` + +**Files:** +- Create: `paperforge/memory/query.py` +- Create: `tests/unit/memory/test_query.py` + +- [ ] **Step 1: Write `query.py`** + +```python +from __future__ import annotations + +import json +import logging +from pathlib import Path + +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.schema import get_schema_version, CURRENT_SCHEMA_VERSION +from paperforge.memory.builder import compute_hash +from paperforge.worker.asset_state import compute_health +from paperforge.worker.asset_index import read_index + + +def get_memory_status(vault: Path) -> dict: + """Check paperforge.db health and staleness. + + Returns a dict with: db_exists, schema_ok, fresh, count_match, + paper_count_db, paper_count_index, needs_rebuild. + """ + db_path = get_memory_db_path(vault) + result = { + "db_exists": db_path.exists(), + "schema_ok": False, + "fresh": False, + "count_match": False, + "paper_count_db": 0, + "paper_count_index": 0, + "needs_rebuild": True, + } + if not db_path.exists(): + return result + + conn = get_connection(db_path, read_only=True) + try: + stored_version = get_schema_version(conn) + result["schema_ok"] = stored_version == CURRENT_SCHEMA_VERSION + row = conn.execute("SELECT COUNT(*) as cnt FROM papers").fetchone() + result["paper_count_db"] = row["cnt"] if row else 0 + stored_hash_row = conn.execute( + "SELECT value FROM meta WHERE key = 'canonical_index_hash'" + ).fetchone() + stored_hash = stored_hash_row["value"] if stored_hash_row else "" + except Exception: + return result + finally: + conn.close() + + envelope = read_index(vault) + if envelope is not None: + # Handle legacy format (bare list) + if isinstance(envelope, list): + items = envelope + paper_count = len(items) + index_hash = compute_hash(items) + else: + items = envelope.get("items", []) + paper_count = envelope.get("paper_count", 0) + index_hash = compute_hash(items) + result["paper_count_index"] = paper_count + + # Compare stored hash with computed hash + result["hash_match"] = stored_hash == index_hash + + result["count_match"] = ( + result["paper_count_db"] == result["paper_count_index"] + ) + + result["fresh"] = ( + result["schema_ok"] + and result["count_match"] + and result.get("hash_match", False) + ) + result["needs_rebuild"] = not result["fresh"] + return result + + +def _entry_from_row(row) -> dict: + """Reconstruct an entry dict from a papers row (sqlite3.Row).""" + entry = {k: row[k] for k in row.keys()} + for key in ("has_pdf", "do_ocr", "analyze"): + if key in entry and entry[key] is not None: + entry[key] = bool(entry[key]) + for key in ("authors_json", "collections_json"): + if key in entry and entry[key]: + try: + entry[key[:-5]] = json.loads(entry[key]) + del entry[key] + except json.JSONDecodeError: + logging.warning( + "Corrupted JSON in column %s for paper %s", + key, entry.get("zotero_key", "?"), + ) + return entry + + +def lookup_paper(conn, query: str) -> list[dict]: + """Multi-strategy lookup. Returns list of matching paper dicts.""" + q = query.strip() + results = [] + + for lookup_col in ("zotero_key", "citation_key", "doi"): + row = conn.execute( + f"SELECT * FROM papers WHERE LOWER({lookup_col}) = LOWER(?)", + (q,), + ).fetchone() + if row: + return [_entry_from_row(row)] + + rows = conn.execute( + """SELECT * FROM papers + WHERE LOWER(title) LIKE '%' || LOWER(?) || '%' + LIMIT 20""", + (q,), + ).fetchall() + if rows: + return [_entry_from_row(r) for r in rows] + + rows = conn.execute( + """SELECT p.* FROM papers p + JOIN paper_aliases a ON a.paper_id = p.zotero_key + WHERE a.alias_norm LIKE '%' || LOWER(?) || '%' + LIMIT 20""", + (q,), + ).fetchall() + return [_entry_from_row(r) for r in rows] + + +def get_paper_assets(conn, zotero_key: str) -> list[dict]: + rows = conn.execute( + "SELECT asset_type, path, exists_on_disk FROM paper_assets WHERE paper_id = ?", + (zotero_key,), + ).fetchall() + return [dict(r) for r in rows] + + +def get_paper_status(vault: Path, query: str) -> dict | None: + """Full paper status lookup. Returns dict or None if not found. + + If multiple candidates found, returns a candidate list without full status. + """ + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return None + + conn = get_connection(db_path, read_only=True) + try: + entries = lookup_paper(conn, query) + if not entries: + return None + + # Multiple candidates → return candidate list only (no full status) + if len(entries) > 1: + return { + "resolved": False, + "candidates": [ + { + "zotero_key": e.get("zotero_key"), + "title": e.get("title"), + "year": e.get("year"), + "citation_key": e.get("citation_key"), + "lifecycle": e.get("lifecycle"), + } + for e in entries + ], + } + + entry = entries[0] + assets = get_paper_assets(conn, entry["zotero_key"]) + entry["health"] = compute_health(entry) + entry["assets"] = assets + entry["resolved"] = True + + next_step = entry.get("next_step", "") + zk = entry.get("zotero_key", "") + if next_step == "/pf-deep": + entry["recommended_action"] = f"/pf-deep {zk}" + elif next_step == "ocr": + entry["recommended_action"] = f"paperforge ocr --key {zk}" + elif next_step == "sync": + entry["recommended_action"] = "paperforge sync" + else: + entry["recommended_action"] = None + + return entry + finally: + conn.close() +``` + +- [ ] **Step 2: Write `tests/unit/memory/test_query.py`** + +```python +from __future__ import annotations + +from paperforge.memory.query import get_memory_status + + +def test_get_memory_status_returns_needs_rebuild_when_no_db(): + from pathlib import Path + result = get_memory_status(Path("/nonexistent/vault")) + assert result["db_exists"] is False + assert result["needs_rebuild"] is True +``` + +- [ ] **Step 3: Run tests** + +```bash +python -m pytest tests/unit/memory/test_query.py -v +``` + +- [ ] **Step 4: Commit** + +```bash +git add paperforge/memory/query.py tests/unit/memory/test_query.py +git commit -m "feat(memory): add query module for paper lookup and status check" +``` + +--- + +### Task 6: CLI commands — `memory.py` and `paper_status.py` + +**Files:** +- Create: `paperforge/commands/memory.py` +- Create: `paperforge/commands/paper_status.py` +- Modify: `paperforge/cli.py:258-259` (register parsers) +- Modify: `paperforge/commands/__init__.py:4-13` (register in command dispatch) + +- [ ] **Step 1: Write `paperforge/commands/memory.py`** + +```python +from __future__ import annotations + +import argparse +import sys + +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.builder import build_from_index +from paperforge.memory.query import get_memory_status +from paperforge import __version__ as PF_VERSION + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + sub_cmd = args.memory_subcommand + + if sub_cmd == "build": + try: + counts = build_from_index(vault) + result = PFResult( + ok=True, + command="memory build", + version=PF_VERSION, + data=counts, + ) + except FileNotFoundError: + result = PFResult( + ok=False, + command="memory build", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message="Canonical index not found. Run paperforge sync --rebuild-index.", + ), + next_actions=[ + { + "command": "paperforge sync --rebuild-index", + "reason": "Generate formal-library.json first", + } + ], + ) + except Exception as exc: + result = PFResult( + ok=False, + command="memory build", + version=PF_VERSION, + error=PFError( + code=ErrorCode.INTERNAL_ERROR, + message=str(exc), + ), + ) + if args.json: + print(result.to_json()) + else: + if result.ok: + print(f"Memory built: {result.data}") + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 0 if result.ok else 1 + + if sub_cmd == "status": + try: + status = get_memory_status(vault) + result = PFResult( + ok=True, + command="memory status", + version=PF_VERSION, + data=status, + ) + except Exception as exc: + result = PFResult( + ok=False, + command="memory status", + version=PF_VERSION, + error=PFError( + code=ErrorCode.INTERNAL_ERROR, + message=str(exc), + ), + ) + if args.json: + print(result.to_json()) + else: + if result.ok: + for k, v in status.items(): + print(f" {k}: {v}") + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 0 if result.ok else 1 + + print(f"Unknown memory subcommand: {sub_cmd}", file=sys.stderr) + return 1 +``` + +- [ ] **Step 2: Write `paperforge/commands/paper_status.py`** + +```python +from __future__ import annotations + +import argparse +import sys + +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.query import get_paper_status +from paperforge import __version__ as PF_VERSION + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + query = args.query + + try: + status = get_paper_status(vault, query) + if status is None: + result = PFResult( + ok=False, + command="paper-status", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message=f"No paper found for: {query}", + ), + next_actions=[ + { + "command": "paperforge search", + "reason": "Search for papers by keyword", + } + ], + ) + else: + result = PFResult( + ok=True, + command="paper-status", + version=PF_VERSION, + data=status, + ) + except Exception as exc: + result = PFResult( + ok=False, + command="paper-status", + version=PF_VERSION, + error=PFError( + code=ErrorCode.INTERNAL_ERROR, + message=str(exc), + ), + ) + + if args.json: + print(result.to_json()) + else: + if result.ok: + data = result.data + if data.get("resolved"): + print(f"Zotero Key: {data.get('zotero_key', '')}") + print(f"Title: {data.get('title', '')}") + print(f"Year: {data.get('year', '')}") + print(f"Lifecycle: {data.get('lifecycle', '')}") + print(f"Next Step: {data.get('next_step', '')}") + if data.get("candidates"): + print(f"\nMultiple candidates: {len(data['candidates'])}") + for c in data["candidates"]: + print(f" - {c['zotero_key']}: {c['title']} ({c['year']})") + else: + print(f"Error: {result.error.message}", file=sys.stderr) + + return 0 if result.ok else 1 +``` + +- [ ] **Step 3: Register in `cli.py`** + +In `paperforge/cli.py`, at `build_parser()` after line 259 (`p_dash`), add: + +```python + # Memory Layer commands + p_memory = sub.add_parser("memory", help="Manage the Memory Layer") + p_memory_sp = p_memory.add_subparsers(dest="memory_subcommand", required=True) + p_memory_build = p_memory_sp.add_parser("build", help="Build the memory database from canonical index") + p_memory_build.add_argument("--json", action="store_true", help="Output as JSON") + p_memory_status = p_memory_sp.add_parser("status", help="Check memory database status") + p_memory_status.add_argument("--json", action="store_true", help="Output as JSON") + + p_paper_status = sub.add_parser("paper-status", help="Look up a paper's status") + p_paper_status.add_argument("query", help="Paper identifier (zotero_key, DOI, title, alias)") + p_paper_status.add_argument("--json", action="store_true", help="Output as JSON") +``` + +In `main()`, after `if args.command == "dashboard": ...` (around line 468, find the command dispatch section), add: + +```python + if args.command == "memory": + from paperforge.commands.memory import run + return run(args) + + if args.command == "paper-status": + from paperforge.commands.paper_status import run + return run(args) +``` + +(Follow existing dispatch pattern — see how "dashboard" dispatches.) + +- [ ] **Step 4: Register in `commands/__init__.py`** + +In `paperforge/commands/__init__.py`, add to `_COMMAND_REGISTRY`: + +```python + "memory": "paperforge.commands.memory", + "paper-status": "paperforge.commands.paper_status", +``` + +- [ ] **Step 5: Verify CLI registration** + +```bash +paperforge --help +``` +Expected: `memory` and `paper-status` appear in subcommand list. + +```bash +paperforge memory --help +``` +Expected: shows `build` and `status` subcommands. + +```bash +paperforge memory status --help +``` + +- [ ] **Step 6: Commit** + +```bash +git add paperforge/commands/memory.py paperforge/commands/paper_status.py paperforge/cli.py paperforge/commands/__init__.py +git commit -m "feat(cli): add memory build/status and paper-status commands" +``` + +--- + +### Task 7: Integration test + +**Files:** +- Create: `tests/integration/test_memory_workflow.py` + +- [ ] **Step 1: Write integration test** + +```python +from __future__ import annotations + +import pytest +from pathlib import Path + + +@pytest.mark.integration +def test_memory_build_and_status_with_test_vault(test_vault: Path): + """End-to-end: sync → memory build → memory status → paper-status.""" + import subprocess + import json + + pf = ["python", "-m", "paperforge", "--vault", str(test_vault)] + + # 1. Sync to ensure formal-library.json exists + result = subprocess.run(pf + ["sync", "--json"], capture_output=True, text=True) + # If sync fails, skip (test vault may not have exports) + if result.returncode != 0: + pytest.skip("Sync failed — test vault may lack export files") + + # 2. Memory build + result = subprocess.run(pf + ["memory", "build", "--json"], capture_output=True, text=True) + assert result.returncode == 0 + data = json.loads(result.stdout) + assert data["ok"] is True + assert data["data"]["papers_indexed"] > 0 + + # 3. Memory status + result = subprocess.run(pf + ["memory", "status", "--json"], capture_output=True, text=True) + assert result.returncode == 0 + data = json.loads(result.stdout) + assert data["data"]["fresh"] is True + assert data["data"]["needs_rebuild"] is False +``` + +- [ ] **Step 2: Run integration test** (requires test vault) + +```bash +python -m pytest tests/integration/test_memory_workflow.py -v -m integration +``` + +- [ ] **Step 3: Commit** + +```bash +git add tests/integration/test_memory_workflow.py +git commit -m "test(memory): add integration test for memory build/status workflow" +``` + +--- + +### Task 8: Final verification — run full test suite + +- [ ] **Step 1: Run all tests** + +```bash +python -m pytest tests/unit/ tests/integration/ -q --tb=short +``` + +Expected: All tests pass, no regressions. + +- [ ] **Step 2: Run ruff lint** + +```bash +ruff check paperforge/memory/ paperforge/commands/memory.py paperforge/commands/paper_status.py --fix && ruff format paperforge/memory/ paperforge/commands/memory.py paperforge/commands/paper_status.py +``` + +- [ ] **Step 3: Manual smoke test with real vault** + +```bash +paperforge memory build --json +paperforge memory status --json +paperforge paper-status "aaronStimulationGrowthFactor2004" --json +``` + +Expected: Real data flows through, paper status shows lifecycle, next_step, assets. + +--- + +## Summary + +| Task | Files Created | Files Modified | Tests | +|------|--------------|----------------|-------| +| 1. Config path | — | `config.py` | manual | +| 2. db.py | `memory/__init__.py`, `memory/db.py` | — | manual | +| 3. schema.py | `memory/schema.py` | — | `test_schema.py` (4 tests) | +| 4. builder.py | `memory/builder.py` | — | `test_builder.py` (3 tests) | +| 5. query.py | `memory/query.py` | — | `test_query.py` (1 test) | +| 6. CLI | `commands/memory.py`, `commands/paper_status.py` | `cli.py`, `commands/__init__.py` | — | +| 7. Integration | `tests/integration/test_memory_workflow.py` | — | 1 test | +| 8. Verification | — | — | full suite + lint | diff --git a/docs/superpowers/specs/2026-05-12-memory-layer-design.md b/docs/superpowers/specs/2026-05-12-memory-layer-design.md new file mode 100644 index 0000000..afb80fc --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-memory-layer-design.md @@ -0,0 +1,279 @@ +# Memory Layer — Design Spec + +> **Status:** Approved | **Date:** 2026-05-12 +> **Review:** Passed (v2 — 5 BLOCKER, 3 MAJOR, 6 MINOR resolved) + +## Goal + +Add a SQLite-backed Memory Layer to PaperForge as a derived, rebuildable global index that serves +dashboard, resolver, agent-context, and search commands. + +## Architecture + +``` +Zotero/BetterBibTeX → exports/*.json + ↓ +formal-library.json (Canonical Index — source of truth, already exists) + ↓ +paperforge.db (Memory Layer — derived, rebuildable SQLite index) + ↓ +paper-status / dashboard / agent-context / search / retrieve +``` + +**Core principle:** `paperforge.db` is a derived index, not the source of truth. +It can be safely deleted and rebuilt from `formal-library.json` at any time. + +## Phase 1 Scope + +**Tables:** `meta`, `papers`, `paper_assets`, `paper_aliases` + +**Commands:** +- `paperforge memory build --json` +- `paperforge memory status --json` +- `paperforge paper-status --json` + +**NOT in Phase 1:** FTS5, chunk retrieval, embedding, `paperforge.db → Markdown` writes, +agent-context, dashboard integration. + +## SQLite Location + +``` +/PaperForge/indexes/paperforge.db +``` +(same directory as `formal-library.json`) + +Register a new path key `"memory_db"` in `config.py:paperforge_paths()` pointing to +`paperforge / "indexes" / "paperforge.db"`. Do not reuse the existing `"index"` key. + +## Schema + +### Connection settings + +- `PRAGMA journal_mode=WAL;` — allow concurrent reads during rebuild +- `PRAGMA foreign_keys=ON;` + +### meta + +```sql +CREATE TABLE IF NOT EXISTS meta ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL +); +``` + +Stores: `schema_version` (integer), `paperforge_version`, `created_at`, `last_full_build_at`, +`canonical_index_hash`, `canonical_index_generated_at`. + +### Schema versioning strategy + +On `paperforge memory build`, if the stored `schema_version` in `meta` does not match the +current version, DROP all tables and rebuild from scratch. `paperforge.db` is a derived index +— full rebuild is always safe. This mirrors `formal-library.json`'s schema-version-check +pattern in `asset_index.py:475-480`. + +Initial schema version: `1`. + +### papers + +One row per paper. Columns directly map to `_build_entry()` entry dict fields. +`asset_state.py` pure functions (`compute_lifecycle`, `compute_health`, `compute_maturity`, +`compute_next_step`) are called at **build time** on each entry dict to populate derived columns. + +```sql +CREATE TABLE IF NOT EXISTS papers ( + zotero_key TEXT PRIMARY KEY, + citation_key TEXT NOT NULL DEFAULT '', + title TEXT NOT NULL, + year TEXT, + doi TEXT, + pmid TEXT, + journal TEXT, + first_author TEXT, + authors_json TEXT, -- json.dumps(entry["authors"], ensure_ascii=False) + abstract TEXT, + domain TEXT, + collection_path TEXT, + collections_json TEXT, -- json.dumps(entry["collections"], ensure_ascii=False) + has_pdf INTEGER NOT NULL DEFAULT 0, + do_ocr INTEGER, + analyze INTEGER, + ocr_status TEXT, + deep_reading_status TEXT, + ocr_job_id TEXT, + impact_factor REAL, + lifecycle TEXT, -- compute_lifecycle(entry) → "indexed"|"pdf_ready"|"fulltext_ready"|"deep_read_done" + maturity_level INTEGER, -- compute_maturity(entry)["level"] → 1-4 + maturity_name TEXT, -- compute_maturity(entry)["level_name"] + next_step TEXT, -- compute_next_step(entry) → "sync"|"ocr"|"/pf-deep"|"ready" + pdf_path TEXT, + note_path TEXT, + main_note_path TEXT, + paper_root TEXT, + fulltext_path TEXT, + ocr_md_path TEXT, + ocr_json_path TEXT, + ai_path TEXT, + deep_reading_md_path TEXT, + updated_at TEXT -- envelope["generated_at"] from formal-library.json +); +``` + +Indexes: +```sql +CREATE INDEX IF NOT EXISTS idx_papers_zotero_key ON papers(zotero_key); +CREATE INDEX IF NOT EXISTS idx_papers_citation_key ON papers(citation_key); +CREATE INDEX IF NOT EXISTS idx_papers_doi ON papers(doi); +CREATE INDEX IF NOT EXISTS idx_papers_domain ON papers(domain); +CREATE INDEX IF NOT EXISTS idx_papers_year ON papers(year); +CREATE INDEX IF NOT EXISTS idx_papers_ocr_status ON papers(ocr_status); +CREATE INDEX IF NOT EXISTS idx_papers_deep_status ON papers(deep_reading_status); +CREATE INDEX IF NOT EXISTS idx_papers_lifecycle ON papers(lifecycle); +CREATE INDEX IF NOT EXISTS idx_papers_next_step ON papers(next_step); +``` + +**Important notes about column→entry mapping:** + +- `maturity_level` = `compute_maturity(entry)["level"]` (scalar 1-4, not the full dict) +- `updated_at` = the envelope's `generated_at` timestamp from `formal-library.json` (shared across all papers in a build) +- `lifecycle` values: `"indexed"`, `"pdf_ready"`, `"fulltext_ready"`, `"deep_read_done"` — these are NOT all members of the `Lifecycle` enum in `core/state.py` (which has `OCR_READY`, `ANALYZE_READY`, `ERROR_STATE` that are never produced). Use plain string comparison, not enum membership. +- `ai_context_ready` is a pre-seeded zero in `summarize_index()` (`asset_index.py:644`) but is never produced by `compute_lifecycle()`. Keep the zero bucket for Phase 3 compatibility but document it as reserved. + +**Health dimensions** (`pdf_health`, `ocr_health`, `note_health`, `asset_health`) are NOT stored in the papers table. They are computed at query time via `asset_state.compute_health(entry_dict)`. The `paper-status` command reconstructs the entry dict from SQLite columns, then calls `compute_health()` in-process. + +### paper_assets + +```sql +CREATE TABLE IF NOT EXISTS paper_assets ( + paper_id TEXT NOT NULL, + asset_type TEXT NOT NULL, + path TEXT NOT NULL, + exists_on_disk INTEGER NOT NULL DEFAULT 0, + PRIMARY KEY (paper_id, asset_type), + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +``` + +Asset types and their source fields: + +| asset_type | source in entry dict | notes | +| -------------- | -------------------------- | --------------------------------------------------------- | +| `pdf` | `pdf_path` | wiki-link; check existence via filesystem | +| `formal_note` | `note_path` | relative vault path | +| `main_note` | `main_note_path` | workspace `{key}.md` | +| `ocr_fulltext` | `fulltext_path` | copied from `ocr/{key}/fulltext.md` | +| `ocr_meta` | derived from `ocr_json_path` | `ocr/{key}/meta.json` | +| `deep_reading` | `main_note_path` | checks for `## 🔍 精读` section within main note (NOT a separate file; `deep_reading_path` is deprecated and always empty) | +| `ai_dir` | `ai_path` | workspace `ai/` directory | + +### paper_aliases + +```sql +CREATE TABLE IF NOT EXISTS paper_aliases ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + paper_id TEXT NOT NULL, + alias TEXT NOT NULL, + alias_norm TEXT NOT NULL, + alias_type TEXT NOT NULL, + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +``` + +Alias types (Phase 1): + +| alias_type | source | normalized to lowercase | +| ------------- | ----------------- | --------------------------- | +| `zotero_key` | `entry["zotero_key"]` | as-is (uppercase) | +| `citation_key` | `entry["citation_key"]` | as-is (case-sensitive) | +| `title` | `entry["title"]` | `.lower().strip()` | +| `doi` | `entry["doi"]` | `.lower().strip()` | + +## Commands + +### `paperforge memory build --json` + +1. Resolve vault path +2. Read `formal-library.json` (canonical index envelope) via `read_index(vault)` +3. If index is `None` or missing → return `PFResult(ok=False, error=PFError(code=PATH_NOT_FOUND, message="Canonical index not found. Run paperforge sync --rebuild-index."))` +4. Extract `items` list and envelope metadata +5. Create/open `paperforge.db` (WAL mode) +6. If stored `schema_version` != current → DROP all tables +7. Create tables if not exist +8. Upsert `meta` rows: `schema_version`, `paperforge_version`, `created_at`, `last_full_build_at`, `canonical_index_generated_at` +9. Compute `canonical_index_hash` = SHA-256 of `json.dumps(sorted(items, key=lambda e: e["zotero_key"]), sort_keys=True, ensure_ascii=False)`; store in `meta` +10. For each entry in `items`: + - Insert/upsert into `papers` + - Insert/upsert into `paper_assets` (check `exists_on_disk` via `Path.exists()`) + - Insert/upsert into `paper_aliases` +11. Return `PFResult(ok=True, data={...})` with `papers_indexed`, `assets_indexed`, `aliases_indexed` counts + +**PFResult.next_actions format** (must match `core/result.py:26` — `list[dict]`): +```json +{ + "next_actions": [ + {"command": "paperforge paper-status --json", "reason": "Look up a specific paper"} + ] +} +``` + +### `paperforge memory status --json` + +Check: +- `paperforge.db` exists → `db_exists: bool` +- `schema_version` matches current → `schema_ok: bool` +- `canonical_index_hash` matches computed hash of current `formal-library.json` → `fresh: bool` +- Paper count matches `envelope["paper_count"]` → `count_match: bool` +- Any check fails → `needs_rebuild: true` + +Return `PFResult(ok=True, data={...})`. + +### `paperforge paper-status --json` + +**Resolution is short-circuit:** stop at the first step that returns ≥1 result. + +Resolution order: +1. Exact match on `zotero_key` (case-insensitive) +2. Exact match on `citation_key` (case-insensitive) +3. Exact match on `doi` (case-insensitive) +4. LIKE match on `title_norm` or normalized alias (`%%`) +5. Fallback: search `paper_aliases.alias_norm` + +Behavior by result count: +- **0 results:** `PFResult(ok=False, error=NOT_FOUND, next_actions=[{"command": "paperforge search", ...}])` +- **1 result:** Full status with paper metadata, assets, lifecycle, next_step, recommended action +- **>1 results:** Candidate list only (no full status details) + +Full status response includes: +- Paper metadata (title, year, authors, doi, journal, domain, abstract) +- Asset status (exists_on_disk for each asset type) +- Lifecycle state +- Health dimensions (computed at query time via `compute_health()`) +- Maturity level and name +- `next_step` with recommended action +- `recommended_action`: e.g., `"/pf-deep ABCDEFG"` or `"paperforge sync"` or `"paperforge ocr"` + +## Integration Points + +### With sync +After `paperforge sync` completes, optionally refresh memory for changed keys. +Not automatic in Phase 1. + +### With dashboard +Dashboard should prefer `paperforge.db` for stats, fallback to file scanning. +This integration is deferred to Phase 2. + +### With agent +Agent skill bootstrap runs `paperforge agent-context --compact --json` first. +This command is deferred to Phase 3. + +## Constraints + +1. `paperforge.db` is a derived index — deletable, rebuildable +2. No SQLite → Markdown writes in Phase 1 +3. Reuse `asset_state.py` pure functions (compute_lifecycle, compute_health, compute_maturity, compute_next_step) +4. Health dimensions are computed at query time via `compute_health()`, not stored in SQLite +5. All `--json` output uses PFResult envelope (respecting `next_actions: list[dict]` contract) +6. SQLite connection uses WAL mode for concurrent reads +7. No external database services — only Python stdlib `sqlite3` +8. No PDF/image binary storage +9. No embedding or vector DB +10. Schema version mismatch → full drop-and-rebuild (derived index, always safe) diff --git a/docs/superpowers/specs/2026-05-12-memory-layer-perf-optimization.md b/docs/superpowers/specs/2026-05-12-memory-layer-perf-optimization.md new file mode 100644 index 0000000..308fdd6 --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-memory-layer-perf-optimization.md @@ -0,0 +1,157 @@ +# Memory Layer Performance Optimization — v1.5.8 + +> **Branch:** `feature/memory` | **From audit:** Round 3 | **All 10 items** + +## P0: Critical Efficiency Bugs + +### 1. refresh_paper: O(N) scan per single-paper update + +**File:** `paperforge/memory/refresh.py:25` + +**Problem:** `refresh_paper()` calls `read_index(vault)` which parses the full 5-10 MB `formal-library.json` every time, then does a linear scan to find one paper. + +**Fix:** Add a lightweight `read_index_entry(vault, key) -> dict | None` function that: +- Opens `formal-library.json` +- Uses `ijson` or streaming parse, OR +- Loads only the `items` list and does a dict lookup by `zotero_key` + +Alternative (simpler): change `refresh_paper()` signature to accept the entry dict directly from the caller. Caller already has the entry (from sync or OCR completion event). Don't re-read the file. + +```python +# New signature +def refresh_paper(vault: Path, entry: dict) -> bool: + # entry is already resolved by caller, skip read_index +``` + +Callers updated: +- `sync_service.run()` → after `_build_entry()`, call `refresh_paper(vault, entry)` +- `commands/ocr.py` → after OCR completes, get entry from OCR context, call `refresh_paper(vault, entry)` +- `commands/finalize.py` → after deep-finalize, use the entry dict + +### 2. FTS Double-Insert + +**File:** `paperforge/memory/builder.py:97,150-158` + `paperforge/memory/schema.py:104` + +**Problem:** `papers_ai` trigger fires on `INSERT OR REPLACE INTO papers`, writing a row to `paper_fts`. Then the manual `INSERT INTO paper_fts` (line 150) tries to write AGAIN — IntegrityError caught silently. + +**Fix:** In `build_from_index()`: +- Before the paper loop: `conn.execute("DROP TRIGGER IF EXISTS papers_ai")` +- After the paper loop: re-create the trigger from schema +- Remove the manual `INSERT INTO paper_fts` (lines 150-158) + +Similarly in `refresh.py`: +- Drop trigger before upsert, re-create after (or just use manual FTS + no trigger) + +### 3. _autoRebuild does full build on every change + +**File:** `paperforge/plugin/main.js:_autoRebuild()` + +**Problem:** Runs `memory build` (full rebuild) on ANY export change. One new paper = 150 papers re-indexed. + +**Fix:** Never trigger full `memory build` from auto-poll. Instead: +- On export change: run `paperforge sync --auto` (incremental sync only) +- The sync command already calls `refresh_paper()` internally for new/changed papers +- Only run `memory build` on first install or when user explicitly requests it + +Change `_autoRebuild()` to `_autoSync()`: +```javascript +const cmd = `"${pyResult.path}" -m paperforge --vault "${vaultPath}" sync`; +``` + +## P1: Redundancy Elimination + +### 4. Frontmatter read 3 times per paper + +**File:** `paperforge/worker/asset_index.py:325-343` + +**Problem:** `_build_entry()` calls `read_frontmatter()` 3 separate times for `do_ocr`, `analyze`, `deep_reading_status` from the same file. + +**Fix:** Add helper at module level: +```python +def _get_frontmatter_values(note_path: Path) -> dict: + """Read frontmatter once, return {do_ocr, analyze, deep_reading_status}.""" + fm = read_frontmatter(note_path) + return { + "do_ocr": fm.get("do_ocr"), + "analyze": fm.get("analyze"), + "deep_reading_status": fm.get("deep_reading_status"), + } +``` + +### 5. Duplicate PAPER_COLUMNS logic + +**File:** `paperforge/memory/builder.py:109-139` + `paperforge/memory/refresh.py:52-74` + +**Problem:** Identical ~30 lines of column-value mapping. + +**Fix:** Extract to `paperforge/memory/_columns.py`: +```python +def build_paper_row(entry: dict, generated_at: str) -> dict: + # single source of truth for papers table columns + ... +``` +Import from both builder.py and refresh.py. + +### 6. Dashboard 6 SELECTs → 2 + +**File:** `paperforge/commands/dashboard.py:65-75` + +**Fix:** +```sql +-- Query 1: combined pdf+ocr health +SELECT has_pdf, + CASE WHEN ocr_status = 'done' THEN 'done' + WHEN ocr_status IN ('failed','blocked') THEN 'failed' + ELSE 'pending' END as ocr_state, + COUNT(*) as cnt +FROM papers GROUP BY has_pdf, ocr_state; + +-- Query 2: domain counts (unchanged) +SELECT domain, COUNT(*) FROM papers GROUP BY domain; +``` + +## P2: General Optimization + +### 7. Per-row INSERT → executemany + +**File:** `paperforge/memory/builder.py:143` + +**Fix:** Collect paper rows, asset rows, alias rows in lists. Use `executemany()`: +```python +conn.executemany("INSERT OR REPLACE INTO papers (...) VALUES (...)", paper_rows) +conn.executemany("INSERT OR REPLACE INTO paper_assets (...) VALUES (...)", asset_rows) +``` + +### 8. Poll interval 30s → 120s + +**File:** `paperforge/plugin/main.js:3920` + +**Fix:** Change `setInterval(..., 30000)` to `setInterval(..., 120000)`. + +### 9. _build_entry: 10 file reads per paper + +**File:** `paperforge/worker/asset_index.py:_build_entry()` + +**Problem:** Multiple `.exists()`, `.read_text()`, frontmatter reads per paper. + +**Fix:** Combine the `_legacy_control_flags` + `do_ocr` + `analyze` + `deep_reading_status` into one pass. Don't check `note_path.exists()` when `main_note_path.exists()` — if main exists, use it; only fall back to note_path when main doesn't exist. + +### 10. formal-library.json read-once pipeline + +**Problem:** `formal-library.json` is parsed by 5+ different modules during a sync→build→dashboard cycle. + +**Fix:** Not urgent — each module reads it independently for isolation. This is acceptable for reliability. Could optimize later with in-memory cache, but at risk of staleness. + +## Implementation Order + +1. P0 #1: refresh_paper accept entry dict (changes refresh.py + callers) +2. P0 #2: FTS trigger removal + manual-only insert +3. P0 #3: _autoRebuild → _autoSync +4. P1 #5: Extract PAPER_COLUMNS helper +5. P1 #4: Single frontmatter parse +6. P1 #6: Dashboard query merge +7. P2 #7: executemany batching +8. P2 #8: Poll interval +9. P2 #9: File read consolidation + +Each step: modify → test → commit. diff --git a/docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-REVIEW.md b/docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-REVIEW.md new file mode 100644 index 0000000..93cf0c4 --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-REVIEW.md @@ -0,0 +1,381 @@ +--- +phase: memory-layer-2-5-spec-review +reviewed: 2026-05-12T00:00:00Z +depth: deep +files_reviewed: 7 +files_reviewed_list: + - docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md + - paperforge/memory/query.py + - paperforge/memory/builder.py + - paperforge/memory/schema.py + - paperforge/commands/dashboard.py + - paperforge/memory/fts.py + - paperforge/cli.py +findings: + critical: 5 + warning: 8 + info: 7 + total: 20 +status: issues_found +--- + +# Spec Review: Memory Layer Phase 2-5 Design + +**Reviewed:** 2026-05-12 +**Depth:** deep (cross-file analysis with import graph and call-chain tracing) +**Files Reviewed:** 7 (1 spec + 6 existing source files) +**Status:** issues_found + +## Summary + +Cross-referenced the Phase 2-5 design spec against the existing Memory Layer codebase (`memory/query.py`, `memory/builder.py`, `memory/schema.py`, `memory/fts.py`), plus the CLI dispatcher (`cli.py`) and dashboard command (`commands/dashboard.py`). + +The design correctly reuses existing infrastructure (`compute_hash`, `PAPER_COLUMNS`, `ASSET_FIELDS`, `read_index`) and follows the layered architecture (memory lib → commands module → CLI dispatch). However, five blocker-level issues were found involving **contract violations**, **missing schema migrations**, and **incomplete PFResult compliance** that must be resolved before implementation begins. + +--- + +## Critical Issues + +### CR-01: Dashboard return format contract violation + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:188-200` + +**Issue:** The spec states "Dashboard output format must NOT change (plugin depends on it)" (line 204), yet `_dashboard_from_db()` adds a new top-level key `_source` to the return dict. The existing `_gather_dashboard_data()` returns `{"stats": {...}, "permissions": {...}}` with exactly two top-level keys. Adding `_source` is a format change that will break any plugin consumer that iterates over top-level keys or destructures the response. + +**Fix:** Either: +1. Nest `_source` inside `stats` (e.g., `stats._source`), preserving the two-key top-level structure, OR +2. Explicitly acknowledge the format change and version the dashboard response schema, coordinating with the plugin team. + +```python +# Option 1 — nest inside stats: +return { + "stats": { + "papers": total, + "pdf_health": {...}, + "ocr_health": {...}, + "domain_counts": domain_counts, + "_source": "paperforge.db" # nested, not top-level + }, + "permissions": permissions, +} +``` + +--- + +### CR-02: Schema version not bumped for new tables + +**File:** `paperforge/memory/schema.py:5` (CURRENT_SCHEMA_VERSION = 1) +**Cross-ref:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:282-304` + +**Issue:** The spec introduces two new tables (`paper_chunks`, `chunk_fts`) but does not mention incrementing `CURRENT_SCHEMA_VERSION` (currently `1`). The existing `get_memory_status()` in `query.py:38` compares stored schema version against `CURRENT_SCHEMA_VERSION` to detect staleness. If the version stays at 1, existing databases won't be detected as stale, and `ensure_schema()` won't know to create the new tables on upgrade. + +Additionally, `build_from_index()` in `builder.py:88-90` drops and recreates all tables when the stored version differs from `CURRENT_SCHEMA_VERSION`: +```python +if stored_version != CURRENT_SCHEMA_VERSION: + drop_all_tables(conn) +ensure_schema(conn) +``` +Without a version bump, upgrading users will never get the new tables. + +**Fix:** Bump `CURRENT_SCHEMA_VERSION` to `2` in `schema.py:5`. The spec should explicitly state this. + +--- + +### CR-03: FTS virtual table naming violates existing convention + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:295-303` +**Cross-ref:** `paperforge/memory/schema.py:86-101` + +**Issue:** The existing FTS virtual table is named `paper_fts` (schema.py line 86). The spec names the new content-sync table `chunk_fts`. The established naming convention is `{entity}_fts` where `{entity}` is the base table name. Since the entity table is `paper_chunks` (not `chunks`), the FTS table should be `paper_chunk_fts` for consistency and to avoid collision with any future `chunks` table from another subsystem. + +**Fix:** Rename `chunk_fts` to `paper_chunk_fts` throughout the spec. + +```sql +CREATE VIRTUAL TABLE IF NOT EXISTS paper_chunk_fts USING fts5( + chunk_id UNINDEXED, + paper_id UNINDEXED, + source_type, + section_title, + chunk_text, + content='paper_chunks', + content_rowid='rowid' +); +``` + +--- + +### CR-04: `agent-context` output format not wrapped in PFResult + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:26-81` +**Cross-ref:** `paperforge/core/result.py:18-27` (PFResult dataclass) + +**Issue:** Every CLI command in the existing codebase returns output via `PFResult.to_json()` (see `paper_status.py:35-40`, `search.py:61`, `dashboard.py:38-40`). The PFResult contract includes fields `ok`, `command`, `version`, `data`, `error`, `warnings`, `next_actions`. The spec's `agent-context` output shows a raw dict structure mimicking PFResult but it is ambiguous whether the implementation will actually use the `PFResult` dataclass. + +If this command bypasses PFResult, it breaks the contract that all `--json` outputs conform to the same envelope format, making it impossible for downstream consumers (plugin, agents) to parse responses uniformly. + +**Fix:** The spec should state explicitly: +```python +result = PFResult( + ok=True, + command="agent-context", + version=PF_VERSION, + data={...}, # the full context dict +) +print(result.to_json()) +``` + +--- + +### CR-05: `get_agent_context()` has no error handling for SQL failures + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:88-107` +**Cross-ref:** `paperforge/memory/schema.py:150-158` (get_schema_version catches OperationalError) + +**Issue:** The spec's `get_agent_context()` opens a connection and executes queries but wraps only the connection lifecycle in try/finally (for close). It does not wrap the individual SQL queries in try/except. If the DB exists but has a corrupted schema (wrong column count, missing table), the query will raise `sqlite3.OperationalError` which propagates unhandled up to the CLI command, producing a raw traceback instead of a clean PFResult error. + +Compare with `get_memory_status()` in `query.py:46-49` which wraps all DB reads in `try/except Exception` and returns a safe fallback dict. + +**Fix:** +```python +def get_agent_context(vault: Path) -> dict: + conn = get_connection(get_memory_db_path(vault), read_only=True) + try: + total = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0] + # ... + except sqlite3.Error as exc: + return {"ok": False, "error": f"DB read failed: {exc}"} + finally: + conn.close() +``` + +--- + +## Warnings + +### WR-01: `agent-context` re-derives freshness instead of delegating + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:88-107` +**Cross-ref:** `paperforge/memory/query.py:16-77` (get_memory_status) + +**Issue:** The spec's `get_agent_context()` manually queries `SELECT COUNT(*)` and `GROUP BY domain` but does not use the existing `get_memory_status()` function which already computes `fresh`, `needs_rebuild`, `hash_match`, and `count_match`. The `"memory_db": "ready"` field is hardcoded — it doesn't reflect whether the DB is actually fresh. Calling `get_memory_status()` would provide a canonical freshness signal that can gate whether the agent can trust the DB. + +**Fix:** Add a call to `get_memory_status(vault)` at the top of `get_agent_context()` and use `result["fresh"]` to set the `memory_db` field to `"ready"` or `"stale"`. + +--- + +### WR-02: `pdf_health` via `lifecycle` is lossy — misses `path_error` states + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:163-167` +**Cross-ref:** `paperforge/commands/dashboard.py:84-107` (path_error regex detection) +**Cross-ref:** `paperforge/memory/builder.py:28-38` (PAPER_COLUMNS — no path_error column) + +**Issue:** The spec computes `pdf_healthy` as `r["lifecycle"] != "indexed"`. A paper with `lifecycle == "pdf_ready"` has a PDF, but that PDF could be broken (permission denied, file missing). The existing file-scanning code in `dashboard.py:99-107` uses a `path_error` regex to detect these cases and counts them separately as `broken`. The DB schema (`PAPER_COLUMNS`) has no `path_error` column, so the DB-based dashboard cannot distinguish between "healthy PDF" and "broken PDF." The hardcoded `"broken": 0` is misleading. + +**Fix:** Either: +1. Add a `path_error` column to the `papers` table and populate it during `build_from_index()`, OR +2. Document this as a known limitation and note that `broken` counts require file-system scanning. + +--- + +### WR-03: `refresh_paper()` linear O(n) scan through formal-library.json + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:228-234` + +**Issue:** The spec searches for the target entry by iterating through all items: +```python +for e in items: + if e.get("zotero_key") == zotero_key: + entry = e; break +``` +For 283 papers this is negligible, but for larger libraries (10K+ entries), this becomes a performance concern. The spec should at minimum acknowledge this limitation and note that an index lookup or dictionary-based approach should be considered for scale. + +**Fix:** Build a lookup dict keyed by `zotero_key`: +```python +index_map = {e.get("zotero_key"): e for e in items if e.get("zotero_key")} +entry = index_map.get(zotero_key) +``` + +--- + +### WR-04: `refresh_paper()` silent skip on stale index is indistinguishable from success + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:268` + +**Issue:** The spec says "If formal-library.json is stale (entry not found), skip silently" and `refresh_paper()` returns `False`. However, the integration points (sync, ocr, deep-finalize, repair) call `refresh_paper()` after modifying state. If the index hasn't been regenerated yet, the refresh silently fails and the DB is now out of sync with the ground truth. The caller has no way to distinguish "refresh succeeded" from "entry not in index yet — DB unchanged." + +This is most acute after `paperforge ocr` where OCR status changes but sync hasn't re-run — the DB will show stale OCR status. + +**Fix:** Return a richer result: +```python +return {"action": "refreshed", "key": zotero_key} +# vs +return {"action": "skipped", "key": zotero_key, "reason": "not_in_index"} +``` +Or raise a distinguishable exception that callers can catch and handle (e.g., trigger a full rebuild). + +--- + +### WR-05: `retrieve` chunk output doesn't specify JOIN to get `title` + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:319-339` +**Cross-ref:** `paperforge/memory/fts.py:41-51` (search_papers JOIN pattern) + +**Issue:** The `retrieve` output (lines 327-338) shows `zotero_key` and `title` fields per chunk, but `paper_chunks` stores only `paper_id` (not `zotero_key` or `title`). The existing `search_papers()` in `fts.py:41-51` demonstrates the correct pattern: JOIN `paper_fts f` → `papers p ON p.rowid = f.rowid` to get metadata. The spec's `retrieve` query is unspecified — it must JOIN `chunk_fts` → `paper_chunks` → `papers` to produce the output format shown. + +**Fix:** Specify the query: +```sql +SELECT c.chunk_id, c.paper_id, c.source_type, c.section_title, + c.page_number, c.chunk_text, p.title, p.zotero_key, rank +FROM paper_chunk_fts f +JOIN paper_chunks c ON c.rowid = f.rowid +JOIN papers p ON p.zotero_key = c.paper_id +WHERE paper_chunk_fts MATCH ? +ORDER BY rank LIMIT ? +``` + +--- + +### WR-06: `agent-context` advertises `--collection` flag that doesn't exist + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:54` +**Cross-ref:** `paperforge/cli.py:273-283` (search subparser — no --collection flag) + +**Issue:** The `agent-context` output lists: +``` +"search": { + "usage": "paperforge search --json [--collection NAME] [--domain NAME] ..." +} +``` +But the existing `search` subparser (cli.py lines 273-283) defines `--domain`, `--year-from`, `--year-to`, `--ocr`, `--deep`, `--lifecycle`, `--next-step` — **no `--collection` filter**. If an agent reads the `agent-context` output and tries `--collection`, the command will fail with an unrecognized argument error. + +**Fix:** Either add `--collection` to the search subparser (requires adding a `collection_path` filter to `search_papers()` in fts.py), or remove it from the agent-context output until it's implemented. + +--- + +### WR-07: FTS triggers for `paper_chunks` / `paper_chunk_fts` not specified + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:295-303` +**Cross-ref:** `paperforge/memory/schema.py:103-118` (FTS_TRIGGERS for papers) + +**Issue:** The existing `paper_fts` table uses `content='papers'` (a content-sync external content FTS5 table) and relies on INSERT/UPDATE/DELETE triggers on the `papers` table to keep the FTS index in sync (schema.py lines 103-118). The spec's `chunk_fts` also uses `content='paper_chunks'` with `content_rowid='rowid'` — the same content-sync pattern. But the spec does not mention the required triggers on the `paper_chunks` table. Without them, inserts/deletes into `paper_chunks` won't update the FTS index. + +**Fix:** Add trigger definitions to the spec: +```sql +CREATE TRIGGER IF NOT EXISTS paper_chunks_ai AFTER INSERT ON paper_chunks BEGIN + INSERT INTO paper_chunk_fts(rowid, chunk_id, paper_id, source_type, section_title, chunk_text) + VALUES (new.rowid, new.chunk_id, new.paper_id, new.source_type, new.section_title, new.chunk_text); +END; +CREATE TRIGGER IF NOT EXISTS paper_chunks_ad AFTER DELETE ON paper_chunks BEGIN + INSERT INTO paper_chunk_fts(paper_chunk_fts, rowid, chunk_id, paper_id, source_type, section_title, chunk_text) + VALUES ('delete', old.rowid, old.chunk_id, old.paper_id, old.source_type, old.section_title, old.chunk_text); +END; +CREATE TRIGGER IF NOT EXISTS paper_chunks_au AFTER UPDATE ON paper_chunks BEGIN + INSERT INTO paper_chunk_fts(paper_chunk_fts, rowid, chunk_id, paper_id, source_type, section_title, chunk_text) + VALUES ('delete', old.rowid, old.chunk_id, old.paper_id, old.source_type, old.section_title, old.chunk_text); + INSERT INTO paper_chunk_fts(rowid, chunk_id, paper_id, source_type, section_title, chunk_text) + VALUES (new.rowid, new.chunk_id, new.paper_id, new.source_type, new.section_title, new.chunk_text); +END; +``` + +--- + +### WR-08: DB dashboard hardcodes `broken: 0` — data regression from file scanner + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:191` +**Cross-ref:** `paperforge/commands/dashboard.py:78,98-105` (pdf_broken tracking) +**Cross-ref:** `paperforge/memory/builder.py:28-38` (PAPER_COLUMNS — no path_error) + +**Issue:** The existing file-scanning code tracks three PDF states: `healthy`, `broken`, and `missing`. The DB-based approach hardcodes `"broken": 0` because the `papers` table has no column for path_error. This means: +- A PDF file deleted after sync will show as `healthy` (lifecycle unchanged in DB) but is actually broken. +- The user sees 0 broken PDFs in the dashboard when they may have several. + +The fallback to file scanning when DB is stale partially mitigates this, but a fresh DB can also have stale path information for any paper whose PDF was moved/deleted after the last `memory build`. + +**Fix:** Either add a `broken_pdf_count` computation that cross-checks `pdf_path` existence on disk (lightweight stat call), or document that the DB dashboard shows "index-time PDF health" and the file scanner shows "current PDF health." + +--- + +## Info + +### IN-01: Command naming inconsistency — `agent-context` vs existing `paper-status` + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:121` +**Cross-ref:** `paperforge/cli.py:269-271` (paper-status subparser) + +**Issue:** Existing commands use descriptive noun phrases: `paper-status`, `deep-reading`, `base-refresh`. The new command `agent-context` follows a different pattern. While the purposes differ (paper-level vs. system-level), the inconsistency is worth noting for CLI discoverability. + +**Suggestion:** Consider `context` (shorter) or `memory-context` (follows `memory build`/`memory status` pattern). No change required — just noting. + +--- + +### IN-02: `ALL_TABLES` and `drop_all_tables()` not updated in spec + +**File:** `paperforge/memory/schema.py:120,137-141` +**Cross-ref:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:282-303` + +**Issue:** The `ALL_TABLES` list in `schema.py:120` controls which tables `drop_all_tables()` removes on rebuild. The spec introduces `paper_chunks` and `chunk_fts` but doesn't mention updating this list. If `drop_all_tables()` is called during a full rebuild (e.g., schema version mismatch), the old tables won't be dropped, potentially leaving orphaned data. + +**Suggestion:** The spec should note that `ALL_TABLES` must be updated to include the new tables. + +--- + +### IN-03: `ensure_schema()` not mentioned in spec + +**File:** `paperforge/memory/schema.py:123-134` +**Cross-ref:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:282-303` + +**Issue:** The spec defines `CREATE TABLE` statements for `paper_chunks` and `chunk_fts` but doesn't mention that `ensure_schema()` must be updated to execute these statements. Both `build_from_index()` and `refresh_paper()` rely on `ensure_schema()` to guarantee tables exist. + +**Suggestion:** Add a note: "Update `ensure_schema()` in `schema.py` to execute `CREATE TABLE IF NOT EXISTS paper_chunks` and `CREATE VIRTUAL TABLE IF NOT EXISTS paper_chunk_fts`." + +--- + +### IN-04: `retrieve` command name vs `search` — discoverability concern + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:57-60,315-317` +**Cross-ref:** `paperforge/cli.py:273` (search subparser) + +**Issue:** The spec introduces `paperforge retrieve` for OCR fulltext searching alongside existing `paperforge search` for metadata searching. The names don't make the distinction self-evident. New users won't know whether to `search` or `retrieve`. + +**Suggestion:** Consider `paperforge fulltext` or `paperforge search-content` to make the purpose clearer. Alternatively, add a `--fulltext` flag to the existing `search` command that switches to `chunk_fts` when specified. No blocker — naming preference. + +--- + +### IN-05: `agent-context` requires `--json` flag but always outputs JSON + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:121,127` +**Cross-ref:** `paperforge/cli.py` (all commands gate JSON output on --json flag) + +**Issue:** The spec says "Always outputs `--json` format; no human-readable mode needed" (line 127), yet the CLI spec shows `paperforge agent-context --json` (line 121). If the command always outputs JSON, the `--json` flag is either redundant (confusing) or incorrectly documented (the command should work without `--json` for human-readable output, like `paper-status` does in `paper_status.py:52-68`). + +**Suggestion:** Either: +1. Make `--json` required/default and remove it from the usage (always JSON), or +2. Add a human-readable mode like `paper-status` and keep `--json` as optional. + +--- + +### IN-06: Field name `paper_status` (underscore) vs `paper-status` (hyphen) in `agent-context` output + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:49` +**Cross-ref:** `paperforge/cli.py:269` (paper-status subparser name) + +**Issue:** The `agent-context` output uses `"paper-status"` as the command key (correct, matches CLI name). However, the next_actions pattern in existing code uses the command name as-is. Minor — no bug, just noting for consistency review. + +--- + +### IN-07: Chunking strategy — `max 500 tokens` underspecified + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:306-311` + +**Issue:** The spec says "Max 500 tokens per chunk" and "max 3 paragraphs per chunk" but doesn't specify: +- What constitutes a "token" (word-based? `tiktoken`? character count / 4?) +- Whether the token limit or paragraph limit takes precedence +- What happens when a single paragraph exceeds 500 tokens (split mid-paragraph? truncate? keep as oversized chunk?) + +**Suggestion:** Clarify tokenization method and tie-breaking rules. For example: "Use `len(text.split())` as a word-count proxy for tokens. If a single paragraph exceeds 500 words, split at sentence boundaries." + +--- + +_Reviewed: 2026-05-12_ +_Reviewer: VT-OS/OPENCODE Terminal (gsd-code-reviewer)_ +_Depth: deep_ diff --git a/docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md b/docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md new file mode 100644 index 0000000..6587af3 --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md @@ -0,0 +1,360 @@ +# Memory Layer Phase 2-5 — Complete Spec + +> **Date:** 2026-05-12 | **Depends on:** Phase 1 (metadata DB) + FTS5 search + +## Overview + +Four remaining features to complete the Memory Layer, in priority order: + +| # | Feature | Purpose | +|---|---------|---------| +| 1 | **agent-context** | Agent 启动路由器:library 概览 + commands 清单 + collection 地图 + rules | +| 2 | **Dashboard SQLite** | 仪表盘从文件扫描切换到读 paperforge.db | +| 3 | **Incremental refresh** | sync/ocr/deep-finalize 后单篇刷新 memory,不重建全库 | +| 4 | **Chunk retrieve** | OCR 全文 + figure caption 片段检索,返回带引用的证据 paragraph | + +--- + +## Feature 1: agent-context + +### Design + +纯只读路由命令。Agent 拿到后知道:库里有什么、能调用什么、从哪开始。 + +### Output structure + +```json +{ + "ok": true, + "command": "agent-context", + "version": "1.6.0", + "data": { + "paperforge": { + "version": "1.6.0", + "vault": "/path/to/vault", + "memory_db": "ready" + }, + "library": { + "paper_count": 283, + "domain_counts": {"骨科": 120, "运动医学": 80, "其他": 83}, + "lifecycle_counts": {"indexed": 2, "pdf_ready": 260, "fulltext_ready": 18, "deep_read_done": 3}, + "ocr_counts": {"done": 21, "pending": 262}, + "deep_reading_counts": {"done": 3, "pending": 280} + }, + "collections": [ + {"name": "骨科", "count": 120, "sub": ["骨折", "软骨", "韧带"]}, + {"name": "运动医学", "count": 80} + ], + "commands": { + "paper-status": { + "usage": "paperforge paper-status --json", + "purpose": "Look up one paper's full status and recommended next action" + }, + "search": { + "usage": "paperforge search --json [--collection NAME] [--domain NAME] [--ocr done|pending] [--year-from N] [--year-to N] [--limit N]", + "purpose": "Full-text search with optional collection/domain/lifecycle filters" + }, + "retrieve": { + "usage": "paperforge retrieve --json [--limit N]", + "purpose": "Search OCR fulltext chunks for evidence paragraphs (coming soon)" + }, + "deep": { + "usage": "/pf-deep ", + "purpose": "Full three-pass deep reading with chart analysis" + }, + "ocr": { + "usage": "/pf-ocr", + "purpose": "Run OCR on papers marked do_ocr:true" + }, + "sync": { + "usage": "/pf-sync", + "purpose": "Sync Zotero and regenerate formal notes + index" + } + }, + "rules": [ + "Use paperforge.db via CLI commands before reading individual files.", + "Do not infer paper state from stale frontmatter when memory status is fresh.", + "Read source files only after resolving candidates via paper-status or search.", + "To locate a paper: start with collection scope if known, then expand to full library search." + ] + } +} +``` + +### Implementation + +**File:** `paperforge/memory/context.py` + +```python +def get_agent_context(vault: Path) -> dict: + """Build agent bootstrap context from paperforge.db.""" + conn = get_connection(get_memory_db_path(vault), read_only=True) + try: + # Library overview + total = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0] + domains = {r["domain"]: r["cnt"] for r in conn.execute( + "SELECT domain, COUNT(*) as cnt FROM papers GROUP BY domain ORDER BY cnt DESC" + ).fetchall()} + lifecycles = ... # same GROUP BY pattern + ocr = ... + deep = ... + + # Collection tree + collections = _build_collection_tree(conn) + + return {...} # full structure above + finally: + conn.close() + +def _build_collection_tree(conn) -> list[dict]: + """Build nested collection hierarchy from papers.collection_path.""" + rows = conn.execute( + "SELECT collection_path, COUNT(*) as cnt FROM papers " + "WHERE collection_path != '' GROUP BY collection_path ORDER BY cnt DESC" + ).fetchall() + # Parse pipe-separated paths into tree + # "骨科 | 骨折" -> nested under 骨科 +``` + +**File:** `paperforge/commands/agent_context.py` — CLI wrapper with `--json` flag. + +**CLI:** `paperforge agent-context --json` + +### Constraints +- Pure read-only on paperforge.db +- If DB missing: return error with message "Run paperforge memory build" +- All SQL queries wrapped in try/except with graceful error handling +- Output wrapped in PFResult dataclass (matches all other CLI commands) + +**Schema version:** `CURRENT_SCHEMA_VERSION` bumped to `2` when `paper_chunks` and `paper_chunk_fts` tables are added (Feature 4). On version mismatch, `memory build` performs full drop-and-rebuild as per existing strategy. + +--- + +## Feature 2: Dashboard SQLite Integration + +### Design + +`dashboard.py` currently scans all `.md` files with regex frontmatter parsing. Replace with SQLite queries. Keep fallback to file scanning if DB is missing or stale. + +### Change + +**File:** `paperforge/commands/dashboard.py` + +The `_gather_dashboard_data()` function currently at lines 54-163 will be refactored: + +```python +def _gather_dashboard_data(vault: Path) -> dict: + db_path = get_memory_db_path(vault) + if db_path.exists(): + try: + return _dashboard_from_db(vault, db_path) + except Exception: + pass # fall through to file scanning + return _dashboard_from_files(vault) # existing logic, renamed +``` + +New function `_dashboard_from_db()`: +```python +def _dashboard_from_db(vault, db_path) -> dict: + conn = get_connection(db_path, read_only=True) + try: + total = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0] + + # PDF health + pdf_rows = conn.execute( + "SELECT lifecycle FROM papers" + ).fetchall() + pdf_healthy = sum(1 for r in pdf_rows if r["lifecycle"] != "indexed") + pdf_missing = total - pdf_healthy + + # OCR health + ocr_done = conn.execute( + "SELECT COUNT(*) FROM papers WHERE ocr_status='done'" + ).fetchone()[0] + ocr_pending = conn.execute( + "SELECT COUNT(*) FROM papers WHERE ocr_status NOT IN ('done','failed')" + ).fetchone()[0] + ocr_failed = conn.execute( + "SELECT COUNT(*) FROM papers WHERE ocr_status='failed'" + ).fetchone()[0] + + # Domain counts + domain_counts = {r["domain"]: r["cnt"] for r in conn.execute( + "SELECT domain, COUNT(*) as cnt FROM papers GROUP BY domain" + ).fetchall()} + + # Permissions (unchanged — still checks file existence) + permissions = _check_permissions(vault) + + return { + "stats": { + "papers": total, + "pdf_health": {"healthy": pdf_healthy, "missing": pdf_missing, "broken": 0}, + "ocr_health": {"pending": ocr_pending, "done": ocr_done, "failed": ocr_failed}, + "domain_counts": domain_counts, + "_source": "paperforge.db" + }, + "permissions": permissions + } + finally: + conn.close() +``` + +### Constraints +- Keep existing `_dashboard_from_files()` as fallback, rename from current `_gather_dashboard_data()` +- Dashboard output format must NOT change (plugin depends on it) +- Add `_source` field so plugin can display data freshness +- If DB is stale (`memory status` shows needs_rebuild), fall back to file scanning + +--- + +## Feature 3: Incremental Refresh + +### Design + +After `sync`, `ocr`, or `deep-finalize` modifies one paper, refresh only that paper's entries in SQLite instead of full `memory build`. + +### Implementation + +**File:** `paperforge/memory/refresh.py` + +```python +def refresh_paper(vault: Path, zotero_key: str) -> bool: + """Incrementally refresh one paper in paperforge.db from formal-library.json.""" + envelope = read_index(vault) + if not envelope: + return False + items = envelope if isinstance(envelope, list) else envelope.get("items", []) + + # Find the matching entry + entry = None + for e in items: + if e.get("zotero_key") == zotero_key: + entry = e + break + if not entry: + return False + + db_path = get_memory_db_path(vault) + conn = get_connection(db_path, read_only=False) + try: + # Upsert paper row (same logic as builder) + _upsert_paper(conn, entry, envelope.get("generated_at", "")) + # Replace assets for this key + conn.execute("DELETE FROM paper_assets WHERE paper_id=?", (zotero_key,)) + _insert_assets(conn, entry, vault) + # Replace aliases for this key + conn.execute("DELETE FROM paper_aliases WHERE paper_id=?", (zotero_key,)) + _insert_aliases(conn, entry) + conn.commit() + return True + except Exception: + conn.rollback() + raise + finally: + conn.close() +``` + +### Integration points + +Trigger `refresh_paper(vault, key)` after: +- `paperforge sync` — for each updated paper +- `paperforge ocr` — after OCR completes for a paper +- `paperforge deep-finalize ` — after marking deep reading done +- `paperforge repair --fix` — after repairing state + +### Constraints +- Reuse `_build_entry()` logic from builder.py (extract shared helpers) +- Only refresh if paperforge.db exists (no auto-build) +- If formal-library.json is stale (entry not found), skip silently +- Transactional: all-or-nothing per paper + +--- + +## Feature 4: Chunk Retrieval + +### Design + +Split OCR fulltext into paragraph-level chunks, store in `paper_chunks` table, index with FTS5. Figure captions from `figure-map.json` included as a chunk source type. + +### Schema + +```sql +CREATE TABLE IF NOT EXISTS paper_chunks ( + chunk_id TEXT PRIMARY KEY, + paper_id TEXT NOT NULL, + source_type TEXT NOT NULL, -- 'ocr_fulltext' | 'figure_caption' | 'abstract' | 'formal_note' + section_title TEXT, -- e.g., "Methods", "Results", "Figure 3" + page_number INTEGER, + chunk_index INTEGER, + chunk_text TEXT NOT NULL, + token_estimate INTEGER, + content_hash TEXT, + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); + +CREATE VIRTUAL TABLE IF NOT EXISTS paper_chunk_fts USING fts5( + chunk_id UNINDEXED, + paper_id UNINDEXED, + source_type, + section_title, + chunk_text, + content='paper_chunks', + content_rowid='rowid' +); +``` + +### Chunking strategy + +- **OCR fulltext**: Split by `` markers, then by double-newline paragraphs within each page. Max 500 tokens per chunk. +- **Figure captions**: Read `figure-map.json` from `ocr//`, one chunk per figure entry. +- **Abstract**: One chunk per paper (source_type='abstract'). +- **Formal note**: Optional — split `## 🔍 精读` sections into chunks. + +### Command + +``` +paperforge retrieve --json [--limit N] [--source ocr_fulltext|figure_caption|all] +``` + +Output: +```json +{ + "ok": true, + "command": "retrieve", + "data": { + "query": "PEMF dose response chondrocyte", + "chunks": [ + { + "zotero_key": "ABC123", + "title": "...", + "source_type": "ocr_fulltext", + "section_title": "Results", + "page_number": 6, + "chunk_text": "At 24h post-stimulation, chondrocyte proliferation increased...", + "rank": -2.5 + } + ] + } +} +``` + +### Constraints +- Chunks populated during `memory build` (full) or `memory refresh --key X` (incremental) +- Only for papers with `ocr_status == "done"` +- Figure-map.json must exist for figure caption chunks +- Max 3 paragraphs per chunk; overlap = 0 +- `paper_chunks` and `paper_chunk_fts` added to `ALL_TABLES` and `ensure_schema()` +- FTS content sync triggers added for `paper_chunks` ↔ `paper_chunk_fts` +- `CURRENT_SCHEMA_VERSION` bumped to `2` + +--- + +## Implementation Order + +1. **agent-context** — highest value for agent workflow +2. **Dashboard integration** — unify data sources +3. **Incremental refresh** — performance improvement +4. **Chunk retrieval** — most complex, depends on OCR pipeline + +Each feature gets its own plan → execute cycle within this spec. diff --git a/docs/superpowers/specs/2026-05-12-memory-layer-phase6-logging.md b/docs/superpowers/specs/2026-05-12-memory-layer-phase6-logging.md new file mode 100644 index 0000000..0421f31 --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-memory-layer-phase6-logging.md @@ -0,0 +1,142 @@ +# Memory Layer Phase 6+ — Reading Events, Logs, Vector Retrieval + +> **Date:** 2026-05-12 | **Depends on:** Memory Layer Phase 1-5 + +## Feature 1: paper_events — Reading Log Backend + +### Schema + +```sql +CREATE TABLE IF NOT EXISTS paper_events ( + event_id INTEGER PRIMARY KEY AUTOINCREMENT, + paper_id TEXT NOT NULL, + event_type TEXT NOT NULL, -- 'reading_note', 'ocr_done', 'sync_updated', 'deep_done' + created_at TEXT NOT NULL DEFAULT (datetime('now')), + payload_json TEXT, -- flexible per event_type + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +``` + +### reading_note payload + +```json +{ + "excerpt": "the fundamental disjunction between materials science and biology", + "section": "Section 7-8", + "page": "P29", + "usage": "F 段核心论点", + "note": "与 DDGMQ7RW 独立诊断同一问题" +} +``` + +### Integration + +Agent 在 `/pf-deep` 精读完一个段落后自动调用: +``` +paper_events INSERT (paper_id, 'reading_note', payload_json) +``` + +或通过 CLI: +```bash +paperforge reading-log --write LQZ2FWIW \ + --section "Discussion P12" \ + --excerpt "magnetoelectric 被定位为压电的增强/补偿" \ + --usage "F 段 Liang 定位" +``` + +--- + +## Feature 2: reading-log / working-log — Export & Slash Commands + +### reading-log export + +```bash +paperforge reading-log --output Project//reading-log.md [--since DATE] +``` + +按 `created_at DESC` 导出所有 `reading_note` events,格式: + +```markdown +## 2026-05-12 + +### LQZ2FWIW — Alvarez-Lorenzo et al. 2023 +- **Discussion P12**:"magnetoelectric 被定位为压电的增强/补偿" + → 用途: F 段 Liang 定位的文献支撑 +``` + +### Slash command: `/pf-log-reading` + +嵌入式 prompt(在 agent skill 或 slash command 定义中): + +``` +读完当前段落或章节后,记录以下信息到 paper_events: +- 来源: zotero_key + section + page +- 信息内容: 原文关键句(逐字引用) +- 用途: 这个信息支持当前写作的哪个论点 +- 备注: 任何交叉验证/矛盾/注意事项 + +执行: paperforge reading-log --write KEY --section "..." --excerpt "..." --usage "..." +``` + +### Slash command: `/pf-log-session` + +``` +会话结束前回顾本次所有决策节点,按以下格式追加到 Project//working-log.md: + +## <日期> — <小节名> + +### 核心决策 +- 做了什么、为什么 + +### 弯路与修正 +- 错误方向 → 用户纠正 → 最终方案 + +### 可复用方法论 +- 本段的 pattern 是什么 + +### 待办 +- [ ] ... + +格式参考: Project/综述写作/working-log.md +``` + +--- + +## Feature 3: Vector Retrieval (Deferred) + +| 特性 | 方案 | +|------|------| +| 模型 | 本地 `all-MiniLM-L6-v2`(80MB,CPU 可跑) | +| API 备选 | OpenAI `text-embedding-3-small` | +| 向量库 | ChromaDB | +| 构建 | `paperforge embed build` | +| 增量 | `refresh_paper()` 自动 re-embed | +| 检索 | `paperforge retrieve --json` | + +### Command output + +```json +{ + "chunks": [ + { + "zotero_key": "ABC123", + "title": "...", + "page": 6, + "section_title": "Results", + "chunk_text": "At 24h post-stimulation, chondrocyte proliferation...", + "score": 0.92 + } + ] +} +``` + +Agent 流程不变:retrieve → 候选段落(带论文身份) → paper-status → 读 fulltext 验证。 + +--- + +## Implementation Order + +1. paper_events table + reading-log write/export +2. `/pf-log-reading` + `/pf-log-session` slash commands +3. Working-log template (embedded in slash command prompt) +4. Vector retrieval (deferred, start when library > 500) diff --git a/docs/superpowers/specs/2026-05-12-memory-layer-round3.md b/docs/superpowers/specs/2026-05-12-memory-layer-round3.md new file mode 100644 index 0000000..652a2c6 --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-memory-layer-round3.md @@ -0,0 +1,263 @@ +# PaperForge v1.5.7 — Memory Layer Round 3 + +> **Branch:** `feature/memory` | **Date:** 2026-05-12 + +## Feature 1: Logging Skill — Strict Markdown Template + +**Problem:** Agent-written reading-log.md may not parse reliably if format varies. + +**Solution:** SKILL.md instructs agent to use strict template format. + +### File: `paperforge/skills/logging/SKILL.md` + +Update the reading-log route section to require this exact format: + +```markdown +## ABCDEFGH — Author Last Name et al. Year +**Title:** Full Paper Title + +### Section Name — Page NN or line NN-NN +**Info:** "verbatim excerpt from paper" +**Use:** how this supports current writing task +**Note:** optional cross-validation note + +### Another Section — Page NN +**Info:** "..." +**Use:** ... +**Note:** (optional) +``` + +### Parsing Rules (for --validate and --import): + +``` +paper format: ^## [A-Z0-9]{8} — .+$ (key is 8 uppercase alphanumeric) +title format: ^\*\*Title:\*\* .+$ +section format: ^### .+$ +info format: ^\*\*Info:\*\* .+$ +use format: ^\*\*Use:\*\* .+$ +note format: ^\*\*Note:\*\* .+$ (optional) +``` + +Constraint: `info` and `use` are mandatory for every section entry. `note` is optional. + +### CLI Changes + +Update `reading-log` parser in `cli.py` to add `--validate` and `--import` subcommands under a shared parser. + +## Feature 2: reading-log --validate + +**File:** `paperforge/commands/reading_log.py` + +``` +paperforge reading-log --validate path/to/reading-log.md +``` + +Function: `validate_reading_log(filepath: Path) -> dict` + +Returns: +```json +{ + "ok": true, + "file": "Project/综述写作/reading-log.md", + "errors": [], + "papers_found": 3, + "entries_found": 7 +} +``` + +On failure: +```json +{ + "ok": false, + "errors": [ + {"line": 15, "field": "info", "message": "missing **Info:** after section header"}, + {"line": 23, "field": "key", "message": "paper key must match ^[A-Z0-9]{8}$"} + ] +} +``` + +Validation algorithm: +1. Parse into papers by `## KEY — Author` headers +2. For each paper: verify `**Title:**` follows +3. For each section `### ...`: verify `**Info:**` and `**Use:**` follow +4. Report all errors at once, not stop-at-first + +## Feature 3: reading-log --import + +**File:** `paperforge/commands/reading_log.py` + `paperforge/memory/events.py` + +``` +paperforge reading-log --import path/to/reading-log.md +``` + +Function: `import_reading_log(vault: Path, filepath: Path) -> dict` + +Returns: +```json +{ + "ok": true, + "papers_imported": 3, + "entries_imported": 7, + "skipped": 0 +} +``` + +Algorithm: +1. Call `validate_reading_log(filepath)` — abort if errors +2. Parse valid file into paper-level entries +3. For each entry, call `write_reading_note(vault, paper_id, section, excerpt, usage, note)` +4. Each write INSERTs a new row — safe for accumulative use + +### Add to `paperforge/memory/events.py`: + +```python +def import_reading_log(vault: Path, filepath: Path) -> dict: + """Parse a reading-log.md and bulk-write to paper_events.""" + # Parse, validate, write + ... +``` + +## Feature 4: reading-log --lookup KEY + +**File:** `paperforge/commands/reading_log.py` + +``` +paperforge reading-log --lookup KEY [--json] +``` + +Function: `lookup_paper_events(vault: Path, key: str) -> dict` + +Returns all accumulated paper_events for a paper, ordered by created_at DESC: +```json +{ + "ok": true, + "zotero_key": "ABCDEFGH", + "title": "...", + "entries": [ + { + "created_at": "2026-05-12 14:30", + "section": "Results P6", + "excerpt": "...", + "usage": "F 段参数数据", + "note": "与 Lippiello 对比" + } + ], + "count": 5, + "projects": ["综述写作", "数据分析"] +} +``` + +## Feature 5: /methodology Skill + +**File:** `paperforge/skills/methodology/SKILL.md` + +Pure-prompt skill, no Python code. Same universal pattern as grill-me. + +```yaml +--- +name: methodology +description: > + Project methodology extraction. Triggered by: + methodology, /methodology, 提取方法论, 存档写作规律, + 总结本项目方法, 提取可复用规则. +source: paperforge +--- +``` + +### Agent workflow: + +1. Ask user which project to extract from (or detect from context) +2. Read `Project//working-log.md` +3. Identify extractable patterns: + - Sections marked as "方法论" or "复用" + - Wrong turns + corrections (弯路 + 修正) + - Final logic flows (最终逻辑: XX 段) + - Review feedback patterns (审阅修正) + - Cross-study audit methodology +4. Classify into categories: + - `review-writing.md` — 综述写作相关 + - `data-analysis.md` — 数据分析相关 + - `general-methods.md` — 通用方法 +5. Present draft to user for confirmation +6. Write to `/PaperForge/methodologies/.md` + +### Methodology file format: + +```markdown +--- +project: 综述写作 +extracted: 2026-05-12 +category: review-writing +--- + +# [Method Name] + +## Source +From working-log.md Section [X.Y] + +## Pattern +[Extracted reusable methodology] + +## Example +[Concrete example from the project] +``` + +## Feature 6: Dashboard → SQLite Migration + +**File:** `paperforge/commands/dashboard.py` + +Current `_gather_dashboard_data()` does file scanning with regex. Migrate to: + +```python +def _gather_dashboard_data(vault: Path) -> dict: + # Try DB first + data = _dashboard_from_db(vault) + if data is not None: + data["permissions"] = _check_permissions(vault) + return data + # Fallback to file scanning + return _dashboard_from_files(vault) +``` + +`_dashboard_from_db()` should read from paperforge.db: +- Paper count: `SELECT COUNT(*) FROM papers` +- Domain counts: `SELECT domain, COUNT(*) FROM papers GROUP BY domain` +- PDF/OCR health: from papers table `ocr_status`, `has_pdf` columns +- Remove the `_source` key (was added in earlier iteration but caused contract issues) + +**Keep the permissions check** (`_check_permissions`) separate and lightweight. + +## Feature 7: Bootstrap Update + +**File:** `paperforge/skills/literature-qa/scripts/pf_bootstrap.py` + +If present in repo (not already done), ensure `memory_layer` field is in bootstrap output. Already implemented in earlier harness work — verify status. + +## Refactoring: Memory Layer No Longer Optional + +**File:** `paperforge/plugin/main.js` ✓ DONE + +Removed the Easy Memory Layer toggle. Status display always shown. Memory layer is always on — SQLite is lightweight enough to not need a toggle. + +## Implementation Order + +1. Logging SKILL.md format update +2. reading-log --validate CLI +3. reading-log --import CLI + events.py +4. reading-log --lookup CLI +5. /methodology SKILL.md +6. Dashboard SQLite migration +7. Integration test + deploy + +## Cross-File Impact + +| File | Action | Features | +|------|--------|----------| +| `paperforge/skills/logging/SKILL.md` | Modify | Feature 1 | +| `paperforge/commands/reading_log.py` | Modify | Features 2, 3, 4 | +| `paperforge/memory/events.py` | Modify | Feature 3 | +| `paperforge/cli.py` | Modify | Features 2, 3, 4 | +| `paperforge/skills/methodology/SKILL.md` | Create | Feature 5 | +| `paperforge/skills/methodology/scripts/pf_bootstrap.py` | Copy | Feature 5 (same bootstrap) | +| `paperforge/commands/dashboard.py` | Modify | Feature 6 | +| `paperforge/plugin/main.js` | ✓ DONE | Refactoring | diff --git a/docs/superpowers/specs/2026-05-12-plugin-settings-redesign-REVIEW.md b/docs/superpowers/specs/2026-05-12-plugin-settings-redesign-REVIEW.md new file mode 100644 index 0000000..efda2f7 --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-plugin-settings-redesign-REVIEW.md @@ -0,0 +1,205 @@ +--- +phase: settings-redesign-spec-review +reviewed: 2026-05-12T12:00:00Z +depth: deep +files_reviewed: 5 +files_reviewed_list: + - docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md + - paperforge/plugin/main.js + - paperforge/services/skill_deploy.py + - paperforge/skills/literature-qa/SKILL.md + - paperforge/skills/literature-logging/SKILL.md +findings: + critical: 1 + warning: 3 + info: 3 + total: 7 +status: issues_found +--- + +# Spec Review: Plugin Settings Redesign + +**Reviewed:** 2026-05-12 +**Depth:** deep (cross-file analysis across plugin codebase, skill deploy, SKILL.md frontmatter) +**Files Reviewed:** 5 +**Status:** ISSUES_FOUND — one BLOCKER must be resolved before implementation + +## Summary + +Cross-referenced the proposed 2-tab settings redesign against the current PaperForge plugin codebase (`paperforge/plugin/main.js`), the `skill_deploy.py` AGENT_SKILL_DIRS mapping, and the two existing SKILL.md files. The spec's architecture (Claudian tab pattern, DOM-based tab switching, `disable-model-invocation` toggle) is well-reasoned and compatible. However, one data persistence issue is a BLOCKER, and several gaps/ambiguities need resolution before implementation proceeds. + +--- + +## Critical Issues + +### CR-01: `saveSettings()` will silently discard all new `data.json` keys + +**File:** `paperforge/plugin/main.js:3534-3542` +**Issue:** The spec proposes storing new feature-toggle data in Obsidian's plugin `data.json` under keys like `features`, `vector_db_mode`, `vector_db_model`, `vector_db_api_key`, and `frozen_skills`. However, the current `saveSettings()` method explicitly filters out any key not present in `DEFAULT_SETTINGS`: + +```js +async saveSettings() { + // Only persist non-path settings to plugin data.json + const dataToSave = {}; + for (const key of Object.keys(DEFAULT_SETTINGS)) { // ← whitelist filter + if (key in this.settings) { + dataToSave[key] = this.settings[key]; + } + } + await this.saveData(dataToSave); +} +``` + +`DEFAULT_SETTINGS` (line 547-556) currently contains only: +- `vault_path`, `setup_complete`, `auto_update`, `agent_platform`, `language`, `paddleocr_api_key`, `zotero_data_dir`, `python_path` + +Any new key (`features`, `vector_db_mode`, `frozen_skills`, etc.) will be **silently discarded** on every save. Toggling a feature, changing a vector DB mode, or freezing a skill would appear to work until the user re-opens settings — at which point `loadData()` returns the stale (or default) values. + +**Fix:** One of two approaches: + +**Option A — Extend DEFAULT_SETTINGS (simpler, less risky):** +```js +const DEFAULT_SETTINGS = { + vault_path: '', + setup_complete: false, + auto_update: true, + agent_platform: 'opencode', + language: '', + paddleocr_api_key: '', + zotero_data_dir: '', + python_path: '', + // NEW: Feature toggles + features: { + fts_search: true, + agent_context: true, + reading_log: true, + vector_db: false, + }, + vector_db_mode: 'local', + vector_db_model: 'all-MiniLM-L6-v2', + vector_db_api_key: '', + frozen_skills: {}, +}; +``` + +**Option B — Change save logic to whitelist exclusions rather than inclusions:** +```js +// Persist everything except internal/temporary fields +const EXCLUDE_KEYS = new Set(['_python_path_stale', '_saveTimeout', '_pfConfig']); +const dataToSave = {}; +for (const key of Object.keys(this.settings)) { + if (!EXCLUDE_KEYS.has(key) && typeof this.settings[key] !== 'function') { + dataToSave[key] = this.settings[key]; + } +} +``` + +Option A is recommended as it preserves the defensive posture of the existing code. + +--- + +## Warnings + +### WR-01: `source` field missing — system skills will be mis-categorized as user skills + +**File:** `paperforge/skills/literature-qa/SKILL.md`, `paperforge/skills/literature-logging/SKILL.md` +**Issue:** The spec uses `source: paperforge` frontmatter to identify system-managed skills (with update/freeze controls) vs user skills (`source: user` or no `source` field → toggle only). Neither existing SKILL.md has a `source` field: + +```yaml +# literature-qa/SKILL.md (current) +name: literature-qa +description: > + 学术文献库操作:精读、问答、检索、批量阅读... + +# literature-logging/SKILL.md (current) +name: literature-logging +description: > + Literature reading and working log management... +``` + +Per the spec's rules: skills without `source` are treated as **user** skills (toggle only, no update button). This means the two PaperForge system skills would show up without update/freeze controls on first install — users would need the implementation to retroactively add `source: paperforge` to detect them correctly. + +**Fix:** Add `source: paperforge` to both SKILL.md files as part of this implementation, and include it in the `deploy_skills()` copytree operation. Also add a `version` field (currently absent from both) since the spec expects it for GitHub semver comparison. + +```yaml +# Proposed addition to both SKILL.md frontmatter blocks +source: paperforge +version: 1.5.5 +``` + +### WR-02: Feature toggle enforcement is missing — no code that reads `features.*` to gate CLI commands + +**File:** Spec section "Section 2: Feature Toggles" +**Issue:** The spec states "When a feature is disabled, the corresponding CLI command returns a clear error message." However, the spec only covers the **settings UI** side — there is no corresponding mechanism described for the Python CLI (`cli.py`) or workers to read `features.*` from `data.json` (which lives in the vault's `.obsidian/plugins/paperforge/` directory, inaccessible at runtime unless the plugin passes the values through `paperforge.json` or an env var). + +Either: +- The plugin needs to write toggles into `paperforge.json` so the Python runtime can read them, OR +- The plugin needs to pass toggles as CLI flags/arguments when invoking commands, OR +- The enforcement lives only in the plugin's command palette (never calls CLI for disabled features) + +**Fix:** Add explicit documentation of the enforcement mechanism. Recommended: the plugin writes a `feature_toggles` block to `paperforge.json` during `saveSettings()`, mirroring the existing `vault_config` block pattern (see `savePaperforgeJson()` at line 3455). This way both plugin and Python runtime have a single source of truth. + +### WR-03: `_debouncedSave()` calls `saveSettings()` — both save paths need updating + +**File:** `paperforge/plugin/main.js:2573-2576` +**Issue:** The settings tab has two save pathways: +1. Direct calls: `this.plugin.saveSettings()` (line 2268 in the Python path onChange handler) +2. Debounced calls: `this._debouncedSave()` (lines 2214, 2223) which calls `this.plugin.saveSettings()` after 500ms + +Both flow through the same `saveSettings()` method. If CR-01 is fixed (extending DEFAULT_SETTINGS), this is not an additional bug — but it means **any new Setting added to the features tab must use the same save mechanism**. The spec doesn't mention this constraint. + +**Fix:** Document in implementation notes that all new toggle handlers should call `this._debouncedSave()` (for inputs) or `this.plugin.saveSettings()` (for immediate actions). For the skill toggle that writes to SKILL.md frontmatter (not data.json), a separate write path is needed — this is handled correctly by the spec but should be called out. + +--- + +## Info + +### IN-01: JSON key nesting inconsistency between architecture diagram and data storage section + +**File:** `docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md:29-30 vs 170-185` +**Issue:** The architecture diagram nests vector DB config under a `向量数据库` section with `开关`, `模式`, `本地`, `API` as sub-items. The JSON storage section places `features.vector_db` (the master toggle) nested under `features`, but places `vector_db_mode`, `vector_db_model`, and `vector_db_api_key` at the **top level** of data.json — not under `features.vector_db.*`. This is structurally valid but the flat/grouped inconsistency between the spec's labeled options and the flat JSON may cause confusion during implementation. + +```json +// Spec proposes: +{ + "features": { "vector_db": false }, // master toggle nested + "vector_db_mode": "local", // implementation detail at top level + "vector_db_model": "...", // ... + "vector_db_api_key": "" // ... +} +``` + +**Fix:** Consider either fully nesting (`features.vector_db.enabled`, `features.vector_db.mode`, etc.) or fully flattening (`features_vector_db`, `features_vector_db_mode`, etc.). The current mixed approach works but adds mental overhead. The nested approach is cleaner for future feature additions. + +### IN-02: Vector DB panel gaps — model detection and error handling unspecified + +**File:** `docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md:161-164` +**Issue:** The vector DB panel design leaves several implementation details ambiguous: + +1. **Model installation state detection**: The status badge `● 已安装 / ○ 未安装` has no specified detection logic. `sentence-transformers` models download on first use (triggered by `SentenceTransformer('all-MiniLM-L6-v2')`), not by a distinct install step. How does the UI know the model is installed? Checking for cached files in `~/.cache/torch/sentence_transformers/`? Running a probe import? + +2. **Installation is async but not cancellable**: "`pip install` is async — show progress bar" — but what happens if the user closes Obsidian or switches vaults mid-install? Is there a cancel mechanism? + +3. **No network error handling**: What does the UI show if `pip install` fails due to network issues, disk space, or permissions? + +**Fix:** Add implementation details for each of these edge cases to the spec, or document them as deferred design decisions. + +### IN-03: Tab state is DOM-preserved between switches but NOT across re-opens + +**File:** Spec section "Tab Implementation" vs `paperforge/plugin/main.js:2206-2208` +**Issue:** The spec correctly follows the Claudian pattern (all tabs exist in DOM, CSS toggles visibility). This preserves form field state when switching between 安装 and 功能 tabs within a single settings session. However, Obsidian calls `display()` on every settings tab open, which runs `containerEl.empty()` (line 2208) and rebuilds the entire UI from scratch. This is standard Obsidian behavior, but means: +- Switching tabs: state preserved (correct) +- Closing and reopening settings: state lost (standard, acceptable) +- Running "Sync Runtime" → calls `this.display()` (line 2553, 2562): **entire settings rebuilt, active tab resets to default** + +The Sync Runtime action at line 2553/2562 explicitly calls `this.display()` which would reset any partially filled forms or the active tab selection back to default. This shouldn't block the spec, but should be noted: the sync runtime action should either: +- Preserve `this.activeTab` before calling `this.display()`, or +- Not call `this.display()` at all (re-render only the runtime health section) + +**Fix:** Add a note to the implementation plan: `this.display()` reset is acceptable for a settings reopen, but the sync runtime flow should preserve `this.activeTab` across the re-render. + +--- + +_Reviewed: 2026-05-12T12:00:00Z_ +_Reviewer: VT-OS/OPENCODE (gsd-code-reviewer)_ +_Depth: deep_ diff --git a/docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md b/docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md new file mode 100644 index 0000000..1761f3a --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md @@ -0,0 +1,195 @@ +# Plugin Settings Redesign — Tabbed Settings + Feature Toggles + +> **Date:** 2026-05-12 | **Research ref:** Claudian + obsidian-skills-manager + +## Architecture + +``` +Settings → PaperForge + ┌─────────────────┬───────────────────────────────┐ + │ [安装] │ [功能] │ + ├─────────────────┼───────────────────────────────┤ + │ Python 路径 │ Skills │ + │ PaddleOCR Key │ ├─ 系统技能 (per-agent dir) │ + │ Zotero 数据目录 │ │ ├─ 开关: toggle frontmatter│ + │ Agent 平台 │ │ ├─ 更新: GitHub semver │ + │ Agent Config路径 │ │ └─ 冻结: 锁版本 │ + │ ... │ └─ 用户技能 (自定义目录) │ + │ │ └─ 开关: toggle frontmatter│ + │ │ │ + │ │ Memory Layer │ + │ │ ├─ FTS5 搜索 │ + │ │ ├─ agent-context │ + │ │ └─ reading-log │ + │ │ │ + │ │ 向量数据库 │ + │ │ ├─ 开关: 启用/禁用 │ + │ │ ├─ 模式: 本地 • API │ + │ │ ├─ 本地: [安装模型] + 模型名 │ + │ │ └─ API: API Key │ + └─────────────────┴───────────────────────────────┘ +``` + +## Tab Implementation + +Follow Claudian pattern: custom tab bar with class-toggle, all content divs exist in DOM simultaneously. + +```typescript +// PaperforgeSettingsTab.ts +type SettingsTabId = 'setup' | 'features'; + +// -- render() -- +// 1. Tab bar +const tabBar = containerEl.createDiv({ cls: 'paperforge-settings-tabs' }); +const tabButtons = new Map(); +const tabContents = new Map(); + +// 2. For each tab: create button + content div +for (const [id, label] of [['setup','安装'], ['features','功能']]) { + const btn = tabBar.createEl('button', { cls: 'paperforge-settings-tab', text: label }); + btn.addEventListener('click', () => switchTab(id)); // toggles --active class + tabButtons.set(id, btn); + + const content = containerEl.createDiv({ cls: 'paperforge-settings-tab-content' }); + tabContents.set(id, content); +} + +// 3. Render each tab +renderSetupTab(tabContents.get('setup')); +renderFeaturesTab(tabContents.get('features')); + +// 4. Activate default tab +switchTab(this.activeTab); +``` + +CSS: `.paperforge-settings-tab-content { display: none; }` `.paperforge-settings-tab-content--active { display: block; }` + +## Section 1: Skills Management + +### System Skill Detection + +Scan vault-local agent skill directories (from `AGENT_SKILL_DIRS` mapping): + +``` +{vault}/.opencode/skills/literature-qa/SKILL.md +{vault}/.opencode/skills/literature-logging/SKILL.md (new) +{vault}/.claude/skills/literature-qa/SKILL.md +{vault}/.codex/skills/literature-qa/SKILL.md +... +``` + +Each skill identified by `SKILL.md` frontmatter: +```yaml +name: literature-qa +description: 学术文献库操作 +version: 1.5.5 +source: PaperForge/paperforge +``` + +### UI per skill row + +``` +┌─────────────────────────────────────────────────────┐ +│ [✓] literature-qa v1.5.5 [更新] [冻结] │ +│ 学术文献库操作:精读、问答、检索 │ +├─────────────────────────────────────────────────────┤ +│ [✓] literature-logging v1.0.0 [更新] [冻结] │ +│ 阅读日志与工作日志管理 │ +└─────────────────────────────────────────────────────┘ +``` + +- **开关** (`[✓]`): 写入 `SKILL.md` frontmatter `disable-model-invocation: true/false`(obsidian-skills-manager 同款方式) +- **更新**: GitHub API `GET /repos/LLLin000/PaperForge/releases?per_page=25` → semver 比对 → 有新版显示 `[更新]` 按钮 → 重新下载 skill 文件 +- **冻结**: 写入 plugin `data.json` → `frozen_skills: { "literature-qa": true }` → 冻结后不显示更新提示 + +### User Skill Detection + +``` +{vault}/.claude/skills/ (可配置路径) +``` + +User skills identified by `SKILL.md` frontmatter field `source: user` (or no `source` field). Features: +- **开关**: same `disable-model-invocation` toggle +- **无更新/冻结功能** + +### Source attribution + +Frontmatter discriminator for system vs user: +```yaml +# System skill +source: paperforge # → managed by plugin, has update button + +# User skill +source: user # → toggle only, no update +# (or no source field) # → treated as user +``` + +**Implementation note:** Both existing SKILL.md files (`literature-qa`, `literature-logging`) must add `source: paperforge` to their frontmatter. + +## Section 2: Feature Toggles + +Memory Layer features as simple Obsidian toggles in plugin `data.json`: + +| Key | Default | Effect | +|-----|---------|--------| +| `features.fts_search` | `true` | `paperforge memory build` 是否创建 FTS 索引 | +| `features.agent_context` | `true` | 是否允许 `agent-context` 命令 | +| `features.reading_log` | `true` | 是否启用 paper_events 表 | +| `features.vector_db` | `false` | 是否启用向量检索模块 | + +When a feature is disabled, the corresponding CLI command returns a clear error message. + +**Implementation note:** CLI commands read `data.json` to check feature toggles. If `data.json` is missing (user runs CLI outside Obsidian), features default to `true` (opt-out, not opt-in). + +## Section 3: Vector Database + +``` +┌─────────────────────────────────────────┐ +│ 向量数据库 [启用] │ +│ │ +│ 模式: ● 本地 ○ API │ +│ │ +│ 本地模型: all-MiniLM-L6-v2 │ +│ 模型大小: 80 MB │ +│ 状态: ● 已安装 / ○ 未安装 │ +│ [安装模型] │ +│ [重新安装] │ +│ │ +│ API Key: ┌─────────────────────────────┐│ +│ │ sk-... ││ +│ └─────────────────────────────┘│ +│ 模型: text-embedding-3-small │ +└─────────────────────────────────────────┘ +``` + +Implementation notes: +- Model installation: `pip install sentence-transformers` + trigger model download +- Model path: stored in `data.json` under `features.vector_db.model_path` +- API: uses existing `.env` PaddleOCR Key pattern, add `VECTOR_API_KEY` +- `pip install` is async — show progress bar + +## Data Storage + +Plugin `data.json`: +```json +{ + "features": { + "fts_search": true, + "agent_context": true, + "reading_log": true, + "vector_db": false + }, + "vector_db_mode": "local", + "vector_db_model": "all-MiniLM-L6-v2", + "vector_db_api_key": "", + "frozen_skills": {} +} +``` + +**Critical:** All new keys MUST be added to `DEFAULT_SETTINGS` in `main.js` (currently a whitelist of 8 keys in `saveSettings()`). Without this, toggles appear to work but vanish on vault reopen. +``` + +Skill disable state in `SKILL.md` frontmatter (standard Agent Skills spec): +```yaml +disable-model-invocation: true +``` diff --git a/docs/superpowers/specs/2026-05-12-vector-retrieval.md b/docs/superpowers/specs/2026-05-12-vector-retrieval.md new file mode 100644 index 0000000..147ffc5 --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-vector-retrieval.md @@ -0,0 +1,197 @@ +# Phase 7 — Vector Retrieval + +> **Date:** 2026-05-12 | **Depends on:** Memory Layer Phase 1-6 + +## Overview + +Add semantic vector retrieval for OCR fulltext, built on ChromaDB with local embedding models. +Optional module, disabled by default. Activated by user via plugin settings toggle. + +## Architecture + +``` +fulltext.md + ↓ 剔除 ![[*]] 图片链接行 + ↓ 替换行内图片链接为 [Figure N] + ↓ 按 分页 + ↓ 每页按双换行分自然段 + ↓ 2-3 段一组 → 300-400 token/chunk, 1 段重叠 + ↓ section 检测 (规则匹配 IMRaD + Figure/Table) + ↓ embed with bge-small-en-v1.5 (384d) + ↓ +ChromaDB @ indexes/vectors/ + ↓ paperforge retrieve "PEMF dose response" --json + ↓ top-5 chunks + 前后各 1 chunk (补上下文) + ↓ +{ chunks: [{ paper_id, title, section, page, text, score }] } +``` + +## Dependencies + +``` +pip install chromadb sentence-transformers +``` + +Local model auto-downloads on first use (~130 MB for `bge-small-en-v1.5`). +API mode uses `openai` package (already in deps). + +## Section Detection (Rule-based) + +Scan each paragraph for known section keywords: + +``` +Case-insensitive match, must appear as standalone short line (< 80 chars): + +Introduction | Methods | Materials | Results | Discussion +Conclusion | Abstract | Background | References | Supplementary +Figure \d+ | Fig\.? \d+ | Table \d+ +``` + +Rules (priority order): +1. Exact keyword match → section = matched text +2. ALL CAPS short line → probable section title +3. Short line, no period, surrounded by blank lines → probable section title +4. Fallback: inherit from previous chunk in same page +5. Default: "Text" (unclassified) + +## Local Model Options + +| Model ID | Dim | Size | Chinese | Speed | +| -------------------------- | ---- | ----- | ------- | ----- | +| `BAAI/bge-small-en-v1.5` | 384 | 130MB | [*] | Fast | +| `sentence-transformers/all-MiniLM-L6-v2` | 384 | 80MB | — | Fast | +| `BAAI/bge-base-en-v1.5` | 768 | 440MB | [*] | Medium | +| `sentence-transformers/all-mpnet-base-v2` | 768 | 420MB | — | Medium | + +Model selection stored in `data.json` → `vector_db_model`. + +## API Mode + +```python +# When vector_db_mode == "api": +from openai import OpenAI +client = OpenAI(api_key=api_key) +embedding = client.embeddings.create( + model="text-embedding-3-small", + input=text +) +``` + +API key from `data.json` → `vector_db_api_key` or fallback to `.env` `OPENAI_API_KEY`. +Max 8191 tokens per call — chunking ensures we stay under limit. + +## ChromaDB Storage + +``` +/PaperForge/indexes/vectors/ + ├── chroma.sqlite3 + └── / (Chroma internal) +``` + +Collection name: `paperforge_fulltext`. +Metadata stored per chunk: `paper_id, citation_key, title, year, section, page, chunk_index, token_estimate`. + +## Commands + +### `paperforge embed build [--force]` + +1. Check `data.json` for `features.vector_db == true` +2. Read `formal-library.json` for all papers with `ocr_status == "done"` +3. For each paper: read `fulltext.md`, chunk, embed, insert into ChromaDB +4. If `--force`: delete existing collection, rebuild from scratch + +Returns PFResult: +```json +{ + "ok": true, + "data": { + "papers_embedded": 21, + "chunks_embedded": 420, + "model": "BAAI/bge-small-en-v1.5", + "mode": "local" + } +} +``` + +### `paperforge retrieve --json [--limit N] [--expand true]` + +1. Embed query with same model +2. Query ChromaDB, get top-N chunks +3. If `--expand true` (default): fetch adjacent chunks (±1) for context +4. Join with papers table for metadata + +Returns: +```json +{ + "ok": true, + "data": { + "query": "PEMF dose response chondrocyte", + "chunks": [ + { + "paper_id": "ABC123", + "citation_key": "aaronStimulation2004", + "title": "Stimulation of growth factor synthesis...", + "year": 2004, + "section": "Results", + "page": 6, + "chunk_text": "At 24h post-stimulation, chondrocyte proliferation increased...\n\n...", + "adjacent_before": "... (previous chunk, if expanded)", + "adjacent_after": "... (next chunk, if expanded)", + "score": 0.92 + } + ], + "count": 5, + "model": "BAAI/bge-small-en-v1.5" + } +} +``` + +### `paperforge embed status --json` + +Returns: db exists, collection exists, chunk count, model name, last build time. + +## Integration with Memory Layer + +### Memory build + +`paperforge memory build` does NOT trigger embed build. Vector DB is separate, user-controlled. + +### Incremental refresh + +`refresh_paper()` extended: +```python +def refresh_paper(vault, zotero_key): + # existing SQLite refresh... + + # If vector DB enabled: + if vector_db_enabled(vault): + # Delete old chunks for this paper + collection.delete(where={"paper_id": zotero_key}) + # Re-embed this paper + _embed_paper(vault, zotero_key) +``` + +Triggered after OCR completes (fulltext changes) or deep-finalize. + +## Files + +``` +Create: + paperforge/memory/vector_db.py — ChromaDB init, embed, query, delete + paperforge/memory/chunker.py — fulltext → chunks (rule-based) + paperforge/commands/embed.py — CLI: embed build/status + paperforge/commands/retrieve.py — CLI: retrieve + +Modify: + paperforge/memory/refresh.py — add vector refresh hook + paperforge/cli.py — register embed + retrieve +``` + +## Constraints + +1. Optional — disabled until user enables in settings +2. Requires `pip install chromadb sentence-transformers` (user installs or plugin offers button) +3. Windows compatible (ChromaDB embedded mode works on Windows) +4. `paperforge.db` remains source of truth; ChromaDB is deletable and rebuildable +5. No GPU required; CPU embedding for 150 papers takes ~30 seconds +6. API mode: respects rate limits, batches chunks to minimize API calls diff --git a/docs/superpowers/specs/2026-05-14-dashboard-copy-metadata-design.md b/docs/superpowers/specs/2026-05-14-dashboard-copy-metadata-design.md new file mode 100644 index 0000000..9d40e07 --- /dev/null +++ b/docs/superpowers/specs/2026-05-14-dashboard-copy-metadata-design.md @@ -0,0 +1,164 @@ +# Dashboard Copy Interaction + Per-Paper Metadata Enhancement + +> **Status:** Spec complete, awaiting implementation +> **Date:** 2026-05-14 +> **Scope:** Plugin JS + CSS only (main.js + styles.css) + +## Goal + +Two UX improvements to the per-paper dashboard view: +1. **Click-to-copy** for discrete metadata fields (single click → clipboard) +2. **Text-selectable** for prose content areas (normal browser selection + copy) +3. **Metadata enhancement** — add Journal / DOI / Zotero Key / Collection Path in a compact inline row below authors/year + +--- + +## Design + +### Per-Paper View Layout (after changes) + +``` +┌────────────────────────────────────────────────────────────────┐ +│ │ +│ Efficacy of TXA in Reducing Blood Loss... [📋] │ ← Title, click-to-copy, copy icon on hover +│ Tianli Xia, Hiroyasu Konno, Jeonghyun Ahn · 2016 │ ← Authors (click-to-copy) · Year +│ │ +│ Cancer Research · DOI: 10.1158/... · Zotero: ABCDEFG 🔍 │ ← NEW meta-line (Zotero-style) +│ 📂 Orthopedics / Spine │ ← NEW collection path (click-to-copy) +│ │ +│ [PDF] [Fulltext] [OCR done] [Deep-read pending] │ ← existing status pills + file buttons +│ │ +│ ## 🔍 精读 (article overview — text-selectable) │ ← existing, keep selectable +│ ## 💬 Discussion (Q&A — text-selectable) │ ← existing, keep selectable +│ ▶ Technical Details (health, paths — click-to-copy fields) │ ← existing, add copy-on-click +│ │ +└────────────────────────────────────────────────────────────────┘ +``` + +### Meta-Line CSS (from user-provided reference) + +```css +/* Metadata inline row — Zotero style */ +.paperforge-meta-line { + margin-top: 8px; + font-size: 13px; + color: var(--text-muted); + display: flex; + flex-wrap: wrap; + gap: 6px 10px; + align-items: center; +} + +.paperforge-meta-item { + white-space: nowrap; +} + +.paperforge-meta-key { + color: var(--text-faint); + margin-right: 4px; +} + +.paperforge-meta-value { + color: var(--text-muted); +} + +.paperforge-meta-value.mono { + font-family: var(--font-monospace); +} + +/* Clickable fields */ +.paperforge-meta-value.clickable, +.paperforge-click-copy { + cursor: pointer; + border-bottom: 1px dashed var(--text-faint); + transition: color 0.15s, border-color 0.15s; +} + +.paperforge-meta-value.clickable:hover, +.paperforge-click-copy:hover { + color: var(--text-accent); + border-bottom-color: var(--text-accent); +} + +/* Feedback flash on copy */ +.paperforge-copied { + color: var(--text-accent) !important; + border-bottom-color: var(--text-success) !important; +} +``` + +### Interaction Rules + +| Field | Type | Click behavior | +| ------------------ | -------- | -------------------------------------------------- | +| Title | Copy | Click → copy full title; copy icon appears on hover | +| Authors | Copy | Click → copy author string | +| Journal | Display | Read-only, no copy | +| DOI | Copy | Click → copy DOI; also link icon to doi.org | +| Zotero Key | Copy | Click → copy key (monospace) | +| Collection Path | Copy | Click → copy pipe-joined path | +| PMID (if present) | Copy | Click → copy PMID | +| Note Path | Copy | Inside Technical Details; click → copy path | +| Fulltext Path | Copy | Inside Technical Details; click → copy path | +| Article Overview | Select | Normal text selection, no click-to-copy | +| Recent Discussion | Select | Normal text selection, no click-to-copy | +| Technical Details | Mixed | Paths are click-to-copy; status text is selectable | + +### Copy Feedback + +On click → `navigator.clipboard.writeText(value)`. Brief inline feedback: text briefly turns accent color (1s), then restores. No icon, no tooltip. + +--- + +## Implementation Tasks + +### Task 1: Add CSS to styles.css +- Add `.paperforge-meta-line`, `.paperforge-meta-item`, `.paperforge-meta-key`, `.paperforge-meta-value` rules +- Add `.paperforge-click-copy` + `.paperforge-copy-icon` hover rules +- Ensure existing content areas have no `user-select: none` + +### Task 2: Render meta-line in _renderPaperMode +- File: `paperforge/plugin/main.js`, in `PaperForgeStatusView._renderPaperMode()` +- After authors/year rendering (~line 1591), insert meta-line div +- Fields: Journal · DOI: xxx · Zotero: xxx · Collection: xxx +- Source data: `entry.journal`, `entry.doi`, `entry.zotero_key`, `entry.collection_path`, `entry.pmid` +- Add `paperforge-meta-value mono clickable` class to DOI, Zotero Key, PMID + +### Task 3: Implement click-to-copy helper +- Add `_makeClickCopy(el, value, displayText)` method to `PaperForgeStatusView` + - Sets cursor:pointer, dashed border, onclick handler + - On click: copy value, change text to "Copied!", setTimeout restore displayText +- Apply to: title, authors, DOI, zotero_key, collection_path, pmid + +### Task 4: Apply click-to-copy to Technical Details +- File: `paperforge/plugin/main.js`, `_renderPaperTechnicalDetails()` +- Make Note Path and Fulltext Path clickable +- Use same `_makeClickCopy` helper + +### Task 5: Verify text selection behavior +- Confirm article overview, recent discussion, and tech details body text are NOT `user-select: none` +- Remove any existing `user-select: none` from content areas (but keep on buttons/toggles) + +--- + +## Files to Modify + +| File | Changes | +| ------------------------- | ---------------------------------------------------------- | +| `paperforge/plugin/styles.css` | ~40 lines: meta-line + click-copy + copy-icon CSS | +| `paperforge/plugin/main.js` | ~60 lines: meta-line rendering + _makeClickCopy() + wiring | + +--- + +## Acceptance Criteria + +- [ ] Meta-line appears below authors/year in per-paper view: Journal · DOI · Zotero · Collection +- [ ] DOI and Zotero Key are monospace, dashed-underline on hover, click to copy +- [ ] Title dashed-underline on hover, click to copy +- [ ] Authors click to copy +- [ ] Collection path click to copy +- [ ] Note Path / Fulltext Path in Technical Details click to copy +- [ ] Article overview and Recent Discussion text remains freely selectable +- [ ] Brief color-flash feedback on copy (no icon, no tooltip) +- [ ] No regressions in global or collection modes +- [ ] Works in both light and dark Obsidian themes diff --git a/fixtures/methodology/METHODOLOGY_COMPACT.md b/fixtures/methodology/METHODOLOGY_COMPACT.md new file mode 100644 index 0000000..eb45f4f --- /dev/null +++ b/fixtures/methodology/METHODOLOGY_COMPACT.md @@ -0,0 +1,20 @@ +# PaperForge Methodology Compact + +## General +- Separate source fact, interpretation, and intended use. +- Prior reading-log is not verified fact; re-check source before reuse. +- When user corrects a judgment, record the correction if relevant. + +## Literature work +- Do not collapse heterogeneous studies without comparing model, parameter, endpoint, and measurement layer. +- Distinguish device-level settings from local biological exposure. +- Confirm within-study internal chain (material->output->effect) before making cross-study claims. + +## Clinical research +- Separate candidate variables, selected variables, final model variables, and sensitivity variables. +- Do not infer causality from predictive variables. + +## Writing +- Do not write unsupported claims. Every factual claim must have a source reference. +- Prefer bounded conclusions over broad overclaims. +- Distinguish "the paper says X" from "I infer Y from X". diff --git a/fixtures/snapshots/formal_note_frontmatter/orthopedic_article.yaml b/fixtures/snapshots/formal_note_frontmatter/orthopedic_article.yaml index 503aa72..f44f7d1 100644 --- a/fixtures/snapshots/formal_note_frontmatter/orthopedic_article.yaml +++ b/fixtures/snapshots/formal_note_frontmatter/orthopedic_article.yaml @@ -1,4 +1,5 @@ zotero_key: FIXT0001 +citation_key: FIXT0001 domain: orthopedic title: "Biomechanical Comparison of Suture Anchor Fixations in Rotator Cuff Repair" year: "2024" diff --git a/manifest.json b/manifest.json index b535c32..21ba972 100644 --- a/manifest.json +++ b/manifest.json @@ -1,9 +1,9 @@ { "id": "paperforge", "name": "PaperForge", - "version": "1.5.5", + "version": "1.5.6rc1", "minAppVersion": "1.9.0", - "description": "PaperForge — Zotero literature pipeline. Sync PDFs, run OCR, and read with AI-assisted deep reading.", + "description": "Zotero literature pipeline for Obsidian. Sync PDFs, run OCR, and read with AI-assisted deep reading.", "author": "Lin Zhaoxuan", "authorUrl": "https://github.com/LLLin000", "isDesktopOnly": true diff --git a/paperforge/__init__.py b/paperforge/__init__.py index 8f670b8..85369c3 100644 --- a/paperforge/__init__.py +++ b/paperforge/__init__.py @@ -1,3 +1,3 @@ """paperforge — PaperForge package.""" -__version__ = "1.5.5" +__version__ = "1.5.6rc1" diff --git a/paperforge/adapters/bbt.py b/paperforge/adapters/bbt.py index 7aac2d9..155d3c8 100644 --- a/paperforge/adapters/bbt.py +++ b/paperforge/adapters/bbt.py @@ -154,6 +154,25 @@ def resolve_item_collection_paths(item: dict, collection_lookup: dict) -> list[s return sorted({path for path in paths if path}, key=lambda value: (-value.count("/"), value)) +def extract_citation_key(item: dict) -> str: + """Extract the Better BibTeX citation key from a BBT JSON item. + + BBT stores the generated citation key as a top-level ``citationKey`` field, + e.g. ``aaronStimulationGrowthFactor2004``. Falls back to the Extra field. + """ + ck = item.get("citationKey", "") + if ck: + return ck + extra = item.get("extra", "") + if not extra: + return "" + for line in extra.splitlines(): + stripped = line.strip() + if stripped.lower().startswith("citation key:"): + return stripped.split(":", 1)[1].strip() + return "" + + def load_export_rows(path: Path) -> list[dict]: data = read_json(path) if isinstance(data, list): @@ -199,6 +218,7 @@ def load_export_rows(path: Path) -> list[dict]: "creators": item.get("creators", []), "abstract": item.get("abstractNote", ""), "journal": item.get("publicationTitle", ""), + "citation_key": extract_citation_key(item), "extra": item.get("extra", ""), "year": extract_year(item.get("date", "")), "date": item.get("date", ""), diff --git a/paperforge/cli.py b/paperforge/cli.py index d46b5e9..e289d44 100644 --- a/paperforge/cli.py +++ b/paperforge/cli.py @@ -258,6 +258,83 @@ def build_parser() -> argparse.ArgumentParser: p_dash = sub.add_parser("dashboard", help="Aggregated stats and permissions for the plugin dashboard") p_dash.add_argument("--json", action="store_true", help="Output as PFResult JSON") + # Vector DB + p_embed = sub.add_parser("embed", help="Vector embedding operations") + p_embed_sp = p_embed.add_subparsers(dest="embed_subcommand", required=True) + p_embed_build = p_embed_sp.add_parser("build", help="Build vector index from OCR fulltext") + p_embed_build.add_argument("--json", action="store_true") + p_embed_build.add_argument("--force", action="store_true") + p_embed_status = p_embed_sp.add_parser("status", help="Check vector DB status") + p_embed_status.add_argument("--json", action="store_true") + + p_retrieve = sub.add_parser("retrieve", help="Semantic search across OCR fulltext") + p_retrieve.add_argument("query", help="Search query") + p_retrieve.add_argument("--json", action="store_true") + p_retrieve.add_argument("--limit", type=int, default=5) + p_retrieve.add_argument("--expand", action="store_true", default=True) + + # Memory Layer commands + p_memory = sub.add_parser("memory", help="Manage the Memory Layer") + p_memory_sp = p_memory.add_subparsers(dest="memory_subcommand", required=True) + p_memory_build = p_memory_sp.add_parser("build", help="Build the memory database from canonical index") + p_memory_build.add_argument("--json", action="store_true", help="Output as JSON") + p_memory_status = p_memory_sp.add_parser("status", help="Check memory database status") + p_memory_status.add_argument("--json", action="store_true", help="Output as JSON") + + p_paper_status = sub.add_parser("paper-status", help="Look up a paper's status") + p_paper_status.add_argument("query", help="Paper identifier (zotero_key, DOI, title, alias)") + p_paper_status.add_argument("--json", action="store_true", help="Output as JSON") + + p_pc = sub.add_parser("paper-context", help="Get full context for a paper (metadata + reading notes + corrections)") + p_pc.add_argument("key", help="Zotero key") + p_pc.add_argument("--json", action="store_true", help="Output as JSON") + + p_rl = sub.add_parser("reading-log", help="Record or export reading notes") + p_rl.add_argument("--write", dest="paper_id", help="Write note for this zotero_key") + p_rl.add_argument("--section", help="Section (e.g. Discussion P12)") + p_rl.add_argument("--excerpt", help="Quoted excerpt") + p_rl.add_argument("--usage", help="How this supports the current writing") + p_rl.add_argument("--note", help="Optional cross-validation note") + p_rl.add_argument("--context", help="Full paragraph containing excerpt") + p_rl.add_argument("--tags", help="Comma-separated tags") + p_rl.add_argument("--project", help="Associated project name") + p_rl.add_argument("--render", action="store_true", help="Render reading-log.md for one or all projects") + p_rl.add_argument("--correct", dest="correct_id", help="ID of prior reading note to correct") + p_rl.add_argument("--correction", help="Correction text") + p_rl.add_argument("--reason", help="Reason for correction (e.g. 'Rechecked figure legend')") + p_rl.add_argument("--since", help="Export notes since date (YYYY-MM-DD)") + p_rl.add_argument("--limit", type=int, default=50, help="Max notes to export") + p_rl.add_argument("--output", help="Write markdown to file") + p_rl.add_argument("--validate", help="Validate a reading-log.md file") + p_rl.add_argument("--import", dest="import_file", help="Import reading-log.md into paper_events") + p_rl.add_argument("--lookup", help="Look up all reading notes for a paper key") + p_rl.add_argument("--json", action="store_true", help="Output as JSON") + + p_pl = sub.add_parser("project-log", help="Record or render project work logs") + p_pl.add_argument("--write", action="store_true", help="Write a new project log entry") + p_pl.add_argument("--payload", help="JSON payload for the entry") + p_pl.add_argument("--project", help="Project name (required for write/list/render)") + p_pl.add_argument("--list", action="store_true", help="List all entries for a project") + p_pl.add_argument("--render", action="store_true", help="Render project-log.md") + p_pl.add_argument("--limit", type=int, default=50, help="Max entries to list") + p_pl.add_argument("--json", action="store_true", help="Output as PFResult JSON") + + p_search = sub.add_parser("search", help="Full-text search across the library") + p_search.add_argument("query", help="Search query (supports FTS5 syntax)") + p_search.add_argument("--json", action="store_true", help="Output as JSON") + p_search.add_argument("--limit", type=int, default=20, help="Max results") + p_search.add_argument("--domain", help="Filter by domain") + p_search.add_argument("--year-from", type=int, help="Filter by year (inclusive)") + p_search.add_argument("--year-to", type=int, help="Filter by year (inclusive)") + p_search.add_argument("--ocr", choices=["done","pending","failed","processing"], help="Filter by OCR status") + p_search.add_argument("--deep", choices=["done","pending"], help="Filter by deep reading status") + p_search.add_argument("--lifecycle", choices=["indexed","pdf_ready","fulltext_ready","deep_read_done"], help="Filter by lifecycle") + p_search.add_argument("--next-step", choices=["sync","ocr","/pf-deep","ready"], help="Filter by next step") + + # agent-context + p_ac = sub.add_parser("agent-context", help="Generate agent bootstrap context") + p_ac.add_argument("--json", action="store_true", help="Output as JSON") + # base-refresh p_base = sub.add_parser("base-refresh", help="Refresh Obsidian Base view files") p_base.add_argument( @@ -470,6 +547,51 @@ def main(argv: list[str] | None = None) -> int: return dashboard.run(args) + if args.command == "memory": + from paperforge.commands.memory import run + + return run(args) + + if args.command == "embed": + from paperforge.commands.embed import run + + return run(args) + + if args.command == "retrieve": + from paperforge.commands.retrieve import run + + return run(args) + + if args.command == "paper-status": + from paperforge.commands.paper_status import run + + return run(args) + + if args.command == "paper-context": + from paperforge.commands.paper_context import run + + return run(args) + + if args.command == "reading-log": + from paperforge.commands.reading_log import run + + return run(args) + + if args.command == "project-log": + from paperforge.commands.project_log import run + + return run(args) + + if args.command == "search": + from paperforge.commands.search import run + + return run(args) + + if args.command == "agent-context": + from paperforge.commands.agent_context import run + + return run(args) + if args.command == "base-refresh": force = getattr(args, "force", False) paths = args.paths @@ -544,11 +666,11 @@ def _cmd_paths(vault: Path, args: argparse.Namespace) -> int: if args.json: # Output only the keys required by D-Path Output contract - output_keys = {"vault", "worker_script", "ld_deep_script"} + output_keys = {"vault", "worker_script", "pf_deep_script"} filtered = {k: v for k, v in all_paths.items() if k in output_keys} filtered["vault"] = str(vault.resolve()) filtered["worker_script"] = str(paths["worker_script"].resolve()) - filtered["ld_deep_script"] = str(paths["ld_deep_script"].resolve()) + filtered["pf_deep_script"] = str(paths["pf_deep_script"].resolve()) print(json.dumps(filtered, ensure_ascii=False, indent=2)) else: for key, path_str in sorted(all_paths.items()): diff --git a/paperforge/command_files/pf-log-reading.md b/paperforge/command_files/pf-log-reading.md new file mode 100644 index 0000000..f93745a --- /dev/null +++ b/paperforge/command_files/pf-log-reading.md @@ -0,0 +1,33 @@ +# /pf-log-reading — Record a reading note + +> 读完当前段落或章节后自动记录到 paperforge.db 的 paper_events 表。 + +## Agent Workflow + +1. 确定 zotero_key (从上下文或 formal note 中获取) +2. 提取以下信息: + - **section**: 文献中的位置 (e.g. "Discussion P12", "Results Fig.3") + - **excerpt**: 逐字引用的原文关键句 + - **usage**: 这个信息支持当前写作的哪个论点 + - **note**: 任何交叉验证/矛盾/注意事项 (optional) + +3. 执行: +```bash +paperforge reading-log --write \ + --section "Discussion P12" \ + --excerpt "the fundamental disjunction between materials science and biology" \ + --usage "F 段 gap 论点" \ + --note "与 DDGMQ7RW 独立诊断同一问题" +``` + +## Prompt Injection + +After reading a section or paragraph from a paper: + +**Record a reading note.** Determine the zotero_key of the paper you just read. Extract the section name (e.g. "Discussion P12", "Results Fig.3"), a verbatim excerpt of the key sentence, how this supports the current writing task, and any cross-validation notes. Then run: + +``` +paperforge --vault {vault_path} reading-log --write KEY --section "..." --excerpt "..." --usage "..." --note "..." +``` + +If the user's vault path is unknown, ask before running. diff --git a/paperforge/command_files/pf-log-session.md b/paperforge/command_files/pf-log-session.md new file mode 100644 index 0000000..335b8b0 --- /dev/null +++ b/paperforge/command_files/pf-log-session.md @@ -0,0 +1,54 @@ +# /pf-log-session — Summarize session decisions to working-log + +> 会话结束时回顾本次所有决策节点,追加到 working-log.md。 + +## Agent Workflow + +1. 回顾本次会话中所有关键节点: + - 用户纠正了什么 + - 方案怎么变的 + - 有什么弯路和教训 + - 可复用的方法论 + +2. 按以下格式生成 markdown,追加到 working-log.md: + +```markdown +## — <小节名> + +### 核心决策 +- 做了什么、为什么 + +### 弯路与修正 +- 错误方向 → 用户纠正 → 最终方案 + +### 可复用方法论 +- 本段的 pattern,后续段落能怎么用 + +### 待办 +- [ ] ... +``` + +3. 询问用户确认,然后写入到 `Project//working-log.md` + +## Prompt Injection + +At the end of this session, before saying goodbye: + +**Write the working-log entry.** Review all decision points, corrections, dead ends, and methodological insights from this session. Ask the user: "Should I write the working-log entry now?" If yes, generate the entry in the format below and append it to the appropriate working-log.md in the user's project directory. Ask the user to confirm the project path if unsure. + +Format: +``` +## YYYY-MM-DD — Section Name + +### Core Decisions +- What happened and why + +### Dead Ends & Corrections +- Wrong direction -> User correction -> Final approach + +### Reusable Methodology +- Patterns that apply to later sections + +### TODO +- [ ] ... +``` diff --git a/paperforge/commands/__init__.py b/paperforge/commands/__init__.py index 63dc3ad..e60656a 100644 --- a/paperforge/commands/__init__.py +++ b/paperforge/commands/__init__.py @@ -10,6 +10,12 @@ "context": "paperforge.commands.context", "dashboard": "paperforge.commands.dashboard", "finalize": "paperforge.commands.finalize", + "memory": "paperforge.commands.memory", + "embed": "paperforge.commands.embed", + "retrieve": "paperforge.commands.retrieve", + "paper-status": "paperforge.commands.paper_status", + "agent-context": "paperforge.commands.agent_context", + "reading-log": "paperforge.commands.reading_log", } diff --git a/paperforge/commands/agent_context.py b/paperforge/commands/agent_context.py new file mode 100644 index 0000000..4ad294b --- /dev/null +++ b/paperforge/commands/agent_context.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +import argparse +import sys + +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.context import get_agent_context +from paperforge import __version__ as PF_VERSION + +COMMANDS = { + "paper-status": { + "usage": "paperforge paper-status --json", + "purpose": "Look up one paper's full status and recommended next action", + }, + "search": { + "usage": "paperforge search --json [--collection NAME] [--domain NAME] [--ocr done|pending] [--year-from N] [--year-to N] [--limit N]", + "purpose": "Full-text search with optional collection/domain/lifecycle filters", + }, + "retrieve": { + "usage": "paperforge retrieve --json [--limit N]", + "purpose": "Search OCR fulltext chunks for evidence paragraphs (coming soon)", + }, + "deep": { + "usage": "/pf-deep ", + "purpose": "Full three-pass deep reading with chart analysis", + }, + "ocr": { + "usage": "/pf-ocr", + "purpose": "Run OCR on papers marked do_ocr:true", + }, + "sync": { + "usage": "/pf-sync", + "purpose": "Sync Zotero and regenerate formal notes + index", + }, +} + +RULES = [ + "Use paperforge.db via CLI commands before reading individual files.", + "Do not infer paper state from stale frontmatter when memory status is fresh.", + "Read source files only after resolving candidates via paper-status or search.", + "To locate a paper: start with collection scope if known, then expand to full library search.", +] + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + + context = get_agent_context(vault) + if context is None: + result = PFResult( + ok=False, + command="agent-context", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message="Memory database not found or query failed. Run paperforge memory build.", + ), + ) + if args.json: + print(result.to_json()) + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 1 + + data = { + "paperforge": { + "version": PF_VERSION, + "vault": str(vault), + "memory_db": "ready", + }, + "library": context["library"], + "collections": context["collections"], + "commands": COMMANDS, + "rules": RULES, + } + + result = PFResult( + ok=True, + command="agent-context", + version=PF_VERSION, + data=data, + ) + + if args.json: + print(result.to_json()) + else: + lib = data["library"] + print(f"Papers: {lib['paper_count']} total") + print(f"Domains: {lib['domain_counts']}") + print(f"Lifecycle: {lib['lifecycle_counts']}") + for c in data.get("collections", []): + subs = f" ({len(c['sub'])} sub)" if c["sub"] else "" + print(f" [{c['count']:3}] {c['name']}{subs}") + + return 0 if result.ok else 1 diff --git a/paperforge/commands/dashboard.py b/paperforge/commands/dashboard.py index 68b0e79..bbf8771 100644 --- a/paperforge/commands/dashboard.py +++ b/paperforge/commands/dashboard.py @@ -51,12 +51,85 @@ def run(args) -> int: return 1 -def _gather_dashboard_data(vault: Path) -> dict: - """Gather stats and permissions for dashboard display.""" +def _dashboard_from_db(vault: Path) -> dict | None: + """Build dashboard stats from paperforge.db. Returns None if DB unavailable.""" + from pathlib import Path as _P + db_path = vault / "System" / "PaperForge" / "indexes" / "paperforge.db" + if not db_path.exists(): + return None + try: + import sqlite3 + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + # Aggregate stats via single GROUP BY + rows = conn.execute(""" + SELECT has_pdf, + CASE WHEN ocr_status='done' THEN 'done' + WHEN ocr_status IN ('failed','blocked') THEN 'failed' + ELSE 'pending' END as ocr, + COUNT(*) as cnt + FROM papers GROUP BY has_pdf, ocr + """).fetchall() + total = sum(r["cnt"] for r in rows) + pdf_healthy = sum(r["cnt"] for r in rows if r["has_pdf"] == 1 and r["ocr"] != "failed") + pdf_missing = sum(r["cnt"] for r in rows if r["has_pdf"] == 0) + pdf_broken = total - pdf_healthy - pdf_missing + ocr_done = sum(r["cnt"] for r in rows if r["ocr"] == "done") + ocr_failed = sum(r["cnt"] for r in rows if r["ocr"] == "failed") + ocr_pending = total - ocr_done - ocr_failed + # Domain counts + rows = conn.execute("SELECT domain, COUNT(*) as cnt FROM papers GROUP BY domain").fetchall() + domain_counts = {r["domain"]: r["cnt"] for r in rows} + conn.close() + return { + "stats": { + "papers": total, + "pdf_health": {"healthy": pdf_healthy, "missing": pdf_missing, "broken": pdf_broken}, + "ocr_health": {"pending": ocr_pending, "done": ocr_done, "failed": ocr_failed}, + "domain_counts": domain_counts, + }, + } + except Exception: + return None + + +def _check_permissions(vault: Path) -> dict: + """Check sync/OCR/context permissions (lightweight filesystem check).""" + cfg = load_vault_config(vault) + paths = paperforge_paths(vault, cfg) + + export_files = sorted(paths["exports"].glob("*.json")) if paths["exports"].exists() else [] + can_sync = len(export_files) > 0 + + paddle_token = ( + os.environ.get("PADDLEOCR_API_TOKEN") or os.environ.get("PADDLEOCR_API_KEY") or os.environ.get("OCR_TOKEN") + ) + can_ocr = bool(paddle_token) + + can_copy_context = False + pf_dir = paths.get("paperforge", vault / cfg["system_dir"] / "PaperForge") + if pf_dir.exists(): + try: + pf_dir.parent.mkdir(parents=True, exist_ok=True) + test_file = pf_dir / ".write_test" + test_file.touch() + test_file.unlink() + can_copy_context = True + except (OSError, PermissionError): + pass + + return { + "can_sync": can_sync, + "can_ocr": can_ocr, + "can_copy_context": can_copy_context, + } + + +def _dashboard_from_files(vault: Path) -> dict: + """Gather stats and permissions by scanning literature files.""" cfg = load_vault_config(vault) paths = paperforge_paths(vault, cfg) - # ── Papers / formal note count ── _skip_names = {"fulltext.md", "deep-reading.md", "discussion.md"} record_count = 0 if paths["literature"].exists(): @@ -64,7 +137,6 @@ def _gather_dashboard_data(vault: Path) -> dict: if p.name not in _skip_names: record_count += 1 - # ── Domain counts (first-level subdirs under literature) ── domain_counts: dict[str, int] = {} if paths["literature"].exists(): for domain_dir in sorted(paths["literature"].iterdir()): @@ -73,7 +145,6 @@ def _gather_dashboard_data(vault: Path) -> dict: if count > 0: domain_counts[domain_dir.name] = count - # ── PDF health & OCR health from frontmatter ── pdf_healthy = 0 pdf_broken = 0 pdf_missing = 0 @@ -95,7 +166,6 @@ def _gather_dashboard_data(vault: Path) -> dict: except Exception: continue - # PDF health path_error_m = _path_error_pat.search(text) if path_error_m: error_type = path_error_m.group(1) @@ -106,7 +176,6 @@ def _gather_dashboard_data(vault: Path) -> dict: elif _pdf_path_pat.search(text): pdf_healthy += 1 - # OCR health ocr_status_m = _ocr_status_pat.search(text) if ocr_status_m: status = ocr_status_m.group(1).strip().lower().strip('"') @@ -119,27 +188,6 @@ def _gather_dashboard_data(vault: Path) -> dict: elif _do_ocr_pat.search(text): ocr_pending += 1 - # ── Permissions ── - export_files = sorted(paths["exports"].glob("*.json")) if paths["exports"].exists() else [] - can_sync = len(export_files) > 0 - - paddle_token = ( - os.environ.get("PADDLEOCR_API_TOKEN") or os.environ.get("PADDLEOCR_API_KEY") or os.environ.get("OCR_TOKEN") - ) - can_ocr = bool(paddle_token) - - can_copy_context = False - pf_dir = paths.get("paperforge", vault / cfg["system_dir"] / "PaperForge") - if pf_dir.exists(): - try: - pf_dir.parent.mkdir(parents=True, exist_ok=True) - test_file = pf_dir / ".write_test" - test_file.touch() - test_file.unlink() - can_copy_context = True - except (OSError, PermissionError): - pass - return { "stats": { "papers": record_count, @@ -155,9 +203,17 @@ def _gather_dashboard_data(vault: Path) -> dict: }, "domain_counts": domain_counts, }, - "permissions": { - "can_sync": can_sync, - "can_ocr": can_ocr, - "can_copy_context": can_copy_context, - }, + "permissions": _check_permissions(vault), } + + +def _gather_dashboard_data(vault: Path) -> dict: + # Try DB first + data = _dashboard_from_db(vault) + if data is not None: + data["permissions"] = _check_permissions(vault) + return data + # Fallback to file scanning + data = _dashboard_from_files(vault) + data["permissions"] = _check_permissions(vault) + return data diff --git a/paperforge/commands/embed.py b/paperforge/commands/embed.py new file mode 100644 index 0000000..c114550 --- /dev/null +++ b/paperforge/commands/embed.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.chunker import chunk_fulltext +from paperforge.memory.vector_db import ( + delete_paper_vectors, + embed_paper, + get_embed_status, + get_vector_db_path, +) +from paperforge.worker.asset_index import read_index +from paperforge.worker.vector_db import _preflight_check +from paperforge import __version__ as PF_VERSION + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + sub = getattr(args, "embed_subcommand", "build") + + if sub == "status": + status = get_embed_status(vault) + result = PFResult(ok=True, command="embed status", version=PF_VERSION, data=status) + if args.json: + print(result.to_json()) + else: + for k, v in status.items(): + print(f" {k}: {v}") + return 0 + + # Build + + # Read plugin settings for preflight + settings: dict = {} + dc_json = vault / ".obsidian" / "plugins" / "paperforge" / "data.json" + if dc_json.exists(): + try: + import json + + settings = json.loads(dc_json.read_text(encoding="utf-8")) + except Exception: + pass + + preflight = _preflight_check(vault, settings) + if not preflight["ok"]: + result = PFResult( + ok=False, + command="embed-build", + version=PF_VERSION, + error=PFError(code=ErrorCode.VALIDATION_ERROR, message=preflight["error"]), + data={"fix": preflight.get("fix", "")}, + ) + if args.json: + print(result.to_json()) + else: + print(f"Error: {preflight['error']}", file=sys.stderr) + print(f"Fix: {preflight['fix']}", file=sys.stderr) + return 1 + + envelope = read_index(vault) + if not envelope: + result = PFResult(ok=False, command="embed build", version=PF_VERSION, + error=PFError(code=ErrorCode.PATH_NOT_FOUND, + message="Canonical index not found. Run paperforge sync first.")) + print(result.to_json() if args.json else result.error.message, file=sys.stderr if not args.json else sys.stdout) + return 1 + + items = envelope if isinstance(envelope, list) else envelope.get("items", []) + done_papers = [e for e in items if e.get("ocr_status") == "done"] + + if args.force: + db_path = get_vector_db_path(vault) + if db_path.exists(): + import shutil + shutil.rmtree(str(db_path), ignore_errors=True) + + papers_embedded = 0 + chunks_embedded = 0 + for entry in done_papers: + key = entry.get("zotero_key") + fulltext_rel = entry.get("fulltext_path", "") + if not fulltext_rel: + continue + fulltext_path = vault / fulltext_rel + chunks = chunk_fulltext(fulltext_path) + if not chunks: + continue + try: + delete_paper_vectors(vault, key) + n = embed_paper(vault, key, chunks) + chunks_embedded += n + papers_embedded += 1 + except Exception as e: + result = PFResult(ok=False, command="embed build", version=PF_VERSION, + error=PFError(code=ErrorCode.INTERNAL_ERROR, message=str(e))) + print(result.to_json() if args.json else result.error.message, file=sys.stderr if not args.json else sys.stdout) + return 1 + + data = { + "papers_embedded": papers_embedded, + "chunks_embedded": chunks_embedded, + "model": get_embed_status(vault)["model"], + "mode": get_embed_status(vault)["mode"], + } + result = PFResult(ok=True, command="embed build", version=PF_VERSION, data=data) + if args.json: + print(result.to_json()) + else: + print(f"Embedded {papers_embedded} papers ({chunks_embedded} chunks)") + return 0 diff --git a/paperforge/commands/memory.py b/paperforge/commands/memory.py new file mode 100644 index 0000000..a0e8d65 --- /dev/null +++ b/paperforge/commands/memory.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +import argparse +import sys + +from paperforge import __version__ as PF_VERSION +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.builder import build_from_index +from paperforge.memory.query import get_memory_status + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + sub_cmd = args.memory_subcommand + + if sub_cmd == "build": + try: + counts = build_from_index(vault) + result = PFResult( + ok=True, + command="memory build", + version=PF_VERSION, + data=counts, + ) + except FileNotFoundError: + result = PFResult( + ok=False, + command="memory build", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message="Canonical index not found. Run paperforge sync --rebuild-index.", + ), + next_actions=[ + { + "command": "paperforge sync --rebuild-index", + "reason": "Generate formal-library.json first", + } + ], + ) + except Exception as exc: + result = PFResult( + ok=False, + command="memory build", + version=PF_VERSION, + error=PFError( + code=ErrorCode.INTERNAL_ERROR, + message=str(exc), + ), + ) + if args.json: + print(result.to_json()) + else: + if result.ok: + print(f"Memory built: {result.data}") + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 0 if result.ok else 1 + + if sub_cmd == "status": + try: + status = get_memory_status(vault) + result = PFResult( + ok=True, + command="memory status", + version=PF_VERSION, + data=status, + ) + except Exception as exc: + result = PFResult( + ok=False, + command="memory status", + version=PF_VERSION, + error=PFError( + code=ErrorCode.INTERNAL_ERROR, + message=str(exc), + ), + ) + if args.json: + print(result.to_json()) + else: + if result.ok: + for k, v in status.items(): + print(f" {k}: {v}") + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 0 if result.ok else 1 + + print(f"Unknown memory subcommand: {sub_cmd}", file=sys.stderr) + return 1 diff --git a/paperforge/commands/paper_context.py b/paperforge/commands/paper_context.py new file mode 100644 index 0000000..193d82c --- /dev/null +++ b/paperforge/commands/paper_context.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +import argparse +import json +import sys + +from paperforge import __version__ as PF_VERSION +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.permanent import get_corrections_for_paper, get_reading_notes_for_paper + + +def _build_paper_context(vault, key: str) -> dict | None: + """Build full context for a paper: metadata + reading notes + corrections.""" + + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return None + + conn = get_connection(db_path, read_only=True) + try: + row = conn.execute( + """SELECT zotero_key, citation_key, title, year, doi, journal, + first_author, domain, collection_path, has_pdf, + ocr_status, analyze, deep_reading_status, lifecycle, + next_step, pdf_path, note_path, fulltext_path, paper_root + FROM papers WHERE zotero_key = ?""", + (key,), + ).fetchone() + + if not row: + return None + + paper = dict(row) + + prior_notes = get_reading_notes_for_paper(vault, key) + + corrections = [] + corr_rows = conn.execute( + """SELECT created_at, payload_json + FROM paper_events + WHERE paper_id = ? AND event_type = 'correction_note' + ORDER BY created_at DESC""", + (key,), + ).fetchall() + seen_ids: set[str] = set() + for cr in corr_rows: + payload = json.loads(cr["payload_json"]) + orig_id = payload.get("original_id", "") + corrections.append({ + "created_at": cr["created_at"], + "previous_note_id": orig_id, + "correction": payload.get("correction", ""), + "reason": payload.get("reason", ""), + }) + if orig_id: + seen_ids.add(orig_id) + + jsonl_corrections = get_corrections_for_paper(vault, key) + for c in jsonl_corrections: + cid = c.get("original_id", "") + if cid and cid in seen_ids: + continue + corrections.append({ + "created_at": c.get("created_at", ""), + "previous_note_id": cid, + "correction": c.get("correction", ""), + "reason": c.get("reason", ""), + }) + if cid: + seen_ids.add(cid) + + recheck_targets = [] + for n in prior_notes: + if not n.get("verified", False): + recheck_targets.append( + f"{n.get('section', 'unknown')}: {n.get('excerpt', '')[:80]}..." + ) + + return { + "warning": "Prior reading notes are not verified facts. Re-check source before reuse.", + "paper": paper, + "prior_notes": prior_notes, + "corrections": corrections, + "recheck_targets": recheck_targets, + } + finally: + conn.close() + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + key = args.key + + context = _build_paper_context(vault, key) + + if context is None: + result = PFResult( + ok=False, + command="paper-context", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message=f"No paper found for key: {key}", + ), + ) + else: + result = PFResult( + ok=True, + command="paper-context", + version=PF_VERSION, + data=context, + ) + + if args.json: + print(result.to_json()) + else: + if result.ok: + p = result.data["paper"] + print(f"Paper: {p.get('title', key)}") + print(f" Key: {p.get('zotero_key', '')}") + print(f" OCR: {p.get('ocr_status', 'unknown')}") + print(f" Lifecycle: {p.get('lifecycle', '')}") + notes = result.data.get("prior_notes", []) + print(f" Reading notes: {len(notes)}") + print(f" Corrections: {len(result.data.get('corrections', []))}") + if result.data.get("recheck_targets"): + print(f" Recheck targets: {len(result.data['recheck_targets'])}") + else: + print(f"Error: {result.error.message}", file=sys.stderr) + + return 0 if result.ok else 1 diff --git a/paperforge/commands/paper_status.py b/paperforge/commands/paper_status.py new file mode 100644 index 0000000..34b38aa --- /dev/null +++ b/paperforge/commands/paper_status.py @@ -0,0 +1,70 @@ +from __future__ import annotations + +import argparse +import sys + +from paperforge import __version__ as PF_VERSION +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.query import get_paper_status + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + query = args.query + + try: + status = get_paper_status(vault, query) + if status is None: + result = PFResult( + ok=False, + command="paper-status", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message=f"No paper found for: {query}", + ), + next_actions=[ + { + "command": "paperforge search", + "reason": "Search for papers by keyword", + } + ], + ) + else: + result = PFResult( + ok=True, + command="paper-status", + version=PF_VERSION, + data=status, + ) + except Exception as exc: + result = PFResult( + ok=False, + command="paper-status", + version=PF_VERSION, + error=PFError( + code=ErrorCode.INTERNAL_ERROR, + message=str(exc), + ), + ) + + if args.json: + print(result.to_json()) + else: + if result.ok: + data = result.data + if data.get("resolved"): + print(f"Zotero Key: {data.get('zotero_key', '')}") + print(f"Title: {data.get('title', '')}") + print(f"Year: {data.get('year', '')}") + print(f"Lifecycle: {data.get('lifecycle', '')}") + print(f"Next Step: {data.get('next_step', '')}") + if data.get("candidates"): + print(f"\nMultiple candidates: {len(data['candidates'])}") + for c in data["candidates"]: + print(f" - {c['zotero_key']}: {c['title']} ({c['year']})") + else: + print(f"Error: {result.error.message}", file=sys.stderr) + + return 0 if result.ok else 1 diff --git a/paperforge/commands/project_log.py b/paperforge/commands/project_log.py new file mode 100644 index 0000000..5f26d0a --- /dev/null +++ b/paperforge/commands/project_log.py @@ -0,0 +1,183 @@ +from __future__ import annotations + +import argparse +import json +import sys +from collections import Counter +from pathlib import Path + +from paperforge import __version__ as PF_VERSION +from paperforge.config import paperforge_paths +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.permanent import ( + append_project_entry, + get_project_entries, + read_all_project_entries, +) + + +def _render_project_log_md(vault: Path, project: str) -> None: + """Render project-log.md from JSONL.""" + entries = get_project_entries(vault, project) + if not entries: + return + + lines = [f"# Project Log — {project}", ""] + lines.append("> Auto-generated from project-log.jsonl. Do not edit manually.") + lines.append("") + + for entry in sorted(entries, key=lambda x: x.get("created_at", ""), reverse=True): + lines.append(f"## {entry.get('date', '')} — {entry.get('title', '(untitled)')}") + lines.append(f"**Type:** {entry.get('type', '')}") + lines.append("") + + if entry.get("decisions"): + lines.append("### Core Decisions") + for d in entry["decisions"]: + lines.append(f"- {d}") + lines.append("") + + if entry.get("detours"): + lines.append("### Detours & Corrections") + for dt in entry["detours"]: + if isinstance(dt, dict): + lines.append(f"- **Wrong:** {dt.get('wrong', '')}") + lines.append(f" **Correction:** {dt.get('correction', '')}") + lines.append(f" **Resolution:** {dt.get('resolution', '')}") + else: + lines.append(f"- {dt}") + lines.append("") + + if entry.get("reusable"): + lines.append("### Reusable Methods") + for r in entry["reusable"]: + lines.append(f"- {r}") + lines.append("") + + if entry.get("todos"): + lines.append("### Todos") + for t in entry["todos"]: + done = "x" if t.get("done", False) else " " + lines.append(f"- [{done}] {t.get('content', '')}") + lines.append("") + + if entry.get("tags"): + lines.append(f"**Tags:** {', '.join(entry['tags'])}") + + lines.append("---") + lines.append("") + + paths = paperforge_paths(vault) + resource_dir = paths.get("resources") + if resource_dir: + output_dir = resource_dir / "Projects" / project + else: + output_dir = vault / "Projects" / project + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / "project-log.md" + output_path.write_text("\n".join(lines), encoding="utf-8") + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + + if getattr(args, "write", False): + project = getattr(args, "project", "") + payload_str = getattr(args, "payload", "") + + if not project: + result = PFResult(ok=False, command="project-log", version=PF_VERSION, + error=PFError(code=ErrorCode.VALIDATION_ERROR, message="--project is required for --write")) + if getattr(args, "json", False): + print(result.to_json()) + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 1 + + if not payload_str: + result = PFResult(ok=False, command="project-log", version=PF_VERSION, + error=PFError(code=ErrorCode.VALIDATION_ERROR, message="--payload is required for --write")) + if getattr(args, "json", False): + print(result.to_json()) + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 1 + + try: + entry = json.loads(payload_str) + entry["project"] = project + result_data = append_project_entry(vault, entry) + + _render_project_log_md(vault, project) + + result = PFResult(ok=True, command="project-log", version=PF_VERSION, data=result_data) + except json.JSONDecodeError as e: + result = PFResult(ok=False, command="project-log", version=PF_VERSION, + error=PFError(code=ErrorCode.VALIDATION_ERROR, message=f"Invalid JSON: {e}")) + + if getattr(args, "json", False): + print(result.to_json()) + else: + print("Written." if result.ok else f"Error: {result.error.message}") + return 0 if result.ok else 1 + + if getattr(args, "list", False): + project = getattr(args, "project", "") + if not project: + result = PFResult(ok=False, command="project-log", version=PF_VERSION, + error=PFError(code=ErrorCode.VALIDATION_ERROR, message="--project is required for --list")) + if getattr(args, "json", False): + print(result.to_json()) + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 1 + + entries = get_project_entries(vault, project) + data = {"project": project, "entries": entries[:getattr(args, "limit", 50)], "count": len(entries)} + result = PFResult(ok=True, command="project-log", version=PF_VERSION, data=data) + + if getattr(args, "json", False): + print(result.to_json()) + else: + print(f"{len(entries)} entries for project '{project}'") + for e in entries[:5]: + print(f" [{e.get('date', '')}] {e.get('type', '')}: {e.get('title', '')}") + return 0 + + if getattr(args, "render", False): + project = getattr(args, "project", "") + if not project: + result = PFResult(ok=False, command="project-log", version=PF_VERSION, + error=PFError(code=ErrorCode.VALIDATION_ERROR, message="--project is required for --render")) + if getattr(args, "json", False): + print(result.to_json()) + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 1 + + _render_project_log_md(vault, project) + result = PFResult(ok=True, command="project-log", version=PF_VERSION, + data={"rendered": True, "project": project}) + if getattr(args, "json", False): + print(result.to_json()) + else: + print(f"Rendered project-log.md for '{project}'") + return 0 + + # Default: show all projects with entry counts + all_entries = read_all_project_entries(vault) + project_counts = Counter(e["project"] for e in all_entries if e.get("project")) + + result = PFResult(ok=True, command="project-log", version=PF_VERSION, + data={"projects": dict(project_counts)}) + if getattr(args, "json", False): + print(result.to_json()) + else: + if project_counts: + print("Projects with log entries:") + for proj, cnt in project_counts.most_common(): + print(f" {proj}: {cnt} entries") + else: + print("No project log entries found.") + return 0 diff --git a/paperforge/commands/reading_log.py b/paperforge/commands/reading_log.py new file mode 100644 index 0000000..0e19abb --- /dev/null +++ b/paperforge/commands/reading_log.py @@ -0,0 +1,540 @@ +from __future__ import annotations + +import argparse +import datetime +import json +import re +from pathlib import Path + +from paperforge import __version__ as PF_VERSION +from paperforge.config import paperforge_paths +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.events import write_correction_note +from paperforge.memory.permanent import ( + append_correction, + append_reading_note, + get_reading_notes_for_paper, + read_all_reading_notes, +) + +_HEADER_RE = re.compile(r"^## ([A-Z0-9]{8}) \u2014 .+ \d{4}$") +_TITLE_RE = re.compile(r"^\*\*Title:\*\* (.+)") +_SECTION_RE = re.compile(r"^### (.+)") +_HR_RE = re.compile(r"^-{3,}$") +_FIELD_RE = re.compile(r"^\*\*([^:]+):\*\*") + +_LABEL_INFO = frozenset({"Info", "信息"}) +_LABEL_USE = frozenset({"Use", "用途"}) +_LABEL_NOTE = frozenset({"Note", "备注"}) + + +def _strip_quotes(s: str) -> str: + if s.startswith('"') and s.endswith('"'): + return s[1:-1] + if len(s) >= 2 and s[0] == '\u201c' and s[-1] == '\u201d': + return s[1:-1] + return s + + +def _parse_reading_log(filepath: Path) -> dict: + if not filepath.exists(): + return {"ok": False, "papers": [], "errors": [{"line": 0, "field": "file", "message": "File not found"}]} + + content = filepath.read_text(encoding="utf-8") + lines = content.splitlines() + + papers: list[dict] = [] + errors: list[dict] = [] + + current_paper: dict | None = None + current_section: str | None = None + current_fields: dict = {} + active_field: str | None = None + + def _flush_section(ln: int = 0): + nonlocal current_section, current_fields, active_field + if current_paper is not None and current_section is not None: + info_val = current_fields.get("info", "") + use_val = current_fields.get("use", "") + if not info_val: + errors.append({"line": ln, "field": "entry.info", "message": f"Missing **Info:** in section '{current_section}'"}) + if not use_val: + errors.append({"line": ln, "field": "entry.use", "message": f"Missing **Use:** in section '{current_section}'"}) + current_paper["sections"].append({ + "section_name": current_section, + "info": info_val, + "use": use_val, + "note": current_fields.get("note", ""), + }) + current_section = None + current_fields = {} + active_field = None + + def _flush_paper(ln: int = 0): + nonlocal current_paper + _flush_section(ln) + if current_paper: + papers.append(current_paper) + current_paper = None + + for i, line in enumerate(lines): + ln = i + 1 + stripped = line.strip() + if not stripped: + continue + + m = _HEADER_RE.match(stripped) + if m: + _flush_paper(ln) + current_paper = {"paper_key": m.group(1), "title": "", "sections": []} + continue + + m = _TITLE_RE.match(stripped) + if m and current_paper is not None: + current_paper["title"] = m.group(1) + continue + + m = _SECTION_RE.match(stripped) + if m: + if current_paper is not None: + _flush_section(ln) + current_section = m.group(1) + continue + + if current_paper is None or current_section is None: + continue + + if _HR_RE.match(stripped): + active_field = None + continue + + fm = _FIELD_RE.match(stripped) + if fm: + label = fm.group(1) + rest = stripped[fm.end():].strip() + if label in _LABEL_INFO: + active_field = "info" + current_fields["info"] = _strip_quotes(rest) + continue + if label in _LABEL_USE: + active_field = "use" + current_fields["use"] = _strip_quotes(rest) if rest else "" + continue + if label in _LABEL_NOTE: + active_field = "note" + current_fields["note"] = _strip_quotes(rest) if rest else "" + continue + if label == "Title": + active_field = None + continue + active_field = None + continue + + if active_field: + existing = current_fields.get(active_field, "") + if existing: + current_fields[active_field] = existing + "\n" + stripped + else: + current_fields[active_field] = stripped + + _flush_paper(len(lines) + 1) + + return {"ok": len(errors) == 0, "papers": papers, "errors": errors} + + +def validate_reading_log(filepath: Path) -> dict: + """Parse a reading-log.md with strict format rules and return validation result.""" + parsed = _parse_reading_log(filepath) + return { + "ok": parsed["ok"], + "file": str(filepath), + "errors": parsed["errors"], + "papers_found": len(parsed["papers"]), + "entries_found": sum(len(p["sections"]) for p in parsed["papers"]), + } + + +def import_reading_log(vault: Path, filepath: Path) -> dict: + """Validate and import a reading-log.md into reading-log.jsonl source of truth.""" + parsed = _parse_reading_log(filepath) + if not parsed["ok"]: + return {"ok": False, "errors": parsed["errors"], "papers_imported": 0, "entries_imported": 0} + + papers_set: set[str] = set() + entries_imported = 0 + errors: list[dict] = [] + + for paper in parsed["papers"]: + for section in paper["sections"]: + info = section.get("info", "") + use = section.get("use", "") + if info and use: + res = append_reading_note( + vault, + paper["paper_key"], + section["section_name"], + excerpt=info, + usage=use, + note=section.get("note", "") or "", + ) + if res.get("ok"): + entries_imported += 1 + papers_set.add(paper["paper_key"]) + else: + errors.append({ + "paper_key": paper["paper_key"], + "section": section["section_name"], + "error": res.get("error", "unknown"), + }) + + return { + "ok": len(errors) == 0, + "papers_imported": len(papers_set), + "entries_imported": entries_imported, + "errors": errors, + } + + +def lookup_paper_events(vault: Path, key: str) -> dict: + """Look up all reading notes for a paper from JSONL.""" + notes = get_reading_notes_for_paper(vault, key) + notes.sort(key=lambda n: n.get("created_at", ""), reverse=True) + + title = "" + db_path = get_memory_db_path(vault) + if db_path.exists(): + conn = get_connection(db_path, read_only=True) + try: + row = conn.execute( + "SELECT title FROM papers WHERE zotero_key = ?", (key,), + ).fetchone() + if row: + title = row["title"] or "" + finally: + conn.close() + + entries = [] + for n in notes: + entries.append({ + "created_at": n.get("created_at", ""), + "section": n.get("section", ""), + "excerpt": n.get("excerpt", ""), + "usage": n.get("usage", ""), + "note": n.get("note", ""), + }) + + return { + "ok": True, + "zotero_key": key, + "title": title, + "entries": entries, + "count": len(entries), + } + + +def _export_from_jsonl(vault: Path, since: str = "", limit: int = 50) -> list[dict]: + """Export reading notes from JSONL, enriched with paper metadata from DB.""" + all_notes = read_all_reading_notes(vault) + all_notes.sort(key=lambda n: n.get("created_at", ""), reverse=True) + + if since: + all_notes = [n for n in all_notes if n.get("created_at", "") >= since] + all_notes = all_notes[:limit] + + db_path = get_memory_db_path(vault) + paper_meta: dict[str, dict] = {} + if db_path.exists(): + paper_ids = list(set(n.get("paper_id", "") for n in all_notes if n.get("paper_id"))) + if paper_ids: + conn = get_connection(db_path, read_only=True) + try: + placeholders = ",".join("?" * len(paper_ids)) + rows = conn.execute( + f"SELECT zotero_key, citation_key, title, year, first_author " + f"FROM papers WHERE zotero_key IN ({placeholders})", + paper_ids, + ).fetchall() + for row in rows: + paper_meta[row["zotero_key"]] = { + "citation_key": row["citation_key"], + "title": row["title"], + "year": row["year"], + "first_author": row["first_author"], + } + finally: + conn.close() + + results = [] + for n in all_notes: + pid = n.get("paper_id", "") + meta = paper_meta.get(pid, {}) + results.append({ + "created_at": n.get("created_at", ""), + "paper_id": pid, + "citation_key": meta.get("citation_key", pid), + "title": meta.get("title", ""), + "year": meta.get("year", ""), + "first_author": meta.get("first_author", ""), + "section": n.get("section", ""), + "excerpt": n.get("excerpt", ""), + "usage": n.get("usage", ""), + "note": n.get("note", ""), + }) + + return results + + +def _render_reading_log_md(vault: Path, project: str = "") -> None: + """Render reading-log.md from JSONL source of truth. + + Groups notes by paper_id and writes a formatted markdown file. + If project is specified, writes to /Projects//reading-log.md. + Otherwise writes to /logs/rendered/reading-log.md. + """ + paths = paperforge_paths(vault) + notes = read_all_reading_notes(vault) + + if not notes: + print("No reading notes to render.") + return + + if project: + notes = [n for n in notes if n.get("project") == project] + + if not notes: + print(f"No reading notes found{' for project ' + project if project else ''}.") + return + + grouped: dict[str, list[dict]] = {} + for n in notes: + pid = n.get("paper_id", "unknown") + grouped.setdefault(pid, []).append(n) + + lines: list[str] = [] + heading = f"Reading Log \u2014 {project}" if project else "Reading Log \u2014 All Projects" + lines.append(f"# {heading}\n") + lines.append(f"*Generated: {datetime.date.today().isoformat()} | Total entries: {len(notes)}*\n") + + for pid, entries in sorted(grouped.items()): + lines.append(f"## {pid}\n") + for entry in sorted(entries, key=lambda e: (e.get("section", ""), e.get("created_at", ""))): + section = entry.get("section", "Untitled") + lines.append(f"### {section}") + lines.append(f"> {entry.get('excerpt', '')}") + if entry.get("context"): + lines.append(">") + lines.append(f"> {entry.get('context')}") + lines.append("") + if entry.get("usage"): + lines.append(f"- **Usage:** {entry.get('usage')}") + if entry.get("note"): + lines.append(f"- **Note:** {entry.get('note')}") + tag_list = entry.get("tags", []) + if tag_list: + lines.append(f"- **Tags:** {', '.join(tag_list)}") + verified = entry.get("verified", False) + lines.append(f"- **Verified:** {'Yes' if verified else 'No'}") + lines.append("") + lines.append("---\n") + + if project: + output_dir = paths["resources"] / "Projects" / project + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / "reading-log.md" + else: + output_dir = paths["paperforge"] / "logs" / "rendered" + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / "reading-log.md" + + output_path.write_text("\n".join(lines), encoding="utf-8") + print(f"Rendered {len(notes)} entries to {output_path}") + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + + if args.validate: + data = validate_reading_log(Path(args.validate)) + result = PFResult( + ok=data["ok"], command="reading-log", version=PF_VERSION, data=data, + ) + if args.json: + print(result.to_json()) + else: + if data["ok"]: + print(f"Valid. {data['papers_found']} papers, {data['entries_found']} entries.") + else: + print(f"{len(data['errors'])} error(s):") + for e in data["errors"]: + print(f" line {e['line']}: [{e['field']}] {e['message']}") + return 0 if data["ok"] else 1 + + if args.import_file: + data = import_reading_log(vault, Path(args.import_file)) + result = PFResult( + ok=data.get("ok", True), command="reading-log", version=PF_VERSION, data=data, + ) + if args.json: + print(result.to_json()) + else: + if data["ok"]: + print(f"Imported {data['entries_imported']} entries from {data['papers_imported']} papers.") + else: + print(f"Validation failed with {len(data.get('errors', []))} error(s).") + return 0 if data["ok"] else 1 + + if args.lookup: + data = lookup_paper_events(vault, args.lookup) + result = PFResult( + ok=data["ok"], command="reading-log", version=PF_VERSION, data=data, + ) + if args.json: + print(result.to_json()) + else: + if data["ok"]: + print(f"Paper: {data['title']} ({data['zotero_key']})") + print(f" {data['count']} reading notes:") + for e in data["entries"]: + print(f" [{e['created_at']}] {e['section']}: \"{e['excerpt']}\"") + if e["usage"]: + print(f" -> Usage: {e['usage']}") + if e["note"]: + print(f" -> Note: {e['note']}") + else: + print(f"No entries found for key: {args.lookup}") + return 0 + + if args.render: + _render_reading_log_md(vault, args.project or "") + return 0 + + if args.correct_id: + if not args.correction: + result = PFResult( + ok=False, command="reading-log", version=PF_VERSION, + data={}, + error=PFError(code=ErrorCode.INVALID_INPUT, + message="--correction is required with --correct"), + ) + if args.json: + print(result.to_json()) + else: + print("Error: --correction is required with --correct") + return 1 + + all_notes = read_all_reading_notes(vault) + original = next((n for n in all_notes if n.get("id") == args.correct_id), None) + paper_id = original.get("paper_id", "") if original else "" + if not paper_id: + result = PFResult( + ok=False, command="reading-log", version=PF_VERSION, + data={}, + error=PFError(code=ErrorCode.NOT_FOUND, + message=f"Original entry {args.correct_id} not found in JSONL"), + ) + if args.json: + print(result.to_json()) + else: + print(f"Error: Original entry {args.correct_id} not found in reading-log.jsonl") + return 1 + + # Write to JSONL (source of truth) + jsonl_result = append_correction( + vault, paper_id, args.correct_id, + args.correction, args.reason or "", + ) + + # Also write to paper_events for FTS (best effort) + db_ok = write_correction_note( + vault, paper_id, args.correct_id, + args.correction, args.reason or "", + ) + + ok = bool(jsonl_result.get("ok")) + result = PFResult( + ok=ok, + command="reading-log", + version=PF_VERSION, + data={ + "written": ok, + "jsonl_id": jsonl_result.get("id", ""), + "db_indexed": db_ok, + }, + error=PFError(code=ErrorCode.INTERNAL_ERROR, + message="Correction write failed") if not ok else None, + ) + if args.json: + print(result.to_json()) + else: + if ok: + print(f"Correction written ({jsonl_result.get('id', '')})." + f"{' DB indexed.' if db_ok else ''}") + else: + print("Failed.") + return 0 if ok else 1 + + if args.paper_id and args.excerpt: + tags_list = [t.strip() for t in args.tags.split(",") if t.strip()] if args.tags else None + + jsonl_result = append_reading_note( + vault, args.paper_id, args.section or "", + args.excerpt, args.usage or "", args.context or "", + args.note or "", args.project or "", tags_list, + ) + + ok = jsonl_result.get("ok", False) + result = PFResult( + ok=ok, + command="reading-log", + version=PF_VERSION, + data={"written": ok, "id": jsonl_result.get("id"), "path": jsonl_result.get("path")}, + error=PFError(code=ErrorCode.INTERNAL_ERROR, + message=jsonl_result.get("error", "Failed to write")) if not ok else None, + ) + if args.json: + print(result.to_json()) + else: + if ok: + print(f"Written. ID: {jsonl_result.get('id', 'unknown')}") + else: + print(f"Failed: {jsonl_result.get('error', 'unknown')}") + + if ok and args.project: + _render_reading_log_md(vault, args.project) + + return 0 if ok else 1 + + notes = _export_from_jsonl(vault, since=args.since or "", limit=args.limit or 50) + result = PFResult( + ok=True, + command="reading-log", + version=PF_VERSION, + data={"notes": notes, "count": len(notes)}, + ) + + if args.json: + print(result.to_json()) + elif args.output: + lines = [] + last_date = None + for n in notes: + date_str = n["created_at"][:10] + if date_str != last_date: + last_date = date_str + lines.append(f"\n## {date_str}") + author = (n["first_author"] or "").split()[-1] if n["first_author"] else "" + lines.append(f"\n### {n['citation_key']} \u2014 {author} et al. {n['year']}") + lines.append(f"- **{n['section']}**\uff1a\"{n['excerpt']}\"") + if n["usage"]: + lines.append(f" \u2192 \u7528\u9014: {n['usage']}") + if n["note"]: + lines.append(f" \u2192 \u5907\u6ce8: {n['note']}") + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text("\n".join(lines).strip() + "\n", encoding="utf-8") + print(f"Exported {len(notes)} notes to {args.output}") + else: + print(f"{len(notes)} reading notes.") + return 0 diff --git a/paperforge/commands/retrieve.py b/paperforge/commands/retrieve.py new file mode 100644 index 0000000..7d4bb48 --- /dev/null +++ b/paperforge/commands/retrieve.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +import argparse +import sys +import json + +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.vector_db import retrieve_chunks +from paperforge import __version__ as PF_VERSION + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + query = args.query + limit = args.limit or 5 + + # Check if vector index exists + from paperforge.worker.vector_db import get_embed_status + status = get_embed_status(vault) + if status.get("chunk_count", 0) == 0: + result = PFResult( + ok=False, + command="retrieve", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message="Vector index is empty. Run paperforge embed build first.", + ), + data={"next_action": "paperforge embed build"}, + ) + if args.json: + print(result.to_json()) + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 1 + + try: + chunks = retrieve_chunks(vault, query, limit=limit, expand=args.expand) + except Exception as e: + result = PFResult(ok=False, command="retrieve", version=PF_VERSION, + error=PFError(code=ErrorCode.INTERNAL_ERROR, message=str(e))) + print(result.to_json() if args.json else result.error.message, file=sys.stderr if not args.json else sys.stdout) + return 1 + + # Enrich with paper metadata from memory DB + if chunks: + db_path = get_memory_db_path(vault) + if db_path.exists(): + conn = get_connection(db_path, read_only=True) + try: + for c in chunks: + row = conn.execute( + "SELECT citation_key, title, year, first_author FROM papers WHERE zotero_key=?", + (c["paper_id"],) + ).fetchone() + if row: + c["citation_key"] = row["citation_key"] + c["title"] = row["title"] + c["year"] = row["year"] + c["first_author"] = row["first_author"] + finally: + conn.close() + + data = {"query": query, "chunks": chunks, "count": len(chunks)} + result = PFResult(ok=True, command="retrieve", version=PF_VERSION, data=data) + + if args.json: + print(result.to_json()) + else: + print(f"{len(chunks)} chunks for: {query}") + for c in chunks: + print(f" [{c.get('section','')}] {c.get('citation_key','')} p{c.get('page_number',0)}: {c['chunk_text'][:80]}...") + return 0 diff --git a/paperforge/commands/search.py b/paperforge/commands/search.py new file mode 100644 index 0000000..0de275a --- /dev/null +++ b/paperforge/commands/search.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +import argparse +import sys + +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.fts import search_papers +from paperforge import __version__ as PF_VERSION + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + query = args.query + + db_path = get_memory_db_path(vault) + if not db_path.exists(): + result = PFResult( + ok=False, + command="search", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message="Memory database not found. Run paperforge memory build.", + ), + ) + if args.json: + print(result.to_json()) + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 1 + + conn = get_connection(db_path, read_only=True) + try: + results = search_papers( + conn, query, + limit=args.limit, + domain=args.domain or "", + year_from=args.year_from or 0, + year_to=args.year_to or 0, + ocr_status=args.ocr or "", + deep_status=args.deep or "", + lifecycle=args.lifecycle or "", + next_step=args.next_step or "", + ) + data = { + "query": query, + "matches": results, + "count": len(results), + "filters_applied": { + "domain": args.domain, + "year_from": args.year_from, + "year_to": args.year_to, + "ocr": args.ocr, + "deep": args.deep, + "lifecycle": args.lifecycle, + "next_step": args.next_step, + }, + } + result = PFResult(ok=True, command="search", version=PF_VERSION, data=data) + except Exception as exc: + result = PFResult( + ok=False, command="search", version=PF_VERSION, + error=PFError(code=ErrorCode.INTERNAL_ERROR, message=str(exc)), + ) + finally: + conn.close() + + if args.json: + print(result.to_json()) + else: + if result.ok: + matches = result.data["matches"] + print(f"Found {len(matches)} results for: {query}") + for m in matches: + rank_val = m.get("rank", "") + print(f" [{m['lifecycle']:16}] {m['zotero_key']} | {m['year']} | {m['first_author']} | {m['title'][:60]}") + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 0 if result.ok else 1 diff --git a/paperforge/config.py b/paperforge/config.py index 8f63456..4ea3559 100644 --- a/paperforge/config.py +++ b/paperforge/config.py @@ -279,7 +279,7 @@ def paperforge_paths( - bases: / - worker_script: pipeline/worker/scripts/literature_pipeline.py - skill_dir: / - - ld_deep_script: /literature-qa/scripts/ld_deep.py + - pf_deep_script: /paperforge/scripts/pf_deep.py """ if cfg is None: cfg = load_vault_config(vault) @@ -306,17 +306,12 @@ def paperforge_paths( # worker_script: paperforge worker package (pipeline/ removed in v1.3) worker_script = Path(__file__).parent / "worker" / "__init__.py" - # ld_deep_script: look relative to skill_dir first, then repo paperforge/skills for dev - ld_deep_script = skill_path / "literature-qa" / "scripts" / "ld_deep.py" - if not ld_deep_script.exists(): - repo_skill = Path(__file__).parent / "skills" / "literature-qa" / "scripts" / "ld_deep.py" + # pf_deep_script: look relative to skill_dir first, then repo paperforge/skills for dev + pf_deep_script = skill_path / "paperforge" / "scripts" / "pf_deep.py" + if not pf_deep_script.exists(): + repo_skill = Path(__file__).parent / "skills" / "paperforge" / "scripts" / "pf_deep.py" if repo_skill.exists(): - ld_deep_script = repo_skill - else: - # Backward compat: old skills/ location during transition - old_repo_skill = Path(__file__).parent.parent / "skills" / "literature-qa" / "scripts" / "ld_deep.py" - if old_repo_skill.exists(): - ld_deep_script = old_repo_skill + pf_deep_script = repo_skill return { "vault": vault, @@ -332,10 +327,11 @@ def paperforge_paths( "bases": bases, "worker_script": worker_script, "skill_dir": skill_path, - "ld_deep_script": ld_deep_script, + "pf_deep_script": pf_deep_script, # ── v2.2: canonical locations below paperforge/ ── "config": paperforge / "config" / "domain-collections.json", "index": paperforge / "indexes" / "formal-library.json", + "memory_db": paperforge / "indexes" / "paperforge.db", } diff --git a/paperforge/memory/__init__.py b/paperforge/memory/__init__.py new file mode 100644 index 0000000..5585cd6 --- /dev/null +++ b/paperforge/memory/__init__.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.schema import ensure_schema, drop_all_tables + +__all__ = [ + "get_connection", + "get_memory_db_path", + "ensure_schema", + "drop_all_tables", +] diff --git a/paperforge/memory/_columns.py b/paperforge/memory/_columns.py new file mode 100644 index 0000000..61c0ce9 --- /dev/null +++ b/paperforge/memory/_columns.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +import json + + +PAPER_COLUMNS = [ + "zotero_key", "citation_key", "title", "year", "doi", "pmid", + "journal", "first_author", "authors_json", "abstract", "domain", + "collection_path", "collections_json", + "has_pdf", "do_ocr", "analyze", "ocr_status", "deep_reading_status", + "ocr_job_id", "impact_factor", + "lifecycle", "maturity_level", "maturity_name", "next_step", + "pdf_path", "note_path", "main_note_path", "paper_root", + "fulltext_path", "ocr_md_path", "ocr_json_path", "ai_path", + "deep_reading_md_path", "updated_at", +] + + +def build_paper_row(entry: dict, generated_at: str) -> dict: + row = {} + for col in PAPER_COLUMNS: + if col == "authors_json": + row[col] = json.dumps(entry.get("authors", []), ensure_ascii=False) + elif col == "collections_json": + row[col] = json.dumps(entry.get("collections", []), ensure_ascii=False) + elif col == "lifecycle": + row[col] = entry.get("lifecycle", "") + elif col == "maturity_level": + row[col] = entry.get("maturity", {}).get("level", 1) + elif col == "maturity_name": + row[col] = entry.get("maturity", {}).get("level_name", "") + elif col == "next_step": + row[col] = entry.get("next_step", "") + elif col == "updated_at": + row[col] = generated_at + elif col in ("do_ocr", "analyze"): + val = entry.get(col) + row[col] = 1 if val else 0 + elif col == "has_pdf": + row[col] = 1 if entry.get("has_pdf") else 0 + else: + row[col] = entry.get(col, "") + return row diff --git a/paperforge/memory/builder.py b/paperforge/memory/builder.py new file mode 100644 index 0000000..d2b8567 --- /dev/null +++ b/paperforge/memory/builder.py @@ -0,0 +1,271 @@ +from __future__ import annotations + +import hashlib +import json +import logging +from datetime import datetime, timezone +from pathlib import Path + +from paperforge import __version__ as PF_VERSION +from paperforge.memory._columns import PAPER_COLUMNS, build_paper_row +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.schema import ( + CURRENT_SCHEMA_VERSION, + PAPERS_AI_TRIGGER, + clear_fts, + drop_all_tables, + ensure_schema, + get_schema_version, +) +from paperforge.worker.asset_index import read_index +from paperforge.worker.asset_state import ( + compute_lifecycle, + compute_maturity, + compute_next_step, +) + +logger = logging.getLogger(__name__) + +ASSET_FIELDS = [ + ("pdf", "pdf_path"), + ("formal_note", "note_path"), + ("main_note", "main_note_path"), + ("ocr_fulltext", "fulltext_path"), + ("ocr_meta", "ocr_json_path"), + ("deep_reading", "main_note_path"), + ("ai_dir", "ai_path"), +] + +ALIAS_TYPES = ["zotero_key", "citation_key", "title", "doi"] + + +def compute_hash(items: list[dict]) -> str: + sorted_items = sorted(items, key=lambda e: e["zotero_key"]) + raw = json.dumps(sorted_items, sort_keys=True, ensure_ascii=False) + return hashlib.sha256(raw.encode("utf-8")).hexdigest() + + +def _resolve_vault_path(vault: Path, rel_path: str) -> Path: + if not rel_path: + return Path() + p = vault / rel_path + return p.resolve() if p.exists() else p + + +def _import_reading_log(conn, vault: Path) -> int: + """Import reading-log.jsonl into reading_log table. Returns count.""" + from paperforge.memory.permanent import read_all_reading_notes + + notes = read_all_reading_notes(vault) + conn.execute("DELETE FROM reading_log") + count = 0 + for note in notes: + conn.execute( + """INSERT INTO reading_log (id, paper_id, project, section, excerpt, context, usage, note, tags_json, created_at, agent, verified) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", + ( + note["id"], note["paper_id"], + note.get("project", ""), + note["section"], note["excerpt"], + note.get("context", ""), note["usage"], + note.get("note", ""), + json.dumps(note.get("tags", []), ensure_ascii=False), + note["created_at"], + note.get("agent", ""), + 1 if note.get("verified") else 0, + ), + ) + count += 1 + return count + + +def _import_project_log(conn, vault: Path) -> int: + """Import project-log.jsonl into project_log table. Returns count.""" + from paperforge.memory.permanent import read_all_project_entries + + entries = read_all_project_entries(vault) + conn.execute("DELETE FROM project_log") + count = 0 + for entry in entries: + conn.execute( + """INSERT INTO project_log (id, project, date, type, title, decisions_json, detours_json, reusable_json, todos_json, related_papers_json, tags_json, created_at, agent) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", + ( + entry["id"], entry["project"], + entry.get("date", ""), entry["type"], entry["title"], + json.dumps(entry.get("decisions", []), ensure_ascii=False), + json.dumps(entry.get("detours", []), ensure_ascii=False), + json.dumps(entry.get("reusable", []), ensure_ascii=False), + json.dumps(entry.get("todos", []), ensure_ascii=False), + json.dumps(entry.get("related_papers", []), ensure_ascii=False), + json.dumps(entry.get("tags", []), ensure_ascii=False), + entry.get("created_at", ""), + entry.get("agent", ""), + ), + ) + count += 1 + return count + + +def _import_correction_log(conn, vault: Path) -> int: + """Import correction-log.jsonl into paper_events for FTS search. Returns count.""" + from paperforge.memory.permanent import read_all_corrections + + corrections = read_all_corrections(vault) + count = 0 + for c in corrections: + payload = { + "original_id": c.get("original_id", ""), + "correction": c.get("correction", ""), + "reason": c.get("reason", ""), + } + conn.execute( + "INSERT INTO paper_events (paper_id, event_type, payload_json) VALUES (?, 'correction_note', ?)", + (c["paper_id"], json.dumps(payload, ensure_ascii=False)), + ) + count += 1 + return count + + +def build_from_index(vault: Path) -> dict: + """Read formal-library.json and build/rebuild paperforge.db. + + Returns a dict with counts for reporting. + """ + envelope = read_index(vault) + if envelope is None: + raise FileNotFoundError( + "Canonical index not found. Run paperforge sync --rebuild-index." + ) + # Legacy format: bare list of entries (pre-envelope) + if isinstance(envelope, list): + items = envelope + generated_at = "" + else: + items = envelope.get("items", []) + generated_at = envelope.get("generated_at", "") + canonical_hash = compute_hash(items) if isinstance(items, list) and items and isinstance(items[0], dict) else "" + + db_path = get_memory_db_path(vault) + conn = get_connection(db_path, read_only=False) + try: + stored_version = get_schema_version(conn) + if stored_version != CURRENT_SCHEMA_VERSION: + drop_all_tables(conn) + ensure_schema(conn) + + conn.execute("DELETE FROM paper_aliases;") + conn.execute("DELETE FROM paper_assets;") + conn.execute("DELETE FROM papers;") + + clear_fts(conn) + + conn.execute("DROP TRIGGER IF EXISTS papers_ai") + + now_utc = datetime.now(timezone.utc).isoformat() + paper_rows: list[dict] = [] + asset_rows: list[tuple] = [] + alias_rows: list[tuple] = [] + + placeholders = ", ".join([f":{c}" for c in PAPER_COLUMNS]) + cols = ", ".join(PAPER_COLUMNS) + paper_sql = f"INSERT OR REPLACE INTO papers ({cols}) VALUES ({placeholders})" + + for entry in items: + zotero_key = entry.get("zotero_key", "") + if not zotero_key: + continue + + entry["lifecycle"] = str(compute_lifecycle(entry)) + entry["maturity"] = compute_maturity(entry) + entry["next_step"] = str(compute_next_step(entry)) + paper_rows.append(build_paper_row(entry, generated_at)) + + for asset_type, entry_field in ASSET_FIELDS: + path_val = entry.get(entry_field, "") + if not path_val: + continue + rel_path = str(path_val).replace("\\", "/") + abs_path = _resolve_vault_path(vault, rel_path) + exists = 1 if abs_path.exists() else 0 + + if asset_type == "deep_reading" and abs_path.exists(): + try: + content = abs_path.read_text(encoding="utf-8") + exists = 1 if "## 🔍 精读" in content else 0 + except Exception: + exists = 0 + + asset_rows.append((zotero_key, asset_type, rel_path, exists)) + + for alias_type in ALIAS_TYPES: + raw_val = entry.get(alias_type, "") + if not raw_val: + continue + raw_str = str(raw_val) + alias_rows.append((zotero_key, raw_str, raw_str.lower().strip(), alias_type)) + + conn.executemany(paper_sql, paper_rows) + conn.executemany( + """INSERT OR REPLACE INTO paper_assets + (paper_id, asset_type, path, exists_on_disk) + VALUES (?, ?, ?, ?)""", + asset_rows, + ) + conn.executemany( + """INSERT OR REPLACE INTO paper_aliases + (paper_id, alias, alias_norm, alias_type) + VALUES (?, ?, ?, ?)""", + alias_rows, + ) + + conn.execute("""INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) + SELECT rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json + FROM papers""") + conn.execute(PAPERS_AI_TRIGGER) + + reading_count = _import_reading_log(conn, vault) + logger.info("Imported %d reading notes from JSONL", reading_count) + + project_count = _import_project_log(conn, vault) + logger.info("Imported %d project log entries from JSONL", project_count) + + conn.execute("DELETE FROM paper_events WHERE event_type = 'correction_note';") + correction_count = _import_correction_log(conn, vault) + logger.info("Imported %d corrections from JSONL", correction_count) + + conn.execute( + "DELETE FROM paper_events WHERE event_type != 'correction_note';" + ) + + meta_upserts = [ + ("schema_version", str(CURRENT_SCHEMA_VERSION)), + ("paperforge_version", PF_VERSION), + ("created_at", now_utc), + ("last_full_build_at", now_utc), + ("canonical_index_hash", canonical_hash), + ("canonical_index_generated_at", generated_at), + ] + for key, value in meta_upserts: + conn.execute( + """INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)""", + (key, value), + ) + + conn.commit() + + return { + "db_path": str(db_path), + "papers_indexed": len(paper_rows), + "assets_indexed": len(asset_rows), + "aliases_indexed": len(alias_rows), + "reading_notes_imported": reading_count, + "project_entries_imported": project_count, + "corrections_imported": correction_count, + "schema_version": str(CURRENT_SCHEMA_VERSION), + } + except Exception: + conn.rollback() + raise + finally: + conn.close() diff --git a/paperforge/memory/chunker.py b/paperforge/memory/chunker.py new file mode 100644 index 0000000..7011fa4 --- /dev/null +++ b/paperforge/memory/chunker.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +import re +from pathlib import Path + +# Section detection keywords (case-insensitive, must appear as short standalone line) +SECTION_PATTERNS = [ + re.compile(r'^\s*(introduction|methods|materials|results|discussion|conclusion|abstract|background|references|supplementary|acknowledgments?)\s*$', re.IGNORECASE), + re.compile(r'^\s*(figure\s*\d+|fig\.?\s*\d+|table\s*\d+)\s*$', re.IGNORECASE), +] + +def _detect_section(line: str) -> str: + """Try to identify a section title from a line.""" + stripped = line.strip() + if len(stripped) > 80: + return "" + for pat in SECTION_PATTERNS: + m = pat.match(stripped) + if m: + return m.group(0) + # Heuristic: ALL CAPS short line, no period + if stripped.isupper() and len(stripped) > 2 and '.' not in stripped: + return stripped + # Short line, no period, surrounded by blank lines (checked by caller) + if len(stripped) < 80 and '.' not in stripped[-5:]: + return stripped + return "" + + +def _clean_text(text: str) -> str: + """Remove image links and clean text for embedding.""" + # Remove standalone image links: ![[path]] + text = re.sub(r'^!\[\[.*\]\]\s*$', '', text, flags=re.MULTILINE) + # Replace inline images with placeholder + text = re.sub(r'!\[\[.*?\]\]', '[Figure]', text) + # Collapse multiple blank lines + text = re.sub(r'\n{3,}', '\n\n', text) + return text.strip() + + +def chunk_fulltext(fulltext_path: Path) -> list[dict]: + """Chunk a fulltext.md into embeddable segments. + + Returns list of dicts with: text, section, page_number, chunk_index, token_estimate. + """ + if not fulltext_path.exists(): + return [] + + text = _clean_text(fulltext_path.read_text(encoding="utf-8")) + + # Split by page markers + pages = re.split(r'', text) + # pages[0] = before first marker, pages[1] = page num, pages[2] = content, pages[3] = page num, ... + + current_section = "Text" + parts = [] + + if len(pages) > 1 and not pages[1].strip().isdigit(): + # No page marker found, treat whole text as one page + parts = [(1, text)] + else: + for j in range(1, len(pages), 2): + if j + 1 < len(pages): + try: + page_num = int(pages[j].strip()) + page_content = pages[j + 1] + parts.append((page_num, page_content)) + except ValueError: + continue + + if not parts and text.strip(): + parts = [(1, text)] + + chunks = [] + chunk_index = 0 + for page_num, page_text in parts: + # Split page into paragraphs by double newlines + paragraphs = [p.strip() for p in re.split(r'\n\s*\n', page_text) if p.strip()] + + # Detect section headers + for para in paragraphs: + section = _detect_section(para) + if section: + current_section = section + + # Group 2-3 paragraphs per chunk with 1-paragraph overlap + i = 0 + while i < len(paragraphs): + chunk_paras = paragraphs[i:i+3] + chunk_text = "\n\n".join(chunk_paras) + token_estimate = len(chunk_text.split()) # rough: 1 token ≈ 1 word + chunks.append({ + "text": chunk_text, + "section": current_section, + "page_number": page_num, + "chunk_index": chunk_index, + "token_estimate": token_estimate, + }) + chunk_index += 1 + i += max(1, len(chunk_paras) - 1) # advance but leave 1 overlap + + return chunks diff --git a/paperforge/memory/context.py b/paperforge/memory/context.py new file mode 100644 index 0000000..01f6f7b --- /dev/null +++ b/paperforge/memory/context.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +from pathlib import Path + +from paperforge.memory.db import get_connection, get_memory_db_path + + +def _build_collection_tree(conn) -> list[dict]: + """Build collection hierarchy from papers.collection_path. + + Each collection_path is pipe-separated, e.g. "骨科 | 骨折". + Returns flat list of top-level collections with sub-collections. + """ + rows = conn.execute( + "SELECT collection_path, COUNT(*) as cnt FROM papers " + "WHERE collection_path != '' " + "GROUP BY collection_path ORDER BY cnt DESC" + ).fetchall() + top: dict[str, dict] = {} + for row in rows: + parts = [p.strip() for p in row["collection_path"].split("|") if p.strip()] + if not parts: + continue + root = parts[0] + if root not in top: + top[root] = {"name": root, "count": 0, "sub": []} + top[root]["count"] += row["cnt"] + if len(parts) > 1: + sub_name = parts[-1] + if sub_name not in top[root]["sub"]: + top[root]["sub"].append(sub_name) + for c in top.values(): + c["sub"] = sorted(c["sub"]) + return sorted(top.values(), key=lambda x: -x["count"]) + + +def get_agent_context(vault: Path) -> dict | None: + """Build agent context from paperforge.db — library stats + collection tree. + + Returns None if DB is missing or query fails. + """ + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return None + + conn = get_connection(db_path, read_only=True) + try: + total = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0] + + domains = { + r["domain"]: r["cnt"] + for r in conn.execute( + "SELECT domain, COUNT(*) as cnt FROM papers GROUP BY domain ORDER BY cnt DESC" + ).fetchall() + } + + lifecycle_counts = { + r["lifecycle"]: r["cnt"] + for r in conn.execute( + "SELECT lifecycle, COUNT(*) as cnt FROM papers GROUP BY lifecycle" + ).fetchall() + } + + ocr_counts = { + r["ocr_status"]: r["cnt"] + for r in conn.execute( + "SELECT ocr_status, COUNT(*) as cnt FROM papers GROUP BY ocr_status" + ).fetchall() + } + + deep_counts = { + r["deep_reading_status"]: r["cnt"] + for r in conn.execute( + "SELECT deep_reading_status, COUNT(*) as cnt FROM papers GROUP BY deep_reading_status" + ).fetchall() + } + + collections = _build_collection_tree(conn) + + return { + "library": { + "paper_count": total, + "domain_counts": domains, + "lifecycle_counts": lifecycle_counts, + "ocr_counts": ocr_counts, + "deep_reading_counts": deep_counts, + }, + "collections": collections, + } + except Exception: + return None + finally: + conn.close() diff --git a/paperforge/memory/db.py b/paperforge/memory/db.py new file mode 100644 index 0000000..19dbf83 --- /dev/null +++ b/paperforge/memory/db.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +import sqlite3 +from pathlib import Path + +from paperforge.config import paperforge_paths + + +def get_memory_db_path(vault: Path) -> Path: + """Return the absolute path to paperforge.db.""" + paths = paperforge_paths(vault) + db_path = paths.get("memory_db") + if not db_path: + raise FileNotFoundError("memory_db path not configured") + return db_path + + +def get_connection(db_path: Path, read_only: bool = False) -> sqlite3.Connection: + """Open a SQLite connection to paperforge.db with WAL mode. + + Args: + db_path: Path to paperforge.db. + read_only: If True, open in read-only mode (for queries). + """ + if read_only: + uri = "file:" + db_path.as_posix() + "?mode=ro" + conn = sqlite3.connect(uri, uri=True) + else: + db_path.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + if not read_only: + conn.execute("PRAGMA journal_mode=WAL;") + conn.execute("PRAGMA foreign_keys=ON;") + return conn diff --git a/paperforge/memory/events.py b/paperforge/memory/events.py new file mode 100644 index 0000000..9ef3d75 --- /dev/null +++ b/paperforge/memory/events.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from paperforge.memory.db import get_connection, get_memory_db_path + + +def write_reading_note(vault: Path, paper_id: str, section: str, + excerpt: str, usage: str = "", note: str = "", + context: str = "", project: str = "", + tags: list[str] | None = None) -> bool: + """DEPRECATED: Wraps append_reading_note(). Use permanent.py directly. + + Kept for backward compatibility. Does NOT write to paper_events anymore. + """ + from paperforge.memory.permanent import append_reading_note + result = append_reading_note( + vault, paper_id, section, excerpt, + usage=usage, context=context, note=note, + project=project, tags=tags, + ) + return bool(result.get("ok")) + + +def export_reading_log(vault: Path, since: str = "", limit: int = 50) -> list[dict]: + """Export reading notes from JSONL (source of truth).""" + from paperforge.memory.permanent import read_all_reading_notes + + notes = read_all_reading_notes(vault) + + # Optionally enrich with papers metadata from DB + db_path = get_memory_db_path(vault) + paper_cache = {} + if db_path.exists(): + conn = get_connection(db_path, read_only=True) + try: + rows = conn.execute( + "SELECT zotero_key, citation_key, title, year, first_author FROM papers" + ).fetchall() + for r in rows: + paper_cache[r["zotero_key"]] = dict(r) + finally: + conn.close() + + results = [] + for n in notes: + created = n.get("created_at", "") + if since and created < since: + continue + pid = n.get("paper_id", "") + meta = paper_cache.get(pid, {}) + results.append({ + "created_at": created, + "paper_id": pid, + "citation_key": meta.get("citation_key", ""), + "title": meta.get("title", ""), + "year": meta.get("year", ""), + "first_author": meta.get("first_author", ""), + "section": n.get("section", ""), + "excerpt": n.get("excerpt", ""), + "usage": n.get("usage", ""), + "note": n.get("note", ""), + }) + + # Sort DESC by created_at, apply limit + results.sort(key=lambda x: x["created_at"], reverse=True) + return results[:limit] + + +def write_correction_note(vault: Path, paper_id: str, original_id: str, + correction: str, reason: str = "") -> bool: + """Record a correction note for a prior reading_note event.""" + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return False + + payload = { + "original_id": original_id, + "correction": correction, + "reason": reason, + } + conn = get_connection(db_path, read_only=False) + try: + conn.execute( + """INSERT INTO paper_events (paper_id, event_type, payload_json) + VALUES (?, 'correction_note', ?)""", + (paper_id, json.dumps(payload, ensure_ascii=False)), + ) + conn.commit() + return True + except Exception: + conn.rollback() + return False + finally: + conn.close() diff --git a/paperforge/memory/fts.py b/paperforge/memory/fts.py new file mode 100644 index 0000000..4a760b0 --- /dev/null +++ b/paperforge/memory/fts.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +import re +import sqlite3 + + +def tokenize_for_fts(q: str) -> str: + """Extract alphanumeric + CJK tokens and quote for safe FTS.""" + tokens = re.findall(r"[\w\u4e00-\u9fff]+", q) + if not tokens: + return q + return " OR ".join(f'"{t}"' for t in tokens) + + +def search_papers(conn: sqlite3.Connection, query: str, limit: int = 20, + domain: str = "", year_from: int = 0, year_to: int = 0, + ocr_status: str = "", deep_status: str = "", + lifecycle: str = "", next_step: str = "") -> list[dict]: + """Full-text search with safe fallback for special characters.""" + + filter_conditions = [] + filter_params = [] + + if domain: + filter_conditions.append("p.domain = ?") + filter_params.append(domain) + if year_from: + filter_conditions.append("CAST(p.year AS INTEGER) >= ?") + filter_params.append(year_from) + if year_to: + filter_conditions.append("CAST(p.year AS INTEGER) <= ?") + filter_params.append(year_to) + if ocr_status: + filter_conditions.append("p.ocr_status = ?") + filter_params.append(ocr_status) + if deep_status: + filter_conditions.append("p.deep_reading_status = ?") + filter_params.append(deep_status) + if lifecycle: + filter_conditions.append("p.lifecycle = ?") + filter_params.append(lifecycle) + if next_step: + filter_conditions.append("p.next_step = ?") + filter_params.append(next_step) + + filter_clause = (" AND " + " AND ".join(filter_conditions)) if filter_conditions else "" + + # Level 1: Raw FTS + try: + return _fts_query(conn, query, filter_clause, filter_params, limit) + except sqlite3.OperationalError: + pass + + # Level 2: Quoted token FTS + token_query = tokenize_for_fts(query) + if token_query != query: + try: + return _fts_query(conn, token_query, filter_clause, filter_params, limit) + except sqlite3.OperationalError: + pass + + # Level 3: LIKE fallback + return _like_query(conn, query, filter_clause, filter_params, limit) + + +def _fts_query(conn, query, filter_clause, filter_params, limit): + sql = f""" + SELECT p.zotero_key, p.citation_key, p.title, p.year, p.doi, + p.first_author, p.journal, p.domain, p.lifecycle, + p.ocr_status, p.deep_reading_status, p.next_step, + substr(p.abstract, 1, 300) as abstract, + rank + FROM paper_fts f + JOIN papers p ON p.rowid = f.rowid + WHERE paper_fts MATCH ?{filter_clause} + ORDER BY rank + LIMIT ? + """ + conn.row_factory = sqlite3.Row + rows = conn.execute(sql, [query] + filter_params + [limit]).fetchall() + return [dict(r) for r in rows] + + +def _like_query(conn, query, filter_clause, filter_params, limit): + like_param = f"%{query}%" + sql = f""" + SELECT p.zotero_key, p.citation_key, p.title, p.year, p.doi, + p.first_author, p.journal, p.domain, p.lifecycle, + p.ocr_status, p.deep_reading_status, p.next_step, + substr(p.abstract, 1, 300) as abstract, + 0 as rank + FROM papers p + WHERE (p.title LIKE ? OR p.abstract LIKE ? OR p.doi LIKE ? OR p.citation_key LIKE ?){filter_clause} + ORDER BY p.year DESC + LIMIT ? + """ + conn.row_factory = sqlite3.Row + rows = conn.execute(sql, [like_param, like_param, like_param, like_param] + filter_params + [limit]).fetchall() + return [dict(r) for r in rows] diff --git a/paperforge/memory/permanent.py b/paperforge/memory/permanent.py new file mode 100644 index 0000000..cf033fc --- /dev/null +++ b/paperforge/memory/permanent.py @@ -0,0 +1,217 @@ +from __future__ import annotations + +import json +import datetime +import logging +import secrets +from pathlib import Path + +from paperforge.config import paperforge_paths + +logger = logging.getLogger(__name__) + + +def _logs_dir(vault: Path) -> Path: + paths = paperforge_paths(vault) + return paths["paperforge"] / "logs" + + +def _ensure_logs_dir(vault: Path) -> Path: + log_dir = _logs_dir(vault) + log_dir.mkdir(parents=True, exist_ok=True) + return log_dir + + +# ── Reading Log ──────────────────────────────────────────────────────────── + + +def get_reading_log_path(vault: Path) -> Path: + return _logs_dir(vault) / "reading-log.jsonl" + + +def append_reading_note( + vault: Path, + paper_id: str, + section: str, + excerpt: str, + usage: str = "", + context: str = "", + note: str = "", + project: str = "", + tags: list[str] | None = None, + agent: str = "", +) -> dict: + if not paper_id: + return {"ok": False, "error": "paper_id is required"} + if not excerpt: + return {"ok": False, "error": "excerpt is required"} + + date_str = datetime.date.today().strftime("%Y%m%d") + entry_id = f"rln_{date_str}_{secrets.token_hex(4)}" + now = datetime.datetime.now(datetime.timezone.utc).isoformat() + + entry: dict[str, object] = { + "id": entry_id, + "created_at": now, + "paper_id": paper_id, + "section": section, + "excerpt": excerpt, + "usage": usage, + "context": context, + "note": note, + "project": project, + "tags": tags or [], + "agent": agent, + "verified": False, + } + + log_dir = _ensure_logs_dir(vault) + filepath = log_dir / "reading-log.jsonl" + + try: + with filepath.open("a", encoding="utf-8") as f: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + except OSError as e: + return {"ok": False, "error": str(e)} + + return {"ok": True, "id": entry_id, "path": str(filepath)} + + +def _read_jsonl(filepath: Path) -> list[dict]: + if not filepath.exists(): + return [] + entries: list[dict] = [] + with filepath.open("r", encoding="utf-8") as f: + for line_no, line in enumerate(f, 1): + stripped = line.strip() + if not stripped: + continue + try: + entries.append(json.loads(stripped)) + except json.JSONDecodeError: + logger.warning( + "Skipping malformed JSON line %d in %s", line_no, filepath + ) + return entries + + +def read_all_reading_notes(vault: Path) -> list[dict]: + filepath = get_reading_log_path(vault) + return _read_jsonl(filepath) + + +def get_reading_notes_for_paper(vault: Path, paper_id: str) -> list[dict]: + all_notes = read_all_reading_notes(vault) + return [n for n in all_notes if n.get("paper_id") == paper_id] + + +# ── Project Log ──────────────────────────────────────────────────────────── + + +def get_project_log_path(vault: Path) -> Path: + return _logs_dir(vault) / "project-log.jsonl" + + +def append_project_entry(vault: Path, entry: dict) -> dict: + date_str = datetime.date.today().strftime("%Y%m%d") + entry_id = f"plog_{date_str}_{secrets.token_hex(4)}" + now = datetime.datetime.now(datetime.timezone.utc).isoformat() + + record: dict[str, object] = { + "id": entry_id, + "created_at": now, + "project": entry.get("project", ""), + "date": entry.get("date", ""), + "type": entry.get("type", ""), + "title": entry.get("title", ""), + "decisions": entry.get("decisions", []), + "detours": entry.get("detours", []), + "reusable": entry.get("reusable", []), + "todos": entry.get("todos", []), + "related_papers": entry.get("related_papers", []), + "tags": entry.get("tags", []), + "agent": entry.get("agent", ""), + } + + log_dir = _ensure_logs_dir(vault) + filepath = log_dir / "project-log.jsonl" + + try: + with filepath.open("a", encoding="utf-8") as f: + f.write(json.dumps(record, ensure_ascii=False) + "\n") + except OSError as e: + return {"ok": False, "error": str(e)} + + return {"ok": True, "id": entry_id, "path": str(filepath)} + + +def read_all_project_entries(vault: Path) -> list[dict]: + filepath = get_project_log_path(vault) + return _read_jsonl(filepath) + + +def get_project_entries(vault: Path, project: str) -> list[dict]: + all_entries = read_all_project_entries(vault) + return [e for e in all_entries if e.get("project") == project] + + +# ── Correction Log ────────────────────────────────────────────────────────── + + +def get_correction_log_path(vault: Path) -> Path: + return _logs_dir(vault) / "correction-log.jsonl" + + +def append_correction( + vault: Path, + paper_id: str, + original_id: str, + correction: str, + reason: str = "", + agent: str = "", +) -> dict: + """Append a correction record to correction-log.jsonl.""" + if not paper_id: + return {"ok": False, "error": "paper_id is required"} + if not original_id: + return {"ok": False, "error": "original_id is required"} + if not correction: + return {"ok": False, "error": "correction is required"} + + date_str = datetime.date.today().strftime("%Y%m%d") + entry_id = f"corr_{date_str}_{secrets.token_hex(4)}" + now = datetime.datetime.now(datetime.timezone.utc).isoformat() + + entry: dict[str, object] = { + "id": entry_id, + "event_type": "correction", + "created_at": now, + "paper_id": paper_id, + "original_id": original_id, + "correction": correction, + "reason": reason, + "agent": agent, + } + + log_dir = _ensure_logs_dir(vault) + filepath = log_dir / "correction-log.jsonl" + + try: + with filepath.open("a", encoding="utf-8") as f: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + except OSError as e: + return {"ok": False, "error": str(e)} + + return {"ok": True, "id": entry_id, "path": str(filepath)} + + +def read_all_corrections(vault: Path) -> list[dict]: + """Read all correction entries from correction-log.jsonl.""" + filepath = get_correction_log_path(vault) + return _read_jsonl(filepath) + + +def get_corrections_for_paper(vault: Path, paper_id: str) -> list[dict]: + """Get all corrections for a specific paper.""" + all_corrections = read_all_corrections(vault) + return [c for c in all_corrections if c.get("paper_id") == paper_id] diff --git a/paperforge/memory/query.py b/paperforge/memory/query.py new file mode 100644 index 0000000..5e0df91 --- /dev/null +++ b/paperforge/memory/query.py @@ -0,0 +1,188 @@ +from __future__ import annotations + +import json +import logging +from pathlib import Path + +from paperforge.memory.builder import compute_hash +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.schema import CURRENT_SCHEMA_VERSION, get_schema_version +from paperforge.worker.asset_index import read_index +from paperforge.worker.asset_state import compute_health + +logger = logging.getLogger(__name__) + + +def get_memory_status(vault: Path) -> dict: + """Check paperforge.db health and staleness. + + Returns a dict with: db_exists, schema_ok, fresh, count_match, + paper_count_db, paper_count_index, needs_rebuild. + """ + db_path = get_memory_db_path(vault) + result = { + "db_exists": db_path.exists(), + "schema_ok": False, + "fresh": False, + "hash_match": False, + "count_match": False, + "paper_count_db": 0, + "paper_count_index": 0, + "needs_rebuild": True, + } + if not db_path.exists(): + return result + + conn = get_connection(db_path, read_only=True) + try: + stored_version = get_schema_version(conn) + result["schema_ok"] = stored_version == CURRENT_SCHEMA_VERSION + row = conn.execute("SELECT COUNT(*) as cnt FROM papers").fetchone() + result["paper_count_db"] = row["cnt"] if row else 0 + stored_hash_row = conn.execute( + "SELECT value FROM meta WHERE key = 'canonical_index_hash'" + ).fetchone() + stored_hash = stored_hash_row["value"] if stored_hash_row else "" + except Exception: + return result + finally: + conn.close() + + envelope = read_index(vault) + if envelope is not None: + # Handle legacy format (bare list) + if isinstance(envelope, list): + items = envelope + paper_count = len(items) + index_hash = compute_hash(items) + else: + items = envelope.get("items", []) + paper_count = envelope.get("paper_count", 0) + index_hash = compute_hash(items) + result["paper_count_index"] = paper_count + + # Compare stored hash with computed hash + result["hash_match"] = stored_hash == index_hash + + result["count_match"] = ( + result["paper_count_db"] == result["paper_count_index"] + ) + + result["fresh"] = ( + result["schema_ok"] + and result["count_match"] + and result.get("hash_match", False) + ) + result["needs_rebuild"] = not result["fresh"] + return result + + +def _entry_from_row(row) -> dict: + """Reconstruct an entry dict from a papers row (sqlite3.Row).""" + entry = {k: row[k] for k in row.keys()} + for key in ("has_pdf", "do_ocr", "analyze"): + if key in entry and entry[key] is not None: + entry[key] = bool(entry[key]) + for key in ("authors_json", "collections_json"): + if key in entry and entry[key]: + try: + entry[key[:-5]] = json.loads(entry[key]) + del entry[key] + except json.JSONDecodeError: + logger.warning( + "Corrupted JSON in column %s for paper %s", + key, entry.get("zotero_key", "?"), + ) + return entry + + +def lookup_paper(conn, query: str) -> list[dict]: + """Multi-strategy lookup. Returns list of matching paper dicts.""" + q = query.strip() + + for lookup_col in ("zotero_key", "citation_key", "doi"): + row = conn.execute( + f"SELECT * FROM papers WHERE LOWER({lookup_col}) = LOWER(?)", + (q,), + ).fetchone() + if row: + return [_entry_from_row(row)] + + rows = conn.execute( + """SELECT * FROM papers + WHERE LOWER(title) LIKE '%' || LOWER(?) || '%' + LIMIT 20""", + (q,), + ).fetchall() + if rows: + return [_entry_from_row(r) for r in rows] + + rows = conn.execute( + """SELECT p.* FROM papers p + JOIN paper_aliases a ON a.paper_id = p.zotero_key + WHERE a.alias_norm LIKE '%' || LOWER(?) || '%' + LIMIT 20""", + (q,), + ).fetchall() + return [_entry_from_row(r) for r in rows] + + +def get_paper_assets(conn, zotero_key: str) -> list[dict]: + rows = conn.execute( + "SELECT asset_type, path, exists_on_disk FROM paper_assets WHERE paper_id = ?", + (zotero_key,), + ).fetchall() + return [dict(r) for r in rows] + + +def get_paper_status(vault: Path, query: str) -> dict | None: + """Full paper status lookup. Returns dict or None if not found. + + If multiple candidates found, returns a candidate list without full status. + """ + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return None + + conn = get_connection(db_path, read_only=True) + try: + entries = lookup_paper(conn, query) + if not entries: + return None + + # Multiple candidates -> return candidate list only (no full status) + if len(entries) > 1: + return { + "resolved": False, + "candidates": [ + { + "zotero_key": e.get("zotero_key"), + "title": e.get("title"), + "year": e.get("year"), + "citation_key": e.get("citation_key"), + "lifecycle": e.get("lifecycle"), + } + for e in entries + ], + } + + entry = entries[0] + assets = get_paper_assets(conn, entry["zotero_key"]) + entry["health"] = compute_health(entry) + entry["assets"] = assets + entry["resolved"] = True + + next_step = entry.get("next_step", "") + zk = entry.get("zotero_key", "") + if next_step == "/pf-deep": + entry["recommended_action"] = f"/pf-deep {zk}" + elif next_step == "ocr": + entry["recommended_action"] = f"paperforge ocr --key {zk}" + elif next_step == "sync": + entry["recommended_action"] = "paperforge sync" + else: + entry["recommended_action"] = None + + return entry + finally: + conn.close() diff --git a/paperforge/memory/refresh.py b/paperforge/memory/refresh.py new file mode 100644 index 0000000..280b90c --- /dev/null +++ b/paperforge/memory/refresh.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +from datetime import datetime, timezone +from pathlib import Path + +from paperforge.memory._columns import PAPER_COLUMNS, build_paper_row +from paperforge.memory.builder import ( + ALIAS_TYPES, + ASSET_FIELDS, + _resolve_vault_path, +) +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.schema import PAPERS_AI_TRIGGER, ensure_schema +from paperforge.worker.asset_state import ( + compute_lifecycle, + compute_maturity, + compute_next_step, +) + + +def refresh_paper(vault: Path, entry: dict) -> bool: + """Upsert a single paper into memory DB. Entry is from _build_entry() output.""" + zotero_key = entry.get("zotero_key", "") + if not zotero_key: + return False + + generated_at = datetime.now(timezone.utc).isoformat() + + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return False + + conn = get_connection(db_path, read_only=False) + try: + ensure_schema(conn) + + conn.execute("DROP TRIGGER IF EXISTS papers_ai") + + entry["lifecycle"] = str(compute_lifecycle(entry)) + entry["maturity"] = compute_maturity(entry) + entry["next_step"] = str(compute_next_step(entry)) + paper_values = build_paper_row(entry, generated_at) + + # Step 1: Get old rowid before papers upsert (rowid may change on REPLACE) + old = conn.execute( + "SELECT rowid FROM papers WHERE zotero_key = ?", + (zotero_key,), + ).fetchone() + + # Step 2: Delete old FTS row BEFORE papers changes + if old: + conn.execute( + "DELETE FROM paper_fts WHERE rowid = ?", + (old["rowid"],), + ) + + # Step 3: Upsert papers + placeholders = ", ".join([f":{c}" for c in PAPER_COLUMNS]) + cols = ", ".join(PAPER_COLUMNS) + conn.execute( + f"INSERT OR REPLACE INTO papers ({cols}) VALUES ({placeholders})", + paper_values, + ) + + conn.execute("DELETE FROM paper_assets WHERE paper_id = ?", (zotero_key,)) + for asset_type, entry_field in ASSET_FIELDS: + path_val = entry.get(entry_field, "") + if not path_val: + continue + rel_path = str(path_val).replace("\\", "/") + abs_path = _resolve_vault_path(vault, rel_path) + exists = 1 if abs_path.exists() else 0 + if asset_type == "deep_reading" and abs_path.exists(): + try: + content = abs_path.read_text(encoding="utf-8") + exists = 1 if "## \U0001f52d \u7cbe\u8bfb" in content else 0 + except Exception: + exists = 0 + conn.execute( + "INSERT OR REPLACE INTO paper_assets (paper_id, asset_type, path, exists_on_disk) VALUES (?, ?, ?, ?)", + (zotero_key, asset_type, rel_path, exists), + ) + + conn.execute("DELETE FROM paper_aliases WHERE paper_id = ?", (zotero_key,)) + for alias_type in ALIAS_TYPES: + raw_val = entry.get(alias_type, "") + if not raw_val: + continue + raw_str = str(raw_val) + conn.execute( + "INSERT OR REPLACE INTO paper_aliases (paper_id, alias, alias_norm, alias_type) VALUES (?, ?, ?, ?)", + (zotero_key, raw_str, raw_str.lower().strip(), alias_type), + ) + + # Step 4: Get new rowid after upsert + new = conn.execute( + "SELECT rowid FROM papers WHERE zotero_key = ?", + (zotero_key,), + ).fetchone() + + # Step 5: Insert new FTS row + if new: + conn.execute( + "INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) " + "VALUES ((SELECT rowid FROM papers WHERE zotero_key = ?), ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + ( + zotero_key, + zotero_key, + entry.get("citation_key", ""), + entry.get("title", ""), + entry.get("first_author", ""), + paper_values["authors_json"], + entry.get("abstract", ""), + entry.get("journal", ""), + entry.get("domain", ""), + entry.get("collection_path", ""), + paper_values["collections_json"], + ), + ) + conn.execute(PAPERS_AI_TRIGGER) + + conn.commit() + return True + except Exception: + conn.rollback() + raise + finally: + conn.close() diff --git a/paperforge/memory/schema.py b/paperforge/memory/schema.py new file mode 100644 index 0000000..9989ffa --- /dev/null +++ b/paperforge/memory/schema.py @@ -0,0 +1,218 @@ +from __future__ import annotations + +import sqlite3 + +CURRENT_SCHEMA_VERSION = 2 # Bump from 1 for reading_log + project_log tables + +CREATE_META = """ +CREATE TABLE IF NOT EXISTS meta ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL +); +""" + +CREATE_PAPERS = """ +CREATE TABLE IF NOT EXISTS papers ( + zotero_key TEXT PRIMARY KEY, + citation_key TEXT NOT NULL DEFAULT '', + title TEXT NOT NULL, + year TEXT, + doi TEXT, + pmid TEXT, + journal TEXT, + first_author TEXT, + authors_json TEXT, + abstract TEXT, + domain TEXT, + collection_path TEXT, + collections_json TEXT, + has_pdf INTEGER NOT NULL DEFAULT 0, + do_ocr INTEGER, + analyze INTEGER, + ocr_status TEXT, + deep_reading_status TEXT, + ocr_job_id TEXT, + impact_factor REAL, + lifecycle TEXT, + maturity_level INTEGER, + maturity_name TEXT, + next_step TEXT, + pdf_path TEXT, + note_path TEXT, + main_note_path TEXT, + paper_root TEXT, + fulltext_path TEXT, + ocr_md_path TEXT, + ocr_json_path TEXT, + ai_path TEXT, + deep_reading_md_path TEXT, + updated_at TEXT +); +""" + +CREATE_ASSETS = """ +CREATE TABLE IF NOT EXISTS paper_assets ( + paper_id TEXT NOT NULL, + asset_type TEXT NOT NULL, + path TEXT NOT NULL, + exists_on_disk INTEGER NOT NULL DEFAULT 0, + PRIMARY KEY (paper_id, asset_type), + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +""" + +CREATE_ALIASES = """ +CREATE TABLE IF NOT EXISTS paper_aliases ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + paper_id TEXT NOT NULL, + alias TEXT NOT NULL, + alias_norm TEXT NOT NULL, + alias_type TEXT NOT NULL, + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +""" + +INDEX_SQL = [ + "CREATE INDEX IF NOT EXISTS idx_papers_doi ON papers(doi);", + "CREATE INDEX IF NOT EXISTS idx_papers_citation_key ON papers(citation_key);", + "CREATE INDEX IF NOT EXISTS idx_papers_domain ON papers(domain);", + "CREATE INDEX IF NOT EXISTS idx_papers_year ON papers(year);", + "CREATE INDEX IF NOT EXISTS idx_papers_ocr_status ON papers(ocr_status);", + "CREATE INDEX IF NOT EXISTS idx_papers_deep_status ON papers(deep_reading_status);", + "CREATE INDEX IF NOT EXISTS idx_papers_lifecycle ON papers(lifecycle);", + "CREATE INDEX IF NOT EXISTS idx_papers_next_step ON papers(next_step);", +] + +CREATE_PAPER_FTS = """ +CREATE VIRTUAL TABLE IF NOT EXISTS paper_fts USING fts5( + zotero_key, + citation_key, + title, + first_author, + authors_json, + abstract, + journal, + domain, + collection_path, + collections_json, + content='papers', + content_rowid='rowid' +); +""" + +PAPERS_AI_TRIGGER = """CREATE TRIGGER IF NOT EXISTS papers_ai AFTER INSERT ON papers BEGIN + INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) + VALUES (new.rowid, new.zotero_key, new.citation_key, new.title, new.first_author, new.authors_json, new.abstract, new.journal, new.domain, new.collection_path, new.collections_json); +END;""" + +FTS_TRIGGERS = [ + PAPERS_AI_TRIGGER, + """CREATE TRIGGER IF NOT EXISTS papers_ad AFTER DELETE ON papers BEGIN + INSERT INTO paper_fts(paper_fts, rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) + VALUES ('delete', old.rowid, old.zotero_key, old.citation_key, old.title, old.first_author, old.authors_json, old.abstract, old.journal, old.domain, old.collection_path, old.collections_json); + END;""", + """CREATE TRIGGER IF NOT EXISTS papers_au AFTER UPDATE ON papers BEGIN + INSERT INTO paper_fts(paper_fts, rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) + VALUES ('delete', old.rowid, old.zotero_key, old.citation_key, old.title, old.first_author, old.authors_json, old.abstract, old.journal, old.domain, old.collection_path, old.collections_json); + INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) + VALUES (new.rowid, new.zotero_key, new.citation_key, new.title, new.first_author, new.authors_json, new.abstract, new.journal, new.domain, new.collection_path, new.collections_json); + END;""", +] + +CREATE_EVENTS = """ +CREATE TABLE IF NOT EXISTS paper_events ( + event_id INTEGER PRIMARY KEY AUTOINCREMENT, + paper_id TEXT NOT NULL, + event_type TEXT NOT NULL, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + payload_json TEXT, + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +""" + +EVENT_INDEX_SQL = [ + "CREATE INDEX IF NOT EXISTS idx_events_paper ON paper_events(paper_id);", + "CREATE INDEX IF NOT EXISTS idx_events_type ON paper_events(event_type);", + "CREATE INDEX IF NOT EXISTS idx_events_time ON paper_events(created_at);", +] + +CREATE_READING_LOG = """ +CREATE TABLE IF NOT EXISTS reading_log ( + id TEXT PRIMARY KEY, + paper_id TEXT NOT NULL, + project TEXT DEFAULT '', + section TEXT NOT NULL, + excerpt TEXT NOT NULL, + context TEXT DEFAULT '', + usage TEXT NOT NULL, + note TEXT DEFAULT '', + tags_json TEXT DEFAULT '[]', + created_at TEXT NOT NULL, + agent TEXT DEFAULT '', + verified INTEGER DEFAULT 0, + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +""" + +CREATE_PROJECT_LOG = """ +CREATE TABLE IF NOT EXISTS project_log ( + id TEXT PRIMARY KEY, + project TEXT NOT NULL, + date TEXT NOT NULL, + type TEXT NOT NULL, + title TEXT NOT NULL, + decisions_json TEXT DEFAULT '[]', + detours_json TEXT DEFAULT '[]', + reusable_json TEXT DEFAULT '[]', + todos_json TEXT DEFAULT '[]', + related_papers_json TEXT DEFAULT '[]', + tags_json TEXT DEFAULT '[]', + created_at TEXT NOT NULL, + agent TEXT DEFAULT '' +); +""" + +ALL_TABLES = ["paper_fts", "papers", "paper_assets", "paper_aliases", "meta", "paper_events", "reading_log", "project_log"] + + +def ensure_schema(conn: sqlite3.Connection) -> None: + """Create tables and indexes if they don't exist.""" + conn.execute(CREATE_META) + conn.execute(CREATE_PAPERS) + conn.execute(CREATE_ASSETS) + conn.execute(CREATE_ALIASES) + conn.execute(CREATE_PAPER_FTS) + conn.execute(CREATE_EVENTS) + conn.execute(CREATE_READING_LOG) + conn.execute(CREATE_PROJECT_LOG) + for idx_sql in INDEX_SQL: + conn.execute(idx_sql) + for idx_sql in EVENT_INDEX_SQL: + conn.execute(idx_sql) + for trigger_sql in FTS_TRIGGERS: + conn.execute(trigger_sql) + conn.commit() + + +def drop_all_tables(conn: sqlite3.Connection) -> None: + """Drop all Memory Layer tables (for rebuild).""" + for table in ALL_TABLES: + conn.execute(f"DROP TABLE IF EXISTS {table};") + conn.commit() + + +def clear_fts(conn: sqlite3.Connection) -> None: + """Delete all FTS index entries (before rebuild).""" + conn.execute("DELETE FROM paper_fts;") + conn.commit() + + +def get_schema_version(conn: sqlite3.Connection) -> int: + """Read the stored schema version from meta table, or 0 if not found.""" + try: + row = conn.execute( + "SELECT value FROM meta WHERE key = 'schema_version'" + ).fetchone() + return int(row["value"]) if row else 0 + except sqlite3.OperationalError: + return 0 diff --git a/paperforge/memory/vector_db.py b/paperforge/memory/vector_db.py new file mode 100644 index 0000000..08e8e80 --- /dev/null +++ b/paperforge/memory/vector_db.py @@ -0,0 +1,305 @@ +from __future__ import annotations + +import json +import logging +import os +from pathlib import Path + +logger = logging.getLogger(__name__) + +# Lazy imports to avoid requiring chromadb unless actually used +_chroma = None +_ST = None + +def _get_chroma(): + global _chroma + if _chroma is None: + import chromadb + _chroma = chromadb + return _chroma + +def _get_st(): + global _ST + if _ST is None: + from sentence_transformers import SentenceTransformer + _ST = SentenceTransformer + return _ST + + +def _read_plugin_settings(vault: Path) -> dict: + """Read plugin data.json for vector_db settings.""" + data_path = vault / ".obsidian" / "plugins" / "paperforge" / "data.json" + if data_path.exists(): + return json.loads(data_path.read_text(encoding="utf-8")) + return {} + + +def get_vector_db_path(vault: Path) -> Path: + """Return the ChromaDB persistence directory.""" + from paperforge.config import paperforge_paths + paths = paperforge_paths(vault) + return (paths.get("memory_db", paths.get("index", vault / "System" / "PaperForge"))).parent / "vectors" + + +def get_collection(vault: Path): + """Get or create the ChromaDB collection for paperforge.""" + chroma = _get_chroma() + db_path = get_vector_db_path(vault) + db_path.mkdir(parents=True, exist_ok=True) + client = chroma.PersistentClient(path=str(db_path)) + # Delete and recreate if schema changed + try: + return client.get_or_create_collection( + name="paperforge_fulltext", + metadata={"hnsw:space": "cosine"}, + ) + except Exception: + client.delete_collection("paperforge_fulltext") + return client.create_collection( + name="paperforge_fulltext", + metadata={"hnsw:space": "cosine"}, + ) + + +_cached_model = None +_cached_model_name = None + + +def get_embedding_model(vault: Path): + """Load the embedding model based on plugin settings or default. Cached after first load.""" + global _cached_model, _cached_model_name + settings = _read_plugin_settings(vault) + mode = settings.get("vector_db_mode", "local") + + if mode == "api": + return None + + model_name = settings.get("vector_db_model", "BAAI/bge-small-en-v1.5") + + if _cached_model is not None and _cached_model_name == model_name: + return _cached_model + + ST = _get_st() + logger.info("Loading embedding model: %s", model_name) + + hf_endpoint = settings.get("vector_db_hf_endpoint", "") or os.environ.get("HF_ENDPOINT", "") + + if hf_endpoint: + local_path = _download_model_via_mirror(model_name, hf_endpoint) + if local_path and (local_path / "modules.json").exists(): + logger.info("Loading from local mirror copy: %s", local_path) + _cached_model = ST(str(local_path)) + _cached_model_name = model_name + return _cached_model + + _cached_model = ST(model_name) + _cached_model_name = model_name + return _cached_model + + +def _download_model_via_mirror(model_name: str, mirror: str) -> Path | None: + """Download model files from a mirror URL to a local cache directory. + Bypasses huggingface_hub entirely by using urllib directly.""" + try: + import urllib.request + except Exception: + return None + + mirror = mirror.rstrip("/") + base_url = f"{mirror}/{model_name}/resolve/main" + local_dir = Path.home() / ".cache" / "paperforge" / "models" / model_name.replace("/", "--") + + files = [ + "config.json", "modules.json", "config_sentence_transformers.json", + "sentence_bert_config.json", "special_tokens_map.json", + "tokenizer.json", "tokenizer_config.json", "vocab.txt", + "model.safetensors", "pytorch_model.bin", + "1_Pooling/config.json", + ] + + local_dir.mkdir(parents=True, exist_ok=True) + + # Build headers from HF_TOKEN + hf_token = os.environ.get("HF_TOKEN", "") + headers = {} + if hf_token: + headers["Authorization"] = f"Bearer {hf_token}" + + for f in files: + dest = local_dir / f + if dest.exists() and dest.stat().st_size > 0: + continue + dest.parent.mkdir(parents=True, exist_ok=True) + url = f"{base_url}/{f}" + try: + req = urllib.request.Request(url, headers=headers) + with urllib.request.urlopen(req, timeout=600) as resp: + with open(dest, "wb") as out: + while True: + chunk = resp.read(8192) + if not chunk: + break + out.write(chunk) + except Exception: + pass + + # Return path only if core files exist + has_weights = (local_dir / "model.safetensors").exists() or (local_dir / "pytorch_model.bin").exists() + has_config = (local_dir / "modules.json").exists() and (local_dir / "config.json").exists() + return local_dir if has_weights and has_config else None + return _cached_model + + +def embed_paper(vault: Path, zotero_key: str, chunks: list[dict]) -> int: + """Embed chunks for one paper and insert into ChromaDB. Returns count.""" + collection = get_collection(vault) + model = get_embedding_model(vault) + + if model is None: + # API mode + return _embed_paper_api(vault, zotero_key, chunks, collection) + + # Local mode + texts = [c["text"] for c in chunks] + ids = [f"{zotero_key}_{c['chunk_index']}" for c in chunks] + metadatas = [ + { + "paper_id": zotero_key, + "section": c["section"], + "page_number": c["page_number"], + "chunk_index": c["chunk_index"], + "token_estimate": c["token_estimate"], + } + for c in chunks + ] + + embeddings = model.encode(texts, show_progress_bar=False).tolist() + + collection.add( + ids=ids, + embeddings=embeddings, + documents=texts, + metadatas=metadatas, + ) + return len(chunks) + + +def _embed_paper_api(vault, zotero_key, chunks, collection) -> int: + """Embed using OpenAI API.""" + settings = _read_plugin_settings(vault) + api_key = settings.get("vector_db_api_key", "") + if not api_key: + env_file = vault / ".env" + if env_file.exists(): + for line in env_file.read_text(encoding="utf-8").splitlines(): + if line.startswith("OPENAI_API_KEY="): + api_key = line.split("=", 1)[1].strip().strip('"').strip("'") + if not api_key: + raise ValueError("No API key configured for vector DB") + + texts = [c["text"] for c in chunks] + ids = [f"{zotero_key}_{c['chunk_index']}" for c in chunks] + metadatas = [ + {"paper_id": zotero_key, "section": c["section"], + "page_number": c["page_number"], "chunk_index": c["chunk_index"], + "token_estimate": c["token_estimate"]} + for c in chunks + ] + + from openai import OpenAI + api_model = os.environ.get("VECTOR_DB_API_MODEL", "") or settings.get("vector_db_api_model", "text-embedding-3-small") + api_base = os.environ.get("VECTOR_DB_API_BASE", "") or settings.get("vector_db_api_base", None) or None + api_key = os.environ.get("VECTOR_DB_API_KEY", "") or api_key + logger.info("API mode: base_url=%s, model=%s", api_base or "(default OpenAI)", api_model) + client = OpenAI(api_key=api_key, base_url=api_base) + response = client.embeddings.create(model=api_model, input=texts) + embeddings = [e.embedding for e in response.data] + + collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas) + return len(chunks) + + +def delete_paper_vectors(vault: Path, zotero_key: str) -> int: + """Delete all chunks for a paper from ChromaDB.""" + collection = get_collection(vault) + try: + results = collection.get(where={"paper_id": zotero_key}) + ids = results.get("ids", []) + if ids: + collection.delete(ids=ids) + return len(ids) + except Exception: + return 0 + + +def retrieve_chunks(vault: Path, query: str, limit: int = 5, expand: bool = True) -> list[dict]: + """Search for chunks matching the query. Returns list with adjacent context.""" + collection = get_collection(vault) + model = get_embedding_model(vault) + + if model is None: + # API mode + settings = _read_plugin_settings(vault) + api_key = settings.get("vector_db_api_key", "") + env_file = vault / ".env" + if not api_key and env_file.exists(): + for line in env_file.read_text(encoding="utf-8").splitlines(): + if line.startswith("OPENAI_API_KEY="): + api_key = line.split("=", 1)[1].strip().strip('"').strip("'") + if not api_key: + raise ValueError("No API key configured for vector DB") + from openai import OpenAI + api_base = os.environ.get("VECTOR_DB_API_BASE", "") or settings.get("vector_db_api_base", None) or None + api_key = os.environ.get("VECTOR_DB_API_KEY", "") or api_key + client = OpenAI(api_key=api_key, base_url=api_base) + api_model = os.environ.get("VECTOR_DB_API_MODEL", "") or settings.get("vector_db_api_model", "text-embedding-3-small") + response = client.embeddings.create(model=api_model, input=query) + query_embedding = response.data[0].embedding + else: + query_embedding = model.encode(query).tolist() + + results = collection.query( + query_embeddings=[query_embedding], + n_results=limit * 3 if expand else limit, + include=["documents", "metadatas", "distances"], + ) + + chunks = [] + for i, (doc, meta, dist) in enumerate(zip( + results["documents"][0], + results["metadatas"][0], + results["distances"][0], + )): + chunks.append({ + "paper_id": meta["paper_id"], + "section": meta.get("section", "Text"), + "page_number": meta.get("page_number", 1), + "chunk_index": meta.get("chunk_index", 0), + "chunk_text": doc, + "score": round(1.0 - dist, 4), # cosine distance → similarity + }) + + return chunks + + +def get_embed_status(vault: Path) -> dict: + """Get vector DB status.""" + db_path = get_vector_db_path(vault) + exists = db_path.exists() + chunk_count = 0 + if exists: + try: + collection = get_collection(vault) + chunk_count = collection.count() + except Exception: + pass + + settings = _read_plugin_settings(vault) + mode = settings.get("vector_db_mode", "local") + model = settings.get("vector_db_api_model", "text-embedding-3-small") if mode == "api" else settings.get("vector_db_model", "BAAI/bge-small-en-v1.5") + return { + "db_exists": exists, + "chunk_count": chunk_count, + "model": model, + "mode": mode, + } diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 4695a57..c36fc4c 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -76,9 +76,9 @@ function checkRuntimeVersion(pythonExe, pluginVersion, cwd, timeout, _execFile) } else { resolve({ status: "mismatch", pyVersion: pyVer, pluginVersion, error: null }); } - }); - }); -} + }); + }); + } function classifyError(errorCode) { const code = String(errorCode); @@ -198,9 +198,8 @@ function runSubprocess(pythonExe, args, cwd, timeout, _spawn, env) { exitCode: -1, elapsed: Date.now() - startTime }); }); }); -} - +} // ── Cross-platform Python and BBT detection (macOS/Linux) ── @@ -489,6 +488,70 @@ Object.assign(LANG.en, { ocr_privacy_title: 'OCR Privacy Notice', ocr_privacy_warning: 'OCR will upload PDFs to the PaddleOCR API. Do not upload sensitive or confidential documents.', ocr_understand: 'I understand, continue', + + /* ── Tabbed Settings ── */ + tab_setup: 'Installation', + tab_features: 'Features', + /* ── Features tab descriptions ── */ + feat_skills_desc: 'Manage and enable/disable agent skills installed in your vault. Each row corresponds to a SKILL.md file — toggle off to prevent the agent from auto-invoking that skill.', + feat_skills_system: 'System Skills ship with PaperForge and are updated alongside PaperForge.', + feat_skills_user: 'User Skills are custom skills you install from community or create yourself.', + feat_memory_desc: 'The Memory Layer is the core data engine of PaperForge, powered by SQLite. It integrates all literature metadata (papers, assets, aliases, reading events), provides FTS5 full-text search across titles/abstracts/authors/collections, and enables the agent-context and paper-status commands. Always active — no toggle needed.', + feat_vector_desc: 'Vector Database enables semantic search across OCR-extracted fulltext using embedding models. Documents are split into chunks, embedded into vector space, and stored in ChromaDB. Supports local models (free, CPU) or OpenAI API (paid, faster).', + feat_vector_config_label: 'Advanced Configuration', + feat_agent_platform: 'Agent Platform', + feat_agent_platform_desc: 'Select which agent platform to manage skills for.', + feat_vector_enable: 'Enable Vector Retrieval', + feat_vector_enable_desc: 'Semantic search across OCR fulltext. Requires: pip install chromadb sentence-transformers openai (~500MB).', + feat_hf_mirror: 'HF Mirror / Endpoint', + feat_hf_mirror_desc: 'Model download source. Try official if mirror fails. Custom: type any URL.', + feat_custom_endpoint: 'Custom Endpoint', + feat_custom_endpoint_desc: 'Enter a custom HuggingFace mirror URL.', + feat_hf_token: 'HF Token', + feat_hf_token_desc: 'HuggingFace access token (optional, helps with rate limits and gated models).', + feat_model: 'Model', + feat_embed_mode: 'Embedding Mode', + feat_embed_mode_local: 'Local (free, CPU)', + feat_embed_mode_api: 'API (OpenAI, paid)', + feat_openai_key: 'OpenAI API Key', + feat_openai_key_desc: 'Used for text-embedding-3-small (1536d).', + feat_verify: 'Verify', + feat_checking: 'Checking...', + feat_rebuild_vectors: 'Rebuild Vectors', + feat_rebuild_vectors_desc: 'Rebuild all OCR fulltext vectors. Required after model or mode change.', + feat_rebuild_vectors_changed: 'Model changed — rebuild to update all vectors.', + feat_install_deps: 'Install Dependencies', + feat_install_deps_desc: 'pip install chromadb sentence-transformers openai (~500MB).', + feat_model_bge_small: 'Best balance — fast, accurate, recommended for most users (384d, 130MB)', + feat_model_minilm: 'Lightest & fastest — lower accuracy, minimal disk (384d, 80MB)', + feat_model_bge_base: 'Highest accuracy — slower, large disk footprint (768d, 440MB)', + feat_api_base_url: 'API Base URL', + feat_api_base_url_desc: 'Custom OpenAI-compatible API endpoint. Leave empty for default.', + feat_api_model: 'API Model', + feat_api_model_desc: 'Embedding model name for this endpoint.', + feat_deps_missing: 'Dependencies not installed. Required: chromadb, sentence-transformers, openai.', + feat_deps_checking: 'Checking dependencies...', + feat_no_python: 'No Python found. Check Installation tab.', + feat_rebuild_btn: 'Rebuild', + feat_build_btn: 'Build', + feat_building: 'Building...', + feat_installing: 'Installing...', + feat_install_btn: 'Install', + feat_retry_btn: 'Retry', + feat_removing: 'Removing...', + feat_not_cached: 'Not cached', + feat_uninstall_btn: 'Uninstall', + feat_verify_btn: 'Verify', + feat_checking_btn: 'Checking...', + feat_valid_key: 'API key valid.', + feat_key_rejected: 'API key rejected.', + feat_enter_key: 'Enter a valid OpenAI API key.', + feat_network_error: 'Network error: ', + feat_build_complete: 'Vector build complete.', + feat_build_failed: 'Build failed. See terminal output.', + feat_output_copied: 'Output copied to clipboard.', + feat_install_done: 'Dependencies installed. Building vectors...', + feat_install_failed: 'Install failed: ', }); /* ── LANG.zh: v1.12 runtime health, OCR queue, pf-deep, dashboard translations ── */ @@ -524,6 +587,70 @@ Object.assign(LANG.zh, { install_validating: '正在校验安装环境…', install_bootstrapping: '未检测到 PaperForge Python 包,正在自动安装…', wizard_safety: '安全说明:如果你选择的目录里已经有文件,安装向导会保留已有内容,只补充缺失的 PaperForge 文件和目录。', + + /* ── Tabbed Settings ── */ + tab_setup: '安装', + tab_features: '功能', + /* ── 功能介绍的描述文本 ── */ + feat_skills_desc: '管理 Vault 中已安装的 Agent 技能。每行对应一个 SKILL.md 文件,关闭开关可阻止 Agent 自动调用该技能。', + feat_skills_system: '系统技能随 PaperForge 一同发布,会跟随 PaperForge 版本更新。', + feat_skills_user: '用户技能是你自行安装或创建的自定义技能。', + feat_memory_desc: '记忆层是 PaperForge 的核心数据引擎,基于 SQLite 构建。它整合了所有文献元数据(论文、资源文件、别名、阅读事件),支持 FTS5 全文检索(可搜索标题、摘要、作者、分类),并为 agent-context 和 paper-status 命令提供数据支撑。始终运行,无需手动开启。', + feat_vector_desc: '向量数据库通过嵌入模型实现 OCR 全文的语义搜索。文档被切分为文本块(chunk),编码为向量存入 ChromaDB。支持本地模型(免费,CPU 运行)或 OpenAI API(付费,更快速)。', + feat_vector_config_label: '高级配置', + feat_agent_platform: 'Agent 平台', + feat_agent_platform_desc: '选择要管理的 Agent 平台。', + feat_vector_enable: '启用向量检索', + feat_vector_enable_desc: '对 OCR 全文进行语义搜索。需安装: pip install chromadb sentence-transformers openai (~500MB)。', + feat_hf_mirror: 'HF 镜像站 / 端点', + feat_hf_mirror_desc: '模型下载源。镜像不可用时尝试官方源。自定义:输入任意 URL。', + feat_custom_endpoint: '自定义端点', + feat_custom_endpoint_desc: '输入自定义 HuggingFace 镜像 URL。', + feat_hf_token: 'HF Token', + feat_hf_token_desc: 'HuggingFace 访问令牌(可选,有助于解除限速和下载受限模型)。', + feat_model: '模型', + feat_embed_mode: '嵌入模式', + feat_embed_mode_local: '本地(免费,CPU)', + feat_embed_mode_api: 'API(OpenAI,付费)', + feat_openai_key: 'OpenAI API Key', + feat_openai_key_desc: '用于 text-embedding-3-small(1536 维)。', + feat_verify: '验证', + feat_checking: '检测中…', + feat_rebuild_vectors: '重建向量', + feat_rebuild_vectors_desc: '重建所有 OCR 全文向量。更换模型或模式后需要重建。', + feat_rebuild_vectors_changed: '模型已更换 — 需要重建向量。', + feat_install_deps: '安装依赖', + feat_install_deps_desc: 'pip install chromadb sentence-transformers openai (~500MB)。', + feat_model_bge_small: '最佳平衡 — 快速、准确,推荐大多数用户使用 (384d, 130MB)', + feat_model_minilm: '最轻最快 — 精度略低,磁盘占用最小 (384d, 80MB)', + feat_model_bge_base: '最高精度 — 较慢,磁盘占用大 (768d, 440MB)', + feat_api_base_url: 'API 地址', + feat_api_base_url_desc: '自定义 OpenAI 兼容 API 端点。留空使用默认地址。', + feat_api_model: 'API 模型', + feat_api_model_desc: '该端点使用的嵌入模型名称。', + feat_deps_missing: '依赖未安装。需要:chromadb, sentence-transformers, openai。', + feat_deps_checking: '正在检测依赖…', + feat_no_python: '未找到 Python。请查看安装标签页。', + feat_rebuild_btn: '重建', + feat_build_btn: '构建', + feat_building: '构建中…', + feat_installing: '安装中…', + feat_install_btn: '安装', + feat_retry_btn: '重试', + feat_removing: '删除中…', + feat_not_cached: '未缓存', + feat_uninstall_btn: '卸载', + feat_verify_btn: '验证', + feat_checking_btn: '检测中…', + feat_valid_key: 'API Key 有效。', + feat_key_rejected: 'API Key 被拒绝。', + feat_enter_key: '请输入有效的 OpenAI API Key。', + feat_network_error: '网络错误:', + feat_build_complete: '向量构建完成。', + feat_build_failed: '构建失败。请查看终端输出。', + feat_output_copied: '输出已复制到剪贴板。', + feat_install_done: '依赖已安装。正在构建向量…', + feat_install_failed: '安装失败:', }); function langFromApp(app) { @@ -553,6 +680,21 @@ const DEFAULT_SETTINGS = { paddleocr_api_key: '', zotero_data_dir: '', python_path: '', + // Feature toggles + features: { + memory_layer: true, + vector_db: false, + }, + selected_skill_platform: 'opencode', + vector_db_mode: 'local', + vector_db_model: 'BAAI/bge-small-en-v1.5', + vector_db_api_key: '', + vector_db_api_base: '', + vector_db_api_model: 'text-embedding-3-small', + vector_db_hf_endpoint: 'https://hf-mirror.com', + vector_db_hf_token: '', + vector_db_last_model: '', + frozen_skills: {}, }; // ACTIONS, resolvePythonExecutable extracted to src/ modules (Plan 53-001) @@ -1169,7 +1311,7 @@ class PaperForgeStatusView extends ItemView { const filePath = file.path; if (ext === 'base') { - return { mode: 'collection', filePath, key: null, domain: file.basename }; + return { mode: 'collection', filePath, key: null, domain: file.basename.trim() }; } if (ext === 'md') { @@ -1376,9 +1518,19 @@ class PaperForgeStatusView extends ItemView { hubBtn.createEl('span', { text: 'Open Literature Hub' }); hubBtn.addEventListener('click', () => { const baseDir = plugin?.settings?.base_dir || 'Bases'; - const baseFile = this.app.vault.getAbstractFileByPath(baseDir); - if (baseFile) { - this.app.workspace.revealLeaf().setViewState({ type: 'file-explorer', active: true }); + const baseFolder = this.app.vault.getAbstractFileByPath(baseDir); + if (baseFolder) { + // Find first .base file in the base directory + let baseFile = null; + if (baseFolder.children) { + baseFile = baseFolder.children.find(f => f.extension === 'base'); + } + if (baseFile) { + const leaf = this.app.workspace.getLeaf(false); + if (leaf) leaf.openFile(baseFile); + } else { + new Notice('[!!] No .base file found in ' + baseDir, 6000); + } } else { new Notice('[!!] Base directory not found: ' + baseDir, 6000); } @@ -1810,13 +1962,14 @@ class PaperForgeStatusView extends ItemView { const domain = this._currentDomain || 'Unknown'; const domainItems = this._filterByDomain(domain); - const view = this._contentEl.createEl('div', { cls: 'paperforge-collection-view' }); - if (domainItems.length === 0) { - this._renderEmptyState(view, 'No papers found in domain "' + domain + '". Sync some papers first.'); + // Fall back to global mode if no papers match this domain (e.g. "Literature Hub") + this._renderGlobalMode(); return; } + const view = this._contentEl.createEl('div', { cls: 'paperforge-collection-view' }); + // ── Single-pass aggregation ── const totalPapers = domainItems.length; let hasPdf = 0, ocrDone = 0, analyzeReady = 0, deepRead = 0; @@ -2196,6 +2349,12 @@ class PaperForgeSettingTab extends PluginSettingTab { this.plugin = plugin; this._saveTimeout = null; this._pfConfig = null; // cached paperforge.json config + this._lastSyncTime = null; + this._memoryStatusText = null; // null = not checked yet, string = cached result + this._vectorDepsOk = null; // null = not checked, bool = cached + this._embedStatusText = null; + this._skillsCollapsed = { user: true }; // User skills collapsed by default + this.activeTab = 'setup'; } /** Reload path config from paperforge.json */ @@ -2208,6 +2367,61 @@ class PaperForgeSettingTab extends PluginSettingTab { containerEl.empty(); this._refreshPfConfig(); + // Inject tab CSS once + if (!document.getElementById('paperforge-tab-styles')) { + const style = document.createElement('style'); + style.id = 'paperforge-tab-styles'; + style.textContent = ` + .paperforge-settings-tabs { display: flex; gap: 4px; margin-bottom: 16px; border-bottom: 1px solid var(--background-modifier-border); } + .paperforge-settings-tab { padding: 6px 16px; border: none; background: none; cursor: pointer; border-bottom: 2px solid transparent; font-size: 14px; color: var(--text-muted); } + .paperforge-settings-tab--active { color: var(--text-accent); border-bottom-color: var(--text-accent); } + .paperforge-tab-content { display: none; } + .paperforge-tab-content--active { display: block; } + .paperforge-skills-collapse-header { display: flex !important; align-items: center; cursor: pointer; padding: 6px 0 !important; margin: 0 !important; } + .paperforge-skills-collapse-header h4 { margin: 0 !important; } + .paperforge-skills-collapse-content { margin: 0 !important; padding: 0 !important; } + .paperforge-skills-group { margin-bottom: 10px; } + .paperforge-skills-group:last-child { margin-bottom: 0; } + .vertical-tab-content-container { overflow-y: scroll !important; } + `; + document.head.appendChild(style); + } + + // --- Tab bar --- + const tabBar = containerEl.createDiv({ cls: 'paperforge-settings-tabs' }); + const tabs = [ + { id: 'setup', label: t('tab_setup') || 'Installation' }, + { id: 'features', label: t('tab_features') || 'Features' }, + ]; + const tabContents = {}; + + tabs.forEach(tab => { + const btn = tabBar.createEl('button', { + cls: 'paperforge-settings-tab' + (tab.id === this.activeTab ? ' paperforge-settings-tab--active' : ''), + text: tab.label, + }); + btn.addEventListener('click', () => { + this.activeTab = tab.id; + this.display(); // re-render with new active tab + }); + }); + + // --- Tab content containers --- + tabs.forEach(tab => { + tabContents[tab.id] = containerEl.createDiv({ + cls: 'paperforge-tab-content' + (tab.id === this.activeTab ? ' paperforge-tab-content--active' : ''), + }); + }); + + // --- Render active tab --- + if (this.activeTab === 'setup') { + this._renderSetupTab(tabContents.setup); + } else { + this._renderFeaturesTab(tabContents.features); + } + } + + _renderSetupTab(containerEl) { const vaultPath = this.app.vault.adapter.basePath; if (!this.plugin.settings.vault_path) { this.plugin.settings.vault_path = vaultPath; @@ -2413,6 +2627,719 @@ class PaperForgeSettingTab extends PluginSettingTab { } } + _execMemoryStatus(pythonPath, vp, callback) { + const { exec } = require('child_process'); + exec(`"${pythonPath}" -m paperforge --vault "${vp}" memory status --json`, { encoding: 'utf-8', timeout: 15000 }, (err, stdout) => { + if (err) { callback('Status unavailable'); return; } + try { + const data = JSON.parse(stdout); + if (data.ok) { + const s = data.data; + const freshness = s.fresh ? 'fresh' : 'stale'; + callback(`Papers: ${s.paper_count_db} | ${freshness}${s.needs_rebuild ? ' - needs rebuild' : ''}`); + } else { + callback('DB not found. Run paperforge memory build.'); + } + } catch(e) { callback('Could not parse status.'); } + }); + } + + _execEmbedStatus(pythonPath, vp, callback) { + const { exec } = require('child_process'); + exec(`"${pythonPath}" -m paperforge --vault "${vp}" embed status --json`, { encoding: 'utf-8', timeout: 15000 }, (err, stdout) => { + if (err) { callback('Status unavailable'); return; } + try { + const data = JSON.parse(stdout); + if (data.ok) { + callback(`Chunks: ${data.data.chunk_count} | ${data.data.model} | ${data.data.mode}`); + } else { + callback('Could not parse status.'); + } + } catch(e) { callback('Could not parse status.'); } + }); + } + + _renderMemoryStatusText(el, text, extraInfo) { + el.innerHTML = ''; + el.createEl('span', { text: text, cls: 'paperforge-memory-text' }).style.cssText = 'flex:1;'; + + if (extraInfo === 'syncing') { + const syncEl = el.createEl('span', { text: 'Syncing...', cls: 'paperforge-sync-status' }); + syncEl.style.cssText = 'opacity:0.7; margin-right:8px;'; + } else if (extraInfo) { + const timeEl = el.createEl('span', { text: extraInfo, cls: 'paperforge-sync-status' }); + timeEl.style.cssText = 'opacity:0.7; margin-right:8px;'; + } + + const refreshBtn = el.createEl('button', { cls: 'paperforge-refresh-btn', text: '\u21BB' }); + refreshBtn.style.cssText = 'margin-left:auto; border:none; background:none; cursor:pointer; font-size:16px; padding:0 4px;'; + refreshBtn.title = 'Sync now'; + refreshBtn.onclick = () => { + this._memoryStatusText = null; + this._runManualSync(); + }; + } + + _getBuildCommand(settings) { + const vp = this.app.vault.adapter.basePath; + const pyResult = resolvePythonExecutable(vp, settings); + if (!pyResult.path) return null; + return `"${pyResult.path}" -m paperforge --vault "${vp}" sync`; + } + + _runManualSync() { + const vp = this.app.vault.adapter.basePath; + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + if (!pyResult.path) return; + + const statusRow = document.querySelector('.paperforge-memory-status'); + if (statusRow) { + this._renderMemoryStatusText(statusRow, 'Checking...', 'syncing'); + } + + this.plugin._autoSyncRunning = true; + const { exec } = require('child_process'); + exec(`"${pyResult.path}" -m paperforge --vault "${vp}" sync`, { timeout: 120000, encoding: 'utf-8' }, (err) => { + this.plugin._autoSyncRunning = false; + this._memoryStatusText = null; + if (!err) { + this._lastSyncTime = new Date().toLocaleTimeString(); + this.plugin._lastSyncTime = this._lastSyncTime; + } + this.display(); // re-render + }); + } + + _renderFeaturesTab(containerEl) { + // --- Section: Skills --- + containerEl.createEl('h3', { text: 'Skills' }); + const skillsDescEl = containerEl.createEl('div', { cls: 'paperforge-desc-box' }); + skillsDescEl.style.cssText = 'padding:8px 12px; margin:0 0 12px; background:var(--background-secondary); border-radius:4px; font-size:12px; color:var(--text-muted); line-height:1.5;'; + skillsDescEl.setText(t('feat_skills_desc')); + skillsDescEl.createEl('br'); + skillsDescEl.createEl('span', { text: t('feat_skills_system'), cls: '' }).style.opacity = '0.7'; + + // Agent platform selector + const agentPlatforms = { + 'opencode': 'OpenCode', + 'claude': 'Claude Code', + 'codex': 'Codex', + 'cursor': 'Cursor', + 'windsurf': 'Windsurf', + 'github_copilot': 'GitHub Copilot', + }; + const agentDirs = { + 'opencode': '.opencode/skills', + 'claude': '.claude/skills', + 'codex': '.codex/skills', + 'cursor': '.cursor/skills', + 'windsurf': '.windsurf/skills', + 'github_copilot': '.github/skills', + }; + + const vaultPath = this.app.vault.adapter.basePath; + const fs = require('fs'); + const path = require('path'); + + let selectedPlatform = this.plugin.settings.selected_skill_platform || 'opencode'; + + new Setting(containerEl) + .setName(t('feat_agent_platform')) + .setDesc(t('feat_agent_platform_desc')) + .addDropdown(dropdown => { + Object.entries(agentPlatforms).forEach(([key, label]) => dropdown.addOption(key, label)); + dropdown.setValue(selectedPlatform) + .onChange(value => { + this.plugin.settings.selected_skill_platform = value; + this.plugin.saveSettings(); + this.display(); + }); + }) + .addExtraButton(btn => { + btn.setIcon('folder') + .setTooltip('Open skills folder') + .onClick(() => { + const dir = agentDirs[selectedPlatform] || '.opencode/skills'; + const fullPath = path.join(vaultPath, dir); + if (fs.existsSync(fullPath)) { + const { exec } = require('child_process'); + exec(`start "" "${fullPath}"`); + } else { + new Notice(`Skills folder not found: ${dir}`); + } + }); + }); + + // Show skills for selected platform + const skillDir = path.join(vaultPath, agentDirs[selectedPlatform]); + let systemSkills = []; + let userSkills = []; + + if (fs.existsSync(skillDir)) { + fs.readdirSync(skillDir, { withFileTypes: true }).forEach(entry => { + if (!entry.isDirectory()) return; + const skillFile = path.join(skillDir, entry.name, 'SKILL.md'); + if (!fs.existsSync(skillFile)) return; + const content = fs.readFileSync(skillFile, 'utf-8'); + const nameMatch = content.match(/^name:\s*(.+)$/m); + const lines = content.split('\n'); + const descIdx = lines.findIndex(l => /^description:/.test(l)); + let desc = ''; + if (descIdx >= 0) { + const first = lines[descIdx].match(/^description:\s*(.+)$/); + if (first && first[1] && first[1] !== '>' && first[1] !== '|-' && first[1] !== '|') { + desc = first[1].trim(); + } else { + for (let i = descIdx + 1; i < lines.length; i++) { + if (/^\s{2,}/.test(lines[i]) || lines[i].trim() === '') { + desc += lines[i].trim() + ' '; + } else break; + } + desc = desc.trim(); + } + } + const sourceMatch = content.match(/^source:\s*(.+)$/m); + const disableMatch = content.match(/^disable-model-invocation:\s*(.+)$/m); + const versionMatch = content.match(/^version:\s*(.+)$/m); + + const skill = { + name: nameMatch ? nameMatch[1].trim() : entry.name, + desc: desc, + source: sourceMatch ? sourceMatch[1].trim() : 'user', + disabled: disableMatch && disableMatch[1].trim() === 'true', + version: versionMatch ? versionMatch[1].trim() : '', + path: skillFile, + content: content, + dirName: entry.name, + }; + + if (skill.source === 'paperforge') { + systemSkills.push(skill); + } else { + userSkills.push(skill); + } + }); + } + + const skillsBox = containerEl.createEl('div'); + skillsBox.style.cssText = 'background:var(--background-secondary); border-radius:8px; padding:12px 12px 10px; margin:8px 0 16px;'; + + const renderCollapsibleSkills = (label, skills, isSystem) => { + if (skills.length === 0) return; + + // Group wrapper for spacing between groups + const group = skillsBox.createEl('div', { cls: 'paperforge-skills-group' }); + + // Header row with toggle arrow (created first so it appears above content) + const header = group.createEl('div', { cls: 'paperforge-skills-collapse-header' }); + + // Content wrapper + const content = group.createEl('div', { cls: 'paperforge-skills-collapse-content' }); + const arrow = header.createEl('span', { text: '\u25BC', cls: 'paperforge-skills-arrow' }); + arrow.style.cssText = 'display:inline-block; font-size:10px; margin-right:6px; transition:transform 0.2s; transform:rotate(0deg);'; + header.createEl('h4', { text: `${label} (${skills.length})`, cls: 'paperforge-skills-subheader' }); + + skills.forEach(s => { + const nameText = s.name + (s.version ? ' v' + s.version : ''); + const sourceLabel = isSystem ? ' [system]' : ' [user]'; + const descText = s.desc || ''; + + const setting = new Setting(content) + .setName(nameText + sourceLabel) + .setDesc(descText); + setting.settingEl.style.opacity = s.disabled ? '0.4' : '1'; + + setting.addToggle(toggle => { + toggle.setValue(!s.disabled) + .onChange(value => { + const newDisabled = !value; + const disableMatch = s.content.match(/^disable-model-invocation:\s*(.+)$/m); + const newContent = disableMatch + ? s.content.replace(/^disable-model-invocation:\s*.+$/m, `disable-model-invocation: ${newDisabled}`) + : s.content.replace(/^(---\r?\n)/, `$1disable-model-invocation: ${newDisabled}\n`); + fs.writeFileSync(s.path, newContent, 'utf-8'); + s.disabled = newDisabled; + s.content = newContent; + setting.settingEl.style.opacity = s.disabled ? '0.4' : '1'; + }); + }); + }); + + // Toggle with state preservation + const stateKey = isSystem ? 'system' : 'user'; + const collapsed = this._skillsCollapsed[stateKey] || false; + if (collapsed) { + content.style.display = 'none'; + arrow.style.transform = 'rotate(-90deg)'; + } + + header.addEventListener('click', () => { + const nowCollapsed = content.style.display !== 'none'; + if (nowCollapsed) { + content.style.display = 'none'; + arrow.style.transform = 'rotate(-90deg)'; + } else { + content.style.display = ''; + arrow.style.transform = 'rotate(0deg)'; + } + this._skillsCollapsed[stateKey] = content.style.display === 'none'; + }); + }; + + // System skills + renderCollapsibleSkills('System Skills', systemSkills, true); + + // User skills + renderCollapsibleSkills('User Skills', userSkills, false); + + if (systemSkills.length === 0 && userSkills.length === 0) { + skillsBox.createEl('p', { + text: `No skills found in ${agentDirs[selectedPlatform]}. Run setup to deploy skills.`, + cls: 'setting-item-description' + }); + } + + // --- Section: Memory Layer --- + containerEl.createEl('h3', { text: 'Memory Layer' }); + + const memoryDescEl = containerEl.createEl('div', { cls: 'paperforge-desc-box' }); + memoryDescEl.style.cssText = 'padding:8px 12px; margin:0 0 12px; background:var(--background-secondary); border-radius:4px; font-size:12px; color:var(--text-muted); line-height:1.5;'; + memoryDescEl.setText(t('feat_memory_desc')); + + // Always-on SQLite status display + const statusRow = containerEl.createEl('div', { cls: 'paperforge-memory-status' }); + statusRow.style.cssText = 'display:flex; align-items:center; padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; + + const vp = this.app.vault.adapter.basePath; + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + + if (this.plugin._lastSyncTime && !this._lastSyncTime) { + this._lastSyncTime = this.plugin._lastSyncTime; + } + + if (this._memoryStatusText !== null) { + this._renderMemoryStatusText(statusRow, this._memoryStatusText, this._lastSyncTime); + } else if (pyResult.path) { + this._renderMemoryStatusText(statusRow, 'Checking...', this._lastSyncTime); + this._execMemoryStatus(pyResult.path, vp, (text) => { + this._memoryStatusText = text; + this._renderMemoryStatusText(statusRow, text, this._lastSyncTime); + }); + } else { + this._renderMemoryStatusText(statusRow, 'No Python found.', this._lastSyncTime); + } + + this._renderVectorSection(containerEl); + } + + _renderVectorSection(containerEl) { + // --- Vector Database (within Memory Layer) --- + containerEl.createEl('h4', { text: 'Vector Database' }); + + const vecDescEl = containerEl.createEl('div', { cls: 'paperforge-desc-box' }); + vecDescEl.style.cssText = 'padding:8px 12px; margin:0 0 8px; background:var(--background-secondary); border-radius:4px; font-size:12px; color:var(--text-muted); line-height:1.5;'; + vecDescEl.setText(t('feat_vector_desc')); + + new Setting(containerEl) + .setName(t('feat_vector_enable')) + .setDesc(t('feat_vector_enable_desc')) + .addToggle(toggle => { + toggle.setValue(this.plugin.settings.features.vector_db) + .onChange(value => { + this.plugin.settings.features.vector_db = value; + this.plugin.saveSettings(); + this._vectorDepsOk = null; + this._embedStatusText = null; + this.display(); + }); + }); + + if (!this.plugin.settings.features.vector_db) return; + + const vp = this.app.vault.adapter.basePath; + + // Collapsible config section + const vecConfigHeader = containerEl.createEl('div', { cls: 'paperforge-skills-collapse-header' }); + vecConfigHeader.style.cssText = 'display:flex; align-items:center; cursor:pointer; padding:6px 0 2px; margin:0;'; + const vecArrow = vecConfigHeader.createEl('span', { text: '\u25BC' }); + vecArrow.style.cssText = 'display:inline-block; font-size:10px; margin-right:6px; transition:transform 0.2s;'; + vecConfigHeader.createEl('span', { text: t('feat_vector_config_label'), cls: '' }).style.cssText = 'font-size:12px; color:var(--text-muted);'; + const vecConfigContent = containerEl.createEl('div', { cls: 'paperforge-vector-config' }); + + let vecConfigCollapsed = false; + vecConfigHeader.addEventListener('click', () => { + vecConfigCollapsed = !vecConfigCollapsed; + vecConfigContent.style.display = vecConfigCollapsed ? 'none' : ''; + vecArrow.style.transform = vecConfigCollapsed ? 'rotate(-90deg)' : 'rotate(0deg)'; + }); + + // === Resolve state === + if (this._vectorDepsOk === true && this._embedStatusText !== null) { + this._renderVectorReady(vecConfigContent, vp); + return; + } + if (this._vectorDepsOk === false) { + this._renderVectorNoDeps(vecConfigContent); + return; + } + // First check — deps unknown, run async + if (this._vectorDepsOk === null) { + const statusBox = vecConfigContent.createEl('div'); + statusBox.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; + statusBox.setText(t('feat_deps_checking')); + + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + if (!pyResult.path) { + statusBox.setText(t('feat_no_python')); + this._vectorDepsOk = false; + return; + } + const { exec } = require('child_process'); + exec(`"${pyResult.path}" -c "import chromadb; import sentence_transformers; import openai; print('ok')"`, { + encoding: 'utf-8', timeout: 15000 + }, (err, stdout) => { + const ok = !err && (stdout || '').trim() === 'ok'; + this._vectorDepsOk = ok; + if (ok) { + // Deps OK — now check embed status + this._execEmbedStatus(pyResult.path, vp, (statusText) => { + this._embedStatusText = statusText; + this.display(); + }); + } else { + this.display(); + } + }); + } + } + + _renderHfMirror(containerEl) { + const setting = new Setting(containerEl) + .setName(t('feat_hf_mirror')) + .setDesc(t('feat_hf_mirror_desc')) + .addDropdown(dropdown => { + dropdown.addOption('https://hf-mirror.com', 'hf-mirror.com (recommended)'); + dropdown.addOption('https://huggingface.co', 'huggingface.co (official)'); + dropdown.addOption('__custom__', 'Custom...'); + const current = this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com'; + const isPreset = ['https://hf-mirror.com', 'https://huggingface.co'].includes(current); + dropdown.setValue(isPreset ? current : '__custom__') + .onChange(value => { + if (value !== '__custom__') { + this.plugin.settings.vector_db_hf_endpoint = value; + this.plugin.saveSettings(); + if (customInput) { customInput.settingEl.style.display = 'none'; if (this._hfCustomText) this._hfCustomText.setValue(''); } + } else { + if (customInput) customInput.settingEl.style.display = ''; + } + }); + }); + const customInput = new Setting(containerEl) + .setName(t('feat_custom_endpoint')) + .setDesc(t('feat_custom_endpoint_desc')) + .addText(text => { + this._hfCustomText = text; + const current = this.plugin.settings.vector_db_hf_endpoint || ''; + const isPreset = ['https://hf-mirror.com', 'https://huggingface.co'].includes(current); + text.setPlaceholder('https://your-mirror.com') + .setValue(isPreset ? '' : current) + .onChange(value => { + this.plugin.settings.vector_db_hf_endpoint = value; + this.plugin.saveSettings(); + }); + }); + const current = this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com'; + const isPreset = ['https://hf-mirror.com', 'https://huggingface.co'].includes(current); + if (isPreset) customInput.settingEl.style.display = 'none'; + + new Setting(containerEl) + .setName(t('feat_hf_token')) + .setDesc(t('feat_hf_token_desc')) + .addText(text => { + text.setPlaceholder('hf_...') + .setValue(this.plugin.settings.vector_db_hf_token || '') + .onChange(value => { + this.plugin.settings.vector_db_hf_token = value; + this.plugin.saveSettings(); + }); + }); + } + + _renderApiConfig(containerEl) { + if (this.plugin.settings.vector_db_mode !== 'api') return; + + new Setting(containerEl) + .setName(t('feat_openai_key')) + .setDesc(t('feat_openai_key_desc')) + .addText(text => { + text.setPlaceholder('sk-...') + .setValue(this.plugin.settings.vector_db_api_key || '') + .onChange(value => { + this.plugin.settings.vector_db_api_key = value; + this.plugin.saveSettings(); + }); + }); + new Setting(containerEl) + .setName(t('feat_api_base_url')) + .setDesc(t('feat_api_base_url_desc')) + .addText(text => { + text.setPlaceholder('https://api.openai.com/v1') + .setValue(this.plugin.settings.vector_db_api_base || '') + .onChange(value => { + this.plugin.settings.vector_db_api_base = value; + this.plugin.saveSettings(); + }); + }); + new Setting(containerEl) + .setName(t('feat_api_model')) + .setDesc(t('feat_api_model_desc')) + .addText(text => { + text.setPlaceholder('text-embedding-3-small') + .setValue(this.plugin.settings.vector_db_api_model || 'text-embedding-3-small') + .onChange(value => { + this.plugin.settings.vector_db_api_model = value; + this.plugin.saveSettings(); + }); + }); + } + + _renderVectorNoDeps(containerEl) { + const box = containerEl.createEl('div'); + box.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; + box.setText(t('feat_deps_missing')); + + new Setting(containerEl) + .setName(t('feat_install_deps')) + .setDesc(t('feat_install_deps_desc')) + .addButton(button => { + button.setButtonText(t('feat_install_btn')) + .setCta() + .onClick(async () => { + const vp = this.app.vault.adapter.basePath; + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + if (!pyResult.path) { new Notice('No Python found.'); return; } + button.setButtonText(t('feat_installing')); + button.setDisabled(true); + const notice = new Notice('Installing chromadb + sentence-transformers + openai...', 0); + try { + const { exec } = require('child_process'); + const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1', HF_ENDPOINT: this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com', HF_TOKEN: this.plugin.settings.vector_db_hf_token || '' }); + await new Promise((resolve, reject) => { + exec(`"${pyResult.path}" -m pip install chromadb sentence-transformers openai`, { + encoding: 'utf-8', timeout: 300000, env: env, + }, (error) => { error ? reject(error) : resolve(); }); + }); + notice.hide(); + new Notice('Dependencies installed. Building vectors...'); + // Auto-build after install + this._vectorDepsOk = true; + this._execEmbedStatus(pyResult.path, vp, (text) => { + this._embedStatusText = text; + }); + this.display(); + } catch (e) { + notice.hide(); + new Notice('Install failed: ' + (e.stderr || e.message || e)); + button.setButtonText(t('feat_retry_btn')); + button.setDisabled(false); + } + }); + }); + } + + _renderVectorReady(containerEl, vp) { + // Status line + const statusEl = containerEl.createEl('div'); + statusEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; + statusEl.setText(this._embedStatusText || 'Loading...'); + + // Detect model mismatch + const embedInfo = this._embedStatusText ? this._parseEmbedStatus(this._embedStatusText) : null; + const currentModel = this._getCurrentModelKey(); + const lastModel = this.plugin.settings.vector_db_last_model || ''; + const modelChanged = embedInfo && embedInfo.db_exists && lastModel && lastModel !== currentModel; + + if (modelChanged) { + const warnEl = containerEl.createEl('div'); + warnEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-modifier-warning); border-radius:4px;'; + warnEl.setText(`Model changed (${lastModel} -> ${currentModel}). Existing vectors are incompatible — rebuild required.`); + } + + // Mode selector + new Setting(containerEl) + .setName(t('feat_embed_mode')) + .addDropdown(dropdown => { + dropdown.addOption('local', t('feat_embed_mode_local')); + dropdown.addOption('api', t('feat_embed_mode_api')); + dropdown.setValue(this.plugin.settings.vector_db_mode) + .onChange(value => { + this.plugin.settings.vector_db_mode = value; + this.plugin.saveSettings(); + this.display(); + }); + }); + + // Model selector (local mode) + if (this.plugin.settings.vector_db_mode === 'local') { + // HF settings only relevant for local model downloads + this._renderHfMirror(containerEl); + const modelDesc = { + 'BAAI/bge-small-en-v1.5': t('feat_model_bge_small'), + 'sentence-transformers/all-MiniLM-L6-v2': t('feat_model_minilm'), + 'BAAI/bge-base-en-v1.5': t('feat_model_bge_base'), + }; + new Setting(containerEl) + .setName(t('feat_model')) + .setDesc(modelDesc[this.plugin.settings.vector_db_model] || '') + .addDropdown(dropdown => { + dropdown.addOption('BAAI/bge-small-en-v1.5', 'bge-small (384d, 130MB)'); + dropdown.addOption('sentence-transformers/all-MiniLM-L6-v2', 'MiniLM (384d, 80MB)'); + dropdown.addOption('BAAI/bge-base-en-v1.5', 'bge-base (768d, 440MB)'); + dropdown.setValue(this.plugin.settings.vector_db_model) + .onChange(value => { + this.plugin.settings.vector_db_model = value; + this.plugin.saveSettings(); + this.display(); + }); + }) + .addButton(button => { + const model = this.plugin.settings.vector_db_model; + const cacheName = 'models--' + model.replace('/', '--'); + const fs = require('fs'); + const os = require('os'); + const path = require('path'); + const cachePath = path.join(os.homedir(), '.cache', 'huggingface', 'hub', cacheName); + + // Check integrity: directory exists AND has snapshots with files + let isCached = false; + if (fs.existsSync(cachePath)) { + const snapDir = path.join(cachePath, 'snapshots'); + if (fs.existsSync(snapDir)) { + try { + const entries = fs.readdirSync(snapDir); + isCached = entries.some(e => { + const p = path.join(snapDir, e); + return fs.statSync(p).isDirectory() && fs.readdirSync(p).length > 0; + }); + } catch (_) {} + } + } + + if (isCached) { + button.setButtonText(t('feat_uninstall_btn')).setWarning(); + } else { + button.setButtonText(t('feat_not_cached')); + button.setDisabled(true); + } + button.onClick(async () => { + if (!isCached) return; + button.setButtonText(t('feat_removing')); + button.setDisabled(true); + try { + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + const { exec } = require('child_process'); + const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' }); + await new Promise((resolve, reject) => { + exec(`"${pyResult.path}" -c "import shutil, os; p=os.path.join(os.path.expanduser('~/.cache/huggingface/hub'), '${cacheName}'); shutil.rmtree(p,ignore_errors=True); print('done')"`, { + encoding: 'utf-8', timeout: 30000, env: env + }, (error) => error ? reject(error) : resolve()); + }); + new Notice('Model cache removed.'); + } catch (e) { + new Notice('Failed: ' + (e.stderr || e.message || e)); + } + this.display(); + }); + }); + + // INFO: HF download notice for local mode + const infoDiv = containerEl.createDiv({ cls: 'setting-item-description' }); + infoDiv.createEl('p', { + text: 'Local mode downloads models from Hugging Face on first use. ' + + 'If inaccessible, set an HF Endpoint above (e.g. https://hf-mirror.com) or switch to API mode.', + cls: 'paperforge-settings-desc', + }); + } + + // API config (api mode) + this._renderApiConfig(containerEl); + + // Rebuild button with live terminal output + const terminalEl = containerEl.createEl('pre'); + terminalEl.style.cssText = 'display:none; background:var(--background-primary); padding:10px; border-radius:4px; border:1px solid var(--background-modifier-border); max-height:250px; overflow-y:auto; font-size:11px; font-family:var(--font-monospace); margin:8px 0; white-space:pre-wrap; word-break:break-all; opacity:0.8;'; + terminalEl.onclick = () => { + const text = terminalEl.textContent; + if (text) { navigator.clipboard.writeText(text); new Notice('Output copied to clipboard'); } + }; + + new Setting(containerEl) + .setName(t('feat_rebuild_vectors')) + .setDesc(modelChanged ? t('feat_rebuild_vectors_changed') : t('feat_rebuild_vectors_desc')) + .addButton(button => { + const label = embedInfo && embedInfo.db_exists ? t('feat_rebuild_btn') : t('feat_build_btn'); + button.setButtonText(label) + .setCta() + .onClick(async () => { + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + if (!pyResult.path) { new Notice(t('feat_no_python')); return; } + button.setButtonText(t('feat_building')); + button.setDisabled(true); + terminalEl.style.display = 'block'; + terminalEl.setText(''); + + const { spawn } = require('child_process'); + const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1', HF_ENDPOINT: this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com', HF_TOKEN: this.plugin.settings.vector_db_hf_token || '', VECTOR_DB_API_KEY: this.plugin.settings.vector_db_api_key || '', VECTOR_DB_API_BASE: this.plugin.settings.vector_db_api_base || '', VECTOR_DB_API_MODEL: this.plugin.settings.vector_db_api_model || '' }); + const child = spawn(pyResult.path, ['-m', 'paperforge', '--vault', vp, 'embed', 'build', '--force'], { + env: env, stdio: ['ignore', 'pipe', 'pipe'] + }); + + const append = (text) => { + terminalEl.setText((terminalEl.getText() || '') + text); + terminalEl.scrollTop = terminalEl.scrollHeight; + }; + + child.stdout.on('data', (data) => append(data.toString())); + child.stderr.on('data', (data) => append(data.toString())); + + try { + await new Promise((resolve, reject) => { + child.on('close', (code) => code === 0 ? resolve() : reject(new Error('Exit code ' + code))); + child.on('error', reject); + }); + this.plugin.settings.vector_db_last_model = currentModel; + this.plugin.saveSettings(); + this._embedStatusText = null; + this._execEmbedStatus(pyResult.path, vp, (text) => { this._embedStatusText = text; this.display(); }); + new Notice(t('feat_build_complete')); + } catch (e) { + append('\n--- BUILD FAILED ---\n' + (e.stderr || e.message || e)); + new Notice(t('feat_build_failed')); + button.setButtonText(label); + button.setDisabled(false); + } + }); + }); + } + + _getCurrentModelKey() { + if (this.plugin.settings.vector_db_mode === 'api') return this.plugin.settings.vector_db_api_model || 'openai/text-embedding-3-small'; + return this.plugin.settings.vector_db_model || 'BAAI/bge-small-en-v1.5'; + } + + _parseEmbedStatus(text) { + // Parse " key: value" lines from paperforge embed status output + const info = {}; + if (!text) return info; + text.split('\n').forEach(line => { + const m = line.match(/^\s*([^:]+):\s*(.*)/); + if (m) info[m[1].trim()] = m[2].trim(); + }); + // Normalize bools + if (info.db_exists !== undefined) info.db_exists = info.db_exists === 'True'; + if (info.chunk_count !== undefined) info.chunk_count = parseInt(info.chunk_count, 10) || 0; + return info; + } + _getPythonDesc(pyPath, source) { if (source === 'stale') { return `[!!] ${pyPath} (stale — path no longer exists, update or clear the override below)`; @@ -3324,6 +4251,12 @@ class PaperForgeSetupModal extends Modal { module.exports = class PaperForgePlugin extends Plugin { async onload() { await this.loadSettings(); + // ── Automatic file polling state ── + this._lastExportMtime = 0; + this._lastOcrMtimes = {}; + this._autoSyncRunning = false; + this._lastSyncTime = null; + this._pollTimer = null; // Clean stale path fields from plugin data.json (migrated to paperforge.json) this.saveSettings(); T = (langFromApp(this.app) === 'zh') ? LANG.zh : LANG.en; @@ -3368,6 +4301,7 @@ module.exports = class PaperForgePlugin extends Plugin { if (this.settings.auto_update !== false && this.settings.setup_complete) { setTimeout(() => this._autoUpdate(), 3000); } + this._startFilePolling(); } _autoUpdate() { @@ -3407,6 +4341,99 @@ module.exports = class PaperForgePlugin extends Plugin { }); } + /* ── Automatic file polling for seamless memory layer ── */ + + _startFilePolling() { + const vaultPath = this.app.vault.adapter.basePath; + const fs = require('fs'); + const path = require('path'); + const { exec } = require('child_process'); + + this._pollTimer = setInterval(() => { + this._checkExports(vaultPath, fs, path, exec); + this._checkOcr(vaultPath, fs, path, exec); + }, 120000); // every 120 seconds + } + + _checkExports(vaultPath, fs, path, exec) { + if (this._autoSyncRunning) return; + const exportsDir = path.join(vaultPath, 'System', 'PaperForge', 'exports'); + if (!fs.existsSync(exportsDir)) return; + + let newestMtime = 0; + try { + fs.readdirSync(exportsDir).forEach(f => { + if (!f.endsWith('.json')) return; + const stat = fs.statSync(path.join(exportsDir, f)); + if (stat.mtimeMs > newestMtime) newestMtime = stat.mtimeMs; + }); + } catch(e) { return; } + + if (newestMtime > this._lastExportMtime) { + this._lastExportMtime = newestMtime; + this._autoSync(vaultPath, exec); + } + } + + _autoSync(vaultPath, exec) { + if (this._autoSyncRunning) return; + this._autoSyncRunning = true; + + const pyResult = resolvePythonExecutable(vaultPath, this.settings); + if (!pyResult.path) { this._autoSyncRunning = false; return; } + + const cmd = `"${pyResult.path}" -m paperforge --vault "${vaultPath}" sync`; + exec(cmd, { timeout: 120000, encoding: 'utf-8' }, (err, stdout, stderr) => { + this._autoSyncRunning = false; + this._memoryStatusText = null; // force re-check next time + if (!err) { + this._lastSyncTime = new Date().toLocaleTimeString(); + } + // Update last export mtime to avoid re-trigger during build + try { + const fs = require('fs'); + const path = require('path'); + const exportsDir = path.join(vaultPath, 'System', 'PaperForge', 'exports'); + let newest = 0; + fs.readdirSync(exportsDir).forEach(f => { + if (!f.endsWith('.json')) return; + newest = Math.max(newest, fs.statSync(path.join(exportsDir, f)).mtimeMs); + }); + this._lastExportMtime = newest; + } catch(e) {} + }); + } + + _checkOcr(vaultPath, fs, path, exec) { + if (this._autoSyncRunning) return; + const ocrDir = path.join(vaultPath, 'System', 'PaperForge', 'ocr'); + if (!fs.existsSync(ocrDir)) return; + + try { + fs.readdirSync(ocrDir, { withFileTypes: true }).forEach(entry => { + if (!entry.isDirectory()) return; + const metaPath = path.join(ocrDir, entry.name, 'meta.json'); + if (!fs.existsSync(metaPath)) return; + const stat = fs.statSync(metaPath); + const prevMtime = this._lastOcrMtimes[entry.name] || 0; + if (stat.mtimeMs <= prevMtime) return; + + this._lastOcrMtimes[entry.name] = stat.mtimeMs; + if (this._autoSyncRunning) return; + this._autoSyncRunning = true; + + const pyResult = resolvePythonExecutable(vaultPath, this.settings); + if (!pyResult.path) { this._autoSyncRunning = false; return; } + + const cmd = `"${pyResult.path}" -m paperforge --vault "${vaultPath}" sync --key "${entry.name}"`; + exec(cmd, { timeout: 30000, encoding: 'utf-8' }, () => { + this._autoSyncRunning = false; + this._memoryStatusText = null; + }); + }); + } catch(e) {} + } + /** * Read path configuration from the canonical paperforge.json file. * Falls back to Python-level DEFAULT_CONFIG values if file does not exist. @@ -3507,11 +4534,17 @@ module.exports = class PaperForgePlugin extends Plugin { } onunload() { + if (this._pollTimer) clearInterval(this._pollTimer); this.app.workspace.detachLeavesOfType(VIEW_TYPE_PAPERFORGE); } async loadSettings() { this.settings = Object.assign({}, DEFAULT_SETTINGS, await this.loadData()); + // Deep-merge nested objects (features, frozen_skills) to avoid overwrite + if (this.settings.features && DEFAULT_SETTINGS.features) { + this.settings.features = Object.assign({}, DEFAULT_SETTINGS.features, this.settings.features || {}); + } + if (!this.settings.frozen_skills) { this.settings.frozen_skills = {}; } // Path fields come from paperforge.json, not from DEFAULT_SETTINGS or plugin data.json const pfConfig = this.readPaperforgeJson(); this.settings.system_dir = pfConfig.system_dir; diff --git a/paperforge/plugin/manifest.json b/paperforge/plugin/manifest.json index b535c32..21ba972 100644 --- a/paperforge/plugin/manifest.json +++ b/paperforge/plugin/manifest.json @@ -1,9 +1,9 @@ { "id": "paperforge", "name": "PaperForge", - "version": "1.5.5", + "version": "1.5.6rc1", "minAppVersion": "1.9.0", - "description": "PaperForge — Zotero literature pipeline. Sync PDFs, run OCR, and read with AI-assisted deep reading.", + "description": "Zotero literature pipeline for Obsidian. Sync PDFs, run OCR, and read with AI-assisted deep reading.", "author": "Lin Zhaoxuan", "authorUrl": "https://github.com/LLLin000", "isDesktopOnly": true diff --git a/paperforge/plugin/styles.css b/paperforge/plugin/styles.css index 31f0725..8c0682e 100644 --- a/paperforge/plugin/styles.css +++ b/paperforge/plugin/styles.css @@ -1584,15 +1584,15 @@ } .paperforge-runtime-badge.match { background: var(--color-green, #4caf50); - color: #fff; + color: #ffffff; } .paperforge-runtime-badge.mismatch { background: var(--color-red, #f44336); - color: #fff; + color: #ffffff; } .paperforge-runtime-badge.missing { background: var(--color-orange, #ff9800); - color: #fff; + color: #ffffff; } /* ========================================================================== @@ -1600,7 +1600,7 @@ ========================================================================== */ .paperforge-drift-banner { background: var(--color-yellow, #ffc107); - color: var(--background-primary, #000); + color: var(--background-primary, #000000); padding: 8px 14px; border-radius: 6px; font-size: 0.9em; @@ -1619,8 +1619,8 @@ padding: 4px 12px; font-size: 0.85em; background: var(--interactive-normal, #e0e0e0); - color: var(--text-normal, #000); - border: 1px solid var(--background-modifier-border, #ccc); + color: var(--text-normal, #000000); + border: 1px solid var(--background-modifier-border, #cccccc); border-radius: 4px; cursor: pointer; } diff --git a/paperforge/plugin/versions.json b/paperforge/plugin/versions.json index ecd95b7..f3dcc74 100644 --- a/paperforge/plugin/versions.json +++ b/paperforge/plugin/versions.json @@ -5,5 +5,8 @@ "1.4.18": "1.9.0", "1.5.0": "1.9.0", "1.5.1": "1.9.0", - "1.5.2": "1.9.0" + "1.5.2": "1.9.0", + "1.5.3": "1.9.0", + "1.5.4": "1.9.0", + "1.5.5": "1.9.0" } \ No newline at end of file diff --git a/paperforge/schema/field_registry.yaml b/paperforge/schema/field_registry.yaml index cb664dc..119dc3b 100644 --- a/paperforge/schema/field_registry.yaml +++ b/paperforge/schema/field_registry.yaml @@ -6,6 +6,13 @@ frontmatter: description: "Zotero citation key" owner: sync introduced_in: "1.0" + citation_key: + type: str + required: false + public: true + description: "Better BibTeX citation key (e.g. aaronStimulationGrowthFactor2004)" + owner: sync + introduced_in: "1.5" domain: type: str required: true diff --git a/paperforge/services/skill_deploy.py b/paperforge/services/skill_deploy.py index 48782d5..12e010f 100644 --- a/paperforge/services/skill_deploy.py +++ b/paperforge/services/skill_deploy.py @@ -1,4 +1,4 @@ -"""Skill deployment service — single copytree for all platforms. +"""Skill deployment service — deploys the unified paperforge skill to the vault. Used by both setup wizard (install) and update worker (update). All deployments are vault-local only. @@ -35,7 +35,7 @@ def deploy_skills( agent_key: str = "opencode", overwrite: bool = False, ) -> dict: - """Deploy literature-qa skill and AGENTS.md to the vault. + """Deploy paperforge skill and AGENTS.md to the vault. Args: vault: Obsidian vault root. @@ -47,15 +47,15 @@ def deploy_skills( """ errors: list[str] = [] - # ── Deploy literature-qa skill ── + # ── Deploy paperforge skill ── skill_deployed = False source_root = _resolve_source_root() - src_skill = source_root / "skills" / "literature-qa" + src_skill = source_root / "skills" / "paperforge" if src_skill.exists(): skill_dir_name = AGENT_SKILL_DIRS.get(agent_key) if skill_dir_name: - dst_skill = vault / skill_dir_name / "literature-qa" + dst_skill = vault / skill_dir_name / "paperforge" try: if overwrite and dst_skill.exists(): shutil.rmtree(dst_skill, ignore_errors=True) diff --git a/paperforge/setup/agent.py b/paperforge/setup/agent.py index f6b4bd5..83c867d 100644 --- a/paperforge/setup/agent.py +++ b/paperforge/setup/agent.py @@ -1,4 +1,4 @@ -"""AgentInstaller — deploys literature-qa skill to vault-local agent config.""" +"""AgentInstaller — deploys paperforge skill to vault-local agent config.""" from __future__ import annotations @@ -10,7 +10,7 @@ class AgentInstaller: - """Deploy literature-qa skill directory to vault-local agent skills path.""" + """Deploy paperforge skill directory to vault-local agent skills path.""" def __init__(self, vault: Path, agent_type: str = "opencode"): self.vault = vault @@ -23,8 +23,8 @@ def _get_skills_dir(self) -> Path: return self.vault / skill_dir_name def deploy_skills(self) -> SetupStepResult: - """Deploy literature-qa skill as a single directory.""" - source_skills = self._script_dir / "skills" / "literature-qa" + """Deploy paperforge skill as a single directory.""" + source_skills = self._script_dir / "skills" / "paperforge" if not source_skills.exists(): return SetupStepResult( step="agent_installer", @@ -33,7 +33,7 @@ def deploy_skills(self) -> SetupStepResult: error=f"Not found: {source_skills}", ) - target_dir = self._get_skills_dir() / "literature-qa" + target_dir = self._get_skills_dir() / "paperforge" target_dir.mkdir(parents=True, exist_ok=True) try: @@ -41,7 +41,7 @@ def deploy_skills(self) -> SetupStepResult: return SetupStepResult( step="agent_installer", ok=True, - message=f"Deployed literature-qa skill to {target_dir}", + message=f"Deployed paperforge skill to {target_dir}", details={"source": str(source_skills), "target": str(target_dir)}, ) except Exception as e: diff --git a/paperforge/setup_wizard.py b/paperforge/setup_wizard.py index f3cee71..b1c3661 100644 --- a/paperforge/setup_wizard.py +++ b/paperforge/setup_wizard.py @@ -523,6 +523,7 @@ def headless_setup( pf_path / "exports", pf_path / "ocr", pf_path / "config", + pf_path / "methodology", pf_path / "worker/scripts", vault / resources_dir / literature_dir, vault / base_dir, @@ -653,7 +654,7 @@ def headless_setup( overwrite=True, ) if skill_result["skill_deployed"]: - print(" [OK] literature-qa skill deployed") + print(" [OK] paperforge skill deployed") for err in skill_result.get("errors", []): print(f" [WARN] {err}") diff --git a/paperforge/skills/literature-qa/SKILL.md b/paperforge/skills/literature-qa/SKILL.md deleted file mode 100644 index 166b7ce..0000000 --- a/paperforge/skills/literature-qa/SKILL.md +++ /dev/null @@ -1,158 +0,0 @@ ---- -name: literature-qa -description: > - 学术文献库操作:精读、问答、检索、批量阅读。Triggered by: - pf-deep pf-paper pf-end, - "精读", "文献问答", "结束讨论", "找文献", "搜文献", - "文献库", "文献检索", "库里有什么", "搜一下库里", "看一下文献库", - "读一下collection", "总结文献", "批量阅读", "读一下这个方向". ---- - -# Literature QA - ---- - -## 1. Bootstrap — 必须先执行 - -跑这个脚本: - -``` -python $SKILL_DIR/scripts/pf_bootstrap.py -``` - -返回 JSON。记住以下变量: - -| 变量 | 来自 JSON 的 | 用于 | -| ----------- | -------------------- | --------------------------------------- | -| `$SKILL_DIR` | skill 安装路径(平台注入) | 运行 `scripts/ld_deep.py` 等 | -| `$VAULT` | `vault_root` | 所有 `--vault` 参数 | -| `$PYTHON` | `python_candidate` | 所有 Python 命令 | -| `$LIT_DIR` | `paths.literature_dir` | 文献笔记根目录 | -| `$IDX_PATH` | `paths.index_path` | 索引文件 | -| `$OCR_DIR` | `paths.ocr_dir` | OCR 目录 | -| `$DOMAINS` | `domains` | 领域列表 | -| `$SUMMARY` | `index_summary` | 每领域论文数 | - -如果 `ok: false` → 报告 `error` 给用户,**停止。不许自己拼路径。** - ---- - -## 2. Vault 概览 - -展示: - -``` -Vault: $VAULT -文献库: - — N1 篇 - — N2 篇 -共 M 篇 -``` - -**如果用户是空输入触发的 skill**(没给任何具体指令),展示概览后加一句交互: - -``` -你可以: - [1] 精读一篇论文 → "精读 " - [2] 文献问答 → "文献问答 " - [3] 搜索文献 → "找文献 <关键词>" / "库里有没有 <关键词>" - [4] 批量阅读 → "读一下 " / "总结 <方向> 文献" - [5] 返回 -``` - -**如果用户给了具体指令**,直接进入决策树。 - ---- - -## 3. 决策树 - -``` -用户输入 - │ - ├─ 文献标识 (key/DOI/标题/作者年份) + 精读意图 - │ └─ 路由 → deep-reading.md - │ - ├─ 文献标识 (key/DOI/标题/作者年份) + 问答/讨论意图 - │ └─ 路由 → paper-qa.md - │ - ├─ 搜索意图 ("找文献"/"搜文献"/"库里有没有"/"文献检索") - │ └─ 路由 → paper-search.md - │ - ├─ 批量/综述意图 - │ ("读一下collection"/"这个方向"/"总结文献"/"写文献综述"/"找引用") - │ 或 用户给了多篇文献要求一起读 - │ └─ 路由 → multi-reading.md - │ - ├─ 结束/保存 ("结束讨论"/"保存"/"pf-end") - │ └─ 路由 → save-session.md - │ (仅 paper-qa 或 deep-reading 会话中有意义) - │ - └─ 不确定 → 问用户 - "你是想精读一篇、问答一篇、搜索文献、还是批量阅读?" -``` - ---- - -## 4. 工具使用指南 - -本 Skill 提供两类工具:**确定性命令** 和 **Agent 自查**。必须根据场景选择正确的方式。 - -### 确定性命令 — 优先使用 - -| 场景 | 命令 | 原因 | -| ---------------------- | ------------------------------------------------------------------------------------------ | ---- | -| 按 key 快速找文件 | `glob("$LIT_DIR/**/.md")` 或用 `Get-ChildItem "$LIT_DIR" -Recurse -Filter ".md"` | 不需要 $PYTHON,最快 | -| 按 key 查完整信息 | `$PYTHON -m paperforge.worker.paper_resolver resolve-key --vault "$VAULT"` | 返回 frontmatter 字段 (analyze, ocr_status 等) | -| 按 DOI 定位论文 | `$PYTHON -m paperforge.worker.paper_resolver resolve-doi "" --vault "$VAULT"` | DOI 无法用文件系统快速匹配 | -| 按字段搜索论文 | `$PYTHON -m paperforge.worker.paper_resolver search --title "..." --author "..." --year ... --domain "..." --vault "$VAULT"` | 结构化搜索,含相关性打分 | -| 精读 prepare | `$PYTHON "$SKILL_DIR/scripts/ld_deep.py" prepare --key --vault "$VAULT"` | -| 精读 postprocess | `$PYTHON "$SKILL_DIR/scripts/ld_deep.py" postprocess-pass2 --figures --vault "$VAULT"` | -| 精读 validate | `$PYTHON "$SKILL_DIR/scripts/ld_deep.py" validate-note --fulltext ` | -| 保存讨论 | `$PYTHON -m paperforge.worker.discussion record --vault "$VAULT" --agent pf-paper --model "" --qa-pairs ''` | - -### Agent 自查 — 当命令覆盖不到时用 - -| 场景 | 操作 | -| ------------------------ | ----------------------------------------------------------- | -| 按关键词模糊搜索全部文献 | 读 `$IDX_PATH` 的 JSON,筛 `title` / `abstract` / `journal` | -| 按 collection 筛选 | 读 `$IDX_PATH`,筛 `collection_path` 字段 | -| 读论文全文 | 已找到 `fulltext.md` 路径(glob 或 resolve-key) → 直接 read | -| 读精读笔记 | 已找到 formal note 路径 → read 的 `## 🔍 精读` 区域 | -| 遍历笔记做批量统计 | `Get-ChildItem "$LIT_DIR" -Recurse -Filter "*.md"` + 读 frontmatter 或 `find "$LIT_DIR" -name "*.md"` | -| **禁止的操作** | **根据 vault-knowledge 示例拼接路径、把目录名写死在文件路径里** | - ---- - -## 5. 路由表 - -| 路由 | 触发词 | 加载文件 | -| ------------- | ---------------------------------------------------------- | ---------------------------------------- | -| 精读 | `pf-deep `, "精读 " | [deep-reading.md](references/deep-reading.md) | -| 问答 | `pf-paper `, "文献问答 " | [paper-qa.md](references/paper-qa.md) | -| 文献检索 | "找文献", "搜文献", "文献检索", "搜一下库里", "库里有没有" | [paper-search.md](references/paper-search.md) | -| 批量阅读 | "读一下collection", "这个方向", "总结文献", "批量阅读" | [multi-reading.md](references/multi-reading.md) | -| 保存记录 | `pf-end`, "结束讨论", "保存" | [save-session.md](references/save-session.md) | -| 论文定位协议 | 所有路由共享 | [paper-resolution.md](references/paper-resolution.md) | - -> 所有路由继承 Skill 级别的 `$PYTHON` / `$VAULT` / `$LIT_DIR` 等变量。reference 文件不再重复声明。 - ---- - -## 文件结构 - -``` -literature-qa/ -├── SKILL.md ← 本文件 -├── references/ -│ ├── deep-reading.md ← 精读工作流 -│ ├── paper-qa.md ← 问答工作流 -│ ├── paper-search.md ← 文献检索工作流 -│ ├── multi-reading.md ← 批量阅读工作流 -│ ├── save-session.md ← 保存记录工作流 -│ ├── paper-resolution.md ← 论文定位协议 -│ ├── deep-subagent.md -│ └── chart-reading/ -└── scripts/ - ├── pf_bootstrap.py ← Bootstrap 入口 - └── ld_deep.py ← 精读引擎 -``` diff --git a/paperforge/skills/literature-qa/references/deep-reading.md b/paperforge/skills/literature-qa/references/deep-reading.md deleted file mode 100644 index 0f8df6e..0000000 --- a/paperforge/skills/literature-qa/references/deep-reading.md +++ /dev/null @@ -1,162 +0,0 @@ -# 三阶段精读 - -Keshav 三阶段组会式精读。触发后执行以下工作流。 - ---- - -## 前置条件检查 - -执行前确认: -- [ ] 已完成论文定位(参考 [paper-resolution.md](paper-resolution.md)),拿到 zotero_key -- [ ] 用 `glob("$LIT_DIR/**/.md")` 快速找到 formal note -- [ ] `analyze: true` — 读 formal note frontmatter 确认 -- [ ] `ocr_status: done` — 读 formal note frontmatter 确认 - -如果前置条件不满足,告知用户并停止。 - ---- - -## 执行流程 - -### Step 1: Prepare(机械操作,跑脚本) - -```bash -$PYTHON "$SKILL_DIR/scripts/ld_deep.py" prepare --key --vault "$VAULT" -``` - -返回 JSON 解析: -- `status: "ok"` → 记下 `figure_map`、`chart_type_map`、`formal_note`、`fulltext_md`、`figures`、`tables` 路径和数量 → 继续 -- `status: "error"` → 报告 `message` 给用户,停止 - -读 formal note 确认 `## 🔍 精读` 骨架已插入。 - ---- - -### Step 2: Pass 1 — 概览 - -只填 `### Pass 1: 概览` 区域。不碰 Pass 2/3。 - -**填写内容:** - -- **一句话总览**:论文类型 + 核心发现,一句话。 -- **5 Cs 快速评估**: - - **Category**(类型):RCT / 队列研究 / 病例对照 / 综述 / 基础研究 / ... - - **Context**(上下文):该领域当前共识,本文要解决什么问题 - - **Correctness**(合理性初判):初步直觉,逻辑是否有明显漏洞 - - **Contributions**(贡献):1-3 条 - - **Clarity**(清晰度):写作质量,图表可读性 -- **Figure 导读**(基于 fulltext.md 浏览各图 caption): - - 关键主图:列出并一句话概括每个主图要证明什么 - - 证据转折点:哪个 figure 是叙事的关键转折 - - 需要重点展开的 supplementary:如果有 - - 关键表格:列出 - -填完立即保存 formal note。 - ---- - -### Step 3: Pass 2 — 精读还原 - -填 `### Pass 2: 精读还原` 区域。**按 figure 顺序逐个处理。** - -#### 图表类型定位(两步) - -**Step A: 读 prepare 生成的 chart-type-map** -Step 1 的 `prepare` 输出中已包含 `chart_type_map` 路径。读该文件,获取每个 figure 的关键词命中结果。这只是建议。 - -**Step B: Agent 读 caption 做最终判断** - -对每个 figure: -1. 读该 figure 的 caption(来自 prepare 返回的 `fulltext_md` 或 `figure_map`) -2. 根据 caption 内容,对照 [chart-reading/INDEX.md](chart-reading/INDEX.md) 判断图表类型 -3. chart-type-map 建议和 Agent 判断不一致 → 以 Agent 判断为准 -4. 无法确定类型 → 跳过 chart guide,按通用 figure 结构分析 -5. 确定类型后,读对应的 chart-reading 指南(如 `chart-reading/条形图与误差棒.md`),按指南中的检查清单分析 - -#### 每张 Figure 的子标题(固定,不可少) - -按以下格式填入 formal note 中该 figure 的 callout block: - -``` -**图像定位与核心问题**:页码 + 要回答什么问题 -**方法与结果**:实验设计/数据来源/技术手段。核心数据、趋势、对比。 -**图表质量审查**:按 chart-reading 指南检查坐标轴、单位、误差棒、统计标注等。 -**作者解释**:作者在正文中对该图的解读 -**我的理解**:自己的理解(区分于作者解释) -**疑点/局限**:读图时发现的疑问,用 `> [!warning]` 突出 -``` - -#### 每张 Table 的子标题 - -``` -回答什么问题、关键字段/分组、主要结果、我的理解、疑点/局限 -``` - -#### 每张 figure 填完立即保存,再处理下一张。 - -#### 所有 figure/table 处理完后,填: - -**关键方法补课**:简要解释不熟悉的实验技术(1-2 项即可) - -**主要发现与新意**: -- 发现 1:...(来源:Figure X) -- 发现 2:...(来源:Figure Y / Table Z) - -保存。 - ---- - -### Step 4: Postprocess(跑校验脚本,修正问题) - -```bash -$PYTHON "$SKILL_DIR/scripts/ld_deep.py" postprocess-pass2 "$FORMAL_NOTE_PATH" --figures --format text --vault "$VAULT" -``` - -- 输出 `OK` → 继续 -- 输出错误 → 按错误提示修正(包含行号),修正后重新跑 -- 最多 3 轮修正。3 轮后仍失败 → 报告剩余错误给用户 - ---- - -### Step 5: Pass 3 — 深度理解 - -填 `### Pass 3: 深度理解` 区域。基于 Pass 1/2 已写的内容。 - -**填写内容:** - -- **假设挑战与隐藏缺陷**:隐含假设;如果放宽某个假设结论还成立吗;缺少哪些关键引用;实验/分析技术潜在问题 -- **哪些结论扎实,哪些仍存疑**: - - **较扎实**:... - - **仍存疑**:...(用 `> [!warning]`) -- **Discussion 与 Conclusion 怎么读**:作者真正完成了什么;哪些地方有拔高;哪些是推测 -- **对我的启发**:研究设计上;figure 组织上;方法组合上;未来工作想法 -- **遗留问题**:...(用 `> [!question]`) - -保存。 - ---- - -### Step 6: Final Validation - -```bash -$PYTHON "$SKILL_DIR/scripts/ld_deep.py" validate-note "$FORMAL_NOTE_PATH" --fulltext "$FULLTEXT_PATH" -``` - -- 输出 `OK` → 告知用户精读完成 -- 输出错误 → 修正缺失项,不报告成功直到通过 - ---- - -## Callout 格式规则 - -- `> [!important]`:每个 main finding -- `> [!warning]`:疑问、局限、证据边界、仍存疑条目 -- `> [!question]`:遗留问题 -- **间距:** 相邻 callout block 之间必须有空行,否则 Obsidian 会合并 - - 正确:`> [!important] A\n\n> [!important] B` - - 错误:`> [!important] A\n> [!important] B` - -## Supplementary 规则 - -- 默认不逐张展开 supplementary figure/table -- 仅在以下情况纳入:对主结论形成关键支撑、补足方法可信度、限制主文结论解释范围、作者在正文中明显依赖该补充材料 diff --git a/paperforge/skills/literature-qa/references/deep-subagent.md b/paperforge/skills/literature-qa/references/deep-subagent.md deleted file mode 100644 index 9f0de7f..0000000 --- a/paperforge/skills/literature-qa/references/deep-subagent.md +++ /dev/null @@ -1,103 +0,0 @@ -# Subagent Prompt for /pf-deep - -## Task - -Execute Keshav 3-pass journal-club style deep reading on a paper and write the results into the `## 🔍 精读` section of its formal note. - -## Input Variables - -- `{{ZOTERO_KEY}}` — Zotero citation key (e.g. `Y5KQ4JQ7`) -- `{{VAULT}}` — Vault root path -- `{{SCRIPT}}` — Path to `ld_deep.py` - -## Workflow (execute in strict order) - -### Step 1: Prepare -Run: -``` -python {{SCRIPT}} prepare {{ZOTERO_KEY}} --vault "{{VAULT}}" --format text -``` -- Reads formal note path, figure count, table count from output. -- If output starts with `[ERROR]`: report error to user, stop. -- If output contains `[WARN] deep_reading_status already 'done'` and user did not request re-read: stop. -- Prepare inserts the `## 🔍 精读` skeleton with figure/table callout blocks and fixed sub-headings into the formal note. Read the note to inspect its structure. - -### Step 2: Pass 1 (概览) -Fill `### Pass 1: 概览` only. Do not touch Pass 2/3. -- `**一句话总览**`: paper type + core finding in one sentence. -- `**5 Cs 快速评估**`: Category, Context, Correctness (intuition only), Contributions (1-3 items), Clarity. -- `**Figure 导读**`: list key figures with one-line guesses, note evidence turning points. -- Save immediately after writing. - -### Step 3: Pass 2 (精读还原) -Fill `### Pass 2: 精读还原`. Process figures sequentially starting from Figure 1. Each figure callout block has fixed sub-headings. Fill content under each sub-heading. Do NOT modify sub-headings, reorder blocks, or move `![[image]]` embeds. - -**Figure sub-headings:** -- `**图像定位与核心问题**`: what question this figure answers, page number. -- `**方法与结果**`: experimental design / data source / technical approach. Core data points, trends, comparisons. -- `**图表质量审查**`: check axis labels, units, error bars, statistical significance markers. Read `chart-type-map.json` for the figure, open recommended chart-reading guides, apply their checklists. -- `**作者解释**`: authors' description from the text. -- `**我的理解**`: your own analysis (distinct from author explanation). -- `**疑点/局限**`: use `> [!warning]` for concerns. - -**Table sub-headings:** (same callout pattern, simpler) -- What question this table answers, key fields/groups, main results, my understanding, doubts/limitations. - -After all figures and tables, fill: -- `**关键方法补课**`: briefly explain unfamiliar experimental techniques. -- `**主要发现与新意**`: list findings with evidence source (Figure X / Table Y). - -Save after each figure block. - -### Step 4: Postprocess -Run: -``` -python {{SCRIPT}} postprocess-pass2 --figures --format text -``` -- If output is `OK`: proceed. -- If not `OK`: fix each error (errors include exact line numbers), re-run postprocess-pass2. Max 3 fix rounds. If still failing after 3 rounds, report remaining errors to user. - -### Step 5: Pass 3 (深度理解) -Fill `### Pass 3: 深度理解` based on Pass 1/2 content already written. Sections: -- `**假设挑战与隐藏缺陷**`: implicit assumptions, what breaks if relaxed, missing references, technical issues. -- `**哪些结论扎实,哪些仍存疑**`: split into 较扎实 / 仍存疑. -- `**Discussion 与 Conclusion 怎么读**`: what authors actually accomplished vs. overclaim vs. speculation. -- `**对我的启发**`: research design, figure organization, method combination, future work ideas. -- `**遗留问题**`: open questions. -- Save. - -### Step 6: Final Validation -Run: -``` -python {{SCRIPT}} validate-note --fulltext -``` -- Report result to user. If not `OK`, list missing items and fix. - -## Callout Rules - -- `> [!important]`: each main finding entry -- `> [!warning]`: doubts, limitations, evidence boundaries, items in 仍存疑 -- `> [!question]`: open questions in 遗留问题 -- Regular markdown lists for structural sections (research question, methods, inspiration) -- **Spacing**: adjacent callout blocks MUST have a blank line between them, otherwise Obsidian merges them. -- Correct: `> [!important] A\n\n> [!important] B` -- Incorrect: `> [!important] A\n> [!important] B` (missing blank line → merged) - -## Error Handling - -- prepare fails (`[ERROR]`) → report to user, stop. -- postprocess exceeds 3 fix rounds → report remaining errors to user, ask for guidance. -- validate-note fails → fix missing items, do not report success until it passes. - -## Command Reference - -``` -# Prepare (insert skeleton + check preconditions) -python {{SCRIPT}} prepare {{ZOTERO_KEY}} --vault "{{VAULT}}" --format text - -# Postprocess Pass 2 (fix spacing/section issues) -python {{SCRIPT}} postprocess-pass2 --figures --format text - -# Validate final note structure -python {{SCRIPT}} validate-note --fulltext -``` diff --git a/paperforge/skills/literature-qa/references/multi-reading.md b/paperforge/skills/literature-qa/references/multi-reading.md deleted file mode 100644 index c47f15a..0000000 --- a/paperforge/skills/literature-qa/references/multi-reading.md +++ /dev/null @@ -1,144 +0,0 @@ -# 批量文献阅读 - -用户需要阅读多篇文献并总结——综述写作、找引用、研究方向调研等。 - ---- - -## 触发条件 - -- 用户给了一个 collection 名(Zotero 收藏夹) -- 用户给了模糊方向("帮我看一下骨科里关于支架材料的文章") -- 用户给了多篇文献要求一起读 -- 用户说"总结库里XXX方向的文献"、"写一段文献综述" - ---- - -## 执行流程 - -### Step 1: 确定文献范围 - -和用户确认要读哪些文献: -- 用户给了 collection 名 → 读 `$IDX_PATH`,筛 `collection_path` 包含该名称的条目 -- 用户给了关键词方向 → 用 paper_resolver search 或直接 grep `$IDX_PATH` -- 用户给了多篇 key → 直接确认 key 列表 - -列出候选让用户确认: - -``` -找到 N 篇匹配 (): - -[1] ABC12345 — Title (Author, Year, Domain, OCR: done/pending) -[2] DEF67890 — Title (Author, Year, Domain, OCR: done/pending) -... - -要全部读,还是选几篇?(输入编号如 "1,3,5" 或 "all") -``` - -### Step 2: 逐篇阅读 - -对每篇选定文献: - -1. 用 glob 找到 formal note:`glob("$LIT_DIR/**/.md")`(最快,不需要 $PYTHON) -2. 读 formal note frontmatter → 元数据 -3. 同目录下找 `fulltext.md` → 读 Abstract、Results、Discussion -4. 如果有 OCR 但 fulltext 太长 → 先读 caption + figure 描述定位关键段落 -5. 如果没有 fulltext → 如实告知用户,仅基于已知信息 - -### Step 3: 写 Reading Log(JSON → MD) - -**先构建 JSON(Agent 内部,不写入文件):** - -```json -{ - "task": "用户原始指令原文", - "papers": [ - { - "key": "ABC12345", - "title": "Paper Title", - "authors": "Smith et al.", - "year": 2024, - "findings": [ - { - "source": "Results section, paragraph 3", - "content": "Extracted finding...", - "citation_use": "可用于支撑 XXX 观点" - } - ] - } - ] -} -``` - -**再渲染为 MD,追加写入 `$VAULT/Bases/reading-log-.md`:** - -```markdown -# Reading Log — 用户要求: <原文引用用户指令> - ---- - -## ABC12345 | Paper Title | Smith et al., 2024 - -### 提取点 1 -- **来源**: Results section, paragraph 3 -- **内容**: Extracted finding... -- **引用建议**: 可用于支撑 XXX 观点 - -### 提取点 2 -- **来源**: Discussion, final paragraph -- **内容**: ... -- **引用建议**: ... - ---- - -## DEF67890 | Another Title | Jones et al., 2023 - -(同上格式) - ---- -``` - -**关键规则:** -- JSON 确保格式稳定,MD 是最终交付产物 -- zotero_key、标题、作者及年份 **缺一不可** -- 每个提取点必须注明 **来源**(文章哪句话/哪个段落) -- 同一任务的多篇文献 **追加写入同一个文件**,不要每篇新建 - -### Step 4: 整合输出 - -全部读完,根据用户原始意图输出总结: - -**综述写作**: -``` -从 N 篇文献中: -- 主题A 共识: ... -- 主题A 争议: ... -- 方法论趋势: ... -- 关键引用: - 1. "...[结论]" — ABC12345 (Author, Year), Fig.X - 2. ... -``` - -**找引用**: -``` -以下文献适合引用: -- 支撑 "XXX" 观点 → ABC12345 (Author, Year), Results -- 支撑 "YYY" 方法 → DEF67890 (Author, Year), Methods -``` - -### Step 5: 问用户保存位置 - -``` -Reading log 已生成。要保存到哪里? -(留空 → 默认 $VAULT/Bases/reading-log-.md) -``` - -让用户指定路径。如果用户说不清,默认放到 `$VAULT/Bases/`。 - ---- - -## 注意事项 - -- **暂时不支持多篇阅读后运行 pf-end / 结束讨论**(该功能待定) -- 如果某篇文献没有 fulltext,如实告知用户,不要捏造内容 -- Reading log 中每条提取点必须在原文中有据可查 -- JSON → MD 转换由 Agent 完成,用户只看到 MD 文件 diff --git a/paperforge/skills/literature-qa/references/paper-qa.md b/paperforge/skills/literature-qa/references/paper-qa.md deleted file mode 100644 index 626f23e..0000000 --- a/paperforge/skills/literature-qa/references/paper-qa.md +++ /dev/null @@ -1,61 +0,0 @@ -# 论文问答 - -交互式论文 Q&A 工作台。不强制要求 OCR,但 OCR 完成后回答更准确。 - ---- - -## 前置条件 - -- [ ] 已完成论文定位(参考 [paper-resolution.md](paper-resolution.md)),拿到 zotero_key 和 workspace -- [ ] OCR 完成(推荐但非强制) - ---- - -## 执行流程 - -### Step 1: 加载论文 - -1. 确认 workspace 路径 -2. 读 `fulltext.md`(如果存在)作为主要回答依据 -3. 读 formal note frontmatter 获取元数据(标题、作者、期刊、年份) -4. 如果 fulltext.md 不存在,告知用户 "OCR 文本不可用,回答将基于元数据和公开信息" - -### Step 2: 显示论文信息 - -``` -已加载论文: [title] ([year], [journal]) -作者: [authors] -Zotero Key: [key] -领域: [domain] -OCR 状态: [done / 不可用] -结束对话时说 "保存" 即可保存讨论。 -请问有什么问题? -``` - -### Step 3: 进入 Q&A 模式 - -- 等待用户提问 -- 每次回答后等待下一个问题 -- 持续到用户说 "保存"、"结束"、"完成" 等关键词 - ---- - -## 回答原则 - -- **严格基于** fulltext.md 中的文本内容回答 -- 引用原文时标注来源页码/章节(如 "第 3 页,Methods 部分") -- 用中文(简体中文)回答 -- 论文中未提及的内容,明确说明 "论文中未提及该内容" -- 需要结合论文以外知识的问题,说明 "该问题需要结合论文以外的知识" - ---- - -## 切换模式 - -用户在当前对话中可以说 "精读这篇文章" 切换到 deep-reading 模式。此时加载 [deep-reading.md](deep-reading.md) 执行精读流程。 - ---- - -## 保存记录 - -用户说 "保存"、"结束"、"完成"、"保存讨论" 时,加载 [save-session.md](save-session.md) 执行保存。不要自动保存。 diff --git a/paperforge/skills/literature-qa/references/paper-resolution.md b/paperforge/skills/literature-qa/references/paper-resolution.md deleted file mode 100644 index 044b30b..0000000 --- a/paperforge/skills/literature-qa/references/paper-resolution.md +++ /dev/null @@ -1,94 +0,0 @@ -# 论文定位协议 - -本文件定义如何将用户输入解析为论文 workspace。所有子流程公用。 - -## 核心原则 - -1. **Python 做确定性查找。** key、DOI、标题片段、作者+年份。 -2. **Agent 做理解和兜底。** 自然语言、Python 无结果时的 fallback 搜索。 -3. **路径从 `paths` 获取,不硬编码。** 禁止根据 vault-knowledge.md 的示例结构拼接路径。`ocr_dir`、`literature_dir`、`index_path` 只能从 `paper_resolver paths` 或 `paper_resolver resolve-key` 的返回 JSON 中读取。任何情况下都不要把目录名(如 `System`、`Resources`)写死在路径里。 - ---- - -## 通用命令 - -| 操作 | 命令 | -|------|------| -| 获取 vault 路径 | 已由 pf_bootstrap 完成 | -| 按 key 查 | `$PYTHON -m paperforge.worker.paper_resolver resolve-key --vault "$VAULT"` | -| 按 DOI 查 | `$PYTHON -m paperforge.worker.paper_resolver resolve-doi "" --vault "$VAULT"` | -| 按字段搜 | `$PYTHON -m paperforge.worker.paper_resolver search --title "..." --author "..." --year ... --domain "..." --vault "$VAULT"` | - ---- - -## 输入类型判断 - -### 类型 1: Zotero Key(8位字母数字组合) - -``` -$PYTHON -m paperforge.worker.paper_resolver resolve-key --vault "$VAULT" -``` - -返回 JSON 含 `key`, `title`, `domain`, `formal_note_path`, `ocr_path`, `fulltext_path`, `ocr_status` 等。所有路径由 `paperforge.json` 配置决定。 - -### 类型 2: DOI(以 `10.` 开头,可能带 URL 前缀) - -``` -$PYTHON -m paperforge.worker.paper_resolver resolve-doi "" --vault "$VAULT" -``` - -返回格式同类型 1。 - -### 类型 3: 标题片段 - -``` -$PYTHON -m paperforge.worker.paper_resolver search --title "..." --vault "$VAULT" -``` - -返回 `{"matches": [...], "count": N}`。 - -### 类型 4: 作者 + 年份 - -``` -$PYTHON -m paperforge.worker.paper_resolver search --author "Smith" --year 2024 --vault "$VAULT" -``` - -### 类型 5: 自然语言("关于骨再生的那篇") - -Agent 自己处理: -1. 读 `$IDX_PATH`(已由 pf_bootstrap 提供) -2. 读 `index_path` 指向的 `formal-library.json` -3. 在 `title`、`domain`、`journal`、`abstract` 中搜匹配 -4. 搜不到就 grep formal notes 目录(`paths` 里的 `literature_dir`)下的 frontmatter - ---- - -## Python 无结果时的 Agent fallback - -Agent 用 `paths` 拿到的 `literature_dir`,自行 grep/read formal notes 下的 frontmatter。 - -## 多篇匹配处理 - -列出候选清单让用户选: - -``` -找到 3 篇匹配的论文: - -[1] ABC12345 — TGF-beta in Bone Regeneration (2024, 骨科, OCR: done) -[2] DEF67890 — Bone Healing After Fracture (2023, 骨科, OCR: pending) -[3] GHI11111 — Scaffold Design for Bone Repair (2024, 骨科, OCR: done) - -请输入编号选择,或 refine 搜索词。 -``` - -## Fallback 顺序 - -``` -输入 - │ - ├── 像 key/DOI/标题/作者年份? - │ └── Python paper_resolver → 有/无结果 → Agent 兜底 - │ - └── 自然语言? - └── Agent 读 formal-library.json → 搜 → 有/无 -``` diff --git a/paperforge/skills/literature-qa/references/paper-search.md b/paperforge/skills/literature-qa/references/paper-search.md deleted file mode 100644 index 57aa634..0000000 --- a/paperforge/skills/literature-qa/references/paper-search.md +++ /dev/null @@ -1,97 +0,0 @@ -# 文献检索工作流 - -轻量流程:用户想**在库里找文献**(不涉及精读或问答)。 - ---- - -## Stage 状态机 - -你必须明确知道当前在哪个 stage。每完成一个 stage 问自己:"下一步是什么?" 不要在 stage 之间来回跳跃。 - -| Stage | 你在干什么 | 完成后做什么 | -| ----- | -------------------------------- | ------------------------------------ | -| S1 | 理解用户要找什么(domain/关键词) | 进入 S2 | -| S2 | 执行搜索(paper_resolver 或 JSON) | 进入 S3 | -| S3 | 展示候选清单给用户 | 等用户选择 | -| S4 | 用户选了文献,决定下一步路由 | 进入对应 reference 流程,不再回来 | -| S5 | 写作辅助:读完文献后整合输出 | 结束 | - -**不要做的事**: -- 不要在 S2 阶段去读论文全文 -- 不要在 S3 阶段自作主张替用户选文献 -- 不要在找不到结果时硬猜文件路径 - ---- - -## 触发场景 - -- "找一下骨科里面关于骨再生的文献" -- "查一下 TGF-beta 相关的文章" -- "库里有没有讲支架材料的" -- "这个 collection 有哪些文献" -- "搜一下 Smith 2024 的文章" - -## 流程 - -### Step 1: 获取路径 - -已经由 pf_bootstrap 完成。直接用 `paths` JSON 里的 `index_path` 和 `literature_dir`。 - -### Step 2: 解析用户意图 - -从用户输入提取: -- **domain**(如果有):`骨科`、`运动医学` 等 → 对应 `literature_dir` 子目录 -- **关键词**:标题、作者、年份、期刊、主题词 -- **collection 路径**:Zotero 子分类,如 `电刺激软骨修复综述` - -### Step 3: 搜索 - -**优先:Python paper_resolver**(确定性匹配) - -``` -$PYTHON -m paperforge.worker.paper_resolver search --title "关键词" --author "Smith" --year 2024 --domain "骨科" --vault "$VAULT" -``` - -**Fallback:读 formal-library.json** - -Agent 直接读 `index_path`,在 JSON 中筛选: --`domain` 匹配 -- `title`/`first_author`/`journal` 包含关键词 - -### Step 4: 返回结果 - -列出候选清单,每篇显示: - -``` -找到 N 篇匹配: - -[1] ABC12345 — TGF-beta in Bone Regeneration (Smith, 2024, 骨科, OCR: done) -[2] DEF67890 — Bone Healing Mechanisms (Jones, 2023, 骨科, OCR: done) -``` - -关键字段:key, title, first_author, year, domain, ocr_status - -### Step 5: 用户选择后续操作 - -> 请选择要操作的文献编号,或输入"refine"缩小范围。 - -选中文献后,按用户意图自动进入对应路由: -- `精读这篇` → 进入 [deep-reading.md](deep-reading.md) 流程 -- `这篇讲了什么` → 进入 [paper-qa.md](paper-qa.md) 流程 -- 不需要继续 → 结束 - -### Step 6: 写作辅助场景 - -如果用户原始意图包含**写作/优化/参考文献/综述/引用**等,搜索结果不是终点: - -1. 提示用户圈选最相关的 3-5 篇 -2. 对每篇进入 [deep-reading.md](deep-reading.md) 或至少通读 formal note + fulltext 关键段落 -3. 读完所有选定论文后,Agent 整合知识辅助写作 - -> 示例:"我从库里 X 篇文献中提取了以下关键发现……要不要基于这些帮你写 XX 部分?" - -## 注意事项 - -- 如果是大型 library(>500 篇),优先用 paper_resolver 而不是全量读 JSON -- OCR status 为 `done` 的论文可以读 fulltext 内容 -- OCR status 为 `pending` 的只有 formal note frontmatter diff --git a/paperforge/skills/literature-qa/references/save-session.md b/paperforge/skills/literature-qa/references/save-session.md deleted file mode 100644 index 75c776e..0000000 --- a/paperforge/skills/literature-qa/references/save-session.md +++ /dev/null @@ -1,55 +0,0 @@ -# 保存讨论记录 - -将 paper-qa 会话中的 Q&A 记录持久化到论文工作区。 - ---- - -## 触发条件 - -- 用户显式说 "保存"、"保存记录"、"结束"、"完成讨论"、"save" -- 或显式输入 `pf-end` -- 不要自动触发 - ---- - -## 执行 - -### Step 1: 收集 Q&A 对 - -汇总本次 paper-qa 会话中所有 Q&A,序列化为 JSON 数组: - -```json -[ - { - "question": "用户的问题", - "answer": "Agent 的回答", - "source": "user_question", - "timestamp": "2026-05-10T12:00:00+08:00" - } -] -``` - -`source` 为 `"user_question"`(用户提问)或 `"agent_analysis"`(Agent 主动分析)。 - -### Step 2: 调用 discussion 模块 - -```bash -$PYTHON -m paperforge.worker.discussion record \ - --vault "$VAULT" \ - --agent pf-paper \ - --model "" \ - --qa-pairs '' -``` - -### Step 3: 确认结果 - -CLI 返回 `{"status": "ok", ...}` → 告知用户记录已保存。 - -返回 `{"status": "error"}` → 记录错误,重试一次。仍失败则告知用户。 - ---- - -## 注意事项 - -- 仅 paper-qa 会话需要记录。deep-reading 的内容直接写入 formal note,不需要通过本文件。 -- 如果无法从 formal-library.json 找到论文 domain/title,记录失败不应影响用户使用。 diff --git a/paperforge/skills/paperforge/SKILL.md b/paperforge/skills/paperforge/SKILL.md new file mode 100644 index 0000000..0b55005 --- /dev/null +++ b/paperforge/skills/paperforge/SKILL.md @@ -0,0 +1,118 @@ +--- +name: paperforge +description: > + Research Memory Runtime — 文献搜索、精读、问答、阅读笔记、 + 工作记录、方法论提取。Triggered by: + pf-deep pf-paper pf-sync pf-ocr pf-status, + "精读" "找文献" "搜文献" "文献问答" "读一下" "看看这篇" + "讨论" "记录阅读" "记录工作" "总结会话" "提取方法论". +source: paperforge +--- + +# PaperForge — Research Memory Runtime + +PaperForge 将文献、阅读痕迹、工作过程、方法论和产物 +组织成可检索、可复核、可由 agent 调用的研究记忆。 + +--- + +## 1. Bootstrap — 必须先执行 + +```bash +python $SKILL_DIR/scripts/pf_bootstrap.py --vault "$VAULT" +``` + +返回 JSON。记录以下变量(所有 workflow 文件继承,不再重复声明): + +| 变量 | JSON 字段 | 用途 | +| ------------- | ----------------------- | ------------------------------ | +| `$VAULT` | `vault_root` | 所有 `--vault` 参数 | +| `$PYTHON` | `python_candidate` | 所有 `python -m paperforge` 调用 | +| `$LIT_DIR` | `paths.literature_dir` | 文献笔记根目录 | +| `$SKILL_DIR` | 平台注入 | 脚本路径 | +| `$METHODS` | `methodology_index` | 可用方法论索引 | + +如果 `ok: false`,报告 `error` 给用户,**停止。禁止自己拼路径。** + +如果 `python_verified` 为 `false` 或 `python_candidate` 为 `null`: +依次尝试 `python` 再 `python3`。全部失败则停止,提示用户在 `paperforge.json` 中设置 `python_path`。 + +--- + +## 2. Agent Context — bootstrap 成功后执行 + +```bash +$PYTHON -m paperforge agent-context --json --vault "$VAULT" +``` + +返回 library overview、collection tree、可用命令和规则。Agent 注入为会话上下文。 + +--- + +## 3. Methodology Index — bootstrap 自动提供 + +bootstrap 已返回 `methodology_index`(从 `System/PaperForge/methodology/archive/` 扫描)。 +Agent 在需要时自行读取对应卡片(`read System/PaperForge/methodology/archive/.md`)。 + +--- + +## 4. Reading-Log Safety Rule — 全局规则,所有 workflow 必须遵守 + +Reading-log 不是事实源。它记录的是**之前的关注点、解读和预期用途**。 + +当存在 prior reading-log 时: +1. 用它决定**优先复查什么**,不是用它回答用户问题 +2. 重新打开**原文/图表/表格**,核实之前的解读 +3. 确认的,说明"已回原文复核" +4. 被推翻的,创建 correction note +5. **绝对禁止**仅根据 reading-log 内容回答事实性问题 + +--- + +## 5. 意图路由 + +用户输入对应唯一一个 workflow 文件(打开并执行其完整流程): + +| 用户说 | 打开 | +| -------------------------------------------------------- | -------------------------------- | +| "找文献" "搜" "库里有没有XX" "collection 里关于YY" | `workflows/paper-search.md` | +| "精读 " "/pf-deep" "三阶段阅读" | `workflows/deep-reading.md` | +| "读一下" "看看" "讨论" "/pf-paper" " 这篇讲了什么" | `workflows/paper-qa.md` | +| "记一下" "记录阅读" "reading log" "读完这段记一下" | `workflows/reading-log.md` | +| "总结会话" "工作记录" "项目记录" "project log" "记决策" | `workflows/project-log.md` | +| "提取方法论" "总结规律" "存档写作规律" | `workflows/methodology.md` | +| "branch" "代码审查" "feature" "dashboard" "memory layer" "用户反馈" "报错" "安装失败" "Git" "Zotero" "BetterBibTeX" "OCR" "插件" | `workflows/project-engineering.md` | +| 不确定 / 空输入 | 问用户:搜文献、精读、问答、记笔记、记工作、提方法论? | + +路由后如用户切换意图,重新判断并打开对应 workflow。 + +--- + +## 6. 全局禁止规则 + +- **禁止自行拼接文件路径**。所有路径从 bootstrap 或 paper-context 获取。 +- **禁止绕过 CLI 直接操作文件**。搜索用 `$PYTHON -m paperforge search`,不用 glob/grep 扫库。 +- **禁止在未完成 paper-context 检查前读取原文**(适用于 deep-reading、paper-qa)。 + +--- + +## 文件结构 + +``` +paperforge/ +├── SKILL.md ← 本文件(compound:启动注入 + 路由 + 全局规则) +├── workflows/ ← molecules:原子序列 + 分支条件 +│ ├── paper-search.md +│ ├── deep-reading.md +│ ├── paper-qa.md +│ ├── reading-log.md +│ ├── project-log.md +│ ├── methodology.md +│ └── project-engineering.md +├── references/ ← 共享参考 +│ ├── chart-reading/ ← 19 种图表阅读指南 +│ └── method-card-template.md +└── scripts/ ← 脚本 atoms + ├── pf_bootstrap.py + └── pf_deep.py +``` diff --git "a/paperforge/skills/literature-qa/references/chart-reading/GSEA\345\257\214\351\233\206\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/GSEA\345\257\214\351\233\206\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/GSEA\345\257\214\351\233\206\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/GSEA\345\257\214\351\233\206\345\233\276.md" diff --git a/paperforge/skills/literature-qa/references/chart-reading/INDEX.md b/paperforge/skills/paperforge/references/chart-reading/INDEX.md similarity index 100% rename from paperforge/skills/literature-qa/references/chart-reading/INDEX.md rename to paperforge/skills/paperforge/references/chart-reading/INDEX.md diff --git "a/paperforge/skills/literature-qa/references/chart-reading/ROC\344\270\216PR\346\233\262\347\272\277.md" "b/paperforge/skills/paperforge/references/chart-reading/ROC\344\270\216PR\346\233\262\347\272\277.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/ROC\344\270\216PR\346\233\262\347\272\277.md" rename to "paperforge/skills/paperforge/references/chart-reading/ROC\344\270\216PR\346\233\262\347\272\277.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/Western Blot\346\235\241\345\270\246\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/Western Blot\346\235\241\345\270\246\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/Western Blot\346\235\241\345\270\246\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/Western Blot\346\235\241\345\270\246\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\345\205\215\347\226\253\350\215\247\345\205\211\345\256\232\351\207\217\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\345\205\215\347\226\253\350\215\247\345\205\211\345\256\232\351\207\217\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\345\205\215\347\226\253\350\215\247\345\205\211\345\256\232\351\207\217\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\345\205\215\347\226\253\350\215\247\345\205\211\345\256\232\351\207\217\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\346\212\230\347\272\277\345\233\276\344\270\216\346\227\266\351\227\264\345\272\217\345\210\227.md" "b/paperforge/skills/paperforge/references/chart-reading/\346\212\230\347\272\277\345\233\276\344\270\216\346\227\266\351\227\264\345\272\217\345\210\227.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\346\212\230\347\272\277\345\233\276\344\270\216\346\227\266\351\227\264\345\272\217\345\210\227.md" rename to "paperforge/skills/paperforge/references/chart-reading/\346\212\230\347\272\277\345\233\276\344\270\216\346\227\266\351\227\264\345\272\217\345\210\227.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\346\225\243\347\202\271\345\233\276\344\270\216\346\260\224\346\263\241\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\346\225\243\347\202\271\345\233\276\344\270\216\346\260\224\346\263\241\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\346\225\243\347\202\271\345\233\276\344\270\216\346\260\224\346\263\241\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\346\225\243\347\202\271\345\233\276\344\270\216\346\260\224\346\263\241\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\346\230\276\345\276\256\347\205\247\347\211\207\344\270\216SEM\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\346\230\276\345\276\256\347\205\247\347\211\207\344\270\216SEM\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\346\230\276\345\276\256\347\205\247\347\211\207\344\270\216SEM\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\346\230\276\345\276\256\347\205\247\347\211\207\344\270\216SEM\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\346\235\241\345\275\242\345\233\276\344\270\216\350\257\257\345\267\256\346\243\222.md" "b/paperforge/skills/paperforge/references/chart-reading/\346\235\241\345\275\242\345\233\276\344\270\216\350\257\257\345\267\256\346\243\222.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\346\235\241\345\275\242\345\233\276\344\270\216\350\257\257\345\267\256\346\243\222.md" rename to "paperforge/skills/paperforge/references/chart-reading/\346\235\241\345\275\242\345\233\276\344\270\216\350\257\257\345\267\256\346\243\222.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\346\241\221\345\237\272\345\233\276\344\270\216\345\274\246\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\346\241\221\345\237\272\345\233\276\344\270\216\345\274\246\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\346\241\221\345\237\272\345\233\276\344\270\216\345\274\246\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\346\241\221\345\237\272\345\233\276\344\270\216\345\274\246\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\346\243\256\346\236\227\345\233\276\344\270\216Meta\345\210\206\346\236\220.md" "b/paperforge/skills/paperforge/references/chart-reading/\346\243\256\346\236\227\345\233\276\344\270\216Meta\345\210\206\346\236\220.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\346\243\256\346\236\227\345\233\276\344\270\216Meta\345\210\206\346\236\220.md" rename to "paperforge/skills/paperforge/references/chart-reading/\346\243\256\346\236\227\345\233\276\344\270\216Meta\345\210\206\346\236\220.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\347\201\253\345\261\261\345\233\276\344\270\216\346\233\274\345\223\210\351\241\277\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\347\201\253\345\261\261\345\233\276\344\270\216\346\233\274\345\223\210\351\241\277\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\347\201\253\345\261\261\345\233\276\344\270\216\346\233\274\345\223\210\351\241\277\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\347\201\253\345\261\261\345\233\276\344\270\216\346\233\274\345\223\210\351\241\277\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\347\203\255\345\233\276\344\270\216\350\201\232\347\261\273\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\347\203\255\345\233\276\344\270\216\350\201\232\347\261\273\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\347\203\255\345\233\276\344\270\216\350\201\232\347\261\273\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\347\203\255\345\233\276\344\270\216\350\201\232\347\261\273\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\347\224\237\345\255\230\346\233\262\347\272\277.md" "b/paperforge/skills/paperforge/references/chart-reading/\347\224\237\345\255\230\346\233\262\347\272\277.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\347\224\237\345\255\230\346\233\262\347\272\277.md" rename to "paperforge/skills/paperforge/references/chart-reading/\347\224\237\345\255\230\346\233\262\347\272\277.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\347\256\261\345\274\217\345\233\276\344\270\216\345\260\217\346\217\220\347\220\264\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\347\256\261\345\274\217\345\233\276\344\270\216\345\260\217\346\217\220\347\220\264\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\347\256\261\345\274\217\345\233\276\344\270\216\345\260\217\346\217\220\347\220\264\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\347\256\261\345\274\217\345\233\276\344\270\216\345\260\217\346\217\220\347\220\264\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\347\273\204\347\273\207\345\255\246\345\215\212\345\256\232\351\207\217\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\347\273\204\347\273\207\345\255\246\345\215\212\345\256\232\351\207\217\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\347\273\204\347\273\207\345\255\246\345\215\212\345\256\232\351\207\217\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\347\273\204\347\273\207\345\255\246\345\215\212\345\256\232\351\207\217\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\347\275\221\347\273\234\345\233\276\344\270\216\351\200\232\350\267\257\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\347\275\221\347\273\234\345\233\276\344\270\216\351\200\232\350\267\257\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\347\275\221\347\273\234\345\233\276\344\270\216\351\200\232\350\267\257\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\347\275\221\347\273\234\345\233\276\344\270\216\351\200\232\350\267\257\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\350\233\213\347\231\275\350\264\250\347\273\223\346\236\204\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\350\233\213\347\231\275\350\264\250\347\273\223\346\236\204\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\350\233\213\347\231\275\350\264\250\347\273\223\346\236\204\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\350\233\213\347\231\275\350\264\250\347\273\223\346\236\204\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\351\231\215\347\273\264\345\233\276(PCA-tSNE-UMAP).md" "b/paperforge/skills/paperforge/references/chart-reading/\351\231\215\347\273\264\345\233\276(PCA-tSNE-UMAP).md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\351\231\215\347\273\264\345\233\276(PCA-tSNE-UMAP).md" rename to "paperforge/skills/paperforge/references/chart-reading/\351\231\215\347\273\264\345\233\276(PCA-tSNE-UMAP).md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\351\233\267\350\276\276\345\233\276\344\270\216\346\274\217\346\226\227\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\351\233\267\350\276\276\345\233\276\344\270\216\346\274\217\346\226\227\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\351\233\267\350\276\276\345\233\276\344\270\216\346\274\217\346\226\227\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\351\233\267\350\276\276\345\233\276\344\270\216\346\274\217\346\226\227\345\233\276.md" diff --git a/paperforge/skills/paperforge/references/method-card-template.md b/paperforge/skills/paperforge/references/method-card-template.md new file mode 100644 index 0000000..0ae4add --- /dev/null +++ b/paperforge/skills/paperforge/references/method-card-template.md @@ -0,0 +1,39 @@ +# Method Card Template + +复制此模板创建新的方法论卡片。 + +--- + + + + +# <标题:简短、可搜索> + +## Use when + + +## Procedure + + +1. <步骤 1> +2. <步骤 2> +3. <步骤 3> + +## Watch-outs + + +- <注意 1> +- <注意 2> + +## Example + + +--- + diff --git a/paperforge/skills/literature-qa/scripts/pf_bootstrap.py b/paperforge/skills/paperforge/scripts/pf_bootstrap.py similarity index 54% rename from paperforge/skills/literature-qa/scripts/pf_bootstrap.py rename to paperforge/skills/paperforge/scripts/pf_bootstrap.py index 87bd211..a1a99d1 100644 --- a/paperforge/skills/literature-qa/scripts/pf_bootstrap.py +++ b/paperforge/skills/paperforge/scripts/pf_bootstrap.py @@ -18,7 +18,11 @@ }, "domains": ["domain1", "domain2"], "index_summary": {"domain1": 120, "domain2": 80}, - "python_candidate": "D:\\...\\python.exe" // Python that has paperforge, or null + "python_candidate": "D:\\...\\python.exe", + "methodology_index": [ + {"id": "parameter-window-audit", "description": "比较多个研究的参数和剂量反应"}, + ... + ] } If anything fails: ok=false, error explains why. @@ -50,8 +54,8 @@ def _read_pf_config(pf_json: Path) -> dict: return json.load(f) -def _find_python_with_paperforge(vault: Path, pf_cfg: dict) -> str | None: - """Find a Python executable that has paperforge installed.""" +def _find_python_with_paperforge(vault: Path, pf_cfg: dict) -> tuple[str | None, bool]: + """Find a Python executable. Returns (candidate, verified_has_paperforge).""" candidates = [] # 1. Explicit python_path in config @@ -75,10 +79,76 @@ def _find_python_with_paperforge(vault: Path, pf_cfg: dict) -> str | None: encoding="utf-8", errors="replace", ) if result.returncode == 0 and "paperforge" in result.stdout.lower(): - return str(candidate) + return (str(candidate), True) except Exception: continue - return None + + # Fallback: check system python/python3 only (no paperforge verification) + for fallback in ["python", "python3"]: + try: + result = subprocess.run( + [fallback, "--version"], + capture_output=True, text=True, timeout=10, + encoding="utf-8", errors="replace", + ) + if result.returncode == 0: + return (fallback, False) + except Exception: + continue + + return (None, False) + + +def _scan_methodology_archive(pf_root: Path) -> list[dict]: + """Scan methodology archive directory for available method cards.""" + archive_dir = pf_root / "methodology" / "archive" + if not archive_dir.exists(): + return [] + + methods = [] + for f in sorted(archive_dir.glob("*.md")): + try: + text = f.read_text(encoding="utf-8") + # Extract first heading as title, first paragraph after "Use when" as description + title = "" + description = "" + in_use_when = False + for line in text.split("\n"): + stripped = line.strip() + if stripped.startswith("# ") and not title: + title = stripped.lstrip("# ").strip() + elif stripped.startswith("## Use when"): + in_use_when = True + elif in_use_when and stripped and not stripped.startswith("#"): + description = stripped + in_use_when = False + methods.append({ + "id": f.stem, + "title": title or f.stem, + "description": description or "(no description)", + }) + except Exception: + continue + return methods + + +DEFAULTS = { + "system_dir": "System", + "resources_dir": "Resources", + "literature_dir": "Literature", + "control_dir": "LiteratureControl", + "base_dir": "Bases", +} + + +def resolve_cfg(raw: dict) -> dict: + """Resolve config with vault_config nested support and legacy flat keys.""" + cfg = DEFAULTS.copy() + nested = raw.get("vault_config", {}) + if isinstance(nested, dict): + cfg.update({k: v for k, v in nested.items() if v}) + cfg.update({k: raw[k] for k in DEFAULTS if raw.get(k)}) + return cfg def main(): @@ -115,6 +185,7 @@ def main(): json.dump(result, sys.stdout, ensure_ascii=False) sys.exit(0) + cfg = resolve_cfg(cfg) system_dir = cfg.get("system_dir", "System") resources_dir = cfg.get("resources_dir", "Resources") literature_dir = cfg.get("literature_dir", "Literature") @@ -154,7 +225,40 @@ def main(): result["index_summary"] = index_summary # --- 6. Find Python that has paperforge (best effort) --- - result["python_candidate"] = _find_python_with_paperforge(vault, cfg) + py_candidate, py_verified = _find_python_with_paperforge(vault, cfg) + if py_candidate: + result["python_candidate"] = py_candidate + result["python_verified"] = py_verified + else: + result["python_candidate"] = "python" + result["python_verified"] = False + + # --- 7. Memory layer state --- + memory_layer = {"available": False, "paper_count": 0, "fts_search": False, "vector_search": False} + idx_path = Path(paths["index_path"]) + dc_json = vault / ".obsidian" / "plugins" / "paperforge" / "data.json" + if idx_path.exists(): + try: + with open(idx_path, encoding="utf-8") as f: + data = json.load(f) + items = data.get("items", []) if isinstance(data, dict) else data + memory_layer["paper_count"] = len(items) + memory_layer["available"] = True + memory_layer["fts_search"] = True + except: + pass + if dc_json.exists(): + try: + with open(dc_json, encoding="utf-8") as f: + plugin_data = json.load(f) + vector_enabled = plugin_data.get("features", {}).get("vector_db", False) + memory_layer["vector_search"] = vector_enabled + except: + pass + result["memory_layer"] = memory_layer + + # --- 8. Scan methodology archive --- + result["methodology_index"] = _scan_methodology_archive(pf_root) result["ok"] = True json.dump(result, sys.stdout, ensure_ascii=False, indent=2) diff --git a/paperforge/skills/literature-qa/scripts/ld_deep.py b/paperforge/skills/paperforge/scripts/pf_deep.py similarity index 100% rename from paperforge/skills/literature-qa/scripts/ld_deep.py rename to paperforge/skills/paperforge/scripts/pf_deep.py diff --git a/paperforge/skills/paperforge/workflows/deep-reading.md b/paperforge/skills/paperforge/workflows/deep-reading.md new file mode 100644 index 0000000..889359f --- /dev/null +++ b/paperforge/skills/paperforge/workflows/deep-reading.md @@ -0,0 +1,172 @@ +# deep-reading + +> **Safety Rule:** Prior reading-log entries are recheck targets only, never factual answers. +> Always verify against original source before using any reading-log content. + +Keshav 三阶段精读。在 formal note 中写入结构化的 `## 精读` 区域。 + +--- + +## 前置检查 + +### Step 0: paper-context(必须) + +```bash +$PYTHON -m paperforge paper-context --json --vault "$VAULT" +``` + +检查返回 JSON: +- `ok: false` → 报告 `error.message`,停止 +- `data.paper.ocr_status != "done"` → "OCR 未完成,请先运行 paperforge ocr",停止 +- `data.paper.analyze != true` → "analyze 未开启,请在 formal note frontmatter 中设为 true",停止 + +**检查 prior_notes:** +- 如果存在 `data.prior_notes`,逐条看 `verified` 字段 +- `verified: false` 的条目记入 recheck_targets,精读时必须回原文复核这些位置 +- `verified: true` 的条目可以信任,但标注"之前已验证" + +**记录关键路径:** +- `data.paper.note_path`(formal note 路径) +- `data.paper.fulltext_path`(fulltext 路径) +- 记下 `recheck_targets` 列表 + +--- + +## 执行流程 + +### Step 1: Prepare(跑脚本) + +```bash +$PYTHON "$SKILL_DIR/scripts/pf_deep.py" prepare --key --vault "$VAULT" +``` + +解析返回 JSON: +- `status: "ok"` → 记下 `figure_map`、`chart_type_map`、`formal_note`、`fulltext_md`、`figures`、`tables` 的路径和数量 +- `status: "warn"` + `deep_reading_status: done` → 告知用户"该文献已精读过",确认是否重读 +- `status: "error"` → 报告 `message`,停止 + +读 formal note,确认 `## 精读` 骨架已插入。 + +--- + +### Step 2: Pass 1 — 概览 + +只填 `### Pass 1: 概览`。不碰 Pass 2/3。 + +填写内容必须来自原文,不可推断: + +- **一句话总览**:论文类型 + 核心发现,一句话 +- **5 Cs 快速评估**: + - Category(RCT / 队列 / 综述 / 基础研究等) + - Context(领域共识,本文要解决什么) + - Correctness(初步直觉,逻辑有否明显漏洞) + - Contributions(1-3 条) + - Clarity(写作质量,图表可读性) +- **Figure 导读**(基于 fulltext 浏览各图 caption): + - 关键主图:列出,一句话概括要证明什么 + - 证据转折点:哪个 figure 是叙事关键转折 + - 需要重点展开的 supplementary + - 关键表格 + +填完立即保存。 + +--- + +### Step 3: Pass 2 — 精读还原 + +填 `### Pass 2: 精读还原`。**按 figure 顺序逐个处理**。 + +每处理完一个 figure 立即保存。 + +#### 图表类型定位(两步) + +**A: 读 chart-type-map**(prepare 输出中包含该路径)。这是关键词命中建议。 + +**B: Agent 读 caption 做最终判断** +1. 读该 figure 的 caption(来自 fulltext) +2. 打开 `references/chart-reading/INDEX.md`,对照 caption 内容判断图表类型 +3. chart-type-map 建议和 Agent 判断不一致时 → 以 Agent 判断为准 +4. 无法确定类型 → 跳过 chart guide,按通用结构分析 +5. 确定类型 → 读对应 chart-reading 指南,按指南中的检查清单分析 + +#### 每张 Figure 的子标题(固定,不可跳过) + +``` +**图像定位与核心问题**:页码 + 要回答什么问题 +**方法与结果**:实验设计 / 数据来源 / 技术手段;核心数据、趋势、对比 +**图表质量审查**:按 chart-reading 指南检查坐标轴、单位、误差棒、统计标注 +**作者解释**:作者在正文中对该图的解读 +**我的理解**:自己的理解(必须与作者解释做明显区分) +**疑点/局限**:用 `> [!warning]` 突出 +``` + +#### 每张 Table 的子标题(简化版) + +``` +回答什么问题、关键字段/分组、主要结果、我的理解、疑点/局限 +``` + +#### 所有 figure/table 处理完后 + +**关键方法补课**:简要解释不熟悉的实验技术(1-2 项) + +**主要发现与新意**: +- 发现 1:...(来源:Figure X) +- 发现 2:...(来源:Table Y) +- 每条发现必须标注来源(Figure 编号或正文段落) + +--- + +### Step 4: Postprocess(跑校验,修正问题) + +```bash +$PYTHON "$SKILL_DIR/scripts/pf_deep.py" postprocess-pass2 "" --figures --vault "$VAULT" +``` + +- 输出 `OK` → 继续 Step 5 +- 输出错误列表(含行号)→ 按提示修正,修正后重新跑 +- 最多 3 轮修正。3 轮后仍失败 → 报告剩余错误给用户 + +--- + +### Step 5: Pass 3 — 深度理解 + +填 `### Pass 3: 深度理解`。基于 Pass 1/2 已写内容。 + +- **假设挑战与隐藏缺陷**:隐含假设;放宽假设后结论还成立吗;缺少的关键引用;实验/分析技术潜在问题 +- **哪些结论扎实,哪些仍存疑**: + - 较扎实:... + - 仍存疑:...(用 `> [!warning]`) +- **Discussion 与 Conclusion 怎么读**:作者实际完成了什么;哪些有拔高;哪些是推测 +- **对我的启发**:研究设计、figure 组织、方法组合、未来工作 +- **遗留问题**:...(用 `> [!question]`) + +--- + +### Step 6: Final Validation + +```bash +$PYTHON "$SKILL_DIR/scripts/pf_deep.py" validate-note "" --fulltext "" +``` + +- 输出 `OK` → 告知用户精读完成 +- 输出错误 → 修正缺失项,直到通过 + +--- + +## Callout 格式规则 + +- `> [!important]` — 每个 main finding +- `> [!warning]` — 疑问、局限、证据边界、仍存疑条目 +- `> [!question]` — 遗留问题 +- **相邻 callout 之间必须有空行**(否则 Obsidian 合并): + - 正确:`> [!important] A\n\n> [!important] B` + - 错误:`> [!important] A\n> [!important] B` + +--- + +## 禁止 + +- 不要在 Pass 1 完成前碰 Pass 2/3 +- 不要把推断写成文献事实——区分"作者说了 X"和"我推断 Y" +- 不要跨 figure 写综合判断(Pass 2 逐图,Pass 3 才做综合) diff --git a/paperforge/skills/paperforge/workflows/methodology.md b/paperforge/skills/paperforge/workflows/methodology.md new file mode 100644 index 0000000..f196817 --- /dev/null +++ b/paperforge/skills/paperforge/workflows/methodology.md @@ -0,0 +1,97 @@ +# methodology + +> **Scope:** Only archive methods reusable across multiple projects/tasks. +> Session-specific progress, decisions, and todos go to project-log. + +从 project-log 中提取可复用方法论,按 method-card 模板写入 methodology archive。 +不 append 到大文件,每张卡片独立保存。 + +--- + +## 前置条件 + +- bootstrap 已完成 +- 有 project-log 记录可读取 + +--- + +## 步骤 + +### Step 1: 确定项目和来源 + +询问用户从哪个项目提取。如用户未指定,列出有 project-log 的项目。 + +### Step 2: 读取 project-log + +```bash +$PYTHON -m paperforge project-log --list "" --json --vault "$VAULT" +``` + +扫描其中以下信号: + +| log 中的信号 | 可提取为 | +| ------------------------- | --------------------------- | +| `detours` 中的教训 | 方法论规则 | +| `reusable` 字段里的内容 | 直接采用 | +| `decisions` 中的重要选择 | 决策原则 | +| 跨文献审计/比较分析 | 审计方法论 | +| 写作修正/审阅反馈 | 写作检查清单 | + +### Step 3: 识别可提取 pattern + +对每个 pattern 分类: +- `review-writing` — 综述框架设计、gap 分析、跨研究审计 +- `argument-writing` — 段落写作、论证结构 +- `analysis-methods` — 文献审计、跨研究比较、参数提取 +- `general` — fallback + +### Step 4: 按 method-card 模板生成卡片 + +打开 `references/method-card-template.md` 确认模板格式。 + +对每个 pattern 生成一张卡片,展示给用户确认。格式: + +```markdown +--- +id: +tags: [, ] +source_project: +status: active +--- + +# <标题> + +## Use when +<什么时候应该用这个方法> + +## Procedure +1. <步骤 1> +2. <步骤 2> +... + +## Watch-outs +- <注意事项 1> +- <注意事项 2> + +## Example +<来自项目的具体例子> +``` + +### Step 5: 用户确认后写入 + +将每张卡片写入: + +``` +System/PaperForge/methodology/archive/.md +``` + +用 `write` 工具创建文件。如已存在同名文件,追加到末尾(用 `---` 分隔)。 +不自动覆盖已有内容。 + +--- + +## 禁止 + +- 不要提取太泛的"教训"(如"多读文献")——必须有具体的 Procedure 步骤 +- 不要创建超过 4 张卡片/次——优先最可复用的 +- 不要在用户确认前写入 diff --git a/paperforge/skills/paperforge/workflows/paper-qa.md b/paperforge/skills/paperforge/workflows/paper-qa.md new file mode 100644 index 0000000..4175efa --- /dev/null +++ b/paperforge/skills/paperforge/workflows/paper-qa.md @@ -0,0 +1,108 @@ +# paper-qa + +> **Safety Rule:** Prior reading-log entries are recheck targets only, never factual answers. +> Always verify against original source before using any reading-log content. + +交互式文献问答。不强制要求 OCR,但 OCR 完成后回答更准确。 + +每次问答记录到 `discussion.json`(Dashboard 可见)。 + +--- + +## 前置条件 + +- bootstrap 已完成(有 `$VAULT`、`$PYTHON`) + +--- + +## 步骤 + +### Step 1: 定位论文 + +用户可能给 zotero_key、DOI、标题片段、作者+年份。按以下方式查找: + +**优先用 paper-context(一次拿到全部信息):** + +```bash +$PYTHON -m paperforge paper-context --json --vault "$VAULT" +``` + +返回 JSON 包含 paper 元数据、OCR 状态、prior_notes 等。 + +**paper-context 无结果时的备选:** + +```bash +$PYTHON -m paperforge search "" --json --vault "$VAULT" --limit 5 +``` + +如果多候选,列出让用户选(同 paper-search 的 Step 4-5 格式)。 +如果无结果,告知用户并停止。 + +### Step 2: 加载文献内容 + +1. 从 paper-context 或 formal note frontmatter 获取:标题、作者、期刊、年份、domain +2. 读 `fulltext.md`(如果 OCR done)作为主要回答依据 +3. 如果 fulltext 不存在:"OCR 文本不可用,回答将基于元数据和公开信息" + +### Step 3: 展示论文信息 + 进入 Q&A + +``` +已加载: (<year>, <journal>) +作者: <authors> | Key: <zotero_key> | 领域: <domain> +OCR: done / 不可用 +结束对话时说"保存"即可保存讨论。 +请问有什么问题? +``` + +### Step 4: Q&A 循环 + +- 等待用户提问 +- 每次回答后等待下一个问题 +- 持续到用户说"保存"、"结束"、"完成" + +**回答原则:** +- 严格基于 fulltext.md 中的文本内容 +- 引用原文时标注来源页码/章节 +- 论文未提及的内容明确说明"论文中未提及" +- 区分"文献说了什么"和"我推断什么" + +### Step 5: 保存讨论 + +用户说"保存"、"结束"、"完成"时执行。 + +**收集 Q&A 对**,序列化为 JSON 数组: + +```json +[ + { + "question": "用户的问题", + "answer": "Agent 的回答", + "source": "user_question", + "timestamp": "2026-05-14T12:00:00+08:00" + } +] +``` + +`source`: `"user_question"`(用户提问)或 `"agent_analysis"`(Agent 主动分析)。 + +**调 discussion 模块:** + +```bash +$PYTHON -m paperforge.worker.discussion record <zotero_key> \ + --vault "$VAULT" \ + --agent pf-paper \ + --model "<current_model>" \ + --qa-pairs '<JSON_ARRAY>' +``` + +- 返回 `ok` → 告知用户已保存 +- 返回 `error` → 重试一次,仍失败则告知用户 + +**不要自动保存。** 仅用户明确要求时执行。 + +--- + +## 禁止 + +- 不要捏造论文未提及的内容 +- 不要把推断写成论文事实 diff --git a/paperforge/skills/paperforge/workflows/paper-search.md b/paperforge/skills/paperforge/workflows/paper-search.md new file mode 100644 index 0000000..285ce55 --- /dev/null +++ b/paperforge/skills/paperforge/workflows/paper-search.md @@ -0,0 +1,101 @@ +# paper-search + +从文献库中按条件检索文献,返回候选清单及每篇的可用状态。 + +--- + +## 前置条件 + +- bootstrap 已完成(有 `$VAULT`、`$PYTHON`、`$LIT_DIR`) + +--- + +## 步骤 + +### Step 1: 解析用户搜索意图 + +提取以下信息(缺什么就问用户): +- **搜索词**:关键词、作者名、年份 +- **范围**:domain(如"骨科")、collection(如"DC")、不指定=全库 +- **过滤条件**:OCR 状态(done/pending)、年份范围(--year-from/--year-to)、lifecycle + +### Step 2: 执行搜索 + +```bash +$PYTHON -m paperforge search <query> --json --vault "$VAULT" --limit 15 \ + [--domain "<domain>"] \ + [--year-from <N>] [--year-to <N>] \ + [--ocr <done|pending>] \ + [--lifecycle <active|archived>] +``` + +返回 JSON 结构: +```json +{ + "ok": true, + "data": { + "query": "<query>", + "matches": [ + { + "zotero_key": "ABC12345", + "citation_key": "...", + "title": "...", + "year": "2024", + "first_author": "Smith", + "domain": "...", + "collection_path": "...", + "ocr_status": "done", + "deep_reading_status": "pending", + "lifecycle": "active", + "has_pdf": true, + "rank": "..." + } + ], + "count": 5 + } +} +``` + +- 如果 `ok: false` → 报告 `error.message`,问用户是否换搜索词 +- 如果 `data.count == 0` → 告知用户无结果,建议换词或扩大范围 +- 如果 `data.count > 0` → 进入 Step 3 + +### Step 3: 逐个确认状态(paper-context 原子) + +对每个 match,调 `paper-context` 获取更详细的可读状态: + +```bash +$PYTHON -m paperforge paper-context <zotero_key> --json --vault "$VAULT" +``` + +目的:拿到 `ocr_status`、`prior_notes` 数量、`analyze` 状态,帮助用户判断哪些可以直接读。 + +### Step 4: 展示候选清单 + +格式(每条一行): + +``` +找到 N 篇匹配 "<query>": + +[1] ABC12345 | Smith 2024 | Title Here | 骨科 | OCR: done | 精读: pending | 阅读笔记: 3 +[2] DEF67890 | Jones 2023 | Title Here | 骨科 | OCR: done | 精读: done | 阅读笔记: 0 +[3] GHI11111 | Wang 2022 | Title Here | 骨科 | OCR: pending | | 阅读笔记: 0 +``` + +关键字段:zotero_key, first_author, year, title, ocr_status, deep_reading_status, prior_notes 数量 + +### Step 5: 等用户选择后续操作 + +展示候选后不要自己决定下一步。等用户说: +- "读一下 [1]" → 路由到 paper-qa.md +- "精读 [2]" → 路由到 deep-reading.md +- "记一下 [1]" → 路由到 reading-log.md +- "缩小范围"/"refine" → 回到 Step 1,加更多过滤条件 + +--- + +## 禁止 + +- 不要在搜索结果中替用户决定读哪篇 +- 不要在搜索阶段读全文 +- 不要对 0 结果硬猜路径 diff --git a/paperforge/skills/paperforge/workflows/project-engineering.md b/paperforge/skills/paperforge/workflows/project-engineering.md new file mode 100644 index 0000000..85a269f --- /dev/null +++ b/paperforge/skills/paperforge/workflows/project-engineering.md @@ -0,0 +1,26 @@ +# Project Engineering + +When user asks about PaperForge codebase issues (branch, code review, feature, +dashboard, memory layer, user feedback, errors, installation, Git, Zotero, +BetterBibTeX, OCR, plugin): + +1. Read `AGENTS.md` and `README.md` for architecture context +2. Use `git log --oneline` and `git diff` to understand recent changes +3. Search codebase with grep/glob as needed +4. Run diagnostics: `python -m paperforge doctor` (if applicable) +5. Present findings and recommend fixes + +Do NOT modify code without user confirmation. + +## Review Dimensions + +When auditing branches, code, or user-reported issues, check: + +1. **Source of truth clarity:** Is data stored in one canonical location? +2. **Derived index rebuildability:** Can SQLite be rebuilt from JSONL? +3. **Agent routing stability:** Will the skill router pick the right workflow? +4. **Obsidian file integrity:** Are .md files still readable with valid frontmatter? +5. **User flow length:** Has the number of manual steps decreased? +6. **Cross-platform safety:** Paths use `/`, Python detection works on Win/Mac/Linux, Git is accessible. +7. **Data loss risk:** Does any operation silently drop records? +8. **Deprecation hygiene:** Are old functions properly wrapped or removed? diff --git a/paperforge/skills/paperforge/workflows/project-log.md b/paperforge/skills/paperforge/workflows/project-log.md new file mode 100644 index 0000000..585a91b --- /dev/null +++ b/paperforge/skills/paperforge/workflows/project-log.md @@ -0,0 +1,134 @@ +# project-log + +> **Scope:** Record what happened this session — decisions, detours, todos. +> For reusable cross-project methods, use methodology workflow instead. + +记录研究项目的会话总结、决策、弯路修正和方法论提取。 +Agent 按 JSON schema 写入 project-log.jsonl。 +系统自动渲染对应项目的 project-log.md。 + +--- + +## 前置条件 + +- bootstrap 已完成(有 `$VAULT`、`$PYTHON`) +- 已知 project 名称 + +--- + +## 项目日志 JSON Schema + +```json +{ + "id": "plog_<YYYYMMDD>_<序号>", + "project": "综述写作", + "date": "2026-05-14", + "type": "session_summary", + "title": "DC 段参数窗审计", + "decisions": ["做了 X,因为 Y"], + "detours": [ + { + "wrong": "错误方向", + "correction": "用户如何纠正", + "resolution": "最终方案" + } + ], + "reusable": ["可复用的方法论或教训"], + "todos": [ + {"content": "待办事项", "done": false} + ], + "related_papers": ["ABC12345"], + "tags": ["DC", "参数窗", "审计"], + "agent": "opencode" +} +``` + +| 字段 | 必填 | 说明 | +| ---------------- | ---- | ------------------------------------------- | +| `id` | 是 | 自动生成 `plog_YYYYMMDD_NNN` | +| `project` | 是 | 项目名 | +| `date` | 是 | YYYY-MM-DD | +| `type` | 是 | `session_summary` / `decision` / `correction` / `milestone` / `note` | +| `title` | 是 | 本条目的简短标题 | +| `decisions` | 否 | 核心决策列表 | +| `detours` | 否 | 弯路与修正记录 | +| `reusable` | 否 | 可复用的方法论或教训 | +| `todos` | 否 | 待办事项 | +| `related_papers` | 否 | 相关 Zotero keys | +| `tags` | 否 | 分类标签 | +| `agent` | 否 | 记录者 | + +--- + +## 步骤 + +### Step 1: 确定 project + +从上下文获取。如果用户未指定,询问。 + +### Step 2: 回顾本次会话 + +回顾以下内容: +- 做了什么(核心决策) +- 用户纠正了什么(弯路与修正) +- 有什么可复用的方法论或教训 +- 待办事项 + +### Step 3: 按 Schema 组织内容,展示确认 + +展示给用户确认后再写入: + +``` +即将记录到 Project/综述写作/project-log.md: + 日期: 2026-05-14 + 类型: session_summary + 标题: DC 段参数窗审计完成 + 决策: + - 限定参数窗为 100Hz-1kHz + - 移除 AC vs DC 对比段落 + 弯路: + - 把推断当文献事实 → 用户要求逐句审计 → 5 处修正 + 可复用: + - 写完必须逐句过 source,区分"文献说了什么"和"我推断什么" + +确认写入?(y/n) +``` + +### Step 4: 写入(Atom) + +```bash +$PYTHON -m paperforge project-log --write \ + --vault "$VAULT" \ + --project "<project>" \ + --json '<payload>' +``` + +- 返回 `ok: true` → 确认写入成功。**自动渲染对应项目 markdown。** +- 返回 `ok: false` → 报告错误,重试一次 + +### Step 5: 确认渲染 + +```bash +$PYTHON -m paperforge project-log --render --project "<project>" --vault "$VAULT" +``` + +输出到 `Project/<project>/project-log.md`。 + +--- + +## type 参考 + +| type | 使用场景 | +| ----------------- | ------------------------------ | +| `session_summary` | 会话结束时的总结 | +| `decision` | 单独记录一个重要决策 | +| `correction` | 用户纠正了某个方向 | +| `milestone` | 项目里程碑 | +| `note` | 一般研究笔记 | + +--- + +## 禁止 + +- 不要在用户确认前写入 +- 不要只写"做了什么"而没有"弯路与修正"和"可复用方法论" diff --git a/paperforge/skills/paperforge/workflows/reading-log.md b/paperforge/skills/paperforge/workflows/reading-log.md new file mode 100644 index 0000000..19c1b3c --- /dev/null +++ b/paperforge/skills/paperforge/workflows/reading-log.md @@ -0,0 +1,112 @@ +# reading-log + +记录单条阅读笔记。Agent 将用户确认的信息按 JSON schema 写入 reading-log.jsonl。 +系统自动渲染对应项目的 reading-log.md(给人看)并导入 paperforge.db(可搜索)。 + +--- + +## 前置条件 + +- bootstrap 已完成(有 `$VAULT`、`$PYTHON`) +- 已知 paper_id(zotero_key) + +--- + +## 阅读笔记 JSON Schema + +每条阅读笔记必须包含以下字段: + +```json +{ + "id": "rln_<YYYYMMDD>_<序号>", + "paper_id": "ABC12345", + "project": "综述写作", + "section": "Results Fig.3", + "excerpt": "原文关键句(逐字引用)", + "context": "包含 excerpt 的完整段落(供后续回原文复核时定位)", + "usage": "这个信息在当前写作中的用途", + "note": "注意事项 / 待核查 / 可能矛盾", + "tags": ["PEMF", "dose-response"], + "verified": false +} +``` + +| 字段 | 必填 | 说明 | +| --------- | ---- | -------------------------------------------------------- | +| `id` | 是 | 自动生成,格式 `rln_YYYYMMDD_NNN` | +| `paper_id` | 是 | Zotero key(8位大写字母数字) | +| `project` | 否 | 关联的研究项目 | +| `section` | 是 | 文献中的位置(如 "Results Fig.3"、"Discussion P12") | +| `excerpt` | 是 | 逐字引用的原文关键句 | +| `context` | 是 | 包含 excerpt 的完整段落,供复核定位 | +| `usage` | 是 | 这个信息在当前工作(写作/研究)中的用途 | +| `note` | 否 | 交叉验证、矛盾、待核查事项 | +| `tags` | 否 | 分类标签,供横切检索 | +| `verified` | 否 | 默认 false。Agent 回原文复核后应更新为 true | + +--- + +## 步骤 + +### Step 1: 确认 paper_id 和 project + +从上下文获取 zotero_key。如果用户未指定 project,询问或留空。 + +### Step 2: Agent 按 Schema 提取内容 + +从对话上下文中提取 `section`、`excerpt`、`context`、`usage`、`note`、`tags`。 + +**excerpt vs context 的区别:** +- `excerpt`:你关注的那一句(逐字引用) +- `context`:包含这句的完整段落(3-5 句),让以后的人不翻原文也能理解语境 + +### Step 3: 展示确认 + +先展示给用户确认,不要直接写入: + +``` +即将记录: + 文献: ABC12345 | Smith 2024 + 位置: Results Fig.3 + 原文: "..." + 用途: 支撑 PEMF 基质合成的论证 + 备注: 需核查是否对 DNA 归一化了 + 项目: 综述写作 + 标签: PEMF, GAG + 段落语境: "..." + +确认写入?(y/n) +``` + +### Step 4: 写入(Atom) + +```bash +$PYTHON -m paperforge reading-log --write <paper_id> \ + --vault "$VAULT" \ + --section "<section>" \ + --excerpt "<excerpt>" \ + --context "<context>" \ + --usage "<usage>" \ + --note "<note>" \ + --project "<project>" \ + --tags "<tag1>,<tag2>" +``` + +- 返回 `ok: true` → 确认写入成功。**写入后自动渲染对应项目的 markdown。** +- 返回 `ok: false` → 报告错误,重试一次 + +### Step 5: 确认渲染 + +```bash +$PYTHON -m paperforge reading-log --render --project "<project>" --vault "$VAULT" +``` + +输出到 `Project/<project>/reading-log.md`。 + +--- + +## 禁止 + +- 不要在用户确认前写入 +- 不要把推断当作 `excerpt`(必须是原文逐字引用) +- 不要让 `context` 为空(必须是完整段落) diff --git a/paperforge/worker/asset_index.py b/paperforge/worker/asset_index.py index 437284a..1c7b9a1 100644 --- a/paperforge/worker/asset_index.py +++ b/paperforge/worker/asset_index.py @@ -37,9 +37,7 @@ from paperforge import __version__ as _paperforge_version from paperforge.adapters.obsidian_frontmatter import ( _legacy_control_flags, - read_frontmatter_bool, read_frontmatter_dict, - read_frontmatter_optional_bool, ) from paperforge.config import paperforge_paths @@ -230,7 +228,10 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: Lazy imports inside avoid circular dependencies with ``sync.py``. """ # Lazy imports to avoid circular deps with sync.py - from paperforge.worker._utils import read_json, slugify_filename, write_json, yaml_quote + import shutil + + from paperforge import __version__ as PAPERFORGE_VERSION + from paperforge.worker._utils import lookup_impact_factor, read_json, slugify_filename, write_json, yaml_quote from paperforge.worker.asset_state import ( compute_health, compute_lifecycle, @@ -246,9 +247,6 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: obsidian_wikilink_for_path, obsidian_wikilink_for_pdf, ) - from paperforge.worker._utils import lookup_impact_factor - from paperforge import __version__ as PAPERFORGE_VERSION - import shutil key = item["key"] collection_meta = collection_fields(item.get("collections", [])) @@ -289,10 +287,10 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: try: text = main_note_path.read_text(encoding="utf-8") if "aliases:" not in text[: text.find("\n---", 4)]: - alias_line = f"aliases: [{yaml_quote(item.get('title', ''))}]\n" + alias_line = f"aliases: [{yaml_quote(item.get('title', ''))}, {yaml_quote(item.get('citation_key') or item.get('key', ''))}]\n" text = re.sub( - r'(^title:.*\n)', - r'\1' + alias_line, + r"(^title:.*\n)", + r"\1" + alias_line, text, count=1, flags=re.MULTILINE, @@ -301,7 +299,6 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: except Exception: pass # alias will be injected on next full frontmatter_note pass break # only one old file per key - deep_reading_file = workspace_dir / "deep-reading.md" target_fulltext = workspace_dir / "fulltext.md" source_fulltext = paths["ocr"] / key / "fulltext.md" @@ -312,7 +309,6 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: logger.info("Bridged fulltext.md to workspace for %s", key) fulltext_exists = target_fulltext.exists() - deep_reading_exists = deep_reading_file.exists() # ---- entry dict ------------------------------------------------------- authors = item.get("authors", []) @@ -322,25 +318,26 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: legacy_flags = _legacy_control_flags(paths, key) legacy_do_ocr = legacy_flags.get("do_ocr") legacy_analyze = legacy_flags.get("analyze") - note_do_ocr = read_frontmatter_optional_bool(main_note_path, "do_ocr") - if note_do_ocr is None: - note_do_ocr = read_frontmatter_optional_bool(note_path, "do_ocr") - note_analyze = read_frontmatter_optional_bool(main_note_path, "analyze") - if note_analyze is None: - note_analyze = read_frontmatter_optional_bool(note_path, "analyze") - - # deep_reading_status: frontmatter first (finalize.py sets it), body detection fallback (sync ensures it) - def _read_fm_str(fp: Path, key: str) -> str: - if not fp or not fp.exists(): - return "" - try: - fm = read_frontmatter_dict(fp.read_text(encoding="utf-8")) - return str(fm.get(key, "")).strip() - except Exception: - return "" - note_dr = _read_fm_str(main_note_path, "deep_reading_status") - if not note_dr: - note_dr = _read_fm_str(note_path, "deep_reading_status") + # Single frontmatter read for all control fields — cache text for reuse + fm = {} + fm_cached_text = "" + fm_was_main = False + for fp in (main_note_path, note_path): + if fp and fp.exists(): + try: + note_text = fp.read_text(encoding="utf-8") + fm = read_frontmatter_dict(note_text) + fm_cached_text = note_text + fm_was_main = (fp == main_note_path) + break + except Exception: + continue + + _v = fm.get("do_ocr") + note_do_ocr = _v if isinstance(_v, bool) else None + _v = fm.get("analyze") + note_analyze = _v if isinstance(_v, bool) else None + note_dr = str(fm.get("deep_reading_status", "")).strip() do_ocr_value = note_do_ocr if note_do_ocr is not None else legacy_do_ocr if do_ocr_value is None: @@ -350,8 +347,22 @@ def _read_fm_str(fp: Path, key: str) -> str: if analyze_value is None: analyze_value = meta.get("analyze") is True or meta.get("deep_reading_status") == "done" + # Compute deep reading status once, reusing cached text when possible. + # main_note_path is canonical — don't fall back to note_path when it exists. + _dr_status = "pending" + if note_dr == "done": + _dr_status = "done" + elif fm_was_main: + _dr_status = "done" if has_deep_reading_content(fm_cached_text) else "pending" + elif main_note_path.exists(): + try: + _dr_status = "done" if has_deep_reading_content(main_note_path.read_text(encoding="utf-8")) else "pending" + except Exception: + pass + entry = { "zotero_key": key, + "citation_key": item.get("citation_key", ""), "domain": domain, "title": item["title"], "authors": authors, @@ -376,23 +387,15 @@ def _read_fm_str(fp: Path, key: str) -> str: "ocr_job_id": meta.get("ocr_job_id", ""), "ocr_md_path": obsidian_wikilink_for_path(vault, meta.get("markdown_path", "")), "ocr_json_path": meta.get("json_path", ""), - "deep_reading_status": ( - "done" - if note_dr == "done" - else "done" - if main_note_path.exists() and has_deep_reading_content(main_note_path.read_text(encoding="utf-8")) - else "done" - if note_path.exists() and has_deep_reading_content(note_path.read_text(encoding="utf-8")) - else "pending" - ), + "deep_reading_status": _dr_status, "note_path": str((main_note_path if main_note_path.exists() else note_path).relative_to(vault)).replace( "\\", "/" ), "deep_reading_md_path": ( str(main_note_path.relative_to(vault)).replace("\\", "/") - if main_note_path.exists() and has_deep_reading_content(main_note_path.read_text(encoding="utf-8")) + if _dr_status == "done" and main_note_path.exists() else str(note_path.relative_to(vault)).replace("\\", "/") - if note_path.exists() and has_deep_reading_content(note_path.read_text(encoding="utf-8")) + if _dr_status == "done" and note_path.exists() else "" ), # Workspace path fields are only advertised when the backing files/dirs exist. @@ -411,10 +414,10 @@ def _read_fm_str(fp: Path, key: str) -> str: # Slug already frozen above — for existing notes, update frontmatter only (preserve body) if main_note_path.exists(): - text = main_note_path.read_text(encoding="utf-8") + text = fm_cached_text if fm_was_main else main_note_path.read_text(encoding="utf-8") fm_close = text.find("---\n", 4) # closing --- after opening --- if fm_close != -1: - body = text[fm_close + 4:] # everything after frontmatter + body = text[fm_close + 4 :] # everything after frontmatter new_full = frontmatter_note(entry, "") new_fm_close = new_full.find("---\n", 4) if new_fm_close != -1: @@ -425,15 +428,51 @@ def _read_fm_str(fp: Path, key: str) -> str: else: main_note_path.write_text(frontmatter_note(entry, text), encoding="utf-8") else: - existing_text = note_path.read_text(encoding="utf-8") if note_path.exists() else "" + existing_text = fm_cached_text if not fm_was_main and fm_cached_text else ( + note_path.read_text(encoding="utf-8") if note_path.exists() else "" + ) main_note_path.write_text(frontmatter_note(entry, existing_text), encoding="utf-8") # Write per-workspace paper-meta.json (Phase 37: internal state outside frontmatter) write_paper_meta(workspace_dir, entry, paperforge_version=PAPERFORGE_VERSION) + # Auto-embed vectors if this paper just completed OCR + _vec_auto_embed_if_new(vault, entry) + return entry +def _vec_auto_embed_if_new(vault: Path, entry: dict) -> None: + """Auto-embed a paper into vector DB if OCR is done and vectors missing.""" + if entry.get("ocr_status") != "done": + return + fulltext_rel = entry.get("fulltext_path", "") + if not fulltext_rel: + return + fulltext_path = vault / fulltext_rel + if not fulltext_path.exists(): + return + # Check if vector DB is enabled and set up + try: + from paperforge.memory.vector_db import ( + _read_plugin_settings, + chunk_fulltext, + embed_paper, + get_vector_db_path, + ) + settings = _read_plugin_settings(vault) + if not settings.get("features", {}).get("vector_db", False): + return + db_path = get_vector_db_path(vault) + if not db_path.exists(): + return + chunks = chunk_fulltext(fulltext_path) + if not chunks: + return + embed_paper(vault, entry["zotero_key"], chunks) + except Exception: + pass # ChromaDB / model not installed — silently skip + # --------------------------------------------------------------------------- # Full index build # --------------------------------------------------------------------------- @@ -491,6 +530,12 @@ def build_index(vault: Path, verbose: bool = False) -> int: for item in export_rows: entry = _build_entry(item, vault, paths, domain, zotero_dir) index_rows.append(entry) + try: + from paperforge.memory.refresh import refresh_paper + + refresh_paper(vault, entry) + except Exception: + pass # memory DB refresh is best-effort # Atomically write the envelope-wrapped index index_path = paths["index"] @@ -571,6 +616,13 @@ def refresh_index_entry(vault: Path, key: str) -> bool: # Build single entry and update the items list new_entry = _build_entry(found_item, vault, paths, found_domain, zotero_dir) + try: + from paperforge.memory.refresh import refresh_paper + + refresh_paper(vault, new_entry) + except Exception: + pass # memory DB refresh is best-effort + replaced = False for i, existing_entry in enumerate(items): if existing_entry.get("zotero_key") == key: diff --git a/paperforge/worker/ocr.py b/paperforge/worker/ocr.py index e01d4e2..3bf54ee 100644 --- a/paperforge/worker/ocr.py +++ b/paperforge/worker/ocr.py @@ -190,6 +190,8 @@ def sync_ocr_queue(paths: dict[str, Path], target_rows: list[dict]) -> list[dict status = str(meta.get("ocr_status", "pending") or "pending").strip().lower() if status in {"done", "blocked"}: continue + if status == "nopdf": + status = "pending" synced = dict(row) synced["has_pdf"] = bool(target.get("has_pdf")) synced["pdf_path"] = target.get("pdf_path", "") @@ -210,6 +212,9 @@ def sync_ocr_queue(paths: dict[str, Path], target_rows: list[dict]) -> list[dict status = str(meta.get("ocr_status", "pending") or "pending").strip().lower() if status in {"done", "blocked"}: continue + if status == "nopdf": + status = "pending" + continue synced_queue.append( { "zotero_key": key, @@ -1571,13 +1576,19 @@ def run_ocr(vault: Path, verbose: bool = False, no_progress: bool = False) -> in for row in target_rows: key = row["zotero_key"] meta = ensure_ocr_meta(vault, row) - if str(meta.get("ocr_status", "") or "").strip().lower() == "error": + current = str(meta.get("ocr_status", "") or "").strip().lower() + if current == "error": meta["ocr_status"] = "pending" meta["ocr_job_id"] = "" meta["ocr_started_at"] = "" meta["ocr_finished_at"] = "" meta["retry_count"] = 0 write_json(paths["ocr"] / key / "meta.json", meta) + elif current == "nopdf": + meta["ocr_status"] = "pending" + meta["error"] = "" + meta["retry_count"] = 0 + write_json(paths["ocr"] / key / "meta.json", meta) status, _error = validate_ocr_meta(paths, meta) if status == "done_incomplete": meta["ocr_status"] = "pending" diff --git a/paperforge/worker/status.py b/paperforge/worker/status.py index b27c031..fde05d0 100644 --- a/paperforge/worker/status.py +++ b/paperforge/worker/status.py @@ -639,36 +639,36 @@ def add_check(category: str, status: str, message: str, fix: str = "") -> None: if total_issues == 0: add_check("字段注册表", "pass", "所有 formal note frontmatter 与字段注册表一致") - ld_deep_script = paths.get("ld_deep_script") + pf_deep_script = paths.get("pf_deep_script") skill_dir = None - if ld_deep_script: - skill_dir = ld_deep_script.parent.parent + if pf_deep_script: + skill_dir = pf_deep_script.parent.parent if skill_dir and skill_dir.exists(): # Try actual importability check - ld_deep_import_ok = False + pf_deep_import_ok = False import_error = "" - if ld_deep_script and ld_deep_script.exists(): + if pf_deep_script and pf_deep_script.exists(): try: import importlib.util - spec = importlib.util.spec_from_file_location("ld_deep", ld_deep_script) + spec = importlib.util.spec_from_file_location("pf_deep", pf_deep_script) if spec and spec.loader: mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) - ld_deep_import_ok = True + pf_deep_import_ok = True except Exception as e: import_error = str(e) - if ld_deep_import_ok: - add_check("Agent 脚本", "pass", "paperforge and ld_deep importable") + if pf_deep_import_ok: + add_check("Agent 脚本", "pass", "paperforge and pf_deep importable") else: add_check( "Agent 脚本", "warn", - f"literature-qa skill 目录存在但 import 失败: {import_error}", + f"paperforge skill 目录存在但 import 失败: {import_error}", "确认 agent_config_dir 配置正确并已运行 pip install -e .", ) else: - add_check("Agent 脚本", "warn", "literature-qa skill 目录未找到", "确认 agent_config_dir 配置正确") + add_check("Agent 脚本", "warn", "paperforge skill 目录未找到", "确认 agent_config_dir 配置正确") # --- Index Health section (Phase 25: derived from canonical index) --- try: diff --git a/paperforge/worker/sync.py b/paperforge/worker/sync.py index 0bae498..e996a05 100644 --- a/paperforge/worker/sync.py +++ b/paperforge/worker/sync.py @@ -1019,11 +1019,12 @@ def frontmatter_note(entry: dict, existing_text: str = "") -> str: lines = [ "---", f"title: {yaml_quote(entry.get('title', ''))}", - f"aliases: [{yaml_quote(entry.get('title', ''))}]", + f"aliases: [{yaml_quote(entry.get('title', ''))}, {yaml_quote(entry.get('citation_key', ''))}]", f"year: {entry.get('year', '')}", f"journal: {yaml_quote(entry.get('journal', ''))}", f"first_author: {yaml_quote(first_author)}", f"zotero_key: {yaml_quote(entry.get('zotero_key', ''))}", + f"citation_key: {yaml_quote(entry.get('citation_key', ''))}", f"domain: {yaml_quote(entry.get('domain', ''))}", f"doi: {yaml_quote(entry.get('doi', ''))}", f"pmid: {yaml_quote(entry.get('pmid', ''))}", diff --git a/paperforge/worker/update.py b/paperforge/worker/update.py index 3b4f22b..0ac69a5 100644 --- a/paperforge/worker/update.py +++ b/paperforge/worker/update.py @@ -252,7 +252,7 @@ def _deploy_all_skills(vault: Path) -> None: agent_key = config.get("agent_platform") or "opencode" result = deploy_skills(vault=vault, agent_key=agent_key, overwrite=True) if result["skill_deployed"]: - logger.info("已部署 literature-qa skill") + logger.info("已部署 paperforge skill") if result["agents_md"]: logger.info("已更新 AGENTS.md") for err in result.get("errors", []): diff --git a/paperforge/worker/vector_db.py b/paperforge/worker/vector_db.py new file mode 100644 index 0000000..6a746ff --- /dev/null +++ b/paperforge/worker/vector_db.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +import os + + +def _preflight_check(vault, settings: dict) -> dict: + """Check prerequisites for embed build. Returns {ok: bool, error: str, fix: str}.""" + from pathlib import Path + + from paperforge.worker._utils import pipeline_paths + + # 1. chromadb + try: + import chromadb # noqa: F401 + except ImportError: + return {"ok": False, "error": "chromadb is not installed", "fix": 'Run: pip install "paperforge[vector]"'} + + # 2. Mode-specific deps + mode = settings.get("vector_db_mode", "local") + if mode == "local": + try: + import sentence_transformers # noqa: F401 + except ImportError: + return { + "ok": False, + "error": "sentence-transformers not installed", + "fix": 'Run: pip install "paperforge[vector]" or switch to API mode', + } + elif mode == "api": + try: + import openai # noqa: F401 + except ImportError: + return { + "ok": False, + "error": "openai not installed", + "fix": 'Run: pip install "paperforge[vector]" or switch to local mode', + } + api_key = settings.get("vector_db_api_key") or os.environ.get("OPENAI_API_KEY") or os.environ.get("VECTOR_DB_API_KEY") + if not api_key: + return {"ok": False, "error": "API key not configured", "fix": "Set API Key in plugin settings or OPENAI_API_KEY in .env"} + + # 3. OCR done papers + paths = pipeline_paths(vault) + idx_path = paths.get("indexes", Path("")) / "formal-library.json" if paths.get("indexes") else None + if idx_path and idx_path.exists(): + import json + + data = json.loads(idx_path.read_text(encoding="utf-8")) + items = data.get("items", []) if isinstance(data, dict) else data + done = sum(1 for i in (items or []) if i.get("ocr_status") == "done") + if done == 0: + return {"ok": False, "error": "No papers with OCR completed", "fix": "Run paperforge ocr first"} + + return {"ok": True} + + +def get_embed_status(vault) -> dict: + """Check if vector index exists and has content.""" + from pathlib import Path + from paperforge.config import paperforge_paths + paths = paperforge_paths(vault) + vectors_dir = paths.get("vectors", paths.get("paperforge", Path()) / "vectors") + + status = {"exists": False, "chunk_count": 0, "collection_name": ""} + + if not vectors_dir or not vectors_dir.exists(): + return status + + try: + import chromadb + client = chromadb.PersistentClient(path=str(vectors_dir)) + collections = client.list_collections() + if collections: + col = collections[0] + status["exists"] = True + status["collection_name"] = col.name + status["chunk_count"] = col.count() + except Exception: + pass + + return status diff --git a/pyproject.toml b/pyproject.toml index 72fc765..c85ee0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,11 @@ test = [ "coverage>=7.4.0", "ruff>=0.4.0", ] +vector = [ + "chromadb>=0.5.0", + "sentence-transformers>=3.0.0", + "openai>=1.0.0", +] [project.scripts] paperforge = "paperforge.cli:main" @@ -58,10 +63,7 @@ version = {attr = "paperforge.__version__"} [tool.setuptools.package-data] paperforge = [ "py.typed", - "skills/literature-qa/prompt_deep_subagent.md", - "skills/literature-qa/scripts/*.md", - "skills/literature-qa/chart-reading/*.md", - "skills/literature-qa/chart-reading/*", + "skills/paperforge/**", "command_files/*.md", "plugin/*.css", "plugin/*.js", @@ -70,7 +72,7 @@ paperforge = [ [tool.pytest.ini_options] addopts = "--ignore=tests/sandbox/00_TestVault/ --strict-markers" -testpaths = ["tests/unit", "tests/cli", "tests/e2e", "tests/journey", "tests/chaos", "tests/audit"] +testpaths = ["tests/unit", "tests/cli", "tests/e2e", "tests/journey", "tests/chaos", "tests/audit", "tests/integration"] markers = [ "unit: Unit tests (Level 1) — fast, isolated", "cli: CLI contract tests (Level 2) — subprocess boundary", @@ -78,6 +80,7 @@ markers = [ "journey: User journey tests (Level 5) — full workflows", "chaos: Destructive tests (Level 6) — abnormal scenarios", "audit: Consistency audit tests — validate L1 mocks against L4 real pipeline output", + "integration: Integration tests — multi-component workflows", "slow: Tests that take >30s (skip during development)", "snapshot: Tests that use snapshot comparison", ] diff --git a/scripts/bump.py b/scripts/bump.py index c29d393..cd7dd84 100644 --- a/scripts/bump.py +++ b/scripts/bump.py @@ -101,8 +101,8 @@ def main(): except subprocess.CalledProcessError: sys.exit("VERIFY FAILED: cannot read __init__.py from HEAD") - run(["git", "tag", "-f", f"v{new_ver}"]) - print(f"Committed and tagged v{new_ver}") + run(["git", "tag", "-f", new_ver]) + print(f"Committed and tagged {new_ver}") print("Run: git push && git push --tags") diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/test_memory_workflow.py b/tests/integration/test_memory_workflow.py new file mode 100644 index 0000000..2118256 --- /dev/null +++ b/tests/integration/test_memory_workflow.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +import json +import os +import sqlite3 +import subprocess +from pathlib import Path + +import pytest + +from paperforge.memory.db import get_memory_db_path + + +@pytest.mark.integration +def test_memory_build_and_status_with_test_vault(test_vault: Path): + """End-to-end: sync -> memory build -> memory status -> paper-status.""" + pf = ["python", "-m", "paperforge", "--vault", str(test_vault)] + env = {**os.environ, "PYTHONIOENCODING": "utf-8"} + + # 1. Sync to ensure formal-library.json exists + result = subprocess.run( + pf + ["sync", "--json"], capture_output=True, text=True, encoding="utf-8", env=env + ) + if result.returncode != 0: + pytest.skip("Sync failed -- test vault may lack export files") + + # 2. Memory build + result = subprocess.run( + pf + ["memory", "build", "--json"], capture_output=True, text=True, encoding="utf-8", env=env + ) + assert result.returncode == 0, f"memory build failed: {result.stderr}" + data = json.loads(result.stdout) + assert data["ok"] is True, f"build result not ok: {data}" + assert data["data"]["papers_indexed"] > 0, "expected at least 1 paper indexed" + + # 3. Memory status + result = subprocess.run( + pf + ["memory", "status", "--json"], capture_output=True, text=True, encoding="utf-8", env=env + ) + assert result.returncode == 0 + data = json.loads(result.stdout) + assert data["data"]["fresh"] is True, f"memory not fresh: {data['data']}" + assert data["data"]["needs_rebuild"] is False + + # 4. Paper-status lookup by zotero_key + papers_json = subprocess.run( + pf + ["memory", "status", "--json"], capture_output=True, text=True, encoding="utf-8", env=env + ) + status_data = json.loads(papers_json.stdout) + paper_count = status_data["data"]["paper_count_db"] + + if paper_count > 0: + # Get first paper's zotero_key from the db + db_path = get_memory_db_path(test_vault) + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + row = conn.execute("SELECT zotero_key FROM papers LIMIT 1").fetchone() + conn.close() + + if row: + key = row["zotero_key"] + result = subprocess.run( + pf + ["paper-status", key, "--json"], + capture_output=True, text=True, encoding="utf-8", env=env, + ) + assert result.returncode == 0 + data = json.loads(result.stdout) + assert data["ok"] is True + assert data["data"]["resolved"] is True diff --git a/tests/unit/memory/__init__.py b/tests/unit/memory/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/memory/test_builder.py b/tests/unit/memory/test_builder.py new file mode 100644 index 0000000..44eac9c --- /dev/null +++ b/tests/unit/memory/test_builder.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from paperforge.memory.builder import compute_hash + + +def test_compute_hash_deterministic(): + items1 = [{"zotero_key": "A"}, {"zotero_key": "B"}] + items2 = [{"zotero_key": "B"}, {"zotero_key": "A"}] + assert compute_hash(items1) == compute_hash(items2) + + +def test_compute_hash_different_for_different_data(): + items1 = [{"zotero_key": "A", "title": "X"}] + items2 = [{"zotero_key": "A", "title": "Y"}] + assert compute_hash(items1) != compute_hash(items2) + + +def test_compute_hash_handles_empty(): + assert compute_hash([]) == compute_hash([]) + assert len(compute_hash([])) == 64 # SHA-256 hex diff --git a/tests/unit/memory/test_context.py b/tests/unit/memory/test_context.py new file mode 100644 index 0000000..2a885bc --- /dev/null +++ b/tests/unit/memory/test_context.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from pathlib import Path + +from paperforge.memory.context import get_agent_context + + +def test_get_agent_context_returns_none_when_no_db(): + assert get_agent_context(Path("/nonexistent/vault")) is None diff --git a/tests/unit/memory/test_query.py b/tests/unit/memory/test_query.py new file mode 100644 index 0000000..47db10b --- /dev/null +++ b/tests/unit/memory/test_query.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from pathlib import Path + +from paperforge.memory.query import get_memory_status + + +def test_get_memory_status_returns_needs_rebuild_when_no_db(): + result = get_memory_status(Path("/nonexistent/vault")) + assert result["db_exists"] is False + assert result["needs_rebuild"] is True diff --git a/tests/unit/memory/test_refresh.py b/tests/unit/memory/test_refresh.py new file mode 100644 index 0000000..103440f --- /dev/null +++ b/tests/unit/memory/test_refresh.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +from pathlib import Path + +from paperforge.memory.refresh import refresh_paper + + +def test_refresh_paper_returns_false_when_no_db(): + assert refresh_paper(Path("/nonexistent/vault"), {"zotero_key": "KEY001"}) is False + + +def test_refresh_paper_returns_false_for_empty_key(): + assert refresh_paper(Path("/nonexistent/vault"), {}) is False + + +def test_refresh_paper_returns_false_for_missing_key(): + assert refresh_paper(Path("/nonexistent/vault"), {"title": "No Key"}) is False diff --git a/tests/unit/memory/test_schema.py b/tests/unit/memory/test_schema.py new file mode 100644 index 0000000..130a18a --- /dev/null +++ b/tests/unit/memory/test_schema.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +import tempfile +from pathlib import Path + +from paperforge.memory.schema import ( + ALL_TABLES, + ensure_schema, + drop_all_tables, + get_schema_version, + CURRENT_SCHEMA_VERSION, +) +from paperforge.memory.db import get_connection + + +def test_ensure_schema_creates_all_tables(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + cursor = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name" + ) + tables = {row["name"] for row in cursor.fetchall()} + for table in ALL_TABLES: + assert table in tables, f"Missing table: {table}" + conn.close() + finally: + db_path.unlink(missing_ok=True) + + +def test_drop_all_tables_clears_all(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + conn = None + try: + conn = get_connection(db_path) + ensure_schema(conn) + drop_all_tables(conn) + cursor = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table'" + ) + tables = {row["name"] for row in cursor.fetchall()} + app_tables = {t for t in tables if t in ALL_TABLES} + assert app_tables == set() + finally: + if conn: + conn.close() + db_path.unlink(missing_ok=True) + + +def test_get_schema_version_returns_zero_when_no_meta(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + assert get_schema_version(conn) == 0 + conn.close() + finally: + db_path.unlink(missing_ok=True) + + +def test_get_schema_version_returns_stored_value(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + conn.execute( + "INSERT INTO meta (key, value) VALUES ('schema_version', '1')" + ) + conn.commit() + assert get_schema_version(conn) == 1 + conn.close() + finally: + db_path.unlink(missing_ok=True) + + +def test_schema_version_mismatch_triggers_rebuild_semantics(): + """When stored version != CURRENT, get_schema_version returns a different int.""" + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + conn.execute( + "INSERT INTO meta (key, value) VALUES ('schema_version', '99')" + ) + conn.commit() + stored = get_schema_version(conn) + assert stored != CURRENT_SCHEMA_VERSION + conn.close() + finally: + db_path.unlink(missing_ok=True) diff --git a/tests/unit/schema/test_field_registry.py b/tests/unit/schema/test_field_registry.py index a7fa5bc..f724152 100644 --- a/tests/unit/schema/test_field_registry.py +++ b/tests/unit/schema/test_field_registry.py @@ -50,8 +50,8 @@ def test_all_expected_fields_present(self) -> None: reg = load_field_registry(REGISTRY_PATH) fm = get_owner_fields(reg, "frontmatter") expected = { - "zotero_key", "domain", "title", "year", "doi", - "collection_path", "has_pdf", "pdf_path", "supplementary", + "zotero_key", "citation_key", "domain", "title", "year", "doi", + "collection_path", "collection_tags", "has_pdf", "pdf_path", "supplementary", "fulltext_md_path", "recommend_analyze", "analyze", "do_ocr", "ocr_status", "deep_reading_status", "path_error", }