From 139d5b673e878433637b2ec5c9b5539d5e35e18f Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 17:32:59 +0800 Subject: [PATCH 001/132] feat(config): add memory_db path key for Memory Layer --- paperforge/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/paperforge/config.py b/paperforge/config.py index 8f63456..b0b07de 100644 --- a/paperforge/config.py +++ b/paperforge/config.py @@ -336,6 +336,7 @@ def paperforge_paths( # ── v2.2: canonical locations below paperforge/ ── "config": paperforge / "config" / "domain-collections.json", "index": paperforge / "indexes" / "formal-library.json", + "memory_db": paperforge / "indexes" / "paperforge.db", } From 772954fca4250a7dc379bcc2106273fcc9e2a795 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 17:35:19 +0800 Subject: [PATCH 002/132] feat(memory): add db.py with connection and path resolution --- paperforge/memory/__init__.py | 8 ++++++++ paperforge/memory/db.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 paperforge/memory/__init__.py create mode 100644 paperforge/memory/db.py diff --git a/paperforge/memory/__init__.py b/paperforge/memory/__init__.py new file mode 100644 index 0000000..97d74fc --- /dev/null +++ b/paperforge/memory/__init__.py @@ -0,0 +1,8 @@ +from __future__ import annotations + +from paperforge.memory.db import get_connection, get_memory_db_path + +__all__ = [ + "get_connection", + "get_memory_db_path", +] diff --git a/paperforge/memory/db.py b/paperforge/memory/db.py new file mode 100644 index 0000000..19dbf83 --- /dev/null +++ b/paperforge/memory/db.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +import sqlite3 +from pathlib import Path + +from paperforge.config import paperforge_paths + + +def get_memory_db_path(vault: Path) -> Path: + """Return the absolute path to paperforge.db.""" + paths = paperforge_paths(vault) + db_path = paths.get("memory_db") + if not db_path: + raise FileNotFoundError("memory_db path not configured") + return db_path + + +def get_connection(db_path: Path, read_only: bool = False) -> sqlite3.Connection: + """Open a SQLite connection to paperforge.db with WAL mode. + + Args: + db_path: Path to paperforge.db. + read_only: If True, open in read-only mode (for queries). + """ + if read_only: + uri = "file:" + db_path.as_posix() + "?mode=ro" + conn = sqlite3.connect(uri, uri=True) + else: + db_path.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + if not read_only: + conn.execute("PRAGMA journal_mode=WAL;") + conn.execute("PRAGMA foreign_keys=ON;") + return conn From fb17c580206971e03d4828b9c7e3aad51b6b6a11 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 17:40:10 +0800 Subject: [PATCH 003/132] feat(memory): add schema module with table definitions and tests --- paperforge/memory/schema.py | 115 +++++++++++++++++++++++++++++++ tests/unit/memory/__init__.py | 0 tests/unit/memory/test_schema.py | 96 ++++++++++++++++++++++++++ 3 files changed, 211 insertions(+) create mode 100644 paperforge/memory/schema.py create mode 100644 tests/unit/memory/__init__.py create mode 100644 tests/unit/memory/test_schema.py diff --git a/paperforge/memory/schema.py b/paperforge/memory/schema.py new file mode 100644 index 0000000..8f41462 --- /dev/null +++ b/paperforge/memory/schema.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +import sqlite3 + +CURRENT_SCHEMA_VERSION = 1 + +CREATE_META = """ +CREATE TABLE IF NOT EXISTS meta ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL +); +""" + +CREATE_PAPERS = """ +CREATE TABLE IF NOT EXISTS papers ( + zotero_key TEXT PRIMARY KEY, + citation_key TEXT NOT NULL DEFAULT '', + title TEXT NOT NULL, + year TEXT, + doi TEXT, + pmid TEXT, + journal TEXT, + first_author TEXT, + authors_json TEXT, + abstract TEXT, + domain TEXT, + collection_path TEXT, + collections_json TEXT, + has_pdf INTEGER NOT NULL DEFAULT 0, + do_ocr INTEGER, + analyze INTEGER, + ocr_status TEXT, + deep_reading_status TEXT, + ocr_job_id TEXT, + impact_factor REAL, + lifecycle TEXT, + maturity_level INTEGER, + maturity_name TEXT, + next_step TEXT, + pdf_path TEXT, + note_path TEXT, + main_note_path TEXT, + paper_root TEXT, + fulltext_path TEXT, + ocr_md_path TEXT, + ocr_json_path TEXT, + ai_path TEXT, + deep_reading_md_path TEXT, + updated_at TEXT +); +""" + +CREATE_ASSETS = """ +CREATE TABLE IF NOT EXISTS paper_assets ( + paper_id TEXT NOT NULL, + asset_type TEXT NOT NULL, + path TEXT NOT NULL, + exists_on_disk INTEGER NOT NULL DEFAULT 0, + PRIMARY KEY (paper_id, asset_type), + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +""" + +CREATE_ALIASES = """ +CREATE TABLE IF NOT EXISTS paper_aliases ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + paper_id TEXT NOT NULL, + alias TEXT NOT NULL, + alias_norm TEXT NOT NULL, + alias_type TEXT NOT NULL, + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +""" + +INDEX_SQL = [ + "CREATE INDEX IF NOT EXISTS idx_papers_doi ON papers(doi);", + "CREATE INDEX IF NOT EXISTS idx_papers_citation_key ON papers(citation_key);", + "CREATE INDEX IF NOT EXISTS idx_papers_domain ON papers(domain);", + "CREATE INDEX IF NOT EXISTS idx_papers_year ON papers(year);", + "CREATE INDEX IF NOT EXISTS idx_papers_ocr_status ON papers(ocr_status);", + "CREATE INDEX IF NOT EXISTS idx_papers_deep_status ON papers(deep_reading_status);", + "CREATE INDEX IF NOT EXISTS idx_papers_lifecycle ON papers(lifecycle);", + "CREATE INDEX IF NOT EXISTS idx_papers_next_step ON papers(next_step);", +] + +ALL_TABLES = ["papers", "paper_assets", "paper_aliases", "meta"] + + +def ensure_schema(conn: sqlite3.Connection) -> None: + """Create tables and indexes if they don't exist.""" + conn.execute(CREATE_META) + conn.execute(CREATE_PAPERS) + conn.execute(CREATE_ASSETS) + conn.execute(CREATE_ALIASES) + for idx_sql in INDEX_SQL: + conn.execute(idx_sql) + conn.commit() + + +def drop_all_tables(conn: sqlite3.Connection) -> None: + """Drop all Memory Layer tables (for rebuild).""" + for table in ALL_TABLES: + conn.execute(f"DROP TABLE IF EXISTS {table};") + conn.commit() + + +def get_schema_version(conn: sqlite3.Connection) -> int: + """Read the stored schema version from meta table, or 0 if not found.""" + try: + row = conn.execute( + "SELECT value FROM meta WHERE key = 'schema_version'" + ).fetchone() + return int(row["value"]) if row else 0 + except sqlite3.OperationalError: + return 0 diff --git a/tests/unit/memory/__init__.py b/tests/unit/memory/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/memory/test_schema.py b/tests/unit/memory/test_schema.py new file mode 100644 index 0000000..130a18a --- /dev/null +++ b/tests/unit/memory/test_schema.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +import tempfile +from pathlib import Path + +from paperforge.memory.schema import ( + ALL_TABLES, + ensure_schema, + drop_all_tables, + get_schema_version, + CURRENT_SCHEMA_VERSION, +) +from paperforge.memory.db import get_connection + + +def test_ensure_schema_creates_all_tables(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + cursor = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name" + ) + tables = {row["name"] for row in cursor.fetchall()} + for table in ALL_TABLES: + assert table in tables, f"Missing table: {table}" + conn.close() + finally: + db_path.unlink(missing_ok=True) + + +def test_drop_all_tables_clears_all(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + conn = None + try: + conn = get_connection(db_path) + ensure_schema(conn) + drop_all_tables(conn) + cursor = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table'" + ) + tables = {row["name"] for row in cursor.fetchall()} + app_tables = {t for t in tables if t in ALL_TABLES} + assert app_tables == set() + finally: + if conn: + conn.close() + db_path.unlink(missing_ok=True) + + +def test_get_schema_version_returns_zero_when_no_meta(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + assert get_schema_version(conn) == 0 + conn.close() + finally: + db_path.unlink(missing_ok=True) + + +def test_get_schema_version_returns_stored_value(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + conn.execute( + "INSERT INTO meta (key, value) VALUES ('schema_version', '1')" + ) + conn.commit() + assert get_schema_version(conn) == 1 + conn.close() + finally: + db_path.unlink(missing_ok=True) + + +def test_schema_version_mismatch_triggers_rebuild_semantics(): + """When stored version != CURRENT, get_schema_version returns a different int.""" + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + conn.execute( + "INSERT INTO meta (key, value) VALUES ('schema_version', '99')" + ) + conn.commit() + stored = get_schema_version(conn) + assert stored != CURRENT_SCHEMA_VERSION + conn.close() + finally: + db_path.unlink(missing_ok=True) From f6bc234aa705fc0582915fe9111786d1f1980c45 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 17:44:03 +0800 Subject: [PATCH 004/132] feat(memory): add builder module that populates SQLite from formal-library.json --- paperforge/memory/builder.py | 217 ++++++++++++++++++++++++++++++ tests/unit/memory/test_builder.py | 20 +++ 2 files changed, 237 insertions(+) create mode 100644 paperforge/memory/builder.py create mode 100644 tests/unit/memory/test_builder.py diff --git a/paperforge/memory/builder.py b/paperforge/memory/builder.py new file mode 100644 index 0000000..eb7a62d --- /dev/null +++ b/paperforge/memory/builder.py @@ -0,0 +1,217 @@ +from __future__ import annotations + +import hashlib +import json +import logging +from datetime import datetime, timezone +from pathlib import Path + +from paperforge import __version__ as PF_VERSION +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.schema import ( + CURRENT_SCHEMA_VERSION, + ensure_schema, + drop_all_tables, + get_schema_version, +) +from paperforge.worker.asset_index import read_index +from paperforge.worker.asset_state import ( + compute_lifecycle, + compute_maturity, + compute_next_step, +) + +logger = logging.getLogger(__name__) + +PAPER_COLUMNS = [ + "zotero_key", "citation_key", "title", "year", "doi", "pmid", + "journal", "first_author", "authors_json", "abstract", "domain", + "collection_path", "collections_json", + "has_pdf", "do_ocr", "analyze", "ocr_status", "deep_reading_status", + "ocr_job_id", "impact_factor", + "lifecycle", "maturity_level", "maturity_name", "next_step", + "pdf_path", "note_path", "main_note_path", "paper_root", + "fulltext_path", "ocr_md_path", "ocr_json_path", "ai_path", + "deep_reading_md_path", "updated_at", +] + +ASSET_FIELDS = [ + ("pdf", "pdf_path"), + ("formal_note", "note_path"), + ("main_note", "main_note_path"), + ("ocr_fulltext", "fulltext_path"), + ("ocr_meta", "ocr_json_path"), + ("deep_reading", "main_note_path"), + ("ai_dir", "ai_path"), +] + +ALIAS_TYPES = ["zotero_key", "citation_key", "title", "doi"] + + +def compute_hash(items: list[dict]) -> str: + sorted_items = sorted(items, key=lambda e: e["zotero_key"]) + raw = json.dumps(sorted_items, sort_keys=True, ensure_ascii=False) + return hashlib.sha256(raw.encode("utf-8")).hexdigest() + + +def _resolve_vault_path(vault: Path, rel_path: str) -> Path: + if not rel_path: + return Path() + p = vault / rel_path + return p.resolve() if p.exists() else p + + +def build_from_index(vault: Path) -> dict: + """Read formal-library.json and build/rebuild paperforge.db. + + Returns a dict with counts for reporting. + """ + envelope = read_index(vault) + if envelope is None: + raise FileNotFoundError( + "Canonical index not found. Run paperforge sync --rebuild-index." + ) + # Legacy format: bare list of entries (pre-envelope) + if isinstance(envelope, list): + items = envelope + generated_at = "" + else: + items = envelope.get("items", []) + generated_at = envelope.get("generated_at", "") + if isinstance(items, list) and items and isinstance(items[0], dict): + canonical_hash = compute_hash(items) + else: + canonical_hash = "" + + db_path = get_memory_db_path(vault) + conn = get_connection(db_path, read_only=False) + try: + stored_version = get_schema_version(conn) + if stored_version != CURRENT_SCHEMA_VERSION: + drop_all_tables(conn) + ensure_schema(conn) + + conn.execute("DELETE FROM paper_aliases;") + conn.execute("DELETE FROM paper_assets;") + conn.execute("DELETE FROM papers;") + + now_utc = datetime.now(timezone.utc).isoformat() + papers_count = 0 + assets_count = 0 + aliases_count = 0 + + for entry in items: + zotero_key = entry.get("zotero_key", "") + if not zotero_key: + continue + + lifecycle = str(compute_lifecycle(entry)) + maturity = compute_maturity(entry) + next_step = str(compute_next_step(entry)) + + paper_values = {} + for col in PAPER_COLUMNS: + if col == "authors_json": + paper_values[col] = json.dumps( + entry.get("authors", []), ensure_ascii=False + ) + elif col == "collections_json": + paper_values[col] = json.dumps( + entry.get("collections", []), ensure_ascii=False + ) + elif col == "lifecycle": + paper_values[col] = lifecycle + elif col == "maturity_level": + paper_values[col] = maturity.get("level", 1) + elif col == "maturity_name": + paper_values[col] = maturity.get("level_name", "") + elif col == "next_step": + paper_values[col] = next_step + elif col == "updated_at": + paper_values[col] = generated_at + elif col in ("do_ocr", "analyze"): + val = entry.get(col) + paper_values[col] = 1 if val else 0 + elif col == "has_pdf": + paper_values[col] = 1 if entry.get("has_pdf") else 0 + else: + paper_values[col] = entry.get(col, "") + + placeholders = ", ".join([f":{c}" for c in PAPER_COLUMNS]) + cols = ", ".join(PAPER_COLUMNS) + conn.execute( + f"INSERT OR REPLACE INTO papers ({cols}) VALUES ({placeholders})", + paper_values, + ) + papers_count += 1 + + for asset_type, entry_field in ASSET_FIELDS: + path_val = entry.get(entry_field, "") + if not path_val: + continue + rel_path = str(path_val).replace("\\", "/") + abs_path = _resolve_vault_path(vault, rel_path) + exists = 1 if abs_path.exists() else 0 + + if asset_type == "deep_reading": + if abs_path.exists(): + try: + content = abs_path.read_text(encoding="utf-8") + exists = 1 if "## 🔍 精读" in content else 0 + except Exception: + exists = 0 + + conn.execute( + """INSERT OR REPLACE INTO paper_assets + (paper_id, asset_type, path, exists_on_disk) + VALUES (?, ?, ?, ?)""", + (zotero_key, asset_type, rel_path, exists), + ) + assets_count += 1 + + for alias_type in ALIAS_TYPES: + raw_val = entry.get(alias_type, "") + if not raw_val: + continue + raw_str = str(raw_val) + conn.execute( + """INSERT OR REPLACE INTO paper_aliases + (paper_id, alias, alias_norm, alias_type) + VALUES (?, ?, ?, ?)""", + ( + zotero_key, + raw_str, + raw_str.lower().strip(), + alias_type, + ), + ) + aliases_count += 1 + + meta_upserts = [ + ("schema_version", str(CURRENT_SCHEMA_VERSION)), + ("paperforge_version", PF_VERSION), + ("created_at", now_utc), + ("last_full_build_at", now_utc), + ("canonical_index_hash", canonical_hash), + ("canonical_index_generated_at", generated_at), + ] + for key, value in meta_upserts: + conn.execute( + """INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)""", + (key, value), + ) + + conn.commit() + + return { + "db_path": str(db_path), + "papers_indexed": papers_count, + "assets_indexed": assets_count, + "aliases_indexed": aliases_count, + "schema_version": str(CURRENT_SCHEMA_VERSION), + } + except Exception: + conn.rollback() + raise + finally: + conn.close() diff --git a/tests/unit/memory/test_builder.py b/tests/unit/memory/test_builder.py new file mode 100644 index 0000000..44eac9c --- /dev/null +++ b/tests/unit/memory/test_builder.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from paperforge.memory.builder import compute_hash + + +def test_compute_hash_deterministic(): + items1 = [{"zotero_key": "A"}, {"zotero_key": "B"}] + items2 = [{"zotero_key": "B"}, {"zotero_key": "A"}] + assert compute_hash(items1) == compute_hash(items2) + + +def test_compute_hash_different_for_different_data(): + items1 = [{"zotero_key": "A", "title": "X"}] + items2 = [{"zotero_key": "A", "title": "Y"}] + assert compute_hash(items1) != compute_hash(items2) + + +def test_compute_hash_handles_empty(): + assert compute_hash([]) == compute_hash([]) + assert len(compute_hash([])) == 64 # SHA-256 hex From afcbd3ce27590bc539a8eed143b3a5eb86c77619 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 17:47:30 +0800 Subject: [PATCH 005/132] feat(memory): add query module for paper lookup and status check --- paperforge/memory/query.py | 188 ++++++++++++++++++++++++++++++++ tests/unit/memory/test_query.py | 11 ++ 2 files changed, 199 insertions(+) create mode 100644 paperforge/memory/query.py create mode 100644 tests/unit/memory/test_query.py diff --git a/paperforge/memory/query.py b/paperforge/memory/query.py new file mode 100644 index 0000000..5e0df91 --- /dev/null +++ b/paperforge/memory/query.py @@ -0,0 +1,188 @@ +from __future__ import annotations + +import json +import logging +from pathlib import Path + +from paperforge.memory.builder import compute_hash +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.schema import CURRENT_SCHEMA_VERSION, get_schema_version +from paperforge.worker.asset_index import read_index +from paperforge.worker.asset_state import compute_health + +logger = logging.getLogger(__name__) + + +def get_memory_status(vault: Path) -> dict: + """Check paperforge.db health and staleness. + + Returns a dict with: db_exists, schema_ok, fresh, count_match, + paper_count_db, paper_count_index, needs_rebuild. + """ + db_path = get_memory_db_path(vault) + result = { + "db_exists": db_path.exists(), + "schema_ok": False, + "fresh": False, + "hash_match": False, + "count_match": False, + "paper_count_db": 0, + "paper_count_index": 0, + "needs_rebuild": True, + } + if not db_path.exists(): + return result + + conn = get_connection(db_path, read_only=True) + try: + stored_version = get_schema_version(conn) + result["schema_ok"] = stored_version == CURRENT_SCHEMA_VERSION + row = conn.execute("SELECT COUNT(*) as cnt FROM papers").fetchone() + result["paper_count_db"] = row["cnt"] if row else 0 + stored_hash_row = conn.execute( + "SELECT value FROM meta WHERE key = 'canonical_index_hash'" + ).fetchone() + stored_hash = stored_hash_row["value"] if stored_hash_row else "" + except Exception: + return result + finally: + conn.close() + + envelope = read_index(vault) + if envelope is not None: + # Handle legacy format (bare list) + if isinstance(envelope, list): + items = envelope + paper_count = len(items) + index_hash = compute_hash(items) + else: + items = envelope.get("items", []) + paper_count = envelope.get("paper_count", 0) + index_hash = compute_hash(items) + result["paper_count_index"] = paper_count + + # Compare stored hash with computed hash + result["hash_match"] = stored_hash == index_hash + + result["count_match"] = ( + result["paper_count_db"] == result["paper_count_index"] + ) + + result["fresh"] = ( + result["schema_ok"] + and result["count_match"] + and result.get("hash_match", False) + ) + result["needs_rebuild"] = not result["fresh"] + return result + + +def _entry_from_row(row) -> dict: + """Reconstruct an entry dict from a papers row (sqlite3.Row).""" + entry = {k: row[k] for k in row.keys()} + for key in ("has_pdf", "do_ocr", "analyze"): + if key in entry and entry[key] is not None: + entry[key] = bool(entry[key]) + for key in ("authors_json", "collections_json"): + if key in entry and entry[key]: + try: + entry[key[:-5]] = json.loads(entry[key]) + del entry[key] + except json.JSONDecodeError: + logger.warning( + "Corrupted JSON in column %s for paper %s", + key, entry.get("zotero_key", "?"), + ) + return entry + + +def lookup_paper(conn, query: str) -> list[dict]: + """Multi-strategy lookup. Returns list of matching paper dicts.""" + q = query.strip() + + for lookup_col in ("zotero_key", "citation_key", "doi"): + row = conn.execute( + f"SELECT * FROM papers WHERE LOWER({lookup_col}) = LOWER(?)", + (q,), + ).fetchone() + if row: + return [_entry_from_row(row)] + + rows = conn.execute( + """SELECT * FROM papers + WHERE LOWER(title) LIKE '%' || LOWER(?) || '%' + LIMIT 20""", + (q,), + ).fetchall() + if rows: + return [_entry_from_row(r) for r in rows] + + rows = conn.execute( + """SELECT p.* FROM papers p + JOIN paper_aliases a ON a.paper_id = p.zotero_key + WHERE a.alias_norm LIKE '%' || LOWER(?) || '%' + LIMIT 20""", + (q,), + ).fetchall() + return [_entry_from_row(r) for r in rows] + + +def get_paper_assets(conn, zotero_key: str) -> list[dict]: + rows = conn.execute( + "SELECT asset_type, path, exists_on_disk FROM paper_assets WHERE paper_id = ?", + (zotero_key,), + ).fetchall() + return [dict(r) for r in rows] + + +def get_paper_status(vault: Path, query: str) -> dict | None: + """Full paper status lookup. Returns dict or None if not found. + + If multiple candidates found, returns a candidate list without full status. + """ + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return None + + conn = get_connection(db_path, read_only=True) + try: + entries = lookup_paper(conn, query) + if not entries: + return None + + # Multiple candidates -> return candidate list only (no full status) + if len(entries) > 1: + return { + "resolved": False, + "candidates": [ + { + "zotero_key": e.get("zotero_key"), + "title": e.get("title"), + "year": e.get("year"), + "citation_key": e.get("citation_key"), + "lifecycle": e.get("lifecycle"), + } + for e in entries + ], + } + + entry = entries[0] + assets = get_paper_assets(conn, entry["zotero_key"]) + entry["health"] = compute_health(entry) + entry["assets"] = assets + entry["resolved"] = True + + next_step = entry.get("next_step", "") + zk = entry.get("zotero_key", "") + if next_step == "/pf-deep": + entry["recommended_action"] = f"/pf-deep {zk}" + elif next_step == "ocr": + entry["recommended_action"] = f"paperforge ocr --key {zk}" + elif next_step == "sync": + entry["recommended_action"] = "paperforge sync" + else: + entry["recommended_action"] = None + + return entry + finally: + conn.close() diff --git a/tests/unit/memory/test_query.py b/tests/unit/memory/test_query.py new file mode 100644 index 0000000..47db10b --- /dev/null +++ b/tests/unit/memory/test_query.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from pathlib import Path + +from paperforge.memory.query import get_memory_status + + +def test_get_memory_status_returns_needs_rebuild_when_no_db(): + result = get_memory_status(Path("/nonexistent/vault")) + assert result["db_exists"] is False + assert result["needs_rebuild"] is True From 3c906e89421a75d1ab98798f73a8cdecc6c03959 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 17:52:30 +0800 Subject: [PATCH 006/132] feat(cli): add memory build/status and paper-status commands --- paperforge/cli.py | 22 +++++++ paperforge/commands/__init__.py | 2 + paperforge/commands/memory.py | 91 +++++++++++++++++++++++++++++ paperforge/commands/paper_status.py | 70 ++++++++++++++++++++++ 4 files changed, 185 insertions(+) create mode 100644 paperforge/commands/memory.py create mode 100644 paperforge/commands/paper_status.py diff --git a/paperforge/cli.py b/paperforge/cli.py index d46b5e9..b56449c 100644 --- a/paperforge/cli.py +++ b/paperforge/cli.py @@ -258,6 +258,18 @@ def build_parser() -> argparse.ArgumentParser: p_dash = sub.add_parser("dashboard", help="Aggregated stats and permissions for the plugin dashboard") p_dash.add_argument("--json", action="store_true", help="Output as PFResult JSON") + # Memory Layer commands + p_memory = sub.add_parser("memory", help="Manage the Memory Layer") + p_memory_sp = p_memory.add_subparsers(dest="memory_subcommand", required=True) + p_memory_build = p_memory_sp.add_parser("build", help="Build the memory database from canonical index") + p_memory_build.add_argument("--json", action="store_true", help="Output as JSON") + p_memory_status = p_memory_sp.add_parser("status", help="Check memory database status") + p_memory_status.add_argument("--json", action="store_true", help="Output as JSON") + + p_paper_status = sub.add_parser("paper-status", help="Look up a paper's status") + p_paper_status.add_argument("query", help="Paper identifier (zotero_key, DOI, title, alias)") + p_paper_status.add_argument("--json", action="store_true", help="Output as JSON") + # base-refresh p_base = sub.add_parser("base-refresh", help="Refresh Obsidian Base view files") p_base.add_argument( @@ -470,6 +482,16 @@ def main(argv: list[str] | None = None) -> int: return dashboard.run(args) + if args.command == "memory": + from paperforge.commands.memory import run + + return run(args) + + if args.command == "paper-status": + from paperforge.commands.paper_status import run + + return run(args) + if args.command == "base-refresh": force = getattr(args, "force", False) paths = args.paths diff --git a/paperforge/commands/__init__.py b/paperforge/commands/__init__.py index 63dc3ad..9306159 100644 --- a/paperforge/commands/__init__.py +++ b/paperforge/commands/__init__.py @@ -10,6 +10,8 @@ "context": "paperforge.commands.context", "dashboard": "paperforge.commands.dashboard", "finalize": "paperforge.commands.finalize", + "memory": "paperforge.commands.memory", + "paper-status": "paperforge.commands.paper_status", } diff --git a/paperforge/commands/memory.py b/paperforge/commands/memory.py new file mode 100644 index 0000000..a0e8d65 --- /dev/null +++ b/paperforge/commands/memory.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +import argparse +import sys + +from paperforge import __version__ as PF_VERSION +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.builder import build_from_index +from paperforge.memory.query import get_memory_status + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + sub_cmd = args.memory_subcommand + + if sub_cmd == "build": + try: + counts = build_from_index(vault) + result = PFResult( + ok=True, + command="memory build", + version=PF_VERSION, + data=counts, + ) + except FileNotFoundError: + result = PFResult( + ok=False, + command="memory build", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message="Canonical index not found. Run paperforge sync --rebuild-index.", + ), + next_actions=[ + { + "command": "paperforge sync --rebuild-index", + "reason": "Generate formal-library.json first", + } + ], + ) + except Exception as exc: + result = PFResult( + ok=False, + command="memory build", + version=PF_VERSION, + error=PFError( + code=ErrorCode.INTERNAL_ERROR, + message=str(exc), + ), + ) + if args.json: + print(result.to_json()) + else: + if result.ok: + print(f"Memory built: {result.data}") + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 0 if result.ok else 1 + + if sub_cmd == "status": + try: + status = get_memory_status(vault) + result = PFResult( + ok=True, + command="memory status", + version=PF_VERSION, + data=status, + ) + except Exception as exc: + result = PFResult( + ok=False, + command="memory status", + version=PF_VERSION, + error=PFError( + code=ErrorCode.INTERNAL_ERROR, + message=str(exc), + ), + ) + if args.json: + print(result.to_json()) + else: + if result.ok: + for k, v in status.items(): + print(f" {k}: {v}") + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 0 if result.ok else 1 + + print(f"Unknown memory subcommand: {sub_cmd}", file=sys.stderr) + return 1 diff --git a/paperforge/commands/paper_status.py b/paperforge/commands/paper_status.py new file mode 100644 index 0000000..34b38aa --- /dev/null +++ b/paperforge/commands/paper_status.py @@ -0,0 +1,70 @@ +from __future__ import annotations + +import argparse +import sys + +from paperforge import __version__ as PF_VERSION +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.query import get_paper_status + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + query = args.query + + try: + status = get_paper_status(vault, query) + if status is None: + result = PFResult( + ok=False, + command="paper-status", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message=f"No paper found for: {query}", + ), + next_actions=[ + { + "command": "paperforge search", + "reason": "Search for papers by keyword", + } + ], + ) + else: + result = PFResult( + ok=True, + command="paper-status", + version=PF_VERSION, + data=status, + ) + except Exception as exc: + result = PFResult( + ok=False, + command="paper-status", + version=PF_VERSION, + error=PFError( + code=ErrorCode.INTERNAL_ERROR, + message=str(exc), + ), + ) + + if args.json: + print(result.to_json()) + else: + if result.ok: + data = result.data + if data.get("resolved"): + print(f"Zotero Key: {data.get('zotero_key', '')}") + print(f"Title: {data.get('title', '')}") + print(f"Year: {data.get('year', '')}") + print(f"Lifecycle: {data.get('lifecycle', '')}") + print(f"Next Step: {data.get('next_step', '')}") + if data.get("candidates"): + print(f"\nMultiple candidates: {len(data['candidates'])}") + for c in data["candidates"]: + print(f" - {c['zotero_key']}: {c['title']} ({c['year']})") + else: + print(f"Error: {result.error.message}", file=sys.stderr) + + return 0 if result.ok else 1 From 65cbf867e631bfa679b83eba4c8b91018f2227bf Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 17:59:37 +0800 Subject: [PATCH 007/132] test(memory): add integration test for memory build/status workflow --- pyproject.toml | 3 +- tests/integration/__init__.py | 0 tests/integration/test_memory_workflow.py | 69 +++++++++++++++++++++++ 3 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 tests/integration/__init__.py create mode 100644 tests/integration/test_memory_workflow.py diff --git a/pyproject.toml b/pyproject.toml index 72fc765..9d84b07 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,7 +70,7 @@ paperforge = [ [tool.pytest.ini_options] addopts = "--ignore=tests/sandbox/00_TestVault/ --strict-markers" -testpaths = ["tests/unit", "tests/cli", "tests/e2e", "tests/journey", "tests/chaos", "tests/audit"] +testpaths = ["tests/unit", "tests/cli", "tests/e2e", "tests/journey", "tests/chaos", "tests/audit", "tests/integration"] markers = [ "unit: Unit tests (Level 1) — fast, isolated", "cli: CLI contract tests (Level 2) — subprocess boundary", @@ -78,6 +78,7 @@ markers = [ "journey: User journey tests (Level 5) — full workflows", "chaos: Destructive tests (Level 6) — abnormal scenarios", "audit: Consistency audit tests — validate L1 mocks against L4 real pipeline output", + "integration: Integration tests — multi-component workflows", "slow: Tests that take >30s (skip during development)", "snapshot: Tests that use snapshot comparison", ] diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/test_memory_workflow.py b/tests/integration/test_memory_workflow.py new file mode 100644 index 0000000..2118256 --- /dev/null +++ b/tests/integration/test_memory_workflow.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +import json +import os +import sqlite3 +import subprocess +from pathlib import Path + +import pytest + +from paperforge.memory.db import get_memory_db_path + + +@pytest.mark.integration +def test_memory_build_and_status_with_test_vault(test_vault: Path): + """End-to-end: sync -> memory build -> memory status -> paper-status.""" + pf = ["python", "-m", "paperforge", "--vault", str(test_vault)] + env = {**os.environ, "PYTHONIOENCODING": "utf-8"} + + # 1. Sync to ensure formal-library.json exists + result = subprocess.run( + pf + ["sync", "--json"], capture_output=True, text=True, encoding="utf-8", env=env + ) + if result.returncode != 0: + pytest.skip("Sync failed -- test vault may lack export files") + + # 2. Memory build + result = subprocess.run( + pf + ["memory", "build", "--json"], capture_output=True, text=True, encoding="utf-8", env=env + ) + assert result.returncode == 0, f"memory build failed: {result.stderr}" + data = json.loads(result.stdout) + assert data["ok"] is True, f"build result not ok: {data}" + assert data["data"]["papers_indexed"] > 0, "expected at least 1 paper indexed" + + # 3. Memory status + result = subprocess.run( + pf + ["memory", "status", "--json"], capture_output=True, text=True, encoding="utf-8", env=env + ) + assert result.returncode == 0 + data = json.loads(result.stdout) + assert data["data"]["fresh"] is True, f"memory not fresh: {data['data']}" + assert data["data"]["needs_rebuild"] is False + + # 4. Paper-status lookup by zotero_key + papers_json = subprocess.run( + pf + ["memory", "status", "--json"], capture_output=True, text=True, encoding="utf-8", env=env + ) + status_data = json.loads(papers_json.stdout) + paper_count = status_data["data"]["paper_count_db"] + + if paper_count > 0: + # Get first paper's zotero_key from the db + db_path = get_memory_db_path(test_vault) + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + row = conn.execute("SELECT zotero_key FROM papers LIMIT 1").fetchone() + conn.close() + + if row: + key = row["zotero_key"] + result = subprocess.run( + pf + ["paper-status", key, "--json"], + capture_output=True, text=True, encoding="utf-8", env=env, + ) + assert result.returncode == 0 + data = json.loads(result.stdout) + assert data["ok"] is True + assert data["data"]["resolved"] is True From 95ca8fceb8b80d4757c6580ef3073ebe93b7570a Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 18:00:52 +0800 Subject: [PATCH 008/132] style(memory): apply ruff fixes (ternary, nested-if, dict-keys) --- paperforge/memory/builder.py | 20 ++++++++------------ paperforge/memory/query.py | 2 +- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/paperforge/memory/builder.py b/paperforge/memory/builder.py index eb7a62d..84896c6 100644 --- a/paperforge/memory/builder.py +++ b/paperforge/memory/builder.py @@ -10,8 +10,8 @@ from paperforge.memory.db import get_connection, get_memory_db_path from paperforge.memory.schema import ( CURRENT_SCHEMA_VERSION, - ensure_schema, drop_all_tables, + ensure_schema, get_schema_version, ) from paperforge.worker.asset_index import read_index @@ -78,10 +78,7 @@ def build_from_index(vault: Path) -> dict: else: items = envelope.get("items", []) generated_at = envelope.get("generated_at", "") - if isinstance(items, list) and items and isinstance(items[0], dict): - canonical_hash = compute_hash(items) - else: - canonical_hash = "" + canonical_hash = compute_hash(items) if isinstance(items, list) and items and isinstance(items[0], dict) else "" db_path = get_memory_db_path(vault) conn = get_connection(db_path, read_only=False) @@ -153,13 +150,12 @@ def build_from_index(vault: Path) -> dict: abs_path = _resolve_vault_path(vault, rel_path) exists = 1 if abs_path.exists() else 0 - if asset_type == "deep_reading": - if abs_path.exists(): - try: - content = abs_path.read_text(encoding="utf-8") - exists = 1 if "## 🔍 精读" in content else 0 - except Exception: - exists = 0 + if asset_type == "deep_reading" and abs_path.exists(): + try: + content = abs_path.read_text(encoding="utf-8") + exists = 1 if "## 🔍 精读" in content else 0 + except Exception: + exists = 0 conn.execute( """INSERT OR REPLACE INTO paper_assets diff --git a/paperforge/memory/query.py b/paperforge/memory/query.py index 5e0df91..365360e 100644 --- a/paperforge/memory/query.py +++ b/paperforge/memory/query.py @@ -79,7 +79,7 @@ def get_memory_status(vault: Path) -> dict: def _entry_from_row(row) -> dict: """Reconstruct an entry dict from a papers row (sqlite3.Row).""" - entry = {k: row[k] for k in row.keys()} + entry = {k: row[k] for k in row} for key in ("has_pdf", "do_ocr", "analyze"): if key in entry and entry[key] is not None: entry[key] = bool(entry[key]) From 339ffe2b56f99418e440c1d3c951a8da9e39702d Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 18:01:21 +0800 Subject: [PATCH 009/132] feat(memory): add schema exports to __init__.py --- paperforge/memory/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paperforge/memory/__init__.py b/paperforge/memory/__init__.py index 97d74fc..5585cd6 100644 --- a/paperforge/memory/__init__.py +++ b/paperforge/memory/__init__.py @@ -1,8 +1,11 @@ from __future__ import annotations from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.schema import ensure_schema, drop_all_tables __all__ = [ "get_connection", "get_memory_db_path", + "ensure_schema", + "drop_all_tables", ] From 02c1e5b8a0afc8c15d3e607474e84569866d2de2 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 18:11:37 +0800 Subject: [PATCH 010/132] bump: 1.5.5 -> 1.5.6rc1 (Memory Layer RC1) --- .github/workflows/ci-chaos.yml | 37 + .github/workflows/ci.yml | 114 ++ .github/workflows/release.yml | 42 + .../2026-05-12-memory-layer-REVIEW-v3.md | 101 ++ .../plans/2026-05-12-memory-layer-REVIEW.md | 377 +++++ .../plans/2026-05-12-memory-layer.md | 1235 +++++++++++++++++ .../specs/2026-05-12-memory-layer-design.md | 279 ++++ manifest.json | 2 +- paperforge/__init__.py | 2 +- paperforge/plugin/manifest.json | 2 +- 10 files changed, 2188 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/ci-chaos.yml create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/release.yml create mode 100644 docs/superpowers/plans/2026-05-12-memory-layer-REVIEW-v3.md create mode 100644 docs/superpowers/plans/2026-05-12-memory-layer-REVIEW.md create mode 100644 docs/superpowers/plans/2026-05-12-memory-layer.md create mode 100644 docs/superpowers/specs/2026-05-12-memory-layer-design.md diff --git a/.github/workflows/ci-chaos.yml b/.github/workflows/ci-chaos.yml new file mode 100644 index 0000000..5eb83a0 --- /dev/null +++ b/.github/workflows/ci-chaos.yml @@ -0,0 +1,37 @@ +name: Chaos Tests (L6) + +on: + schedule: + # Weekly: Sunday 06:00 UTC + - cron: "0 6 * * 0" + workflow_dispatch: + # Manual trigger from GitHub UI + +jobs: + chaos-tests: + name: Chaos / Destructive Tests + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install package with test dependencies + run: | + pip install -e ".[test]" + pip install pytest pytest-timeout responses PyYAML + + - name: Run chaos tests + run: | + python -m pytest tests/chaos/ -m chaos -v --tb=long --timeout=120 \ + --junit-xml=chaos-results.xml + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: chaos-test-results + path: chaos-results.xml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..0dccc57 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,114 @@ +# CI — Simplified Pipeline +# Runs on push/PR to master. All tests run (no -x early exit). + +name: CI + +on: + push: + branches: [main, master] + paths-ignore: + - "**.md" + - "docs/**" + pull_request: + branches: [main, master] + +env: + PYTHONIOENCODING: utf-8 + +jobs: + # --------------------------------------------------------------------------- + # L0 — Version consistency check + # --------------------------------------------------------------------------- + version-check: + name: L0 — Version Sync + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install package + run: pip install -e . + - name: Check version consistency + run: python scripts/check_version_sync.py + + # --------------------------------------------------------------------------- + # L1 — Unit tests (3 OS x 1 Python) + # --------------------------------------------------------------------------- + unit-tests: + name: L1 — Unit Tests (${{ matrix.os }}, py3.11) + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install package with test deps + run: pip install -e ".[test]" + - name: Run unit tests + shell: bash + run: | + python -m pytest tests/ \ + --ignore=tests/sandbox \ + --ignore=tests/cli \ + --ignore=tests/e2e \ + --ignore=tests/journey \ + --ignore=tests/chaos \ + --ignore=tests/audit \ + -v --tb=short --timeout=60 + + # --------------------------------------------------------------------------- + # L3 — Plugin tests (Vitest) + # --------------------------------------------------------------------------- + plugin-tests: + name: L3 — Plugin Tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: 20 + cache: "npm" + cache-dependency-path: paperforge/plugin/package-lock.json + - run: npm ci + working-directory: paperforge/plugin + - run: npx vitest run --reporter=verbose + working-directory: paperforge/plugin + + # --------------------------------------------------------------------------- + # L4 — E2E + Audit + # --------------------------------------------------------------------------- + e2e-tests: + name: L4 — E2E + Audit + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install package with test deps + run: pip install -e ".[test]" + - name: Run E2E tests + run: python -m pytest tests/e2e/ -m e2e -v --tb=short --timeout=120 + - name: Run audit tests + run: python -m pytest tests/audit/ -m audit -v --tb=short --timeout=120 + + # --------------------------------------------------------------------------- + # Merge gate + # --------------------------------------------------------------------------- + alls-green: + name: All Checks Passed + if: always() + needs: + - unit-tests + - plugin-tests + runs-on: ubuntu-latest + steps: + - uses: re-actors/alls-green@v1.2.2 + with: + allowed-skips: version-check + jobs: ${{ toJSON(needs) }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..c8aab79 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,42 @@ +# Auto-release Obsidian plugin on tag push +# Triggered by tags like v1.4.18. Runs plugin tests, then creates a GitHub Release +# with the 4 required Obsidian plugin files. + +name: Release + +on: + push: + tags: + - "v*" + +jobs: + release: + name: Release Plugin + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-node@v4 + with: + node-version: 20 + cache: "npm" + cache-dependency-path: paperforge/plugin/package-lock.json + + - name: Run plugin tests + working-directory: paperforge/plugin + run: | + npm ci + npx vitest run --reporter=verbose + + - name: Create Release + uses: softprops/action-gh-release@v2 + with: + name: ${{ github.ref_name }} + generate_release_notes: true + files: | + paperforge/plugin/main.js + paperforge/plugin/styles.css + paperforge/plugin/manifest.json + paperforge/plugin/versions.json diff --git a/docs/superpowers/plans/2026-05-12-memory-layer-REVIEW-v3.md b/docs/superpowers/plans/2026-05-12-memory-layer-REVIEW-v3.md new file mode 100644 index 0000000..5b2cd7b --- /dev/null +++ b/docs/superpowers/plans/2026-05-12-memory-layer-REVIEW-v3.md @@ -0,0 +1,101 @@ +--- +phase: memory-layer-plan-v3-quick-check +reviewed: 2026-05-12T09:25:08Z +depth: standard +files_reviewed: 1 +files_reviewed_list: + - docs/superpowers/plans/2026-05-12-memory-layer.md +findings: + critical: 0 + warning: 1 + info: 0 + total: 1 +status: issues_found +--- + +# Phase: Memory Layer Plan v3 Quick Check + +**Reviewed:** 2026-05-12T09:25:08Z +**Depth:** standard (plan-only, cross-referenced against codebase for `--key` validation) +**Files Reviewed:** 1 +**Status:** ISSUES_FOUND (1 WARNING remaining) + +--- + +## Summary + +Quick final check of the implementation plan after v3 review fixes. All 5 named issues from the prior review are **confirmed fixed**. The plan incorporates all 14 fixes from the original v1 deep review (5 CR + 5 WR + 4 IN). One new WARNING-level issue identified in the `_entry_from_row` function. + +--- + +## Named Issue Verification + +| Issue | Status | Evidence | +|-------|--------|----------| +| **N-BLKR-01**: hash query inside try block | **FIXED** | Lines 713-716 — `stored_hash_row = conn.execute(...)` is inside the `try:` block at line 708 | +| **N-BLKR-02**: NameError on `status` in memory.py | **FIXED** | Line 985 — `if result.ok:` guards access to `status` on line 986. `result` is always assigned in both try/except branches | +| **N-WRN-01**: paper_status empty fields for unresolved | **FIXED** | Line 1055 — `if data.get("resolved"):` guards detailed field printing | +| **N-INFO-01**: private `_compute_hash` renamed to `compute_hash` | **FIXED** | Line 451 — `def compute_hash(...)` (public). Line 683 — `from paperforge.memory.builder import compute_hash` | +| **N-INFO-02**: JSON decode logged with `logging.warning` | **FIXED** | Lines 762-764 — `logging.warning("Corrupted JSON in column %s for paper %s", key, ...)` | + +All 5 named issues from the prior review are resolved in the plan. + +--- + +## Original v1 Review Issue Verification (bonus) + +Cross-checked all 14 issues from `2026-05-12-memory-layer-REVIEW.md`: + +| Issue | Status | +|-------|--------| +| CR-01: `make_result` import | **FIXED** — line 910 imports only `PFError, PFResult` | +| CR-02: hash not checked | **FIXED** — lines 713-746 compare stored hash vs computed | +| CR-03: legacy format crash in builder | **FIXED** — lines 475-480 handle `isinstance(envelope, list)` | +| CR-04: legacy format crash in query | **FIXED** — lines 725-733 handle bare list | +| CR-05: Windows-path URI bug | **FIXED** — line 122 uses `db_path.as_posix()` | +| WR-01: `--force` flag | **FIXED** — removed from CLI parser (lines 1080-1082) | +| WR-02: ambiguous query returns full status | **FIXED** — lines 823-838 return candidates only when >1 | +| WR-03: recommended_action missing | **FIXED** — lines 846-855 compute concrete action strings | +| WR-04: zero test coverage | **REMAINS** — plan still has only 4 schema + 3 hash tests | +| WR-05: CLI dispatch pattern | **FIXED** — lines 1089-1099 use simple dispatch | +| IN-01: unused compute_health import | **FIXED** — removed from builder imports (lines 417-422) | +| IN-02: _COMMAND_REGISTRY not consumed | **REMAINS** — still present but rate-limited to INFO | +| IN-03: compute_hash .get vs direct | **FIXED** — line 452 uses `e["zotero_key"]` (direct access) | +| IN-04: fragile rstrip("_json") | **FIXED** — line 760 uses `key[:-5]` instead of `rstrip` | + +--- + +## Warnings + +### WR-V3-01: Data silently lost when JSON decode fails in `_entry_from_row` + +**File:** `docs/superpowers/plans/2026-05-12-memory-layer.md:759-760` +**Issue:** When `json.loads()` raises `JSONDecodeError`, `entry.pop(key)` has already executed — the original `_json` column value is removed from the result dict and never restored. The field disappears silently from query output. + +```python +# Current (plan line 759-760) +try: + entry[key[:-5]] = json.loads(entry.pop(key)) # pop() happens BEFORE json.loads() +except json.JSONDecodeError: + logging.warning(...) # original value already lost +``` + +**Fix:** +```python +# Pop first, then try to decode, restore on failure +raw = entry.pop(key) +try: + entry[key[:-5]] = json.loads(raw) +except json.JSONDecodeError: + entry[key] = raw # keep original JSON string visible + logging.warning( + "Corrupted JSON in column %s for paper %s", + key, entry.get("zotero_key", "?"), + ) +``` + +--- + +_Reviewed: 2026-05-12T09:25:08Z_ +_Reviewer: VT-OS/OPENCODE (gsd-code-reviewer)_ +_Depth: standard_ diff --git a/docs/superpowers/plans/2026-05-12-memory-layer-REVIEW.md b/docs/superpowers/plans/2026-05-12-memory-layer-REVIEW.md new file mode 100644 index 0000000..39c6269 --- /dev/null +++ b/docs/superpowers/plans/2026-05-12-memory-layer-REVIEW.md @@ -0,0 +1,377 @@ +--- +phase: memory-layer-plan-review +reviewed: 2026-05-12T18:30:00Z +depth: deep +files_reviewed: 9 +files_reviewed_list: + - docs/superpowers/plans/2026-05-12-memory-layer.md + - docs/superpowers/specs/2026-05-12-memory-layer-design.md + - paperforge/config.py + - paperforge/cli.py + - paperforge/commands/__init__.py + - paperforge/core/result.py + - paperforge/core/errors.py + - paperforge/worker/asset_state.py + - paperforge/worker/asset_index.py +findings: + critical: 5 + warning: 5 + info: 4 + total: 14 +status: issues_found +--- + +# Phase: Memory Layer Plan Review + +**Reviewed:** 2026-05-12T18:30:00Z +**Depth:** deep (cross-file analysis with import graph tracing) +**Files Reviewed:** 9 +**Status:** ISSUES_FOUND + +## Verdict: ISSUES_FOUND + +5 BLOCKER, 5 WARNING, 4 INFO issues detected. Plan must not be executed until BLOCKER items are resolved. + +--- + +## Summary + +The plan maps spec requirements to tasks with reasonable granularity, and the overall architecture (SQLite under `paperforge/memory/`, derived from `formal-library.json`, PFResult-enveloped CLI) is sound. However, the cross-file trace against the actual codebase reveals **five BLOCKER defects** — a non-existent import, a missing spec-critical hash check, two crash-on-legacy-format scenarios, and a Windows-path URI bug. Five WARNING-level issues include an unimplemented `--force` flag, a behavioral divergence from spec for ambiguous queries, a missing `recommended_action` field, near-zero test coverage for business logic, and an inconsistent CLI dispatch pattern. + +--- + +## Critical Issues + +### CR-01: Import of non-existent `make_result` in `memory.py` + +**File:** Plan Task 6, Step 1 (`paperforge/commands/memory.py` line 4) +**Issue:** The plan code imports `make_result` from `paperforge.core.result`: +```python +from paperforge.core.result import PFError, PFResult, make_result +``` +`make_result` is **not defined anywhere** in the codebase. Verified by grep of the entire `paperforge/` tree — zero matches. `core/result.py` (lines 1-79) exports only `PFError` and `PFResult`. This would cause `ImportError` at runtime on every invocation of `paperforge memory`. + +**Fix:** +```python +# Remove make_result from the import line — it's never used in the function body either. +from paperforge.core.result import PFError, PFResult +``` + +--- + +### CR-02: `get_memory_status` does not check `canonical_index_hash` + +**File:** Plan Task 5, Step 1 (`paperforge/memory/query.py`, `get_memory_status()`) +**Issue:** The spec (Design Spec lines 221-226) explicitly requires `memory status` to verify `canonical_index_hash` against the SHA-256 of the current `formal-library.json`: +> - `canonical_index_hash` matches computed hash of current `formal-library.json` → `fresh: bool` + +The plan's implementation (lines 678-717) computes `fresh` as only: +```python +result["fresh"] = result["schema_ok"] and result["count_match"] +``` +The `canonical_index_hash` stored in `meta` during build is never read back and never compared. The status command will report `fresh: true` even when the canonical index has changed since the last build — giving a falsely green "fresh" signal that causes stale paper-status results. + +**Fix:** In `get_memory_status()` after the read-only connection is opened, add: +```python +# Read stored hash from meta +stored_hash_row = conn.execute( + "SELECT value FROM meta WHERE key = 'canonical_index_hash'" +).fetchone() +stored_hash = stored_hash_row["value"] if stored_hash_row else "" + +# Recompute hash from current index +envelope = read_index(vault) +items = envelope.get("items", []) if isinstance(envelope, dict) else [] +from paperforge.memory.builder import _compute_hash +current_hash = _compute_hash(items) if items else "" + +result["hash_match"] = stored_hash == current_hash +result["fresh"] = result["schema_ok"] and result["count_match"] and result["hash_match"] +``` + +--- + +### CR-03: `build_from_index` crashes on legacy-format (bare list) index + +**File:** Plan Task 4, Step 1 (`paperforge/memory/builder.py`, line 467-471) +**Issue:** `read_index(vault)` in `asset_index.py` (line 160-176) can return a **bare list** (legacy pre-v1.6 format). The `build_from_index` function only checks for `None`: +```python +envelope = read_index(vault) +if envelope is None: + raise FileNotFoundError(...) +items = envelope.get("items", []) # <-- CRASH: list has no .get() +``` +If the vault has a legacy-format `formal-library.json` (not yet migrated by a sync run), `envelope` is a `list`, and `envelope.get(...)` raises `AttributeError`. The existing codebase has `is_legacy_format()` and `migrate_legacy_index()` in `asset_index.py` (lines 178-212) specifically for this case. + +**Fix:** Add legacy format detection after the `None` check: +```python +envelope = read_index(vault) +if envelope is None: + raise FileNotFoundError( + "Canonical index not found. Run paperforge sync --rebuild-index." + ) +from paperforge.worker.asset_index import is_legacy_format +if is_legacy_format(envelope): + raise FileNotFoundError( + "Canonical index is in legacy (bare-list) format. " + "Run paperforge sync --rebuild-index to migrate." + ) +items = envelope.get("items", []) +generated_at = envelope.get("generated_at", "") +``` + +--- + +### CR-04: `get_memory_status` crashes on legacy-format index + +**File:** Plan Task 5, Step 1 (`paperforge/memory/query.py`, line 708-713) +**Issue:** Same legacy-format crash as CR-03, but in the read path: +```python +envelope = read_index(vault) +if envelope: + result["paper_count_index"] = envelope.get("paper_count", 0) # CRASH on list +``` +A bare-list envelope causes `AttributeError`. + +**Fix:** Add the same `is_legacy_format` guard: +```python +envelope = read_index(vault) +if envelope and isinstance(envelope, dict): + result["paper_count_index"] = envelope.get("paper_count", 0) + ... +``` + +--- + +### CR-05: Windows-path URI incompatibility in `get_connection` read-only mode + +**File:** Plan Task 2, Step 2 (`paperforge/memory/db.py`, line 122-123) +**Issue:** +```python +uri = f"file:{db_path}?mode=ro" if read_only else str(db_path) +conn = sqlite3.connect(uri, uri=read_only) +``` +On Windows, `db_path` contains backslashes (e.g., `D:\Vault\System\PaperForge\indexes\paperforge.db`). The constructed URI `file:D:\Vault\...?mode=ro` is NOT a valid [RFC 8089 file URI](https://datatracker.ietf.org/doc/html/rfc8089). SQLite's URI parser requires either `file:///D:/...` (authority path) or `file:D:/...` (local path with forward slashes). With backslashes, `sqlite3.connect(..., uri=True)` may fail with `sqlite3.OperationalError: unable to open database file` or silently misinterpret the path. + +**Fix:** Normalize the path to use forward slashes before constructing the URI: +```python +def get_connection(db_path: Path, read_only: bool = False) -> sqlite3.Connection: + if read_only: + # Windows-safe: convert to forward slashes for SQLite URI parser + posix_path = str(db_path.resolve()).replace("\\", "/") + uri = f"file:{posix_path}?mode=ro" + else: + uri = str(db_path) + conn = sqlite3.connect(uri, uri=read_only) + conn.row_factory = sqlite3.Row + if not read_only: + conn.execute("PRAGMA journal_mode=WAL;") + conn.execute("PRAGMA foreign_keys=ON;") + return conn +``` + +--- + +## Warnings + +### WR-01: `--force` flag on `memory build` defined but never implemented + +**File:** Plan Task 6, Step 3 (cli.py parser) + Task 4 builder +**Issue:** The CLI parser adds `--force` to `memory build`: +```python +p_memory_build.add_argument("--force", action="store_true", help="Force rebuild") +``` +Neither `memory.run()` nor `build_from_index()` checks `args.force`. The builder always deletes all paper data and rebuilds (lines 486-488), making `--force` redundant for the current logic. However, a future optimization that caches unchanged entries would make `--force` meaningful. Either implement the flag or remove it — dead CLI interfaces degrade user experience and create maintenance debt. + +**Fix:** Either (a) remove the `--force` argument entirely from the parser, or (b) wire it through: +```python +# In builder.py: add force parameter +def build_from_index(vault: Path, force: bool = False) -> dict: + ... + if force: + drop_all_tables(conn) + ... +# In memory.py: +counts = build_from_index(vault, force=getattr(args, "force", False)) +``` + +--- + +### WR-02: Ambiguous query (>1 results) returns full status instead of candidate list only + +**File:** Plan Task 5 (`paperforge/memory/query.py`, `get_paper_status()`, lines 775-793) +**Issue:** The spec (Design Spec line 243) states: +> **>1 results:** Candidate list only (no full status details) + +The plan returns full status for the first match PLUS the candidate list: +```python +entry = entries[0] +assets = get_paper_assets(conn, entry["zotero_key"]) +entry["health"] = compute_health(entry) # Full status details computed +entry["candidates"] = entries if len(entries) > 1 else None +entry["assets"] = assets +return entry +``` +And the CLI output (paper_status.py lines 986-991) always prints title/year/lifecycle/next_step — even when multiple candidates exist. This violates the spec's "candidate list only" requirement for ambiguous queries. + +**Fix:** When `len(entries) > 1`, return candidate summary only: +```python +if len(entries) > 1: + return { + "candidates": [ + { + "zotero_key": e.get("zotero_key", ""), + "title": e.get("title", ""), + "year": e.get("year", ""), + "doi": e.get("doi", ""), + "domain": e.get("domain", ""), + } + for e in entries + ], + "candidate_count": len(entries), + } +``` + +--- + +### WR-03: `recommended_action` field missing from paper-status output + +**File:** Plan Task 5 (`paperforge/memory/query.py`, `get_paper_status()`) + Task 6 (`paper_status.py`) +**Issue:** The spec (Design Spec lines 252-253) requires: +> `recommended_action`: e.g., `"/pf-deep ABCDEFG"` or `"paperforge sync"` or `"paperforge ocr"` + +The plan only returns `entry["next_step"]` (e.g., `"/pf-deep"`) in the output but never computes a concrete `recommended_action` like `"/pf-deep ABCDEFG"`. The spec implies this should be a ready-to-use command string with the paper key substituted in. + +**Fix:** In `get_paper_status()`, after computing health, add: +```python +step = entry.get("next_step", "") +zkey = entry.get("zotero_key", "") +action_map = { + "/pf-deep": f"/pf-deep {zkey}", + "ocr": f"paperforge ocr --key {zkey}", + "sync": "paperforge sync", + "repair": "paperforge repair", + "ready": "Ready — no action needed", +} +entry["recommended_action"] = action_map.get(step, step) +``` + +--- + +### WR-04: Core business logic functions have zero test coverage + +**File:** Plan Tasks 4-5 (test files) +**Issue:** The plan specifies 8 tests total: +- 4 schema tests (table creation/deletion/schema version) — good +- 3 builder tests — but ALL three test only `_compute_hash`, a 10-line helper. `build_from_index()` (~150 lines) has **zero tests**. +- 1 query test — only tests `get_memory_status()` with a nonexistent vault path. `lookup_paper()`, `get_paper_assets()`, `get_paper_status()`, and `_entry_from_row()` have **zero tests**. + +Untested edge cases include: empty items list, schema version mismatch trigger, corrupt JSON in authors/collections, exact zotero_key lookup, DOI lookup, title substring search, no-results path, asset reconstruction with None values. + +**Fix:** Add at minimum: +- `test_build_from_index_empty_items()` — ensure handles empty index gracefully +- `test_build_from_index_schema_mismatch()` — verify drop+rebuild on version change +- `test_build_from_index_populates_correctly()` — build from a mock envelope, verify paper count/asset count +- `test_lookup_paper_by_key()` — exact zotero_key match +- `test_lookup_paper_by_doi()` — DOI lookup +- `test_lookup_paper_by_title_substring()` — LIKE match +- `test_lookup_paper_no_results()` — returns empty list +- `test_get_paper_status_returns_none_for_missing()` — paper not found +- `test_entry_from_row_handles_null_fields()` — None values don't crash + +--- + +### WR-05: CLI dispatch pattern inconsistent with existing codebase + +**File:** Plan Task 6, Step 3 (cli.py dispatch blocks) +**Issue:** The plan adds verbose-index carving logic in the dispatch blocks: +```python +if args.command == "memory": + argv = sys.argv.copy() + try: + idx = argv.index("memory") + args.verbose = "--verbose" in argv[idx:] or "-v" in argv[idx:] + except ValueError: + pass + from paperforge.commands import memory + return memory.run(args) +``` +No other command dispatch in `cli.py` (lines 407-533) uses this pattern. All 15 existing command dispatches simply import and call `run(args)`. The `--verbose` flag is already a top-level argument parsed by argparse (cli.py lines 132-136), and `configure_logging(verbose=...)` is called at line 402 BEFORE any dispatch. This carving code is redundant and adds 14 lines of unnecessary complexity per command. + +**Fix:** Follow the existing pattern — just import and dispatch: +```python +if args.command == "memory": + from paperforge.commands import memory + return memory.run(args) + +if args.command == "paper-status": + from paperforge.commands import paper_status + return paper_status.run(args) +``` + +--- + +## Info + +### IN-01: Unused `compute_health` import in `builder.py` + +**File:** Plan Task 4, Step 1 (`paperforge/memory/builder.py`, line 414) +**Issue:** `compute_health` is imported but never called in `build_from_index()`. Per the spec (line 141), health dimensions are computed at query time only, so this import is conceptually correct to exclude. The dead import is harmless but clutters the import block. + +**Fix:** Remove `compute_health` from the builder import: +```python +from paperforge.worker.asset_state import ( + compute_lifecycle, + compute_maturity, + compute_next_step, +) +``` + +--- + +### IN-02: `_COMMAND_REGISTRY` entries not consumed by `cli.py` dispatch + +**File:** Plan Task 6, Step 4 (`paperforge/commands/__init__.py`) +**Issue:** The plan adds `"memory"` and `"paper-status"` to `_COMMAND_REGISTRY`, which powers `get_command_module()` for dynamic dispatch. However, `cli.py` uses hard-coded `if/elif` chains (not `get_command_module()`), so these registry entries are unused by the primary dispatch path. The entries are only consumed if some other code path calls `get_command_module("memory")`. + +**Fix:** Not critical for Phase 1, but either (a) use `get_command_module()` in cli.py dispatch to reduce duplication, or (b) document that the registry exists for future dynamic-dispatch migration. + +--- + +### IN-03: `_compute_hash` uses `.get()` instead of direct key access per spec + +**File:** Plan Task 4, Step 1 (`paperforge/memory/builder.py`, line 448-449) +**Issue:** The spec (line 202) explicitly says: +> `sorted(items, key=lambda e: e["zotero_key"])` + +The plan uses `e.get("zotero_key", "")` — a safe-access variant. This is arguably more robust (it won't crash on malformed entries), but the spec's direct-access was an intentional design choice to fail-loud on corrupt data rather than silently producing a different hash. Decide which contract you want. + +**Fix:** Either align with spec (remove `.get()` for loud failure) or update the spec to accept safe access. + +--- + +### IN-04: `_entry_from_row` uses fragile `.rstrip("_json")` + +**File:** Plan Task 5, Step 1 (`paperforge/memory/query.py`, line 729) +**Issue:** +```python +entry[key.rstrip("_json")] = json.loads(entry.pop(key)) +``` +`rstrip("_json")` removes any trailing characters in the set `{'_', 'j', 's', 'o', 'n'}`, not the literal substring `"_json"`. For `"authors_json"` this produces `"authors"` (correct), and for `"collections_json"` it produces `"collections"` (correct). But if future columns with names like `"version_json"` or `"annotation_json"` were added, this would produce `"versi"` or `"annotati"` — silently wrong. The fix is trivial and prevents future bugs. + +**Fix:** +```python +for key in ("authors_json", "collections_json"): + if key in entry and entry[key]: + try: + clean_key = key[:-5] # strip "_json" suffix (exactly 5 chars) + entry[clean_key] = json.loads(entry.pop(key)) + except json.JSONDecodeError: + pass +``` + +--- + +_Reviewed: 2026-05-12T18:30:00Z_ +_Reviewer: VT-OS/OPENCODE (gsd-code-reviewer)_ +_Depth: deep_ diff --git a/docs/superpowers/plans/2026-05-12-memory-layer.md b/docs/superpowers/plans/2026-05-12-memory-layer.md new file mode 100644 index 0000000..aa03cf5 --- /dev/null +++ b/docs/superpowers/plans/2026-05-12-memory-layer.md @@ -0,0 +1,1235 @@ +# Memory Layer Phase 1 — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) +> or superpowers:executing-plans to implement this plan task-by-task. +> Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a SQLite-backed Memory Layer with `memory build`, `memory status`, and `paper-status` commands. + +**Architecture:** New `paperforge/memory/` package with connection, schema, builder, and query modules. +Commands follow the existing CLI pattern (parser registration + `commands/` module dispatch + PFResult envelope). + +**Tech Stack:** Python stdlib `sqlite3`, `hashlib`, existing `paperforge.core.result`, `paperforge.worker.asset_index`, `paperforge.worker.asset_state`. + +**Spec:** `docs/superpowers/specs/2026-05-12-memory-layer-design.md` + +--- + +## File Structure Map + +``` +Create: + paperforge/memory/__init__.py — package init, re-export key types + paperforge/memory/db.py — get_connection(), get_memory_db_path() + paperforge/memory/schema.py — CURRENT_SCHEMA_VERSION, CREATE TABLE SQL, drop/create tables + paperforge/memory/builder.py — build_from_index() — reads formal-library.json, populates SQLite + paperforge/memory/query.py — lookup_paper(), get_paper_status(), get_memory_status() + paperforge/commands/memory.py — CLI run() for "memory build" and "memory status" + paperforge/commands/paper_status.py — CLI run() for "paper-status" + + tests/unit/memory/__init__.py + tests/unit/memory/test_schema.py + tests/unit/memory/test_builder.py + tests/unit/memory/test_query.py + +Modify: + paperforge/config.py:330-339 — add "memory_db" path key + paperforge/cli.py:258-259 — register "memory" and "paper-status" subcommands + paperforge/commands/__init__.py:4-13 — add to _COMMAND_REGISTRY +``` + +--- + +### Task 1: Register `memory_db` path in config + +**Files:** +- Modify: `paperforge/config.py:330-339` + +- [ ] **Step 1: Add `memory_db` key to `paperforge_paths()` return dict** + +```python +# At paperforge/config.py, after line 338 ("index": ...): +"memory_db": paperforge / "indexes" / "paperforge.db", +``` + +- [ ] **Step 2: Verify** + +```bash +python -c "from paperforge.config import paperforge_paths; p=paperforge_paths(); print(p.get('memory_db'), p.get('index'))" +``` + +Expected: both paths point under `.../PaperForge/indexes/`. + +- [ ] **Step 3: Commit** + +```bash +git add paperforge/config.py +git commit -m "feat(config): add memory_db path key for Memory Layer" +``` + +--- + +### Task 2: `paperforge/memory/__init__.py` and `db.py` + +**Files:** +- Create: `paperforge/memory/__init__.py` +- Create: `paperforge/memory/db.py` +- Test: `tests/unit/memory/test_schema.py` (write later) + +- [ ] **Step 1: Write `__init__.py`** + +```python +from __future__ import annotations + +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.schema import ensure_schema, drop_all_tables + +__all__ = [ + "get_connection", + "get_memory_db_path", + "ensure_schema", + "drop_all_tables", +] +``` + +- [ ] **Step 2: Write `db.py`** + +```python +from __future__ import annotations + +import sqlite3 +from pathlib import Path + +from paperforge.config import paperforge_paths + + +def get_memory_db_path(vault: Path) -> Path: + """Return the absolute path to paperforge.db.""" + paths = paperforge_paths(vault) + db_path = paths.get("memory_db") + if not db_path: + raise FileNotFoundError("memory_db path not configured") + return db_path + + +def get_connection(db_path: Path, read_only: bool = False) -> sqlite3.Connection: + """Open a SQLite connection to paperforge.db with WAL mode. + + Args: + db_path: Path to paperforge.db. + read_only: If True, open in read-only mode (for queries). + """ + if read_only: + uri = "file:" + db_path.as_posix() + "?mode=ro" + conn = sqlite3.connect(uri, uri=True) + else: + db_path.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + if not read_only: + conn.execute("PRAGMA journal_mode=WAL;") + conn.execute("PRAGMA foreign_keys=ON;") + return conn +``` + +- [ ] **Step 3: Run a manual import check** + +```bash +python -c "from paperforge.memory import get_connection, get_memory_db_path; print('OK')" +``` + +- [ ] **Step 4: Commit** + +```bash +git add paperforge/memory/__init__.py paperforge/memory/db.py +git commit -m "feat(memory): add db.py with connection and path resolution" +``` + +--- + +### Task 3: `paperforge/memory/schema.py` + +**Files:** +- Create: `paperforge/memory/schema.py` +- Create: `tests/unit/memory/__init__.py` +- Create: `tests/unit/memory/test_schema.py` + +- [ ] **Step 1: Write `schema.py` with SQL definitions** + +```python +from __future__ import annotations + +import sqlite3 + +CURRENT_SCHEMA_VERSION = 1 + +CREATE_META = """ +CREATE TABLE IF NOT EXISTS meta ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL +); +""" + +CREATE_PAPERS = """ +CREATE TABLE IF NOT EXISTS papers ( + zotero_key TEXT PRIMARY KEY, + citation_key TEXT NOT NULL DEFAULT '', + title TEXT NOT NULL, + year TEXT, + doi TEXT, + pmid TEXT, + journal TEXT, + first_author TEXT, + authors_json TEXT, + abstract TEXT, + domain TEXT, + collection_path TEXT, + collections_json TEXT, + has_pdf INTEGER NOT NULL DEFAULT 0, + do_ocr INTEGER, + analyze INTEGER, + ocr_status TEXT, + deep_reading_status TEXT, + ocr_job_id TEXT, + impact_factor REAL, + lifecycle TEXT, + maturity_level INTEGER, + maturity_name TEXT, + next_step TEXT, + pdf_path TEXT, + note_path TEXT, + main_note_path TEXT, + paper_root TEXT, + fulltext_path TEXT, + ocr_md_path TEXT, + ocr_json_path TEXT, + ai_path TEXT, + deep_reading_md_path TEXT, + updated_at TEXT +); +""" + +CREATE_ASSETS = """ +CREATE TABLE IF NOT EXISTS paper_assets ( + paper_id TEXT NOT NULL, + asset_type TEXT NOT NULL, + path TEXT NOT NULL, + exists_on_disk INTEGER NOT NULL DEFAULT 0, + PRIMARY KEY (paper_id, asset_type), + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +""" + +CREATE_ALIASES = """ +CREATE TABLE IF NOT EXISTS paper_aliases ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + paper_id TEXT NOT NULL, + alias TEXT NOT NULL, + alias_norm TEXT NOT NULL, + alias_type TEXT NOT NULL, + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +""" + +INDEX_SQL = [ + "CREATE INDEX IF NOT EXISTS idx_papers_doi ON papers(doi);", + "CREATE INDEX IF NOT EXISTS idx_papers_citation_key ON papers(citation_key);", + "CREATE INDEX IF NOT EXISTS idx_papers_domain ON papers(domain);", + "CREATE INDEX IF NOT EXISTS idx_papers_year ON papers(year);", + "CREATE INDEX IF NOT EXISTS idx_papers_ocr_status ON papers(ocr_status);", + "CREATE INDEX IF NOT EXISTS idx_papers_deep_status ON papers(deep_reading_status);", + "CREATE INDEX IF NOT EXISTS idx_papers_lifecycle ON papers(lifecycle);", + "CREATE INDEX IF NOT EXISTS idx_papers_next_step ON papers(next_step);", +] + +ALL_TABLES = ["papers", "paper_assets", "paper_aliases", "meta"] + + +def ensure_schema(conn: sqlite3.Connection) -> None: + """Create tables and indexes if they don't exist.""" + conn.execute(CREATE_META) + conn.execute(CREATE_PAPERS) + conn.execute(CREATE_ASSETS) + conn.execute(CREATE_ALIASES) + for idx_sql in INDEX_SQL: + conn.execute(idx_sql) + conn.commit() + + +def drop_all_tables(conn: sqlite3.Connection) -> None: + """Drop all Memory Layer tables (for rebuild).""" + for table in ALL_TABLES: + conn.execute(f"DROP TABLE IF EXISTS {table};") + conn.commit() + + +def get_schema_version(conn: sqlite3.Connection) -> int: + """Read the stored schema version from meta table, or 0 if not found.""" + try: + row = conn.execute( + "SELECT value FROM meta WHERE key = 'schema_version'" + ).fetchone() + return int(row["value"]) if row else 0 + except sqlite3.OperationalError: + return 0 +``` + +- [ ] **Step 2: Write the failing test `tests/unit/memory/test_schema.py`** + +```python +from __future__ import annotations + +import sqlite3 +import tempfile +from pathlib import Path + +from paperforge.memory.schema import ( + ALL_TABLES, + ensure_schema, + drop_all_tables, + get_schema_version, + CURRENT_SCHEMA_VERSION, +) +from paperforge.memory.db import get_connection + + +def test_ensure_schema_creates_all_tables(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + cursor = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name" + ) + tables = {row["name"] for row in cursor.fetchall()} + for table in ALL_TABLES: + assert table in tables, f"Missing table: {table}" + conn.close() + finally: + db_path.unlink(missing_ok=True) + + +def test_drop_all_tables_clears_all(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + drop_all_tables(conn) + cursor = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table'" + ) + tables = {row["name"] for row in cursor.fetchall()} + assert tables == set() + conn.close() + finally: + db_path.unlink(missing_ok=True) + + +def test_get_schema_version_returns_zero_when_no_meta(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + assert get_schema_version(conn) == 0 + conn.close() + finally: + db_path.unlink(missing_ok=True) + + +def test_get_schema_version_returns_stored_value(): + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + conn.execute( + "INSERT INTO meta (key, value) VALUES ('schema_version', '1')" + ) + conn.commit() + assert get_schema_version(conn) == 1 + conn.close() + finally: + db_path.unlink(missing_ok=True) + + +def test_schema_version_mismatch_triggers_rebuild_semantics(): + """When stored version != CURRENT, get_schema_version returns a different int.""" + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp: + db_path = Path(tmp.name) + try: + conn = get_connection(db_path) + ensure_schema(conn) + conn.execute( + "INSERT INTO meta (key, value) VALUES ('schema_version', '99')" + ) + conn.commit() + stored = get_schema_version(conn) + assert stored != CURRENT_SCHEMA_VERSION + conn.close() + finally: + db_path.unlink(missing_ok=True) +``` + +- [ ] **Step 3: Run tests and verify they pass** + +```bash +python -m pytest tests/unit/memory/test_schema.py -v +``` + +- [ ] **Step 4: Commit** + +```bash +git add paperforge/memory/schema.py tests/unit/memory/ +git commit -m "feat(memory): add schema module with table definitions and tests" +``` + +--- + +### Task 4: `paperforge/memory/builder.py` + +**Files:** +- Create: `paperforge/memory/builder.py` +- Create: `tests/unit/memory/test_builder.py` +- Modify: (none) + +- [ ] **Step 1: Write `builder.py`** + +```python +from __future__ import annotations + +import hashlib +import json +import logging +from datetime import datetime, timezone +from pathlib import Path + +from paperforge import __version__ as PF_VERSION +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.schema import ( + CURRENT_SCHEMA_VERSION, + ensure_schema, + drop_all_tables, + get_schema_version, +) +from paperforge.worker.asset_index import read_index +from paperforge.worker.asset_state import ( + compute_lifecycle, + compute_maturity, + compute_next_step, +) + +logger = logging.getLogger(__name__) + +PAPER_COLUMNS = [ + "zotero_key", "citation_key", "title", "year", "doi", "pmid", + "journal", "first_author", "authors_json", "abstract", "domain", + "collection_path", "collections_json", + "has_pdf", "do_ocr", "analyze", "ocr_status", "deep_reading_status", + "ocr_job_id", "impact_factor", + "lifecycle", "maturity_level", "maturity_name", "next_step", + "pdf_path", "note_path", "main_note_path", "paper_root", + "fulltext_path", "ocr_md_path", "ocr_json_path", "ai_path", + "deep_reading_md_path", "updated_at", +] + +ASSET_FIELDS = [ + ("pdf", "pdf_path"), + ("formal_note", "note_path"), + ("main_note", "main_note_path"), + ("ocr_fulltext", "fulltext_path"), + ("ocr_meta", "ocr_json_path"), + ("deep_reading", "main_note_path"), + ("ai_dir", "ai_path"), +] + +ALIAS_TYPES = ["zotero_key", "citation_key", "title", "doi"] + + +def compute_hash(items: list[dict]) -> str: + sorted_items = sorted(items, key=lambda e: e["zotero_key"]) + raw = json.dumps(sorted_items, sort_keys=True, ensure_ascii=False) + return hashlib.sha256(raw.encode("utf-8")).hexdigest() + + +def _resolve_vault_path(vault: Path, rel_path: str) -> Path: + if not rel_path: + return Path() + p = vault / rel_path + return p.resolve() if p.exists() else p + + +def build_from_index(vault: Path) -> dict: + """Read formal-library.json and build/rebuild paperforge.db. + + Returns a dict with counts for reporting. + """ + envelope = read_index(vault) + if envelope is None: + raise FileNotFoundError( + "Canonical index not found. Run paperforge sync --rebuild-index." + ) + # Legacy format: bare list of entries (pre-envelope) + if isinstance(envelope, list): + items = envelope + generated_at = "" + else: + items = envelope.get("items", []) + generated_at = envelope.get("generated_at", "") + if isinstance(items, list) and items and isinstance(items[0], dict): + canonical_hash = compute_hash(items) + else: + canonical_hash = "" + + db_path = get_memory_db_path(vault) + conn = get_connection(db_path, read_only=False) + try: + stored_version = get_schema_version(conn) + if stored_version != CURRENT_SCHEMA_VERSION: + drop_all_tables(conn) + ensure_schema(conn) + + conn.execute("DELETE FROM paper_aliases;") + conn.execute("DELETE FROM paper_assets;") + conn.execute("DELETE FROM papers;") + + now_utc = datetime.now(timezone.utc).isoformat() + papers_count = 0 + assets_count = 0 + aliases_count = 0 + + for entry in items: + zotero_key = entry.get("zotero_key", "") + if not zotero_key: + continue + + lifecycle = str(compute_lifecycle(entry)) + maturity = compute_maturity(entry) + next_step = str(compute_next_step(entry)) + + paper_values = {} + for col in PAPER_COLUMNS: + if col == "authors_json": + paper_values[col] = json.dumps( + entry.get("authors", []), ensure_ascii=False + ) + elif col == "collections_json": + paper_values[col] = json.dumps( + entry.get("collections", []), ensure_ascii=False + ) + elif col == "lifecycle": + paper_values[col] = lifecycle + elif col == "maturity_level": + paper_values[col] = maturity.get("level", 1) + elif col == "maturity_name": + paper_values[col] = maturity.get("level_name", "") + elif col == "next_step": + paper_values[col] = next_step + elif col == "updated_at": + paper_values[col] = generated_at + elif col in ("do_ocr", "analyze"): + val = entry.get(col) + paper_values[col] = 1 if val else 0 + elif col == "has_pdf": + paper_values[col] = 1 if entry.get("has_pdf") else 0 + else: + paper_values[col] = entry.get(col, "") + + placeholders = ", ".join([f":{c}" for c in PAPER_COLUMNS]) + cols = ", ".join(PAPER_COLUMNS) + conn.execute( + f"INSERT OR REPLACE INTO papers ({cols}) VALUES ({placeholders})", + paper_values, + ) + papers_count += 1 + + for asset_type, entry_field in ASSET_FIELDS: + path_val = entry.get(entry_field, "") + if not path_val: + continue + rel_path = str(path_val).replace("\\", "/") + abs_path = _resolve_vault_path(vault, rel_path) + exists = 1 if abs_path.exists() else 0 + + if asset_type == "deep_reading": + if abs_path.exists(): + try: + content = abs_path.read_text(encoding="utf-8") + exists = 1 if "## 🔍 精读" in content else 0 + except Exception: + exists = 0 + + conn.execute( + """INSERT OR REPLACE INTO paper_assets + (paper_id, asset_type, path, exists_on_disk) + VALUES (?, ?, ?, ?)""", + (zotero_key, asset_type, rel_path, exists), + ) + assets_count += 1 + + for alias_type in ALIAS_TYPES: + raw_val = entry.get(alias_type, "") + if not raw_val: + continue + raw_str = str(raw_val) + conn.execute( + """INSERT OR REPLACE INTO paper_aliases + (paper_id, alias, alias_norm, alias_type) + VALUES (?, ?, ?, ?)""", + ( + zotero_key, + raw_str, + raw_str.lower().strip(), + alias_type, + ), + ) + aliases_count += 1 + + meta_upserts = [ + ("schema_version", str(CURRENT_SCHEMA_VERSION)), + ("paperforge_version", PF_VERSION), + ("created_at", now_utc), + ("last_full_build_at", now_utc), + ("canonical_index_hash", canonical_hash), + ("canonical_index_generated_at", generated_at), + ] + for key, value in meta_upserts: + conn.execute( + """INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)""", + (key, value), + ) + + conn.commit() + + return { + "db_path": str(db_path), + "papers_indexed": papers_count, + "assets_indexed": assets_count, + "aliases_indexed": aliases_count, + "schema_version": str(CURRENT_SCHEMA_VERSION), + } + except Exception: + conn.rollback() + raise + finally: + conn.close() +``` + +- [ ] **Step 2: Write the test `tests/unit/memory/test_builder.py`** + +Note: This test needs an actual `formal-library.json` fixture. Use the existing test vault. + +```python +from __future__ import annotations + +import tempfile +from pathlib import Path +from unittest.mock import patch, MagicMock + +from paperforge.memory.builder import build_from_index, compute_hash + + +def test_compute_hash_deterministic(): + items1 = [{"zotero_key": "A"}, {"zotero_key": "B"}] + items2 = [{"zotero_key": "B"}, {"zotero_key": "A"}] + assert compute_hash(items1) == compute_hash(items2) + + +def test_compute_hash_different_for_different_data(): + items1 = [{"zotero_key": "A", "title": "X"}] + items2 = [{"zotero_key": "A", "title": "Y"}] + assert compute_hash(items1) != compute_hash(items2) + + +def test_compute_hash_handles_empty(): + assert compute_hash([]) == compute_hash([]) + assert len(compute_hash([])) == 64 # SHA-256 hex +``` + +- [ ] **Step 3: Run tests** + +```bash +python -m pytest tests/unit/memory/test_builder.py -v +``` + +- [ ] **Step 4: Commit** + +```bash +git add paperforge/memory/builder.py tests/unit/memory/test_builder.py +git commit -m "feat(memory): add builder module that populates SQLite from formal-library.json" +``` + +--- + +### Task 5: `paperforge/memory/query.py` + +**Files:** +- Create: `paperforge/memory/query.py` +- Create: `tests/unit/memory/test_query.py` + +- [ ] **Step 1: Write `query.py`** + +```python +from __future__ import annotations + +import json +import logging +from pathlib import Path + +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.schema import get_schema_version, CURRENT_SCHEMA_VERSION +from paperforge.memory.builder import compute_hash +from paperforge.worker.asset_state import compute_health +from paperforge.worker.asset_index import read_index + + +def get_memory_status(vault: Path) -> dict: + """Check paperforge.db health and staleness. + + Returns a dict with: db_exists, schema_ok, fresh, count_match, + paper_count_db, paper_count_index, needs_rebuild. + """ + db_path = get_memory_db_path(vault) + result = { + "db_exists": db_path.exists(), + "schema_ok": False, + "fresh": False, + "count_match": False, + "paper_count_db": 0, + "paper_count_index": 0, + "needs_rebuild": True, + } + if not db_path.exists(): + return result + + conn = get_connection(db_path, read_only=True) + try: + stored_version = get_schema_version(conn) + result["schema_ok"] = stored_version == CURRENT_SCHEMA_VERSION + row = conn.execute("SELECT COUNT(*) as cnt FROM papers").fetchone() + result["paper_count_db"] = row["cnt"] if row else 0 + stored_hash_row = conn.execute( + "SELECT value FROM meta WHERE key = 'canonical_index_hash'" + ).fetchone() + stored_hash = stored_hash_row["value"] if stored_hash_row else "" + except Exception: + return result + finally: + conn.close() + + envelope = read_index(vault) + if envelope is not None: + # Handle legacy format (bare list) + if isinstance(envelope, list): + items = envelope + paper_count = len(items) + index_hash = compute_hash(items) + else: + items = envelope.get("items", []) + paper_count = envelope.get("paper_count", 0) + index_hash = compute_hash(items) + result["paper_count_index"] = paper_count + + # Compare stored hash with computed hash + result["hash_match"] = stored_hash == index_hash + + result["count_match"] = ( + result["paper_count_db"] == result["paper_count_index"] + ) + + result["fresh"] = ( + result["schema_ok"] + and result["count_match"] + and result.get("hash_match", False) + ) + result["needs_rebuild"] = not result["fresh"] + return result + + +def _entry_from_row(row) -> dict: + """Reconstruct an entry dict from a papers row (sqlite3.Row).""" + entry = {k: row[k] for k in row.keys()} + for key in ("has_pdf", "do_ocr", "analyze"): + if key in entry and entry[key] is not None: + entry[key] = bool(entry[key]) + for key in ("authors_json", "collections_json"): + if key in entry and entry[key]: + try: + entry[key[:-5]] = json.loads(entry[key]) + del entry[key] + except json.JSONDecodeError: + logging.warning( + "Corrupted JSON in column %s for paper %s", + key, entry.get("zotero_key", "?"), + ) + return entry + + +def lookup_paper(conn, query: str) -> list[dict]: + """Multi-strategy lookup. Returns list of matching paper dicts.""" + q = query.strip() + results = [] + + for lookup_col in ("zotero_key", "citation_key", "doi"): + row = conn.execute( + f"SELECT * FROM papers WHERE LOWER({lookup_col}) = LOWER(?)", + (q,), + ).fetchone() + if row: + return [_entry_from_row(row)] + + rows = conn.execute( + """SELECT * FROM papers + WHERE LOWER(title) LIKE '%' || LOWER(?) || '%' + LIMIT 20""", + (q,), + ).fetchall() + if rows: + return [_entry_from_row(r) for r in rows] + + rows = conn.execute( + """SELECT p.* FROM papers p + JOIN paper_aliases a ON a.paper_id = p.zotero_key + WHERE a.alias_norm LIKE '%' || LOWER(?) || '%' + LIMIT 20""", + (q,), + ).fetchall() + return [_entry_from_row(r) for r in rows] + + +def get_paper_assets(conn, zotero_key: str) -> list[dict]: + rows = conn.execute( + "SELECT asset_type, path, exists_on_disk FROM paper_assets WHERE paper_id = ?", + (zotero_key,), + ).fetchall() + return [dict(r) for r in rows] + + +def get_paper_status(vault: Path, query: str) -> dict | None: + """Full paper status lookup. Returns dict or None if not found. + + If multiple candidates found, returns a candidate list without full status. + """ + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return None + + conn = get_connection(db_path, read_only=True) + try: + entries = lookup_paper(conn, query) + if not entries: + return None + + # Multiple candidates → return candidate list only (no full status) + if len(entries) > 1: + return { + "resolved": False, + "candidates": [ + { + "zotero_key": e.get("zotero_key"), + "title": e.get("title"), + "year": e.get("year"), + "citation_key": e.get("citation_key"), + "lifecycle": e.get("lifecycle"), + } + for e in entries + ], + } + + entry = entries[0] + assets = get_paper_assets(conn, entry["zotero_key"]) + entry["health"] = compute_health(entry) + entry["assets"] = assets + entry["resolved"] = True + + next_step = entry.get("next_step", "") + zk = entry.get("zotero_key", "") + if next_step == "/pf-deep": + entry["recommended_action"] = f"/pf-deep {zk}" + elif next_step == "ocr": + entry["recommended_action"] = f"paperforge ocr --key {zk}" + elif next_step == "sync": + entry["recommended_action"] = "paperforge sync" + else: + entry["recommended_action"] = None + + return entry + finally: + conn.close() +``` + +- [ ] **Step 2: Write `tests/unit/memory/test_query.py`** + +```python +from __future__ import annotations + +from paperforge.memory.query import get_memory_status + + +def test_get_memory_status_returns_needs_rebuild_when_no_db(): + from pathlib import Path + result = get_memory_status(Path("/nonexistent/vault")) + assert result["db_exists"] is False + assert result["needs_rebuild"] is True +``` + +- [ ] **Step 3: Run tests** + +```bash +python -m pytest tests/unit/memory/test_query.py -v +``` + +- [ ] **Step 4: Commit** + +```bash +git add paperforge/memory/query.py tests/unit/memory/test_query.py +git commit -m "feat(memory): add query module for paper lookup and status check" +``` + +--- + +### Task 6: CLI commands — `memory.py` and `paper_status.py` + +**Files:** +- Create: `paperforge/commands/memory.py` +- Create: `paperforge/commands/paper_status.py` +- Modify: `paperforge/cli.py:258-259` (register parsers) +- Modify: `paperforge/commands/__init__.py:4-13` (register in command dispatch) + +- [ ] **Step 1: Write `paperforge/commands/memory.py`** + +```python +from __future__ import annotations + +import argparse +import sys + +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.builder import build_from_index +from paperforge.memory.query import get_memory_status +from paperforge import __version__ as PF_VERSION + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + sub_cmd = args.memory_subcommand + + if sub_cmd == "build": + try: + counts = build_from_index(vault) + result = PFResult( + ok=True, + command="memory build", + version=PF_VERSION, + data=counts, + ) + except FileNotFoundError: + result = PFResult( + ok=False, + command="memory build", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message="Canonical index not found. Run paperforge sync --rebuild-index.", + ), + next_actions=[ + { + "command": "paperforge sync --rebuild-index", + "reason": "Generate formal-library.json first", + } + ], + ) + except Exception as exc: + result = PFResult( + ok=False, + command="memory build", + version=PF_VERSION, + error=PFError( + code=ErrorCode.INTERNAL_ERROR, + message=str(exc), + ), + ) + if args.json: + print(result.to_json()) + else: + if result.ok: + print(f"Memory built: {result.data}") + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 0 if result.ok else 1 + + if sub_cmd == "status": + try: + status = get_memory_status(vault) + result = PFResult( + ok=True, + command="memory status", + version=PF_VERSION, + data=status, + ) + except Exception as exc: + result = PFResult( + ok=False, + command="memory status", + version=PF_VERSION, + error=PFError( + code=ErrorCode.INTERNAL_ERROR, + message=str(exc), + ), + ) + if args.json: + print(result.to_json()) + else: + if result.ok: + for k, v in status.items(): + print(f" {k}: {v}") + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 0 if result.ok else 1 + + print(f"Unknown memory subcommand: {sub_cmd}", file=sys.stderr) + return 1 +``` + +- [ ] **Step 2: Write `paperforge/commands/paper_status.py`** + +```python +from __future__ import annotations + +import argparse +import sys + +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.query import get_paper_status +from paperforge import __version__ as PF_VERSION + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + query = args.query + + try: + status = get_paper_status(vault, query) + if status is None: + result = PFResult( + ok=False, + command="paper-status", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message=f"No paper found for: {query}", + ), + next_actions=[ + { + "command": "paperforge search", + "reason": "Search for papers by keyword", + } + ], + ) + else: + result = PFResult( + ok=True, + command="paper-status", + version=PF_VERSION, + data=status, + ) + except Exception as exc: + result = PFResult( + ok=False, + command="paper-status", + version=PF_VERSION, + error=PFError( + code=ErrorCode.INTERNAL_ERROR, + message=str(exc), + ), + ) + + if args.json: + print(result.to_json()) + else: + if result.ok: + data = result.data + if data.get("resolved"): + print(f"Zotero Key: {data.get('zotero_key', '')}") + print(f"Title: {data.get('title', '')}") + print(f"Year: {data.get('year', '')}") + print(f"Lifecycle: {data.get('lifecycle', '')}") + print(f"Next Step: {data.get('next_step', '')}") + if data.get("candidates"): + print(f"\nMultiple candidates: {len(data['candidates'])}") + for c in data["candidates"]: + print(f" - {c['zotero_key']}: {c['title']} ({c['year']})") + else: + print(f"Error: {result.error.message}", file=sys.stderr) + + return 0 if result.ok else 1 +``` + +- [ ] **Step 3: Register in `cli.py`** + +In `paperforge/cli.py`, at `build_parser()` after line 259 (`p_dash`), add: + +```python + # Memory Layer commands + p_memory = sub.add_parser("memory", help="Manage the Memory Layer") + p_memory_sp = p_memory.add_subparsers(dest="memory_subcommand", required=True) + p_memory_build = p_memory_sp.add_parser("build", help="Build the memory database from canonical index") + p_memory_build.add_argument("--json", action="store_true", help="Output as JSON") + p_memory_status = p_memory_sp.add_parser("status", help="Check memory database status") + p_memory_status.add_argument("--json", action="store_true", help="Output as JSON") + + p_paper_status = sub.add_parser("paper-status", help="Look up a paper's status") + p_paper_status.add_argument("query", help="Paper identifier (zotero_key, DOI, title, alias)") + p_paper_status.add_argument("--json", action="store_true", help="Output as JSON") +``` + +In `main()`, after `if args.command == "dashboard": ...` (around line 468, find the command dispatch section), add: + +```python + if args.command == "memory": + from paperforge.commands.memory import run + return run(args) + + if args.command == "paper-status": + from paperforge.commands.paper_status import run + return run(args) +``` + +(Follow existing dispatch pattern — see how "dashboard" dispatches.) + +- [ ] **Step 4: Register in `commands/__init__.py`** + +In `paperforge/commands/__init__.py`, add to `_COMMAND_REGISTRY`: + +```python + "memory": "paperforge.commands.memory", + "paper-status": "paperforge.commands.paper_status", +``` + +- [ ] **Step 5: Verify CLI registration** + +```bash +paperforge --help +``` +Expected: `memory` and `paper-status` appear in subcommand list. + +```bash +paperforge memory --help +``` +Expected: shows `build` and `status` subcommands. + +```bash +paperforge memory status --help +``` + +- [ ] **Step 6: Commit** + +```bash +git add paperforge/commands/memory.py paperforge/commands/paper_status.py paperforge/cli.py paperforge/commands/__init__.py +git commit -m "feat(cli): add memory build/status and paper-status commands" +``` + +--- + +### Task 7: Integration test + +**Files:** +- Create: `tests/integration/test_memory_workflow.py` + +- [ ] **Step 1: Write integration test** + +```python +from __future__ import annotations + +import pytest +from pathlib import Path + + +@pytest.mark.integration +def test_memory_build_and_status_with_test_vault(test_vault: Path): + """End-to-end: sync → memory build → memory status → paper-status.""" + import subprocess + import json + + pf = ["python", "-m", "paperforge", "--vault", str(test_vault)] + + # 1. Sync to ensure formal-library.json exists + result = subprocess.run(pf + ["sync", "--json"], capture_output=True, text=True) + # If sync fails, skip (test vault may not have exports) + if result.returncode != 0: + pytest.skip("Sync failed — test vault may lack export files") + + # 2. Memory build + result = subprocess.run(pf + ["memory", "build", "--json"], capture_output=True, text=True) + assert result.returncode == 0 + data = json.loads(result.stdout) + assert data["ok"] is True + assert data["data"]["papers_indexed"] > 0 + + # 3. Memory status + result = subprocess.run(pf + ["memory", "status", "--json"], capture_output=True, text=True) + assert result.returncode == 0 + data = json.loads(result.stdout) + assert data["data"]["fresh"] is True + assert data["data"]["needs_rebuild"] is False +``` + +- [ ] **Step 2: Run integration test** (requires test vault) + +```bash +python -m pytest tests/integration/test_memory_workflow.py -v -m integration +``` + +- [ ] **Step 3: Commit** + +```bash +git add tests/integration/test_memory_workflow.py +git commit -m "test(memory): add integration test for memory build/status workflow" +``` + +--- + +### Task 8: Final verification — run full test suite + +- [ ] **Step 1: Run all tests** + +```bash +python -m pytest tests/unit/ tests/integration/ -q --tb=short +``` + +Expected: All tests pass, no regressions. + +- [ ] **Step 2: Run ruff lint** + +```bash +ruff check paperforge/memory/ paperforge/commands/memory.py paperforge/commands/paper_status.py --fix && ruff format paperforge/memory/ paperforge/commands/memory.py paperforge/commands/paper_status.py +``` + +- [ ] **Step 3: Manual smoke test with real vault** + +```bash +paperforge memory build --json +paperforge memory status --json +paperforge paper-status "aaronStimulationGrowthFactor2004" --json +``` + +Expected: Real data flows through, paper status shows lifecycle, next_step, assets. + +--- + +## Summary + +| Task | Files Created | Files Modified | Tests | +|------|--------------|----------------|-------| +| 1. Config path | — | `config.py` | manual | +| 2. db.py | `memory/__init__.py`, `memory/db.py` | — | manual | +| 3. schema.py | `memory/schema.py` | — | `test_schema.py` (4 tests) | +| 4. builder.py | `memory/builder.py` | — | `test_builder.py` (3 tests) | +| 5. query.py | `memory/query.py` | — | `test_query.py` (1 test) | +| 6. CLI | `commands/memory.py`, `commands/paper_status.py` | `cli.py`, `commands/__init__.py` | — | +| 7. Integration | `tests/integration/test_memory_workflow.py` | — | 1 test | +| 8. Verification | — | — | full suite + lint | diff --git a/docs/superpowers/specs/2026-05-12-memory-layer-design.md b/docs/superpowers/specs/2026-05-12-memory-layer-design.md new file mode 100644 index 0000000..afb80fc --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-memory-layer-design.md @@ -0,0 +1,279 @@ +# Memory Layer — Design Spec + +> **Status:** Approved | **Date:** 2026-05-12 +> **Review:** Passed (v2 — 5 BLOCKER, 3 MAJOR, 6 MINOR resolved) + +## Goal + +Add a SQLite-backed Memory Layer to PaperForge as a derived, rebuildable global index that serves +dashboard, resolver, agent-context, and search commands. + +## Architecture + +``` +Zotero/BetterBibTeX → exports/*.json + ↓ +formal-library.json (Canonical Index — source of truth, already exists) + ↓ +paperforge.db (Memory Layer — derived, rebuildable SQLite index) + ↓ +paper-status / dashboard / agent-context / search / retrieve +``` + +**Core principle:** `paperforge.db` is a derived index, not the source of truth. +It can be safely deleted and rebuilt from `formal-library.json` at any time. + +## Phase 1 Scope + +**Tables:** `meta`, `papers`, `paper_assets`, `paper_aliases` + +**Commands:** +- `paperforge memory build --json` +- `paperforge memory status --json` +- `paperforge paper-status --json` + +**NOT in Phase 1:** FTS5, chunk retrieval, embedding, `paperforge.db → Markdown` writes, +agent-context, dashboard integration. + +## SQLite Location + +``` +/PaperForge/indexes/paperforge.db +``` +(same directory as `formal-library.json`) + +Register a new path key `"memory_db"` in `config.py:paperforge_paths()` pointing to +`paperforge / "indexes" / "paperforge.db"`. Do not reuse the existing `"index"` key. + +## Schema + +### Connection settings + +- `PRAGMA journal_mode=WAL;` — allow concurrent reads during rebuild +- `PRAGMA foreign_keys=ON;` + +### meta + +```sql +CREATE TABLE IF NOT EXISTS meta ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL +); +``` + +Stores: `schema_version` (integer), `paperforge_version`, `created_at`, `last_full_build_at`, +`canonical_index_hash`, `canonical_index_generated_at`. + +### Schema versioning strategy + +On `paperforge memory build`, if the stored `schema_version` in `meta` does not match the +current version, DROP all tables and rebuild from scratch. `paperforge.db` is a derived index +— full rebuild is always safe. This mirrors `formal-library.json`'s schema-version-check +pattern in `asset_index.py:475-480`. + +Initial schema version: `1`. + +### papers + +One row per paper. Columns directly map to `_build_entry()` entry dict fields. +`asset_state.py` pure functions (`compute_lifecycle`, `compute_health`, `compute_maturity`, +`compute_next_step`) are called at **build time** on each entry dict to populate derived columns. + +```sql +CREATE TABLE IF NOT EXISTS papers ( + zotero_key TEXT PRIMARY KEY, + citation_key TEXT NOT NULL DEFAULT '', + title TEXT NOT NULL, + year TEXT, + doi TEXT, + pmid TEXT, + journal TEXT, + first_author TEXT, + authors_json TEXT, -- json.dumps(entry["authors"], ensure_ascii=False) + abstract TEXT, + domain TEXT, + collection_path TEXT, + collections_json TEXT, -- json.dumps(entry["collections"], ensure_ascii=False) + has_pdf INTEGER NOT NULL DEFAULT 0, + do_ocr INTEGER, + analyze INTEGER, + ocr_status TEXT, + deep_reading_status TEXT, + ocr_job_id TEXT, + impact_factor REAL, + lifecycle TEXT, -- compute_lifecycle(entry) → "indexed"|"pdf_ready"|"fulltext_ready"|"deep_read_done" + maturity_level INTEGER, -- compute_maturity(entry)["level"] → 1-4 + maturity_name TEXT, -- compute_maturity(entry)["level_name"] + next_step TEXT, -- compute_next_step(entry) → "sync"|"ocr"|"/pf-deep"|"ready" + pdf_path TEXT, + note_path TEXT, + main_note_path TEXT, + paper_root TEXT, + fulltext_path TEXT, + ocr_md_path TEXT, + ocr_json_path TEXT, + ai_path TEXT, + deep_reading_md_path TEXT, + updated_at TEXT -- envelope["generated_at"] from formal-library.json +); +``` + +Indexes: +```sql +CREATE INDEX IF NOT EXISTS idx_papers_zotero_key ON papers(zotero_key); +CREATE INDEX IF NOT EXISTS idx_papers_citation_key ON papers(citation_key); +CREATE INDEX IF NOT EXISTS idx_papers_doi ON papers(doi); +CREATE INDEX IF NOT EXISTS idx_papers_domain ON papers(domain); +CREATE INDEX IF NOT EXISTS idx_papers_year ON papers(year); +CREATE INDEX IF NOT EXISTS idx_papers_ocr_status ON papers(ocr_status); +CREATE INDEX IF NOT EXISTS idx_papers_deep_status ON papers(deep_reading_status); +CREATE INDEX IF NOT EXISTS idx_papers_lifecycle ON papers(lifecycle); +CREATE INDEX IF NOT EXISTS idx_papers_next_step ON papers(next_step); +``` + +**Important notes about column→entry mapping:** + +- `maturity_level` = `compute_maturity(entry)["level"]` (scalar 1-4, not the full dict) +- `updated_at` = the envelope's `generated_at` timestamp from `formal-library.json` (shared across all papers in a build) +- `lifecycle` values: `"indexed"`, `"pdf_ready"`, `"fulltext_ready"`, `"deep_read_done"` — these are NOT all members of the `Lifecycle` enum in `core/state.py` (which has `OCR_READY`, `ANALYZE_READY`, `ERROR_STATE` that are never produced). Use plain string comparison, not enum membership. +- `ai_context_ready` is a pre-seeded zero in `summarize_index()` (`asset_index.py:644`) but is never produced by `compute_lifecycle()`. Keep the zero bucket for Phase 3 compatibility but document it as reserved. + +**Health dimensions** (`pdf_health`, `ocr_health`, `note_health`, `asset_health`) are NOT stored in the papers table. They are computed at query time via `asset_state.compute_health(entry_dict)`. The `paper-status` command reconstructs the entry dict from SQLite columns, then calls `compute_health()` in-process. + +### paper_assets + +```sql +CREATE TABLE IF NOT EXISTS paper_assets ( + paper_id TEXT NOT NULL, + asset_type TEXT NOT NULL, + path TEXT NOT NULL, + exists_on_disk INTEGER NOT NULL DEFAULT 0, + PRIMARY KEY (paper_id, asset_type), + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +``` + +Asset types and their source fields: + +| asset_type | source in entry dict | notes | +| -------------- | -------------------------- | --------------------------------------------------------- | +| `pdf` | `pdf_path` | wiki-link; check existence via filesystem | +| `formal_note` | `note_path` | relative vault path | +| `main_note` | `main_note_path` | workspace `{key}.md` | +| `ocr_fulltext` | `fulltext_path` | copied from `ocr/{key}/fulltext.md` | +| `ocr_meta` | derived from `ocr_json_path` | `ocr/{key}/meta.json` | +| `deep_reading` | `main_note_path` | checks for `## 🔍 精读` section within main note (NOT a separate file; `deep_reading_path` is deprecated and always empty) | +| `ai_dir` | `ai_path` | workspace `ai/` directory | + +### paper_aliases + +```sql +CREATE TABLE IF NOT EXISTS paper_aliases ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + paper_id TEXT NOT NULL, + alias TEXT NOT NULL, + alias_norm TEXT NOT NULL, + alias_type TEXT NOT NULL, + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +``` + +Alias types (Phase 1): + +| alias_type | source | normalized to lowercase | +| ------------- | ----------------- | --------------------------- | +| `zotero_key` | `entry["zotero_key"]` | as-is (uppercase) | +| `citation_key` | `entry["citation_key"]` | as-is (case-sensitive) | +| `title` | `entry["title"]` | `.lower().strip()` | +| `doi` | `entry["doi"]` | `.lower().strip()` | + +## Commands + +### `paperforge memory build --json` + +1. Resolve vault path +2. Read `formal-library.json` (canonical index envelope) via `read_index(vault)` +3. If index is `None` or missing → return `PFResult(ok=False, error=PFError(code=PATH_NOT_FOUND, message="Canonical index not found. Run paperforge sync --rebuild-index."))` +4. Extract `items` list and envelope metadata +5. Create/open `paperforge.db` (WAL mode) +6. If stored `schema_version` != current → DROP all tables +7. Create tables if not exist +8. Upsert `meta` rows: `schema_version`, `paperforge_version`, `created_at`, `last_full_build_at`, `canonical_index_generated_at` +9. Compute `canonical_index_hash` = SHA-256 of `json.dumps(sorted(items, key=lambda e: e["zotero_key"]), sort_keys=True, ensure_ascii=False)`; store in `meta` +10. For each entry in `items`: + - Insert/upsert into `papers` + - Insert/upsert into `paper_assets` (check `exists_on_disk` via `Path.exists()`) + - Insert/upsert into `paper_aliases` +11. Return `PFResult(ok=True, data={...})` with `papers_indexed`, `assets_indexed`, `aliases_indexed` counts + +**PFResult.next_actions format** (must match `core/result.py:26` — `list[dict]`): +```json +{ + "next_actions": [ + {"command": "paperforge paper-status --json", "reason": "Look up a specific paper"} + ] +} +``` + +### `paperforge memory status --json` + +Check: +- `paperforge.db` exists → `db_exists: bool` +- `schema_version` matches current → `schema_ok: bool` +- `canonical_index_hash` matches computed hash of current `formal-library.json` → `fresh: bool` +- Paper count matches `envelope["paper_count"]` → `count_match: bool` +- Any check fails → `needs_rebuild: true` + +Return `PFResult(ok=True, data={...})`. + +### `paperforge paper-status --json` + +**Resolution is short-circuit:** stop at the first step that returns ≥1 result. + +Resolution order: +1. Exact match on `zotero_key` (case-insensitive) +2. Exact match on `citation_key` (case-insensitive) +3. Exact match on `doi` (case-insensitive) +4. LIKE match on `title_norm` or normalized alias (`%%`) +5. Fallback: search `paper_aliases.alias_norm` + +Behavior by result count: +- **0 results:** `PFResult(ok=False, error=NOT_FOUND, next_actions=[{"command": "paperforge search", ...}])` +- **1 result:** Full status with paper metadata, assets, lifecycle, next_step, recommended action +- **>1 results:** Candidate list only (no full status details) + +Full status response includes: +- Paper metadata (title, year, authors, doi, journal, domain, abstract) +- Asset status (exists_on_disk for each asset type) +- Lifecycle state +- Health dimensions (computed at query time via `compute_health()`) +- Maturity level and name +- `next_step` with recommended action +- `recommended_action`: e.g., `"/pf-deep ABCDEFG"` or `"paperforge sync"` or `"paperforge ocr"` + +## Integration Points + +### With sync +After `paperforge sync` completes, optionally refresh memory for changed keys. +Not automatic in Phase 1. + +### With dashboard +Dashboard should prefer `paperforge.db` for stats, fallback to file scanning. +This integration is deferred to Phase 2. + +### With agent +Agent skill bootstrap runs `paperforge agent-context --compact --json` first. +This command is deferred to Phase 3. + +## Constraints + +1. `paperforge.db` is a derived index — deletable, rebuildable +2. No SQLite → Markdown writes in Phase 1 +3. Reuse `asset_state.py` pure functions (compute_lifecycle, compute_health, compute_maturity, compute_next_step) +4. Health dimensions are computed at query time via `compute_health()`, not stored in SQLite +5. All `--json` output uses PFResult envelope (respecting `next_actions: list[dict]` contract) +6. SQLite connection uses WAL mode for concurrent reads +7. No external database services — only Python stdlib `sqlite3` +8. No PDF/image binary storage +9. No embedding or vector DB +10. Schema version mismatch → full drop-and-rebuild (derived index, always safe) diff --git a/manifest.json b/manifest.json index b535c32..af16719 100644 --- a/manifest.json +++ b/manifest.json @@ -1,7 +1,7 @@ { "id": "paperforge", "name": "PaperForge", - "version": "1.5.5", + "version": "1.5.6rc1", "minAppVersion": "1.9.0", "description": "PaperForge — Zotero literature pipeline. Sync PDFs, run OCR, and read with AI-assisted deep reading.", "author": "Lin Zhaoxuan", diff --git a/paperforge/__init__.py b/paperforge/__init__.py index 8f670b8..85369c3 100644 --- a/paperforge/__init__.py +++ b/paperforge/__init__.py @@ -1,3 +1,3 @@ """paperforge — PaperForge package.""" -__version__ = "1.5.5" +__version__ = "1.5.6rc1" diff --git a/paperforge/plugin/manifest.json b/paperforge/plugin/manifest.json index b535c32..af16719 100644 --- a/paperforge/plugin/manifest.json +++ b/paperforge/plugin/manifest.json @@ -1,7 +1,7 @@ { "id": "paperforge", "name": "PaperForge", - "version": "1.5.5", + "version": "1.5.6rc1", "minAppVersion": "1.9.0", "description": "PaperForge — Zotero literature pipeline. Sync PDFs, run OCR, and read with AI-assisted deep reading.", "author": "Lin Zhaoxuan", From 6c6b8ce7a1c2ea38aae62e9a83563a06c524648a Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 18:07:44 +0800 Subject: [PATCH 011/132] feat(frontmatter): add citation_key field and alias from BBT citationKey --- .../orthopedic_article.yaml | 1 + paperforge/adapters/bbt.py | 20 +++++++++++++++++++ paperforge/schema/field_registry.yaml | 7 +++++++ paperforge/worker/asset_index.py | 3 ++- paperforge/worker/sync.py | 3 ++- tests/unit/schema/test_field_registry.py | 4 ++-- 6 files changed, 34 insertions(+), 4 deletions(-) diff --git a/fixtures/snapshots/formal_note_frontmatter/orthopedic_article.yaml b/fixtures/snapshots/formal_note_frontmatter/orthopedic_article.yaml index 503aa72..f44f7d1 100644 --- a/fixtures/snapshots/formal_note_frontmatter/orthopedic_article.yaml +++ b/fixtures/snapshots/formal_note_frontmatter/orthopedic_article.yaml @@ -1,4 +1,5 @@ zotero_key: FIXT0001 +citation_key: FIXT0001 domain: orthopedic title: "Biomechanical Comparison of Suture Anchor Fixations in Rotator Cuff Repair" year: "2024" diff --git a/paperforge/adapters/bbt.py b/paperforge/adapters/bbt.py index 7aac2d9..155d3c8 100644 --- a/paperforge/adapters/bbt.py +++ b/paperforge/adapters/bbt.py @@ -154,6 +154,25 @@ def resolve_item_collection_paths(item: dict, collection_lookup: dict) -> list[s return sorted({path for path in paths if path}, key=lambda value: (-value.count("/"), value)) +def extract_citation_key(item: dict) -> str: + """Extract the Better BibTeX citation key from a BBT JSON item. + + BBT stores the generated citation key as a top-level ``citationKey`` field, + e.g. ``aaronStimulationGrowthFactor2004``. Falls back to the Extra field. + """ + ck = item.get("citationKey", "") + if ck: + return ck + extra = item.get("extra", "") + if not extra: + return "" + for line in extra.splitlines(): + stripped = line.strip() + if stripped.lower().startswith("citation key:"): + return stripped.split(":", 1)[1].strip() + return "" + + def load_export_rows(path: Path) -> list[dict]: data = read_json(path) if isinstance(data, list): @@ -199,6 +218,7 @@ def load_export_rows(path: Path) -> list[dict]: "creators": item.get("creators", []), "abstract": item.get("abstractNote", ""), "journal": item.get("publicationTitle", ""), + "citation_key": extract_citation_key(item), "extra": item.get("extra", ""), "year": extract_year(item.get("date", "")), "date": item.get("date", ""), diff --git a/paperforge/schema/field_registry.yaml b/paperforge/schema/field_registry.yaml index cb664dc..119dc3b 100644 --- a/paperforge/schema/field_registry.yaml +++ b/paperforge/schema/field_registry.yaml @@ -6,6 +6,13 @@ frontmatter: description: "Zotero citation key" owner: sync introduced_in: "1.0" + citation_key: + type: str + required: false + public: true + description: "Better BibTeX citation key (e.g. aaronStimulationGrowthFactor2004)" + owner: sync + introduced_in: "1.5" domain: type: str required: true diff --git a/paperforge/worker/asset_index.py b/paperforge/worker/asset_index.py index 437284a..ed9e751 100644 --- a/paperforge/worker/asset_index.py +++ b/paperforge/worker/asset_index.py @@ -289,7 +289,7 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: try: text = main_note_path.read_text(encoding="utf-8") if "aliases:" not in text[: text.find("\n---", 4)]: - alias_line = f"aliases: [{yaml_quote(item.get('title', ''))}]\n" + alias_line = f"aliases: [{yaml_quote(item.get('title', ''))}, {yaml_quote(item.get('citation_key') or item.get('key', ''))}]\n" text = re.sub( r'(^title:.*\n)', r'\1' + alias_line, @@ -352,6 +352,7 @@ def _read_fm_str(fp: Path, key: str) -> str: entry = { "zotero_key": key, + "citation_key": item.get("citation_key", ""), "domain": domain, "title": item["title"], "authors": authors, diff --git a/paperforge/worker/sync.py b/paperforge/worker/sync.py index 0bae498..e996a05 100644 --- a/paperforge/worker/sync.py +++ b/paperforge/worker/sync.py @@ -1019,11 +1019,12 @@ def frontmatter_note(entry: dict, existing_text: str = "") -> str: lines = [ "---", f"title: {yaml_quote(entry.get('title', ''))}", - f"aliases: [{yaml_quote(entry.get('title', ''))}]", + f"aliases: [{yaml_quote(entry.get('title', ''))}, {yaml_quote(entry.get('citation_key', ''))}]", f"year: {entry.get('year', '')}", f"journal: {yaml_quote(entry.get('journal', ''))}", f"first_author: {yaml_quote(first_author)}", f"zotero_key: {yaml_quote(entry.get('zotero_key', ''))}", + f"citation_key: {yaml_quote(entry.get('citation_key', ''))}", f"domain: {yaml_quote(entry.get('domain', ''))}", f"doi: {yaml_quote(entry.get('doi', ''))}", f"pmid: {yaml_quote(entry.get('pmid', ''))}", diff --git a/tests/unit/schema/test_field_registry.py b/tests/unit/schema/test_field_registry.py index a7fa5bc..f724152 100644 --- a/tests/unit/schema/test_field_registry.py +++ b/tests/unit/schema/test_field_registry.py @@ -50,8 +50,8 @@ def test_all_expected_fields_present(self) -> None: reg = load_field_registry(REGISTRY_PATH) fm = get_owner_fields(reg, "frontmatter") expected = { - "zotero_key", "domain", "title", "year", "doi", - "collection_path", "has_pdf", "pdf_path", "supplementary", + "zotero_key", "citation_key", "domain", "title", "year", "doi", + "collection_path", "collection_tags", "has_pdf", "pdf_path", "supplementary", "fulltext_md_path", "recommend_analyze", "analyze", "do_ocr", "ocr_status", "deep_reading_status", "path_error", } From 5673f682a2ee64554ca77fd82eff102627fea176 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 18:33:33 +0800 Subject: [PATCH 012/132] fix(memory): restore row.keys() iteration (sqlite3.Row iterates values, not keys) --- paperforge/memory/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paperforge/memory/query.py b/paperforge/memory/query.py index 365360e..5e0df91 100644 --- a/paperforge/memory/query.py +++ b/paperforge/memory/query.py @@ -79,7 +79,7 @@ def get_memory_status(vault: Path) -> dict: def _entry_from_row(row) -> dict: """Reconstruct an entry dict from a papers row (sqlite3.Row).""" - entry = {k: row[k] for k in row} + entry = {k: row[k] for k in row.keys()} for key in ("has_pdf", "do_ocr", "analyze"): if key in entry and entry[key] is not None: entry[key] = bool(entry[key]) From 1acbeaae23ad206fcaa703104445e8a86c29dafa Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 18:58:25 +0800 Subject: [PATCH 013/132] feat(memory): add FTS5 full-text search with paperforge search command --- paperforge/cli.py | 17 ++++++++ paperforge/commands/search.py | 81 +++++++++++++++++++++++++++++++++++ paperforge/memory/builder.py | 16 +++++++ paperforge/memory/fts.py | 55 ++++++++++++++++++++++++ paperforge/memory/schema.py | 45 ++++++++++++++++++- 5 files changed, 213 insertions(+), 1 deletion(-) create mode 100644 paperforge/commands/search.py create mode 100644 paperforge/memory/fts.py diff --git a/paperforge/cli.py b/paperforge/cli.py index b56449c..4e9ee03 100644 --- a/paperforge/cli.py +++ b/paperforge/cli.py @@ -270,6 +270,18 @@ def build_parser() -> argparse.ArgumentParser: p_paper_status.add_argument("query", help="Paper identifier (zotero_key, DOI, title, alias)") p_paper_status.add_argument("--json", action="store_true", help="Output as JSON") + p_search = sub.add_parser("search", help="Full-text search across the library") + p_search.add_argument("query", help="Search query (supports FTS5 syntax)") + p_search.add_argument("--json", action="store_true", help="Output as JSON") + p_search.add_argument("--limit", type=int, default=20, help="Max results") + p_search.add_argument("--domain", help="Filter by domain") + p_search.add_argument("--year-from", type=int, help="Filter by year (inclusive)") + p_search.add_argument("--year-to", type=int, help="Filter by year (inclusive)") + p_search.add_argument("--ocr", choices=["done","pending","failed","processing"], help="Filter by OCR status") + p_search.add_argument("--deep", choices=["done","pending"], help="Filter by deep reading status") + p_search.add_argument("--lifecycle", choices=["indexed","pdf_ready","fulltext_ready","deep_read_done"], help="Filter by lifecycle") + p_search.add_argument("--next-step", choices=["sync","ocr","/pf-deep","ready"], help="Filter by next step") + # base-refresh p_base = sub.add_parser("base-refresh", help="Refresh Obsidian Base view files") p_base.add_argument( @@ -492,6 +504,11 @@ def main(argv: list[str] | None = None) -> int: return run(args) + if args.command == "search": + from paperforge.commands.search import run + + return run(args) + if args.command == "base-refresh": force = getattr(args, "force", False) paths = args.paths diff --git a/paperforge/commands/search.py b/paperforge/commands/search.py new file mode 100644 index 0000000..0de275a --- /dev/null +++ b/paperforge/commands/search.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +import argparse +import sys + +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.fts import search_papers +from paperforge import __version__ as PF_VERSION + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + query = args.query + + db_path = get_memory_db_path(vault) + if not db_path.exists(): + result = PFResult( + ok=False, + command="search", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message="Memory database not found. Run paperforge memory build.", + ), + ) + if args.json: + print(result.to_json()) + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 1 + + conn = get_connection(db_path, read_only=True) + try: + results = search_papers( + conn, query, + limit=args.limit, + domain=args.domain or "", + year_from=args.year_from or 0, + year_to=args.year_to or 0, + ocr_status=args.ocr or "", + deep_status=args.deep or "", + lifecycle=args.lifecycle or "", + next_step=args.next_step or "", + ) + data = { + "query": query, + "matches": results, + "count": len(results), + "filters_applied": { + "domain": args.domain, + "year_from": args.year_from, + "year_to": args.year_to, + "ocr": args.ocr, + "deep": args.deep, + "lifecycle": args.lifecycle, + "next_step": args.next_step, + }, + } + result = PFResult(ok=True, command="search", version=PF_VERSION, data=data) + except Exception as exc: + result = PFResult( + ok=False, command="search", version=PF_VERSION, + error=PFError(code=ErrorCode.INTERNAL_ERROR, message=str(exc)), + ) + finally: + conn.close() + + if args.json: + print(result.to_json()) + else: + if result.ok: + matches = result.data["matches"] + print(f"Found {len(matches)} results for: {query}") + for m in matches: + rank_val = m.get("rank", "") + print(f" [{m['lifecycle']:16}] {m['zotero_key']} | {m['year']} | {m['first_author']} | {m['title'][:60]}") + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 0 if result.ok else 1 diff --git a/paperforge/memory/builder.py b/paperforge/memory/builder.py index 84896c6..1c79596 100644 --- a/paperforge/memory/builder.py +++ b/paperforge/memory/builder.py @@ -3,6 +3,7 @@ import hashlib import json import logging +import sqlite3 from datetime import datetime, timezone from pathlib import Path @@ -10,6 +11,7 @@ from paperforge.memory.db import get_connection, get_memory_db_path from paperforge.memory.schema import ( CURRENT_SCHEMA_VERSION, + clear_fts, drop_all_tables, ensure_schema, get_schema_version, @@ -92,6 +94,8 @@ def build_from_index(vault: Path) -> dict: conn.execute("DELETE FROM paper_assets;") conn.execute("DELETE FROM papers;") + clear_fts(conn) + now_utc = datetime.now(timezone.utc).isoformat() papers_count = 0 assets_count = 0 @@ -142,6 +146,18 @@ def build_from_index(vault: Path) -> dict: ) papers_count += 1 + try: + conn.execute( + """INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors, abstract, journal, domain, collection_path, collection_tags) + VALUES ((SELECT rowid FROM papers WHERE zotero_key = ?), ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", + (zotero_key, zotero_key, entry.get("citation_key", ""), entry.get("title", ""), + entry.get("first_author", ""), paper_values.get("authors_json", ""), + entry.get("abstract", ""), entry.get("journal", ""), entry.get("domain", ""), + entry.get("collection_path", ""), paper_values.get("collections_json", "")), + ) + except sqlite3.IntegrityError: + pass # duplicate rowid if FTS trigger already fired + for asset_type, entry_field in ASSET_FIELDS: path_val = entry.get(entry_field, "") if not path_val: diff --git a/paperforge/memory/fts.py b/paperforge/memory/fts.py new file mode 100644 index 0000000..11d2c6d --- /dev/null +++ b/paperforge/memory/fts.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +import sqlite3 + + +def search_papers(conn: sqlite3.Connection, query: str, limit: int = 20, + domain: str = "", year_from: int = 0, year_to: int = 0, + ocr_status: str = "", deep_status: str = "", + lifecycle: str = "", next_step: str = "") -> list[dict]: + """Full-text search across papers with optional filters. + + Uses FTS5 for relevance-ranked results with optional column filters. + """ + conditions = ["paper_fts MATCH ?"] + params: list = [query] + + if domain: + conditions.append("p.domain = ?") + params.append(domain) + if year_from: + conditions.append("CAST(p.year AS INTEGER) >= ?") + params.append(year_from) + if year_to: + conditions.append("CAST(p.year AS INTEGER) <= ?") + params.append(year_to) + if ocr_status: + conditions.append("p.ocr_status = ?") + params.append(ocr_status) + if deep_status: + conditions.append("p.deep_reading_status = ?") + params.append(deep_status) + if lifecycle: + conditions.append("p.lifecycle = ?") + params.append(lifecycle) + if next_step: + conditions.append("p.next_step = ?") + params.append(next_step) + + where = " AND ".join(conditions) + sql = f""" + SELECT p.zotero_key, p.citation_key, p.title, p.year, p.doi, + p.first_author, p.journal, p.domain, p.lifecycle, + p.ocr_status, p.deep_reading_status, p.next_step, + p.abstract, + rank + FROM paper_fts f + JOIN papers p ON p.zotero_key = f.zotero_key + WHERE {where} + ORDER BY rank + LIMIT ? + """ + params.append(limit) + conn.row_factory = sqlite3.Row + rows = conn.execute(sql, params).fetchall() + return [dict(r) for r in rows] diff --git a/paperforge/memory/schema.py b/paperforge/memory/schema.py index 8f41462..07f657b 100644 --- a/paperforge/memory/schema.py +++ b/paperforge/memory/schema.py @@ -83,7 +83,41 @@ "CREATE INDEX IF NOT EXISTS idx_papers_next_step ON papers(next_step);", ] -ALL_TABLES = ["papers", "paper_assets", "paper_aliases", "meta"] +CREATE_PAPER_FTS = """ +CREATE VIRTUAL TABLE IF NOT EXISTS paper_fts USING fts5( + zotero_key, + citation_key, + title, + first_author, + authors, + abstract, + journal, + domain, + collection_path, + collection_tags, + content='papers', + content_rowid='rowid' +); +""" + +FTS_TRIGGERS = [ + """CREATE TRIGGER IF NOT EXISTS papers_ai AFTER INSERT ON papers BEGIN + INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors, abstract, journal, domain, collection_path, collection_tags) + VALUES (new.rowid, new.zotero_key, new.citation_key, new.title, new.first_author, new.authors_json, new.abstract, new.journal, new.domain, new.collection_path, new.collections_json); + END;""", + """CREATE TRIGGER IF NOT EXISTS papers_ad AFTER DELETE ON papers BEGIN + INSERT INTO paper_fts(paper_fts, rowid, zotero_key, citation_key, title, first_author, authors, abstract, journal, domain, collection_path, collection_tags) + VALUES ('delete', old.rowid, old.zotero_key, old.citation_key, old.title, old.first_author, old.authors_json, old.abstract, old.journal, old.domain, old.collection_path, old.collections_json); + END;""", + """CREATE TRIGGER IF NOT EXISTS papers_au AFTER UPDATE ON papers BEGIN + INSERT INTO paper_fts(paper_fts, rowid, zotero_key, citation_key, title, first_author, authors, abstract, journal, domain, collection_path, collection_tags) + VALUES ('delete', old.rowid, old.zotero_key, old.citation_key, old.title, old.first_author, old.authors_json, old.abstract, old.journal, old.domain, old.collection_path, old.collections_json); + INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors, abstract, journal, domain, collection_path, collection_tags) + VALUES (new.rowid, new.zotero_key, new.citation_key, new.title, new.first_author, new.authors_json, new.abstract, new.journal, new.domain, new.collection_path, new.collections_json); + END;""", +] + +ALL_TABLES = ["paper_fts", "papers", "paper_assets", "paper_aliases", "meta"] def ensure_schema(conn: sqlite3.Connection) -> None: @@ -92,8 +126,11 @@ def ensure_schema(conn: sqlite3.Connection) -> None: conn.execute(CREATE_PAPERS) conn.execute(CREATE_ASSETS) conn.execute(CREATE_ALIASES) + conn.execute(CREATE_PAPER_FTS) for idx_sql in INDEX_SQL: conn.execute(idx_sql) + for trigger_sql in FTS_TRIGGERS: + conn.execute(trigger_sql) conn.commit() @@ -104,6 +141,12 @@ def drop_all_tables(conn: sqlite3.Connection) -> None: conn.commit() +def clear_fts(conn: sqlite3.Connection) -> None: + """Delete all FTS index entries (before rebuild).""" + conn.execute("DELETE FROM paper_fts;") + conn.commit() + + def get_schema_version(conn: sqlite3.Connection) -> int: """Read the stored schema version from meta table, or 0 if not found.""" try: From 4bb750b2dbbeefcdbd4ec8d149ad5ff696d2b0df Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 19:01:41 +0800 Subject: [PATCH 014/132] fix(memory): match FTS column names to papers table (authors_json, collections_json) --- paperforge/memory/builder.py | 2 +- paperforge/memory/fts.py | 5 +++-- paperforge/memory/schema.py | 12 ++++++------ 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/paperforge/memory/builder.py b/paperforge/memory/builder.py index 1c79596..2ac90a8 100644 --- a/paperforge/memory/builder.py +++ b/paperforge/memory/builder.py @@ -148,7 +148,7 @@ def build_from_index(vault: Path) -> dict: try: conn.execute( - """INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors, abstract, journal, domain, collection_path, collection_tags) + """INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) VALUES ((SELECT rowid FROM papers WHERE zotero_key = ?), ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", (zotero_key, zotero_key, entry.get("citation_key", ""), entry.get("title", ""), entry.get("first_author", ""), paper_values.get("authors_json", ""), diff --git a/paperforge/memory/fts.py b/paperforge/memory/fts.py index 11d2c6d..7fa322e 100644 --- a/paperforge/memory/fts.py +++ b/paperforge/memory/fts.py @@ -37,14 +37,15 @@ def search_papers(conn: sqlite3.Connection, query: str, limit: int = 20, params.append(next_step) where = " AND ".join(conditions) + # Content-sync FTS: query the FTS table directly, columns come from papers sql = f""" SELECT p.zotero_key, p.citation_key, p.title, p.year, p.doi, p.first_author, p.journal, p.domain, p.lifecycle, p.ocr_status, p.deep_reading_status, p.next_step, - p.abstract, + substr(p.abstract, 1, 300) as abstract, rank FROM paper_fts f - JOIN papers p ON p.zotero_key = f.zotero_key + JOIN papers p ON p.rowid = f.rowid WHERE {where} ORDER BY rank LIMIT ? diff --git a/paperforge/memory/schema.py b/paperforge/memory/schema.py index 07f657b..8094549 100644 --- a/paperforge/memory/schema.py +++ b/paperforge/memory/schema.py @@ -89,12 +89,12 @@ citation_key, title, first_author, - authors, + authors_json, abstract, journal, domain, collection_path, - collection_tags, + collections_json, content='papers', content_rowid='rowid' ); @@ -102,17 +102,17 @@ FTS_TRIGGERS = [ """CREATE TRIGGER IF NOT EXISTS papers_ai AFTER INSERT ON papers BEGIN - INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors, abstract, journal, domain, collection_path, collection_tags) + INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) VALUES (new.rowid, new.zotero_key, new.citation_key, new.title, new.first_author, new.authors_json, new.abstract, new.journal, new.domain, new.collection_path, new.collections_json); END;""", """CREATE TRIGGER IF NOT EXISTS papers_ad AFTER DELETE ON papers BEGIN - INSERT INTO paper_fts(paper_fts, rowid, zotero_key, citation_key, title, first_author, authors, abstract, journal, domain, collection_path, collection_tags) + INSERT INTO paper_fts(paper_fts, rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) VALUES ('delete', old.rowid, old.zotero_key, old.citation_key, old.title, old.first_author, old.authors_json, old.abstract, old.journal, old.domain, old.collection_path, old.collections_json); END;""", """CREATE TRIGGER IF NOT EXISTS papers_au AFTER UPDATE ON papers BEGIN - INSERT INTO paper_fts(paper_fts, rowid, zotero_key, citation_key, title, first_author, authors, abstract, journal, domain, collection_path, collection_tags) + INSERT INTO paper_fts(paper_fts, rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) VALUES ('delete', old.rowid, old.zotero_key, old.citation_key, old.title, old.first_author, old.authors_json, old.abstract, old.journal, old.domain, old.collection_path, old.collections_json); - INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors, abstract, journal, domain, collection_path, collection_tags) + INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) VALUES (new.rowid, new.zotero_key, new.citation_key, new.title, new.first_author, new.authors_json, new.abstract, new.journal, new.domain, new.collection_path, new.collections_json); END;""", ] From eedf02c4622badb608d18322b7f517abbfefe139 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 19:44:38 +0800 Subject: [PATCH 015/132] docs(spec): add Memory Layer Phase 2-5 design (agent-context, dashboard, refresh, retrieve) --- ...2026-05-12-memory-layer-phase2-5-design.md | 360 ++++++++++++++++++ 1 file changed, 360 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md diff --git a/docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md b/docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md new file mode 100644 index 0000000..6587af3 --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md @@ -0,0 +1,360 @@ +# Memory Layer Phase 2-5 — Complete Spec + +> **Date:** 2026-05-12 | **Depends on:** Phase 1 (metadata DB) + FTS5 search + +## Overview + +Four remaining features to complete the Memory Layer, in priority order: + +| # | Feature | Purpose | +|---|---------|---------| +| 1 | **agent-context** | Agent 启动路由器:library 概览 + commands 清单 + collection 地图 + rules | +| 2 | **Dashboard SQLite** | 仪表盘从文件扫描切换到读 paperforge.db | +| 3 | **Incremental refresh** | sync/ocr/deep-finalize 后单篇刷新 memory,不重建全库 | +| 4 | **Chunk retrieve** | OCR 全文 + figure caption 片段检索,返回带引用的证据 paragraph | + +--- + +## Feature 1: agent-context + +### Design + +纯只读路由命令。Agent 拿到后知道:库里有什么、能调用什么、从哪开始。 + +### Output structure + +```json +{ + "ok": true, + "command": "agent-context", + "version": "1.6.0", + "data": { + "paperforge": { + "version": "1.6.0", + "vault": "/path/to/vault", + "memory_db": "ready" + }, + "library": { + "paper_count": 283, + "domain_counts": {"骨科": 120, "运动医学": 80, "其他": 83}, + "lifecycle_counts": {"indexed": 2, "pdf_ready": 260, "fulltext_ready": 18, "deep_read_done": 3}, + "ocr_counts": {"done": 21, "pending": 262}, + "deep_reading_counts": {"done": 3, "pending": 280} + }, + "collections": [ + {"name": "骨科", "count": 120, "sub": ["骨折", "软骨", "韧带"]}, + {"name": "运动医学", "count": 80} + ], + "commands": { + "paper-status": { + "usage": "paperforge paper-status --json", + "purpose": "Look up one paper's full status and recommended next action" + }, + "search": { + "usage": "paperforge search --json [--collection NAME] [--domain NAME] [--ocr done|pending] [--year-from N] [--year-to N] [--limit N]", + "purpose": "Full-text search with optional collection/domain/lifecycle filters" + }, + "retrieve": { + "usage": "paperforge retrieve --json [--limit N]", + "purpose": "Search OCR fulltext chunks for evidence paragraphs (coming soon)" + }, + "deep": { + "usage": "/pf-deep ", + "purpose": "Full three-pass deep reading with chart analysis" + }, + "ocr": { + "usage": "/pf-ocr", + "purpose": "Run OCR on papers marked do_ocr:true" + }, + "sync": { + "usage": "/pf-sync", + "purpose": "Sync Zotero and regenerate formal notes + index" + } + }, + "rules": [ + "Use paperforge.db via CLI commands before reading individual files.", + "Do not infer paper state from stale frontmatter when memory status is fresh.", + "Read source files only after resolving candidates via paper-status or search.", + "To locate a paper: start with collection scope if known, then expand to full library search." + ] + } +} +``` + +### Implementation + +**File:** `paperforge/memory/context.py` + +```python +def get_agent_context(vault: Path) -> dict: + """Build agent bootstrap context from paperforge.db.""" + conn = get_connection(get_memory_db_path(vault), read_only=True) + try: + # Library overview + total = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0] + domains = {r["domain"]: r["cnt"] for r in conn.execute( + "SELECT domain, COUNT(*) as cnt FROM papers GROUP BY domain ORDER BY cnt DESC" + ).fetchall()} + lifecycles = ... # same GROUP BY pattern + ocr = ... + deep = ... + + # Collection tree + collections = _build_collection_tree(conn) + + return {...} # full structure above + finally: + conn.close() + +def _build_collection_tree(conn) -> list[dict]: + """Build nested collection hierarchy from papers.collection_path.""" + rows = conn.execute( + "SELECT collection_path, COUNT(*) as cnt FROM papers " + "WHERE collection_path != '' GROUP BY collection_path ORDER BY cnt DESC" + ).fetchall() + # Parse pipe-separated paths into tree + # "骨科 | 骨折" -> nested under 骨科 +``` + +**File:** `paperforge/commands/agent_context.py` — CLI wrapper with `--json` flag. + +**CLI:** `paperforge agent-context --json` + +### Constraints +- Pure read-only on paperforge.db +- If DB missing: return error with message "Run paperforge memory build" +- All SQL queries wrapped in try/except with graceful error handling +- Output wrapped in PFResult dataclass (matches all other CLI commands) + +**Schema version:** `CURRENT_SCHEMA_VERSION` bumped to `2` when `paper_chunks` and `paper_chunk_fts` tables are added (Feature 4). On version mismatch, `memory build` performs full drop-and-rebuild as per existing strategy. + +--- + +## Feature 2: Dashboard SQLite Integration + +### Design + +`dashboard.py` currently scans all `.md` files with regex frontmatter parsing. Replace with SQLite queries. Keep fallback to file scanning if DB is missing or stale. + +### Change + +**File:** `paperforge/commands/dashboard.py` + +The `_gather_dashboard_data()` function currently at lines 54-163 will be refactored: + +```python +def _gather_dashboard_data(vault: Path) -> dict: + db_path = get_memory_db_path(vault) + if db_path.exists(): + try: + return _dashboard_from_db(vault, db_path) + except Exception: + pass # fall through to file scanning + return _dashboard_from_files(vault) # existing logic, renamed +``` + +New function `_dashboard_from_db()`: +```python +def _dashboard_from_db(vault, db_path) -> dict: + conn = get_connection(db_path, read_only=True) + try: + total = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0] + + # PDF health + pdf_rows = conn.execute( + "SELECT lifecycle FROM papers" + ).fetchall() + pdf_healthy = sum(1 for r in pdf_rows if r["lifecycle"] != "indexed") + pdf_missing = total - pdf_healthy + + # OCR health + ocr_done = conn.execute( + "SELECT COUNT(*) FROM papers WHERE ocr_status='done'" + ).fetchone()[0] + ocr_pending = conn.execute( + "SELECT COUNT(*) FROM papers WHERE ocr_status NOT IN ('done','failed')" + ).fetchone()[0] + ocr_failed = conn.execute( + "SELECT COUNT(*) FROM papers WHERE ocr_status='failed'" + ).fetchone()[0] + + # Domain counts + domain_counts = {r["domain"]: r["cnt"] for r in conn.execute( + "SELECT domain, COUNT(*) as cnt FROM papers GROUP BY domain" + ).fetchall()} + + # Permissions (unchanged — still checks file existence) + permissions = _check_permissions(vault) + + return { + "stats": { + "papers": total, + "pdf_health": {"healthy": pdf_healthy, "missing": pdf_missing, "broken": 0}, + "ocr_health": {"pending": ocr_pending, "done": ocr_done, "failed": ocr_failed}, + "domain_counts": domain_counts, + "_source": "paperforge.db" + }, + "permissions": permissions + } + finally: + conn.close() +``` + +### Constraints +- Keep existing `_dashboard_from_files()` as fallback, rename from current `_gather_dashboard_data()` +- Dashboard output format must NOT change (plugin depends on it) +- Add `_source` field so plugin can display data freshness +- If DB is stale (`memory status` shows needs_rebuild), fall back to file scanning + +--- + +## Feature 3: Incremental Refresh + +### Design + +After `sync`, `ocr`, or `deep-finalize` modifies one paper, refresh only that paper's entries in SQLite instead of full `memory build`. + +### Implementation + +**File:** `paperforge/memory/refresh.py` + +```python +def refresh_paper(vault: Path, zotero_key: str) -> bool: + """Incrementally refresh one paper in paperforge.db from formal-library.json.""" + envelope = read_index(vault) + if not envelope: + return False + items = envelope if isinstance(envelope, list) else envelope.get("items", []) + + # Find the matching entry + entry = None + for e in items: + if e.get("zotero_key") == zotero_key: + entry = e + break + if not entry: + return False + + db_path = get_memory_db_path(vault) + conn = get_connection(db_path, read_only=False) + try: + # Upsert paper row (same logic as builder) + _upsert_paper(conn, entry, envelope.get("generated_at", "")) + # Replace assets for this key + conn.execute("DELETE FROM paper_assets WHERE paper_id=?", (zotero_key,)) + _insert_assets(conn, entry, vault) + # Replace aliases for this key + conn.execute("DELETE FROM paper_aliases WHERE paper_id=?", (zotero_key,)) + _insert_aliases(conn, entry) + conn.commit() + return True + except Exception: + conn.rollback() + raise + finally: + conn.close() +``` + +### Integration points + +Trigger `refresh_paper(vault, key)` after: +- `paperforge sync` — for each updated paper +- `paperforge ocr` — after OCR completes for a paper +- `paperforge deep-finalize ` — after marking deep reading done +- `paperforge repair --fix` — after repairing state + +### Constraints +- Reuse `_build_entry()` logic from builder.py (extract shared helpers) +- Only refresh if paperforge.db exists (no auto-build) +- If formal-library.json is stale (entry not found), skip silently +- Transactional: all-or-nothing per paper + +--- + +## Feature 4: Chunk Retrieval + +### Design + +Split OCR fulltext into paragraph-level chunks, store in `paper_chunks` table, index with FTS5. Figure captions from `figure-map.json` included as a chunk source type. + +### Schema + +```sql +CREATE TABLE IF NOT EXISTS paper_chunks ( + chunk_id TEXT PRIMARY KEY, + paper_id TEXT NOT NULL, + source_type TEXT NOT NULL, -- 'ocr_fulltext' | 'figure_caption' | 'abstract' | 'formal_note' + section_title TEXT, -- e.g., "Methods", "Results", "Figure 3" + page_number INTEGER, + chunk_index INTEGER, + chunk_text TEXT NOT NULL, + token_estimate INTEGER, + content_hash TEXT, + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); + +CREATE VIRTUAL TABLE IF NOT EXISTS paper_chunk_fts USING fts5( + chunk_id UNINDEXED, + paper_id UNINDEXED, + source_type, + section_title, + chunk_text, + content='paper_chunks', + content_rowid='rowid' +); +``` + +### Chunking strategy + +- **OCR fulltext**: Split by `` markers, then by double-newline paragraphs within each page. Max 500 tokens per chunk. +- **Figure captions**: Read `figure-map.json` from `ocr//`, one chunk per figure entry. +- **Abstract**: One chunk per paper (source_type='abstract'). +- **Formal note**: Optional — split `## 🔍 精读` sections into chunks. + +### Command + +``` +paperforge retrieve --json [--limit N] [--source ocr_fulltext|figure_caption|all] +``` + +Output: +```json +{ + "ok": true, + "command": "retrieve", + "data": { + "query": "PEMF dose response chondrocyte", + "chunks": [ + { + "zotero_key": "ABC123", + "title": "...", + "source_type": "ocr_fulltext", + "section_title": "Results", + "page_number": 6, + "chunk_text": "At 24h post-stimulation, chondrocyte proliferation increased...", + "rank": -2.5 + } + ] + } +} +``` + +### Constraints +- Chunks populated during `memory build` (full) or `memory refresh --key X` (incremental) +- Only for papers with `ocr_status == "done"` +- Figure-map.json must exist for figure caption chunks +- Max 3 paragraphs per chunk; overlap = 0 +- `paper_chunks` and `paper_chunk_fts` added to `ALL_TABLES` and `ensure_schema()` +- FTS content sync triggers added for `paper_chunks` ↔ `paper_chunk_fts` +- `CURRENT_SCHEMA_VERSION` bumped to `2` + +--- + +## Implementation Order + +1. **agent-context** — highest value for agent workflow +2. **Dashboard integration** — unify data sources +3. **Incremental refresh** — performance improvement +4. **Chunk retrieval** — most complex, depends on OCR pipeline + +Each feature gets its own plan → execute cycle within this spec. From 5f7f2cd317813f2ada9e85c68aaace4cff4ccec5 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 19:54:19 +0800 Subject: [PATCH 016/132] docs(plan): add agent-context implementation plan --- .../plans/2026-05-12-agent-context.md | 329 ++++++++++++++++++ 1 file changed, 329 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-12-agent-context.md diff --git a/docs/superpowers/plans/2026-05-12-agent-context.md b/docs/superpowers/plans/2026-05-12-agent-context.md new file mode 100644 index 0000000..17f5850 --- /dev/null +++ b/docs/superpowers/plans/2026-05-12-agent-context.md @@ -0,0 +1,329 @@ +# agent-context — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) +> or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax. + +**Goal:** Add `paperforge agent-context --json` command that gives agents a library overview, command catalog, collection map, and behavior rules in one call. + +**Architecture:** New `paperforge/memory/context.py` queries paperforge.db for aggregated stats. CLI wrapper in `paperforge/commands/agent_context.py`. Pure read-only, no file scanning. + +**Tech Stack:** Python stdlib `sqlite3`, existing `paperforge.memory.db`, `paperforge.core.result.PFResult`. + +**Spec:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md` + +**Prerequisites:** Memory Layer Phase 1 + FTS5 already implemented on `feature/memory` branch. + +--- + +## File Structure + +``` +Create: + paperforge/memory/context.py — get_agent_context(vault) -> dict + paperforge/commands/agent_context.py — CLI run(args) -> int + tests/unit/memory/test_context.py — unit tests + +Modify: + paperforge/cli.py — add "agent-context" subparser + dispatch + paperforge/commands/__init__.py — add to _COMMAND_REGISTRY +``` + +--- + +### Task 1: `paperforge/memory/context.py` + +**Files:** +- Create: `paperforge/memory/context.py` +- Create: `tests/unit/memory/test_context.py` + +- [ ] **Step 1: Write `paperforge/memory/context.py`** + +```python +from __future__ import annotations + +from pathlib import Path + +from paperforge.memory.db import get_connection, get_memory_db_path + + +def _build_collection_tree(conn) -> list[dict]: + """Build collection hierarchy from papers.collection_path. + + Each collection_path is pipe-separated, e.g. "骨科 | 骨折". + Returns flat list of top-level collections with sub-collections. + """ + rows = conn.execute( + "SELECT collection_path, COUNT(*) as cnt FROM papers " + "WHERE collection_path != '' " + "GROUP BY collection_path ORDER BY cnt DESC" + ).fetchall() + top: dict[str, dict] = {} + for row in rows: + parts = [p.strip() for p in row["collection_path"].split("|") if p.strip()] + if not parts: + continue + root = parts[0] + if root not in top: + top[root] = {"name": root, "count": 0, "sub": []} + top[root]["count"] += row["cnt"] + if len(parts) > 1: + sub_name = parts[-1] + if sub_name not in top[root]["sub"]: + top[root]["sub"].append(sub_name) + for c in top.values(): + c["sub"] = sorted(c["sub"]) + return sorted(top.values(), key=lambda x: -x["count"]) + + +def get_agent_context(vault: Path) -> dict | None: + """Build agent context from paperforge.db — library stats + collection tree. + + Returns None if DB is missing or query fails. + """ + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return None + + conn = get_connection(db_path, read_only=True) + try: + total = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0] + + domains = { + r["domain"]: r["cnt"] + for r in conn.execute( + "SELECT domain, COUNT(*) as cnt FROM papers GROUP BY domain ORDER BY cnt DESC" + ).fetchall() + } + + lifecycle_counts = { + r["lifecycle"]: r["cnt"] + for r in conn.execute( + "SELECT lifecycle, COUNT(*) as cnt FROM papers GROUP BY lifecycle" + ).fetchall() + } + + ocr_counts = { + r["ocr_status"]: r["cnt"] + for r in conn.execute( + "SELECT ocr_status, COUNT(*) as cnt FROM papers GROUP BY ocr_status" + ).fetchall() + } + + deep_counts = { + r["deep_reading_status"]: r["cnt"] + for r in conn.execute( + "SELECT deep_reading_status, COUNT(*) as cnt FROM papers GROUP BY deep_reading_status" + ).fetchall() + } + + collections = _build_collection_tree(conn) + + return { + "library": { + "paper_count": total, + "domain_counts": domains, + "lifecycle_counts": lifecycle_counts, + "ocr_counts": ocr_counts, + "deep_reading_counts": deep_counts, + }, + "collections": collections, + } + except Exception: + return None + finally: + conn.close() +``` + +- [ ] **Step 2: Write `tests/unit/memory/test_context.py`** + +```python +from __future__ import annotations + +from pathlib import Path + +from paperforge.memory.context import get_agent_context + + +def test_get_agent_context_returns_none_when_no_db(): + assert get_agent_context(Path("/nonexistent/vault")) is None +``` + +- [ ] **Step 3: Run tests** + +```bash +python -m pytest tests/unit/memory/test_context.py -v +``` + +- [ ] **Step 4: Commit** + +```bash +git add paperforge/memory/context.py tests/unit/memory/test_context.py +git commit -m "feat(memory): add agent context query module" +``` + +--- + +### Task 2: `paperforge/commands/agent_context.py` + +**Files:** +- Create: `paperforge/commands/agent_context.py` +- Modify: `paperforge/cli.py` (add parser + dispatch) +- Modify: `paperforge/commands/__init__.py` (register) + +- [ ] **Step 1: Write `paperforge/commands/agent_context.py`** + +```python +from __future__ import annotations + +import argparse +import sys + +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.context import get_agent_context +from paperforge import __version__ as PF_VERSION + +COMMANDS = { + "paper-status": { + "usage": "paperforge paper-status --json", + "purpose": "Look up one paper's full status and recommended next action", + }, + "search": { + "usage": "paperforge search --json [--collection NAME] [--domain NAME] [--ocr done|pending] [--year-from N] [--limit N]", + "purpose": "Full-text search with optional collection/domain/lifecycle filters", + }, + "retrieve": { + "usage": "paperforge retrieve --json [--limit N]", + "purpose": "Search OCR fulltext chunks for evidence paragraphs (coming soon)", + }, + "deep": { + "usage": "/pf-deep ", + "purpose": "Full three-pass deep reading with chart analysis", + }, + "ocr": { + "usage": "/pf-ocr", + "purpose": "Run OCR on papers marked do_ocr:true", + }, + "sync": { + "usage": "/pf-sync", + "purpose": "Sync Zotero and regenerate formal notes + index", + }, +} + +RULES = [ + "Use paperforge.db via CLI commands before reading individual files.", + "Do not infer paper state from stale frontmatter when memory status is fresh.", + "Read source files only after resolving candidates via paper-status or search.", + "To locate a paper: start with collection scope if known, then expand to full library search.", +] + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + + library = get_agent_context(vault) + if library is None: + result = PFResult( + ok=False, + command="agent-context", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message="Memory database not found or query failed. Run paperforge memory build.", + ), + ) + if args.json: + print(result.to_json()) + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 1 + + data = { + "paperforge": { + "version": PF_VERSION, + "vault": str(vault), + "memory_db": "ready", + }, + "library": library["library"], + "collections": library["collections"], + "commands": COMMANDS, + "rules": RULES, + } + + result = PFResult( + ok=True, + command="agent-context", + version=PF_VERSION, + data=data, + ) + + if args.json: + print(result.to_json()) + else: + lib = data["library"] + print(f"Papers: {lib['paper_count']} total") + print(f"Domains: {lib['domain_counts']}") + print(f"Lifecycle: {lib['lifecycle_counts']}") + for c in data.get("collections", []): + subs = f" ({len(c['sub'])} sub)" if c["sub"] else "" + print(f" [{c['count']:3}] {c['name']}{subs}") + + return 0 if result.ok else 1 +``` + +- [ ] **Step 2: Register CLI parser in `paperforge/cli.py`** + +In `build_parser()`, after the search parser, add: + +```python + p_ac = sub.add_parser("agent-context", help="Generate agent bootstrap context") + p_ac.add_argument("--json", action="store_true", help="Output as JSON") +``` + +In `main()` dispatch, after the search dispatch, add: + +```python + if args.command == "agent-context": + from paperforge.commands.agent_context import run + return run(args) +``` + +- [ ] **Step 3: Update `paperforge/commands/__init__.py`** + +Add to `_COMMAND_REGISTRY`: +```python + "agent-context": "paperforge.commands.agent_context", +``` + +- [ ] **Step 4: Verify** + +```bash +python -m paperforge agent-context --help +python -m pytest tests/unit/ -q --no-header +``` + +- [ ] **Step 5: Commit** + +```bash +git add paperforge/commands/agent_context.py paperforge/cli.py paperforge/commands/__init__.py +git commit -m "feat(cli): add agent-context command for agent bootstrap" +``` + +--- + +### Task 3: Integration test + install + +- [ ] **Step 1: Reinstall + test on test vault** + +```bash +pip install --force-reinstall --no-deps . # from feature/memory +python -m paperforge --vault "D:\L\Med\test1" agent-context --json +``` + +Expected: full PFResult with library overview and collection tree. + +- [ ] **Step 2: Verify all existing tests still pass** + +```bash +python -m pytest tests/unit/ -q --no-header +``` From 2b91e09996cc7be61e3dced06abb4b5c8928be31 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 19:55:58 +0800 Subject: [PATCH 017/132] feat(memory): add agent context query module --- paperforge/memory/context.py | 93 +++++++++++++++++++++++++++++++ tests/unit/memory/test_context.py | 9 +++ 2 files changed, 102 insertions(+) create mode 100644 paperforge/memory/context.py create mode 100644 tests/unit/memory/test_context.py diff --git a/paperforge/memory/context.py b/paperforge/memory/context.py new file mode 100644 index 0000000..01f6f7b --- /dev/null +++ b/paperforge/memory/context.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +from pathlib import Path + +from paperforge.memory.db import get_connection, get_memory_db_path + + +def _build_collection_tree(conn) -> list[dict]: + """Build collection hierarchy from papers.collection_path. + + Each collection_path is pipe-separated, e.g. "骨科 | 骨折". + Returns flat list of top-level collections with sub-collections. + """ + rows = conn.execute( + "SELECT collection_path, COUNT(*) as cnt FROM papers " + "WHERE collection_path != '' " + "GROUP BY collection_path ORDER BY cnt DESC" + ).fetchall() + top: dict[str, dict] = {} + for row in rows: + parts = [p.strip() for p in row["collection_path"].split("|") if p.strip()] + if not parts: + continue + root = parts[0] + if root not in top: + top[root] = {"name": root, "count": 0, "sub": []} + top[root]["count"] += row["cnt"] + if len(parts) > 1: + sub_name = parts[-1] + if sub_name not in top[root]["sub"]: + top[root]["sub"].append(sub_name) + for c in top.values(): + c["sub"] = sorted(c["sub"]) + return sorted(top.values(), key=lambda x: -x["count"]) + + +def get_agent_context(vault: Path) -> dict | None: + """Build agent context from paperforge.db — library stats + collection tree. + + Returns None if DB is missing or query fails. + """ + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return None + + conn = get_connection(db_path, read_only=True) + try: + total = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0] + + domains = { + r["domain"]: r["cnt"] + for r in conn.execute( + "SELECT domain, COUNT(*) as cnt FROM papers GROUP BY domain ORDER BY cnt DESC" + ).fetchall() + } + + lifecycle_counts = { + r["lifecycle"]: r["cnt"] + for r in conn.execute( + "SELECT lifecycle, COUNT(*) as cnt FROM papers GROUP BY lifecycle" + ).fetchall() + } + + ocr_counts = { + r["ocr_status"]: r["cnt"] + for r in conn.execute( + "SELECT ocr_status, COUNT(*) as cnt FROM papers GROUP BY ocr_status" + ).fetchall() + } + + deep_counts = { + r["deep_reading_status"]: r["cnt"] + for r in conn.execute( + "SELECT deep_reading_status, COUNT(*) as cnt FROM papers GROUP BY deep_reading_status" + ).fetchall() + } + + collections = _build_collection_tree(conn) + + return { + "library": { + "paper_count": total, + "domain_counts": domains, + "lifecycle_counts": lifecycle_counts, + "ocr_counts": ocr_counts, + "deep_reading_counts": deep_counts, + }, + "collections": collections, + } + except Exception: + return None + finally: + conn.close() diff --git a/tests/unit/memory/test_context.py b/tests/unit/memory/test_context.py new file mode 100644 index 0000000..2a885bc --- /dev/null +++ b/tests/unit/memory/test_context.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from pathlib import Path + +from paperforge.memory.context import get_agent_context + + +def test_get_agent_context_returns_none_when_no_db(): + assert get_agent_context(Path("/nonexistent/vault")) is None From 551abd0e150a46aa0d71350ed8cd573dedde4819 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 19:59:12 +0800 Subject: [PATCH 018/132] feat(cli): add agent-context command for agent bootstrap --- paperforge/cli.py | 9 +++ paperforge/commands/__init__.py | 1 + paperforge/commands/agent_context.py | 96 ++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+) create mode 100644 paperforge/commands/agent_context.py diff --git a/paperforge/cli.py b/paperforge/cli.py index 4e9ee03..f40c5c7 100644 --- a/paperforge/cli.py +++ b/paperforge/cli.py @@ -282,6 +282,10 @@ def build_parser() -> argparse.ArgumentParser: p_search.add_argument("--lifecycle", choices=["indexed","pdf_ready","fulltext_ready","deep_read_done"], help="Filter by lifecycle") p_search.add_argument("--next-step", choices=["sync","ocr","/pf-deep","ready"], help="Filter by next step") + # agent-context + p_ac = sub.add_parser("agent-context", help="Generate agent bootstrap context") + p_ac.add_argument("--json", action="store_true", help="Output as JSON") + # base-refresh p_base = sub.add_parser("base-refresh", help="Refresh Obsidian Base view files") p_base.add_argument( @@ -509,6 +513,11 @@ def main(argv: list[str] | None = None) -> int: return run(args) + if args.command == "agent-context": + from paperforge.commands.agent_context import run + + return run(args) + if args.command == "base-refresh": force = getattr(args, "force", False) paths = args.paths diff --git a/paperforge/commands/__init__.py b/paperforge/commands/__init__.py index 9306159..cdaac04 100644 --- a/paperforge/commands/__init__.py +++ b/paperforge/commands/__init__.py @@ -12,6 +12,7 @@ "finalize": "paperforge.commands.finalize", "memory": "paperforge.commands.memory", "paper-status": "paperforge.commands.paper_status", + "agent-context": "paperforge.commands.agent_context", } diff --git a/paperforge/commands/agent_context.py b/paperforge/commands/agent_context.py new file mode 100644 index 0000000..4ad294b --- /dev/null +++ b/paperforge/commands/agent_context.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +import argparse +import sys + +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.context import get_agent_context +from paperforge import __version__ as PF_VERSION + +COMMANDS = { + "paper-status": { + "usage": "paperforge paper-status --json", + "purpose": "Look up one paper's full status and recommended next action", + }, + "search": { + "usage": "paperforge search --json [--collection NAME] [--domain NAME] [--ocr done|pending] [--year-from N] [--year-to N] [--limit N]", + "purpose": "Full-text search with optional collection/domain/lifecycle filters", + }, + "retrieve": { + "usage": "paperforge retrieve --json [--limit N]", + "purpose": "Search OCR fulltext chunks for evidence paragraphs (coming soon)", + }, + "deep": { + "usage": "/pf-deep ", + "purpose": "Full three-pass deep reading with chart analysis", + }, + "ocr": { + "usage": "/pf-ocr", + "purpose": "Run OCR on papers marked do_ocr:true", + }, + "sync": { + "usage": "/pf-sync", + "purpose": "Sync Zotero and regenerate formal notes + index", + }, +} + +RULES = [ + "Use paperforge.db via CLI commands before reading individual files.", + "Do not infer paper state from stale frontmatter when memory status is fresh.", + "Read source files only after resolving candidates via paper-status or search.", + "To locate a paper: start with collection scope if known, then expand to full library search.", +] + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + + context = get_agent_context(vault) + if context is None: + result = PFResult( + ok=False, + command="agent-context", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message="Memory database not found or query failed. Run paperforge memory build.", + ), + ) + if args.json: + print(result.to_json()) + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 1 + + data = { + "paperforge": { + "version": PF_VERSION, + "vault": str(vault), + "memory_db": "ready", + }, + "library": context["library"], + "collections": context["collections"], + "commands": COMMANDS, + "rules": RULES, + } + + result = PFResult( + ok=True, + command="agent-context", + version=PF_VERSION, + data=data, + ) + + if args.json: + print(result.to_json()) + else: + lib = data["library"] + print(f"Papers: {lib['paper_count']} total") + print(f"Domains: {lib['domain_counts']}") + print(f"Lifecycle: {lib['lifecycle_counts']}") + for c in data.get("collections", []): + subs = f" ({len(c['sub'])} sub)" if c["sub"] else "" + print(f" [{c['count']:3}] {c['name']}{subs}") + + return 0 if result.ok else 1 From 3ee0320fb943b69ba2a30ebfcba103aa566b60b5 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 20:00:16 +0800 Subject: [PATCH 019/132] =?UTF-8?q?test(memory):=20verify=20agent-context?= =?UTF-8?q?=20on=20test=20vault=20=E2=80=94=20150=20papers,=2026=20collect?= =?UTF-8?q?ions,=204=20rules?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../phases/agent-context-review/REVIEW.md | 247 ++++++++++++ ...2026-05-12-memory-layer-phase2-5-REVIEW.md | 381 ++++++++++++++++++ 2 files changed, 628 insertions(+) create mode 100644 .planning/phases/agent-context-review/REVIEW.md create mode 100644 docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-REVIEW.md diff --git a/.planning/phases/agent-context-review/REVIEW.md b/.planning/phases/agent-context-review/REVIEW.md new file mode 100644 index 0000000..2ac8ac1 --- /dev/null +++ b/.planning/phases/agent-context-review/REVIEW.md @@ -0,0 +1,247 @@ +--- +phase: agent-context-plan-review +reviewed: 2026-05-12T00:00:00Z +depth: deep +files_reviewed: 6 +files_reviewed_list: + - docs/superpowers/plans/2026-05-12-agent-context.md + - docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md + - paperforge/cli.py + - paperforge/commands/__init__.py + - paperforge/core/result.py + - paperforge/memory/schema.py +findings: + critical: 1 + warning: 3 + info: 2 + total: 6 +status: issues_found +--- + +# Phase: agent-context Plan Review + +**Reviewed:** 2026-05-12 +**Depth:** deep (cross-file analysis — spec vs plan vs existing CLI conventions vs schema) +**Files Reviewed:** 6 +**Status:** issues_found + +## Summary + +Reviewed the `agent-context` implementation plan against the Phase 2-5 design spec (Feature 1), existing `cli.py` conventions, the `PFResult` contract, and the `papers` table schema. The plan follows established CLI dispatch patterns and all SQL column names correctly match the schema. However, one **BLOCKER** output-structure violation was found (collections at wrong JSON path), along with several warnings about error handling and spec fidelity. + +--- + +## Critical Issues + +### CR-01: `collections` output at wrong JSON path — spec/plan contract violation + +**File:** `docs/superpowers/plans/2026-05-12-agent-context.md:121-128` and `docs/superpowers/plans/2026-05-12-agent-context.md:239-248` + +**Issue:** The spec defines `collections` as a top-level key under `data`, sibling to `library`: + +```json +// Spec (lines 44-47 of design spec): +"data": { + "library": { ... }, + "collections": [ ... ], // ← top-level under data + "commands": { ... }, + "rules": [ ... ] +} +``` + +But the plan nests `collections` *inside* `library`: + +```python +# Plan: get_agent_context() returns (line 121-128): +return { + "paper_count": total, + "domain_counts": domains, + "lifecycle_counts": lifecycle_counts, + "ocr_counts": ocr_counts, + "deep_reading_counts": deep_counts, + "collections": collections, # ← inside library dict +} + +# Plan: CLI wrapper constructs (line 239-248): +data = { + "library": library, # ← library includes collections + ... +} +# No separate "collections" key at data level! +``` + +Result: `data.library.collections` instead of spec's `data.collections`. Any downstream agent or plugin that follows the spec contract and accesses `data.collections` will get nothing / `undefined`. + +**Fix:** Either: + +**Option A (move to spec location):** Remove `collections` from `get_agent_context()` return value, and set it at the `data` level in the CLI wrapper: + +```python +# In get_agent_context(), remove "collections": +return { + "paper_count": total, + "domain_counts": domains, + "lifecycle_counts": lifecycle_counts, + "ocr_counts": ocr_counts, + "deep_reading_counts": deep_counts, +} +# In CLI run(), add collections at data level: +data = { + "paperforge": {...}, + "library": library, + "collections": _build_collection_tree_from_conn(vault), # separate call + "commands": COMMANDS, + "rules": RULES, +} +``` + +**Option B (update spec):** If nesting is intentional, update the spec JSON example to show `library.collections` instead of `data.collections`. The non-json output code in the plan (line 264) already reads `lib.get("collections", [])` which matches the nested location. + +--- + +## Warnings + +### WR-01: Blanket `except Exception` silently swallows all query errors + +**File:** `docs/superpowers/plans/2026-05-12-agent-context.md:129-130` + +**Issue:** The `get_agent_context()` function has: + +```python +try: + ... + return {...} +except Exception: + return None +``` + +This catches *everything* — corrupt DB, permission errors, schema mismatch, disk I/O errors — and returns `None`. The caller then reports: + +> "Memory database not found. Run paperforge memory build." + +This message is **wrong** for non-missing-DB failures. A corrupt database or a permission error is not fixed by rebuilding the database. The real exception is lost entirely, making debugging impossible. + +**Fix:** At minimum, log the exception before returning `None`. Better: distinguish between "DB missing" and "DB query failed": + +```python +import logging +logger = logging.getLogger(__name__) + +def get_agent_context(vault: Path) -> dict: + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return None + + conn = get_connection(db_path, read_only=True) + try: + ... + return {...} + except Exception as exc: + logger.exception("Failed to query agent context from %s", db_path) + return None + finally: + conn.close() +``` + +Or propagate the exception upward and let the CLI layer construct a more accurate `PFError` with `ErrorCode.INTERNAL_ERROR` and the actual error message (matching how `search.py` handles exceptions at line 62-66). + +### WR-02: Docstring is misleading about return conditions + +**File:** `docs/superpowers/plans/2026-05-12-agent-context.md:78-81` + +**Issue:** + +```python +def get_agent_context(vault: Path) -> dict: + """Build agent bootstrap context from paperforge.db. + + Returns None if DB is missing. + """ +``` + +The docstring says "Returns None if DB is missing" but the function returns `None` on *any* exception (DB missing, corrupt, permission denied, etc.). Inaccurate docstrings mislead future maintainers. + +**Fix:** Update to: + +```python +"""Build agent bootstrap context from paperforge.db. + +Returns None if the DB file does not exist or a query fails. +""" +``` + +### WR-03: Search command usage string missing `--year-to` flag + +**File:** `docs/superpowers/plans/2026-05-12-agent-context.md:189-192` + +**Issue:** The plan's `COMMANDS` dict lists: + +```python +"search": { + "usage": "paperforge search --json [--collection NAME] [--domain NAME] [--ocr done|pending] [--year-from N] [--limit N]", + ... +} +``` + +But the spec (line 54) and the actual CLI parser (`cli.py:279`) both include `[--year-to N]`. The plan omits it. The spec usage string is: + +``` +paperforge search --json [--collection NAME] [--domain NAME] [--ocr done|pending] [--year-from N] [--year-to N] [--limit N] +``` + +An agent that reads the plan's command catalog may not know `--year-to` is available. + +**Fix:** Add `[--year-to N]` to the search usage string in `COMMANDS`: + +```python +"search": { + "usage": "paperforge search --json [--collection NAME] [--domain NAME] [--ocr done|pending] [--year-from N] [--year-to N] [--limit N]", + "purpose": "Full-text search with optional collection/domain/lifecycle filters", +}, +``` + +--- + +## Info + +### IN-01: Minimal test coverage — no integration-level test + +**File:** `docs/superpowers/plans/2026-05-12-agent-context.md:137-147` + +**Issue:** The single test only covers the `None` return when DB is absent: + +```python +def test_get_agent_context_returns_none_when_no_db(): + assert get_agent_context(Path("/nonexistent/vault")) is None +``` + +There are no tests for: +- Successful query of a populated DB +- Empty DB (0 papers) +- Collection tree with multi-level pipe-separated paths +- Collection tree with empty/whitespace-only paths +- Domain/lifecycle/OCR/deep-reading counts + +**Fix:** Consider adding a fixture-based test using an in-memory SQLite DB with sample data. + +### IN-02: Redundant `_COMMAND_REGISTRY` entry + +**File:** `docs/superpowers/plans/2026-05-12-agent-context.md:290-293` and `paperforge/cli.py:431-571` + +**Issue:** The plan adds `agent-context` to `_COMMAND_REGISTRY` *and* adds direct `if args.command == "agent-context"` dispatch in `cli.py`. The existing `cli.py` main() function already uses direct dispatch for most commands (`paper-status`, `search`, `context`, `dashboard`, etc.) and only uses `_COMMAND_REGISTRY` for `memory` subcommand dispatch. Adding to both is harmless but inconsistent — either use the registry or use direct dispatch, not both. + +Since the plan already adds to `_COMMAND_REGISTRY`, you could use it for dispatch instead: + +```python +if args.command == "agent-context": + mod = get_command_module("agent-context") + return mod.run(args) +``` + +Or keep the direct dispatch and skip `_COMMAND_REGISTRY` (matching `search`, `paper-status`, `context` patterns). Either is fine — just pick one. + +--- + +_Reviewed: 2026-05-12_ +_Reviewer: the agent (gsd-code-reviewer)_ +_Depth: deep_ diff --git a/docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-REVIEW.md b/docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-REVIEW.md new file mode 100644 index 0000000..93cf0c4 --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-REVIEW.md @@ -0,0 +1,381 @@ +--- +phase: memory-layer-2-5-spec-review +reviewed: 2026-05-12T00:00:00Z +depth: deep +files_reviewed: 7 +files_reviewed_list: + - docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md + - paperforge/memory/query.py + - paperforge/memory/builder.py + - paperforge/memory/schema.py + - paperforge/commands/dashboard.py + - paperforge/memory/fts.py + - paperforge/cli.py +findings: + critical: 5 + warning: 8 + info: 7 + total: 20 +status: issues_found +--- + +# Spec Review: Memory Layer Phase 2-5 Design + +**Reviewed:** 2026-05-12 +**Depth:** deep (cross-file analysis with import graph and call-chain tracing) +**Files Reviewed:** 7 (1 spec + 6 existing source files) +**Status:** issues_found + +## Summary + +Cross-referenced the Phase 2-5 design spec against the existing Memory Layer codebase (`memory/query.py`, `memory/builder.py`, `memory/schema.py`, `memory/fts.py`), plus the CLI dispatcher (`cli.py`) and dashboard command (`commands/dashboard.py`). + +The design correctly reuses existing infrastructure (`compute_hash`, `PAPER_COLUMNS`, `ASSET_FIELDS`, `read_index`) and follows the layered architecture (memory lib → commands module → CLI dispatch). However, five blocker-level issues were found involving **contract violations**, **missing schema migrations**, and **incomplete PFResult compliance** that must be resolved before implementation begins. + +--- + +## Critical Issues + +### CR-01: Dashboard return format contract violation + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:188-200` + +**Issue:** The spec states "Dashboard output format must NOT change (plugin depends on it)" (line 204), yet `_dashboard_from_db()` adds a new top-level key `_source` to the return dict. The existing `_gather_dashboard_data()` returns `{"stats": {...}, "permissions": {...}}` with exactly two top-level keys. Adding `_source` is a format change that will break any plugin consumer that iterates over top-level keys or destructures the response. + +**Fix:** Either: +1. Nest `_source` inside `stats` (e.g., `stats._source`), preserving the two-key top-level structure, OR +2. Explicitly acknowledge the format change and version the dashboard response schema, coordinating with the plugin team. + +```python +# Option 1 — nest inside stats: +return { + "stats": { + "papers": total, + "pdf_health": {...}, + "ocr_health": {...}, + "domain_counts": domain_counts, + "_source": "paperforge.db" # nested, not top-level + }, + "permissions": permissions, +} +``` + +--- + +### CR-02: Schema version not bumped for new tables + +**File:** `paperforge/memory/schema.py:5` (CURRENT_SCHEMA_VERSION = 1) +**Cross-ref:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:282-304` + +**Issue:** The spec introduces two new tables (`paper_chunks`, `chunk_fts`) but does not mention incrementing `CURRENT_SCHEMA_VERSION` (currently `1`). The existing `get_memory_status()` in `query.py:38` compares stored schema version against `CURRENT_SCHEMA_VERSION` to detect staleness. If the version stays at 1, existing databases won't be detected as stale, and `ensure_schema()` won't know to create the new tables on upgrade. + +Additionally, `build_from_index()` in `builder.py:88-90` drops and recreates all tables when the stored version differs from `CURRENT_SCHEMA_VERSION`: +```python +if stored_version != CURRENT_SCHEMA_VERSION: + drop_all_tables(conn) +ensure_schema(conn) +``` +Without a version bump, upgrading users will never get the new tables. + +**Fix:** Bump `CURRENT_SCHEMA_VERSION` to `2` in `schema.py:5`. The spec should explicitly state this. + +--- + +### CR-03: FTS virtual table naming violates existing convention + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:295-303` +**Cross-ref:** `paperforge/memory/schema.py:86-101` + +**Issue:** The existing FTS virtual table is named `paper_fts` (schema.py line 86). The spec names the new content-sync table `chunk_fts`. The established naming convention is `{entity}_fts` where `{entity}` is the base table name. Since the entity table is `paper_chunks` (not `chunks`), the FTS table should be `paper_chunk_fts` for consistency and to avoid collision with any future `chunks` table from another subsystem. + +**Fix:** Rename `chunk_fts` to `paper_chunk_fts` throughout the spec. + +```sql +CREATE VIRTUAL TABLE IF NOT EXISTS paper_chunk_fts USING fts5( + chunk_id UNINDEXED, + paper_id UNINDEXED, + source_type, + section_title, + chunk_text, + content='paper_chunks', + content_rowid='rowid' +); +``` + +--- + +### CR-04: `agent-context` output format not wrapped in PFResult + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:26-81` +**Cross-ref:** `paperforge/core/result.py:18-27` (PFResult dataclass) + +**Issue:** Every CLI command in the existing codebase returns output via `PFResult.to_json()` (see `paper_status.py:35-40`, `search.py:61`, `dashboard.py:38-40`). The PFResult contract includes fields `ok`, `command`, `version`, `data`, `error`, `warnings`, `next_actions`. The spec's `agent-context` output shows a raw dict structure mimicking PFResult but it is ambiguous whether the implementation will actually use the `PFResult` dataclass. + +If this command bypasses PFResult, it breaks the contract that all `--json` outputs conform to the same envelope format, making it impossible for downstream consumers (plugin, agents) to parse responses uniformly. + +**Fix:** The spec should state explicitly: +```python +result = PFResult( + ok=True, + command="agent-context", + version=PF_VERSION, + data={...}, # the full context dict +) +print(result.to_json()) +``` + +--- + +### CR-05: `get_agent_context()` has no error handling for SQL failures + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:88-107` +**Cross-ref:** `paperforge/memory/schema.py:150-158` (get_schema_version catches OperationalError) + +**Issue:** The spec's `get_agent_context()` opens a connection and executes queries but wraps only the connection lifecycle in try/finally (for close). It does not wrap the individual SQL queries in try/except. If the DB exists but has a corrupted schema (wrong column count, missing table), the query will raise `sqlite3.OperationalError` which propagates unhandled up to the CLI command, producing a raw traceback instead of a clean PFResult error. + +Compare with `get_memory_status()` in `query.py:46-49` which wraps all DB reads in `try/except Exception` and returns a safe fallback dict. + +**Fix:** +```python +def get_agent_context(vault: Path) -> dict: + conn = get_connection(get_memory_db_path(vault), read_only=True) + try: + total = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0] + # ... + except sqlite3.Error as exc: + return {"ok": False, "error": f"DB read failed: {exc}"} + finally: + conn.close() +``` + +--- + +## Warnings + +### WR-01: `agent-context` re-derives freshness instead of delegating + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:88-107` +**Cross-ref:** `paperforge/memory/query.py:16-77` (get_memory_status) + +**Issue:** The spec's `get_agent_context()` manually queries `SELECT COUNT(*)` and `GROUP BY domain` but does not use the existing `get_memory_status()` function which already computes `fresh`, `needs_rebuild`, `hash_match`, and `count_match`. The `"memory_db": "ready"` field is hardcoded — it doesn't reflect whether the DB is actually fresh. Calling `get_memory_status()` would provide a canonical freshness signal that can gate whether the agent can trust the DB. + +**Fix:** Add a call to `get_memory_status(vault)` at the top of `get_agent_context()` and use `result["fresh"]` to set the `memory_db` field to `"ready"` or `"stale"`. + +--- + +### WR-02: `pdf_health` via `lifecycle` is lossy — misses `path_error` states + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:163-167` +**Cross-ref:** `paperforge/commands/dashboard.py:84-107` (path_error regex detection) +**Cross-ref:** `paperforge/memory/builder.py:28-38` (PAPER_COLUMNS — no path_error column) + +**Issue:** The spec computes `pdf_healthy` as `r["lifecycle"] != "indexed"`. A paper with `lifecycle == "pdf_ready"` has a PDF, but that PDF could be broken (permission denied, file missing). The existing file-scanning code in `dashboard.py:99-107` uses a `path_error` regex to detect these cases and counts them separately as `broken`. The DB schema (`PAPER_COLUMNS`) has no `path_error` column, so the DB-based dashboard cannot distinguish between "healthy PDF" and "broken PDF." The hardcoded `"broken": 0` is misleading. + +**Fix:** Either: +1. Add a `path_error` column to the `papers` table and populate it during `build_from_index()`, OR +2. Document this as a known limitation and note that `broken` counts require file-system scanning. + +--- + +### WR-03: `refresh_paper()` linear O(n) scan through formal-library.json + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:228-234` + +**Issue:** The spec searches for the target entry by iterating through all items: +```python +for e in items: + if e.get("zotero_key") == zotero_key: + entry = e; break +``` +For 283 papers this is negligible, but for larger libraries (10K+ entries), this becomes a performance concern. The spec should at minimum acknowledge this limitation and note that an index lookup or dictionary-based approach should be considered for scale. + +**Fix:** Build a lookup dict keyed by `zotero_key`: +```python +index_map = {e.get("zotero_key"): e for e in items if e.get("zotero_key")} +entry = index_map.get(zotero_key) +``` + +--- + +### WR-04: `refresh_paper()` silent skip on stale index is indistinguishable from success + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:268` + +**Issue:** The spec says "If formal-library.json is stale (entry not found), skip silently" and `refresh_paper()` returns `False`. However, the integration points (sync, ocr, deep-finalize, repair) call `refresh_paper()` after modifying state. If the index hasn't been regenerated yet, the refresh silently fails and the DB is now out of sync with the ground truth. The caller has no way to distinguish "refresh succeeded" from "entry not in index yet — DB unchanged." + +This is most acute after `paperforge ocr` where OCR status changes but sync hasn't re-run — the DB will show stale OCR status. + +**Fix:** Return a richer result: +```python +return {"action": "refreshed", "key": zotero_key} +# vs +return {"action": "skipped", "key": zotero_key, "reason": "not_in_index"} +``` +Or raise a distinguishable exception that callers can catch and handle (e.g., trigger a full rebuild). + +--- + +### WR-05: `retrieve` chunk output doesn't specify JOIN to get `title` + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:319-339` +**Cross-ref:** `paperforge/memory/fts.py:41-51` (search_papers JOIN pattern) + +**Issue:** The `retrieve` output (lines 327-338) shows `zotero_key` and `title` fields per chunk, but `paper_chunks` stores only `paper_id` (not `zotero_key` or `title`). The existing `search_papers()` in `fts.py:41-51` demonstrates the correct pattern: JOIN `paper_fts f` → `papers p ON p.rowid = f.rowid` to get metadata. The spec's `retrieve` query is unspecified — it must JOIN `chunk_fts` → `paper_chunks` → `papers` to produce the output format shown. + +**Fix:** Specify the query: +```sql +SELECT c.chunk_id, c.paper_id, c.source_type, c.section_title, + c.page_number, c.chunk_text, p.title, p.zotero_key, rank +FROM paper_chunk_fts f +JOIN paper_chunks c ON c.rowid = f.rowid +JOIN papers p ON p.zotero_key = c.paper_id +WHERE paper_chunk_fts MATCH ? +ORDER BY rank LIMIT ? +``` + +--- + +### WR-06: `agent-context` advertises `--collection` flag that doesn't exist + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:54` +**Cross-ref:** `paperforge/cli.py:273-283` (search subparser — no --collection flag) + +**Issue:** The `agent-context` output lists: +``` +"search": { + "usage": "paperforge search --json [--collection NAME] [--domain NAME] ..." +} +``` +But the existing `search` subparser (cli.py lines 273-283) defines `--domain`, `--year-from`, `--year-to`, `--ocr`, `--deep`, `--lifecycle`, `--next-step` — **no `--collection` filter**. If an agent reads the `agent-context` output and tries `--collection`, the command will fail with an unrecognized argument error. + +**Fix:** Either add `--collection` to the search subparser (requires adding a `collection_path` filter to `search_papers()` in fts.py), or remove it from the agent-context output until it's implemented. + +--- + +### WR-07: FTS triggers for `paper_chunks` / `paper_chunk_fts` not specified + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:295-303` +**Cross-ref:** `paperforge/memory/schema.py:103-118` (FTS_TRIGGERS for papers) + +**Issue:** The existing `paper_fts` table uses `content='papers'` (a content-sync external content FTS5 table) and relies on INSERT/UPDATE/DELETE triggers on the `papers` table to keep the FTS index in sync (schema.py lines 103-118). The spec's `chunk_fts` also uses `content='paper_chunks'` with `content_rowid='rowid'` — the same content-sync pattern. But the spec does not mention the required triggers on the `paper_chunks` table. Without them, inserts/deletes into `paper_chunks` won't update the FTS index. + +**Fix:** Add trigger definitions to the spec: +```sql +CREATE TRIGGER IF NOT EXISTS paper_chunks_ai AFTER INSERT ON paper_chunks BEGIN + INSERT INTO paper_chunk_fts(rowid, chunk_id, paper_id, source_type, section_title, chunk_text) + VALUES (new.rowid, new.chunk_id, new.paper_id, new.source_type, new.section_title, new.chunk_text); +END; +CREATE TRIGGER IF NOT EXISTS paper_chunks_ad AFTER DELETE ON paper_chunks BEGIN + INSERT INTO paper_chunk_fts(paper_chunk_fts, rowid, chunk_id, paper_id, source_type, section_title, chunk_text) + VALUES ('delete', old.rowid, old.chunk_id, old.paper_id, old.source_type, old.section_title, old.chunk_text); +END; +CREATE TRIGGER IF NOT EXISTS paper_chunks_au AFTER UPDATE ON paper_chunks BEGIN + INSERT INTO paper_chunk_fts(paper_chunk_fts, rowid, chunk_id, paper_id, source_type, section_title, chunk_text) + VALUES ('delete', old.rowid, old.chunk_id, old.paper_id, old.source_type, old.section_title, old.chunk_text); + INSERT INTO paper_chunk_fts(rowid, chunk_id, paper_id, source_type, section_title, chunk_text) + VALUES (new.rowid, new.chunk_id, new.paper_id, new.source_type, new.section_title, new.chunk_text); +END; +``` + +--- + +### WR-08: DB dashboard hardcodes `broken: 0` — data regression from file scanner + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:191` +**Cross-ref:** `paperforge/commands/dashboard.py:78,98-105` (pdf_broken tracking) +**Cross-ref:** `paperforge/memory/builder.py:28-38` (PAPER_COLUMNS — no path_error) + +**Issue:** The existing file-scanning code tracks three PDF states: `healthy`, `broken`, and `missing`. The DB-based approach hardcodes `"broken": 0` because the `papers` table has no column for path_error. This means: +- A PDF file deleted after sync will show as `healthy` (lifecycle unchanged in DB) but is actually broken. +- The user sees 0 broken PDFs in the dashboard when they may have several. + +The fallback to file scanning when DB is stale partially mitigates this, but a fresh DB can also have stale path information for any paper whose PDF was moved/deleted after the last `memory build`. + +**Fix:** Either add a `broken_pdf_count` computation that cross-checks `pdf_path` existence on disk (lightweight stat call), or document that the DB dashboard shows "index-time PDF health" and the file scanner shows "current PDF health." + +--- + +## Info + +### IN-01: Command naming inconsistency — `agent-context` vs existing `paper-status` + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:121` +**Cross-ref:** `paperforge/cli.py:269-271` (paper-status subparser) + +**Issue:** Existing commands use descriptive noun phrases: `paper-status`, `deep-reading`, `base-refresh`. The new command `agent-context` follows a different pattern. While the purposes differ (paper-level vs. system-level), the inconsistency is worth noting for CLI discoverability. + +**Suggestion:** Consider `context` (shorter) or `memory-context` (follows `memory build`/`memory status` pattern). No change required — just noting. + +--- + +### IN-02: `ALL_TABLES` and `drop_all_tables()` not updated in spec + +**File:** `paperforge/memory/schema.py:120,137-141` +**Cross-ref:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:282-303` + +**Issue:** The `ALL_TABLES` list in `schema.py:120` controls which tables `drop_all_tables()` removes on rebuild. The spec introduces `paper_chunks` and `chunk_fts` but doesn't mention updating this list. If `drop_all_tables()` is called during a full rebuild (e.g., schema version mismatch), the old tables won't be dropped, potentially leaving orphaned data. + +**Suggestion:** The spec should note that `ALL_TABLES` must be updated to include the new tables. + +--- + +### IN-03: `ensure_schema()` not mentioned in spec + +**File:** `paperforge/memory/schema.py:123-134` +**Cross-ref:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:282-303` + +**Issue:** The spec defines `CREATE TABLE` statements for `paper_chunks` and `chunk_fts` but doesn't mention that `ensure_schema()` must be updated to execute these statements. Both `build_from_index()` and `refresh_paper()` rely on `ensure_schema()` to guarantee tables exist. + +**Suggestion:** Add a note: "Update `ensure_schema()` in `schema.py` to execute `CREATE TABLE IF NOT EXISTS paper_chunks` and `CREATE VIRTUAL TABLE IF NOT EXISTS paper_chunk_fts`." + +--- + +### IN-04: `retrieve` command name vs `search` — discoverability concern + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:57-60,315-317` +**Cross-ref:** `paperforge/cli.py:273` (search subparser) + +**Issue:** The spec introduces `paperforge retrieve` for OCR fulltext searching alongside existing `paperforge search` for metadata searching. The names don't make the distinction self-evident. New users won't know whether to `search` or `retrieve`. + +**Suggestion:** Consider `paperforge fulltext` or `paperforge search-content` to make the purpose clearer. Alternatively, add a `--fulltext` flag to the existing `search` command that switches to `chunk_fts` when specified. No blocker — naming preference. + +--- + +### IN-05: `agent-context` requires `--json` flag but always outputs JSON + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:121,127` +**Cross-ref:** `paperforge/cli.py` (all commands gate JSON output on --json flag) + +**Issue:** The spec says "Always outputs `--json` format; no human-readable mode needed" (line 127), yet the CLI spec shows `paperforge agent-context --json` (line 121). If the command always outputs JSON, the `--json` flag is either redundant (confusing) or incorrectly documented (the command should work without `--json` for human-readable output, like `paper-status` does in `paper_status.py:52-68`). + +**Suggestion:** Either: +1. Make `--json` required/default and remove it from the usage (always JSON), or +2. Add a human-readable mode like `paper-status` and keep `--json` as optional. + +--- + +### IN-06: Field name `paper_status` (underscore) vs `paper-status` (hyphen) in `agent-context` output + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:49` +**Cross-ref:** `paperforge/cli.py:269` (paper-status subparser name) + +**Issue:** The `agent-context` output uses `"paper-status"` as the command key (correct, matches CLI name). However, the next_actions pattern in existing code uses the command name as-is. Minor — no bug, just noting for consistency review. + +--- + +### IN-07: Chunking strategy — `max 500 tokens` underspecified + +**File:** `docs/superpowers/specs/2026-05-12-memory-layer-phase2-5-design.md:306-311` + +**Issue:** The spec says "Max 500 tokens per chunk" and "max 3 paragraphs per chunk" but doesn't specify: +- What constitutes a "token" (word-based? `tiktoken`? character count / 4?) +- Whether the token limit or paragraph limit takes precedence +- What happens when a single paragraph exceeds 500 tokens (split mid-paragraph? truncate? keep as oversized chunk?) + +**Suggestion:** Clarify tokenization method and tie-breaking rules. For example: "Use `len(text.split())` as a word-count proxy for tokens. If a single paragraph exceeds 500 words, split at sentence boundaries." + +--- + +_Reviewed: 2026-05-12_ +_Reviewer: VT-OS/OPENCODE Terminal (gsd-code-reviewer)_ +_Depth: deep_ From d4df673088e1f4f07106057308d3c37c674db105 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 20:08:46 +0800 Subject: [PATCH 020/132] feat(memory): dashboard reads from SQLite + incremental refresh_paper() --- paperforge/commands/dashboard.py | 124 ++++++++++++++++++++-------- paperforge/memory/refresh.py | 133 ++++++++++++++++++++++++++++++ tests/unit/memory/test_refresh.py | 9 ++ 3 files changed, 233 insertions(+), 33 deletions(-) create mode 100644 paperforge/memory/refresh.py create mode 100644 tests/unit/memory/test_refresh.py diff --git a/paperforge/commands/dashboard.py b/paperforge/commands/dashboard.py index 68b0e79..ac93890 100644 --- a/paperforge/commands/dashboard.py +++ b/paperforge/commands/dashboard.py @@ -51,12 +51,91 @@ def run(args) -> int: return 1 -def _gather_dashboard_data(vault: Path) -> dict: - """Gather stats and permissions for dashboard display.""" +def _dashboard_from_db(vault: Path) -> dict | None: + """Build dashboard stats from paperforge.db. Returns None if DB missing.""" + from paperforge.memory.db import get_connection, get_memory_db_path + + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return None + conn = get_connection(db_path, read_only=True) + try: + total = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0] + + pdf_healthy = conn.execute( + "SELECT COUNT(*) FROM papers WHERE lifecycle != 'indexed'" + ).fetchone()[0] + pdf_missing = conn.execute( + "SELECT COUNT(*) FROM papers WHERE lifecycle = 'indexed'" + ).fetchone()[0] + + ocr_done = conn.execute( + "SELECT COUNT(*) FROM papers WHERE ocr_status='done'" + ).fetchone()[0] + ocr_failed = conn.execute( + "SELECT COUNT(*) FROM papers WHERE ocr_status='failed'" + ).fetchone()[0] + ocr_pending = total - ocr_done - ocr_failed + + domain_counts = { + r["domain"]: r["cnt"] + for r in conn.execute( + "SELECT domain, COUNT(*) as cnt FROM papers GROUP BY domain" + ).fetchall() + } + + return { + "stats": { + "papers": total, + "pdf_health": {"healthy": pdf_healthy, "missing": pdf_missing, "broken": 0}, + "ocr_health": {"pending": ocr_pending, "done": ocr_done, "failed": ocr_failed}, + "domain_counts": domain_counts, + "_source": "paperforge.db", + }, + } + except Exception: + return None + finally: + conn.close() + + +def _check_permissions(vault: Path) -> dict: + """Check sync/OCR/context permissions (lightweight filesystem check).""" + cfg = load_vault_config(vault) + paths = paperforge_paths(vault, cfg) + + export_files = sorted(paths["exports"].glob("*.json")) if paths["exports"].exists() else [] + can_sync = len(export_files) > 0 + + paddle_token = ( + os.environ.get("PADDLEOCR_API_TOKEN") or os.environ.get("PADDLEOCR_API_KEY") or os.environ.get("OCR_TOKEN") + ) + can_ocr = bool(paddle_token) + + can_copy_context = False + pf_dir = paths.get("paperforge", vault / cfg["system_dir"] / "PaperForge") + if pf_dir.exists(): + try: + pf_dir.parent.mkdir(parents=True, exist_ok=True) + test_file = pf_dir / ".write_test" + test_file.touch() + test_file.unlink() + can_copy_context = True + except (OSError, PermissionError): + pass + + return { + "can_sync": can_sync, + "can_ocr": can_ocr, + "can_copy_context": can_copy_context, + } + + +def _dashboard_from_files(vault: Path) -> dict: + """Gather stats and permissions by scanning literature files.""" cfg = load_vault_config(vault) paths = paperforge_paths(vault, cfg) - # ── Papers / formal note count ── _skip_names = {"fulltext.md", "deep-reading.md", "discussion.md"} record_count = 0 if paths["literature"].exists(): @@ -64,7 +143,6 @@ def _gather_dashboard_data(vault: Path) -> dict: if p.name not in _skip_names: record_count += 1 - # ── Domain counts (first-level subdirs under literature) ── domain_counts: dict[str, int] = {} if paths["literature"].exists(): for domain_dir in sorted(paths["literature"].iterdir()): @@ -73,7 +151,6 @@ def _gather_dashboard_data(vault: Path) -> dict: if count > 0: domain_counts[domain_dir.name] = count - # ── PDF health & OCR health from frontmatter ── pdf_healthy = 0 pdf_broken = 0 pdf_missing = 0 @@ -95,7 +172,6 @@ def _gather_dashboard_data(vault: Path) -> dict: except Exception: continue - # PDF health path_error_m = _path_error_pat.search(text) if path_error_m: error_type = path_error_m.group(1) @@ -106,7 +182,6 @@ def _gather_dashboard_data(vault: Path) -> dict: elif _pdf_path_pat.search(text): pdf_healthy += 1 - # OCR health ocr_status_m = _ocr_status_pat.search(text) if ocr_status_m: status = ocr_status_m.group(1).strip().lower().strip('"') @@ -119,27 +194,6 @@ def _gather_dashboard_data(vault: Path) -> dict: elif _do_ocr_pat.search(text): ocr_pending += 1 - # ── Permissions ── - export_files = sorted(paths["exports"].glob("*.json")) if paths["exports"].exists() else [] - can_sync = len(export_files) > 0 - - paddle_token = ( - os.environ.get("PADDLEOCR_API_TOKEN") or os.environ.get("PADDLEOCR_API_KEY") or os.environ.get("OCR_TOKEN") - ) - can_ocr = bool(paddle_token) - - can_copy_context = False - pf_dir = paths.get("paperforge", vault / cfg["system_dir"] / "PaperForge") - if pf_dir.exists(): - try: - pf_dir.parent.mkdir(parents=True, exist_ok=True) - test_file = pf_dir / ".write_test" - test_file.touch() - test_file.unlink() - can_copy_context = True - except (OSError, PermissionError): - pass - return { "stats": { "papers": record_count, @@ -155,9 +209,13 @@ def _gather_dashboard_data(vault: Path) -> dict: }, "domain_counts": domain_counts, }, - "permissions": { - "can_sync": can_sync, - "can_ocr": can_ocr, - "can_copy_context": can_copy_context, - }, + "permissions": _check_permissions(vault), } + + +def _gather_dashboard_data(vault: Path) -> dict: + db_result = _dashboard_from_db(vault) + if db_result is not None: + db_result["permissions"] = _check_permissions(vault) + return db_result + return _dashboard_from_files(vault) diff --git a/paperforge/memory/refresh.py b/paperforge/memory/refresh.py new file mode 100644 index 0000000..ebd033a --- /dev/null +++ b/paperforge/memory/refresh.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from paperforge.memory.builder import ( + PAPER_COLUMNS, + ASSET_FIELDS, + ALIAS_TYPES, + compute_hash, + _resolve_vault_path, +) +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.schema import ensure_schema +from paperforge.worker.asset_index import read_index +from paperforge.worker.asset_state import ( + compute_lifecycle, + compute_maturity, + compute_next_step, +) + + +def refresh_paper(vault: Path, zotero_key: str) -> bool: + """Incrementally refresh one paper in paperforge.db from formal-library.json.""" + envelope = read_index(vault) + if not envelope: + return False + items = envelope if isinstance(envelope, list) else envelope.get("items", []) + + entry = None + for e in items: + if e.get("zotero_key") == zotero_key: + entry = e + break + if not entry: + return False + + generated_at = envelope.get("generated_at", "") if not isinstance(envelope, list) else "" + + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return False + + conn = get_connection(db_path, read_only=False) + try: + ensure_schema(conn) + + lifecycle = str(compute_lifecycle(entry)) + maturity = compute_maturity(entry) + next_step = str(compute_next_step(entry)) + + paper_values = {} + for col in PAPER_COLUMNS: + if col == "authors_json": + paper_values[col] = json.dumps(entry.get("authors", []), ensure_ascii=False) + elif col == "collections_json": + paper_values[col] = json.dumps(entry.get("collections", []), ensure_ascii=False) + elif col == "lifecycle": + paper_values[col] = lifecycle + elif col == "maturity_level": + paper_values[col] = maturity.get("level", 1) + elif col == "maturity_name": + paper_values[col] = maturity.get("level_name", "") + elif col == "next_step": + paper_values[col] = next_step + elif col == "updated_at": + paper_values[col] = generated_at + elif col in ("do_ocr", "analyze"): + val = entry.get(col) + paper_values[col] = 1 if val else 0 + elif col == "has_pdf": + paper_values[col] = 1 if entry.get("has_pdf") else 0 + else: + paper_values[col] = entry.get(col, "") + + placeholders = ", ".join([f":{c}" for c in PAPER_COLUMNS]) + cols = ", ".join(PAPER_COLUMNS) + conn.execute( + f"INSERT OR REPLACE INTO papers ({cols}) VALUES ({placeholders})", + paper_values, + ) + + conn.execute("DELETE FROM paper_assets WHERE paper_id = ?", (zotero_key,)) + for asset_type, entry_field in ASSET_FIELDS: + path_val = entry.get(entry_field, "") + if not path_val: + continue + rel_path = str(path_val).replace("\\", "/") + abs_path = _resolve_vault_path(vault, rel_path) + exists = 1 if abs_path.exists() else 0 + if asset_type == "deep_reading" and abs_path.exists(): + try: + content = abs_path.read_text(encoding="utf-8") + exists = 1 if "## \U0001f52d \u7cbe\u8bfb" in content else 0 + except Exception: + exists = 0 + conn.execute( + "INSERT OR REPLACE INTO paper_assets (paper_id, asset_type, path, exists_on_disk) VALUES (?, ?, ?, ?)", + (zotero_key, asset_type, rel_path, exists), + ) + + conn.execute("DELETE FROM paper_aliases WHERE paper_id = ?", (zotero_key,)) + for alias_type in ALIAS_TYPES: + raw_val = entry.get(alias_type, "") + if not raw_val: + continue + raw_str = str(raw_val) + conn.execute( + "INSERT OR REPLACE INTO paper_aliases (paper_id, alias, alias_norm, alias_type) VALUES (?, ?, ?, ?)", + (zotero_key, raw_str, raw_str.lower().strip(), alias_type), + ) + + # Re-index FTS + try: + conn.execute("DELETE FROM paper_fts WHERE zotero_key = ?", (zotero_key,)) + conn.execute( + "INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) " + "VALUES ((SELECT rowid FROM papers WHERE zotero_key = ?), ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + (zotero_key, zotero_key, entry.get("citation_key", ""), entry.get("title", ""), + entry.get("first_author", ""), paper_values["authors_json"], + entry.get("abstract", ""), entry.get("journal", ""), entry.get("domain", ""), + entry.get("collection_path", ""), paper_values["collections_json"]), + ) + except Exception: + pass # FTS may not be available + + conn.commit() + return True + except Exception: + conn.rollback() + raise + finally: + conn.close() diff --git a/tests/unit/memory/test_refresh.py b/tests/unit/memory/test_refresh.py new file mode 100644 index 0000000..4f1bd2e --- /dev/null +++ b/tests/unit/memory/test_refresh.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from pathlib import Path + +from paperforge.memory.refresh import refresh_paper + + +def test_refresh_paper_returns_false_when_no_db(): + assert refresh_paper(Path("/nonexistent/vault"), "KEY001") is False From 4ffcd4c1c0066bfacf72d619e0232714f7760c57 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 20:48:09 +0800 Subject: [PATCH 021/132] =?UTF-8?q?docs(spec):=20add=20Phase=206=20?= =?UTF-8?q?=E2=80=94=20paper=5Fevents,=20reading/working=20logs,=20vector?= =?UTF-8?q?=20retrieval?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2026-05-12-memory-layer-phase6-logging.md | 142 ++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-12-memory-layer-phase6-logging.md diff --git a/docs/superpowers/specs/2026-05-12-memory-layer-phase6-logging.md b/docs/superpowers/specs/2026-05-12-memory-layer-phase6-logging.md new file mode 100644 index 0000000..0421f31 --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-memory-layer-phase6-logging.md @@ -0,0 +1,142 @@ +# Memory Layer Phase 6+ — Reading Events, Logs, Vector Retrieval + +> **Date:** 2026-05-12 | **Depends on:** Memory Layer Phase 1-5 + +## Feature 1: paper_events — Reading Log Backend + +### Schema + +```sql +CREATE TABLE IF NOT EXISTS paper_events ( + event_id INTEGER PRIMARY KEY AUTOINCREMENT, + paper_id TEXT NOT NULL, + event_type TEXT NOT NULL, -- 'reading_note', 'ocr_done', 'sync_updated', 'deep_done' + created_at TEXT NOT NULL DEFAULT (datetime('now')), + payload_json TEXT, -- flexible per event_type + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +``` + +### reading_note payload + +```json +{ + "excerpt": "the fundamental disjunction between materials science and biology", + "section": "Section 7-8", + "page": "P29", + "usage": "F 段核心论点", + "note": "与 DDGMQ7RW 独立诊断同一问题" +} +``` + +### Integration + +Agent 在 `/pf-deep` 精读完一个段落后自动调用: +``` +paper_events INSERT (paper_id, 'reading_note', payload_json) +``` + +或通过 CLI: +```bash +paperforge reading-log --write LQZ2FWIW \ + --section "Discussion P12" \ + --excerpt "magnetoelectric 被定位为压电的增强/补偿" \ + --usage "F 段 Liang 定位" +``` + +--- + +## Feature 2: reading-log / working-log — Export & Slash Commands + +### reading-log export + +```bash +paperforge reading-log --output Project//reading-log.md [--since DATE] +``` + +按 `created_at DESC` 导出所有 `reading_note` events,格式: + +```markdown +## 2026-05-12 + +### LQZ2FWIW — Alvarez-Lorenzo et al. 2023 +- **Discussion P12**:"magnetoelectric 被定位为压电的增强/补偿" + → 用途: F 段 Liang 定位的文献支撑 +``` + +### Slash command: `/pf-log-reading` + +嵌入式 prompt(在 agent skill 或 slash command 定义中): + +``` +读完当前段落或章节后,记录以下信息到 paper_events: +- 来源: zotero_key + section + page +- 信息内容: 原文关键句(逐字引用) +- 用途: 这个信息支持当前写作的哪个论点 +- 备注: 任何交叉验证/矛盾/注意事项 + +执行: paperforge reading-log --write KEY --section "..." --excerpt "..." --usage "..." +``` + +### Slash command: `/pf-log-session` + +``` +会话结束前回顾本次所有决策节点,按以下格式追加到 Project//working-log.md: + +## <日期> — <小节名> + +### 核心决策 +- 做了什么、为什么 + +### 弯路与修正 +- 错误方向 → 用户纠正 → 最终方案 + +### 可复用方法论 +- 本段的 pattern 是什么 + +### 待办 +- [ ] ... + +格式参考: Project/综述写作/working-log.md +``` + +--- + +## Feature 3: Vector Retrieval (Deferred) + +| 特性 | 方案 | +|------|------| +| 模型 | 本地 `all-MiniLM-L6-v2`(80MB,CPU 可跑) | +| API 备选 | OpenAI `text-embedding-3-small` | +| 向量库 | ChromaDB | +| 构建 | `paperforge embed build` | +| 增量 | `refresh_paper()` 自动 re-embed | +| 检索 | `paperforge retrieve --json` | + +### Command output + +```json +{ + "chunks": [ + { + "zotero_key": "ABC123", + "title": "...", + "page": 6, + "section_title": "Results", + "chunk_text": "At 24h post-stimulation, chondrocyte proliferation...", + "score": 0.92 + } + ] +} +``` + +Agent 流程不变:retrieve → 候选段落(带论文身份) → paper-status → 读 fulltext 验证。 + +--- + +## Implementation Order + +1. paper_events table + reading-log write/export +2. `/pf-log-reading` + `/pf-log-session` slash commands +3. Working-log template (embedded in slash command prompt) +4. Vector retrieval (deferred, start when library > 500) From 99ab29bc75847a225923262be9737d5ec7c29dad Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 20:53:33 +0800 Subject: [PATCH 022/132] feat(memory): add paper_events table + reading-log write/export command --- paperforge/cli.py | 16 ++++++ paperforge/commands/reading_log.py | 64 ++++++++++++++++++++++++ paperforge/memory/events.py | 78 ++++++++++++++++++++++++++++++ paperforge/memory/schema.py | 22 ++++++++- 4 files changed, 179 insertions(+), 1 deletion(-) create mode 100644 paperforge/commands/reading_log.py create mode 100644 paperforge/memory/events.py diff --git a/paperforge/cli.py b/paperforge/cli.py index f40c5c7..fa3f242 100644 --- a/paperforge/cli.py +++ b/paperforge/cli.py @@ -270,6 +270,17 @@ def build_parser() -> argparse.ArgumentParser: p_paper_status.add_argument("query", help="Paper identifier (zotero_key, DOI, title, alias)") p_paper_status.add_argument("--json", action="store_true", help="Output as JSON") + p_rl = sub.add_parser("reading-log", help="Record or export reading notes") + p_rl.add_argument("--write", dest="paper_id", help="Write note for this zotero_key") + p_rl.add_argument("--section", help="Section (e.g. Discussion P12)") + p_rl.add_argument("--excerpt", help="Quoted excerpt") + p_rl.add_argument("--usage", help="How this supports the current writing") + p_rl.add_argument("--note", help="Optional cross-validation note") + p_rl.add_argument("--since", help="Export notes since date (YYYY-MM-DD)") + p_rl.add_argument("--limit", type=int, default=50, help="Max notes to export") + p_rl.add_argument("--output", help="Write markdown to file") + p_rl.add_argument("--json", action="store_true", help="Output as JSON") + p_search = sub.add_parser("search", help="Full-text search across the library") p_search.add_argument("query", help="Search query (supports FTS5 syntax)") p_search.add_argument("--json", action="store_true", help="Output as JSON") @@ -508,6 +519,11 @@ def main(argv: list[str] | None = None) -> int: return run(args) + if args.command == "reading-log": + from paperforge.commands.reading_log import run + + return run(args) + if args.command == "search": from paperforge.commands.search import run diff --git a/paperforge/commands/reading_log.py b/paperforge/commands/reading_log.py new file mode 100644 index 0000000..537cf5a --- /dev/null +++ b/paperforge/commands/reading_log.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import argparse +from pathlib import Path + +from paperforge import __version__ as PF_VERSION +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.events import export_reading_log, write_reading_note + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + + if args.paper_id and args.excerpt: + ok = write_reading_note( + vault, args.paper_id, args.section or "", + args.excerpt, args.usage or "", args.note or "", + ) + result = PFResult( + ok=ok, + command="reading-log", + version=PF_VERSION, + data={"written": ok}, + error=PFError(code=ErrorCode.INTERNAL_ERROR, message="Failed to write") if not ok else None, + ) + if args.json: + print(result.to_json()) + else: + print("Written." if ok else "Failed.") + return 0 if ok else 1 + + notes = export_reading_log(vault, since=args.since or "", limit=args.limit or 50) + result = PFResult( + ok=True, + command="reading-log", + version=PF_VERSION, + data={"notes": notes, "count": len(notes)}, + ) + + if args.json: + print(result.to_json()) + elif args.output: + lines = [] + last_date = None + for n in notes: + date_str = n["created_at"][:10] + if date_str != last_date: + last_date = date_str + lines.append(f"\n## {date_str}") + author = (n["first_author"] or "").split()[-1] if n["first_author"] else "" + lines.append(f"\n### {n['citation_key']} \u2014 {author} et al. {n['year']}") + lines.append(f"- **{n['section']}**\uff1a\"{n['excerpt']}\"") + if n["usage"]: + lines.append(f" \u2192 \u7528\u9014: {n['usage']}") + if n["note"]: + lines.append(f" \u2192 \u5907\u6ce8: {n['note']}") + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text("\n".join(lines).strip() + "\n", encoding="utf-8") + print(f"Exported {len(notes)} notes to {args.output}") + else: + print(f"{len(notes)} reading notes.") + return 0 diff --git a/paperforge/memory/events.py b/paperforge/memory/events.py new file mode 100644 index 0000000..444e65b --- /dev/null +++ b/paperforge/memory/events.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from paperforge.memory.db import get_connection, get_memory_db_path + + +def write_reading_note(vault: Path, paper_id: str, section: str, + excerpt: str, usage: str = "", note: str = "") -> bool: + """Record a reading note in paper_events.""" + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return False + + payload = { + "section": section, + "excerpt": excerpt, + "usage": usage, + "note": note, + } + conn = get_connection(db_path, read_only=False) + try: + conn.execute( + """INSERT INTO paper_events (paper_id, event_type, payload_json) + VALUES (?, 'reading_note', ?)""", + (paper_id, json.dumps(payload, ensure_ascii=False)), + ) + conn.commit() + return True + except Exception: + conn.rollback() + return False + finally: + conn.close() + + +def export_reading_log(vault: Path, since: str = "", limit: int = 50) -> list[dict]: + """Export reading notes as a list of dicts, ordered by created_at DESC.""" + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return [] + + conn = get_connection(db_path, read_only=True) + try: + query = """ + SELECT e.created_at, e.paper_id, e.payload_json, + p.citation_key, p.title, p.year, p.first_author + FROM paper_events e + JOIN papers p ON p.zotero_key = e.paper_id + WHERE e.event_type = 'reading_note' + """ + params = [] + if since: + query += " AND e.created_at >= ?" + params.append(since) + query += " ORDER BY e.created_at DESC LIMIT ?" + params.append(limit) + + rows = conn.execute(query, params).fetchall() + results = [] + for row in rows: + payload = json.loads(row["payload_json"]) + results.append({ + "created_at": row["created_at"], + "paper_id": row["paper_id"], + "citation_key": row["citation_key"], + "title": row["title"], + "year": row["year"], + "first_author": row["first_author"], + "section": payload.get("section", ""), + "excerpt": payload.get("excerpt", ""), + "usage": payload.get("usage", ""), + "note": payload.get("note", ""), + }) + return results + finally: + conn.close() diff --git a/paperforge/memory/schema.py b/paperforge/memory/schema.py index 8094549..dc3b7c0 100644 --- a/paperforge/memory/schema.py +++ b/paperforge/memory/schema.py @@ -117,7 +117,24 @@ END;""", ] -ALL_TABLES = ["paper_fts", "papers", "paper_assets", "paper_aliases", "meta"] +CREATE_EVENTS = """ +CREATE TABLE IF NOT EXISTS paper_events ( + event_id INTEGER PRIMARY KEY AUTOINCREMENT, + paper_id TEXT NOT NULL, + event_type TEXT NOT NULL, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + payload_json TEXT, + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +""" + +EVENT_INDEX_SQL = [ + "CREATE INDEX IF NOT EXISTS idx_events_paper ON paper_events(paper_id);", + "CREATE INDEX IF NOT EXISTS idx_events_type ON paper_events(event_type);", + "CREATE INDEX IF NOT EXISTS idx_events_time ON paper_events(created_at);", +] + +ALL_TABLES = ["paper_fts", "papers", "paper_assets", "paper_aliases", "meta", "paper_events"] def ensure_schema(conn: sqlite3.Connection) -> None: @@ -127,8 +144,11 @@ def ensure_schema(conn: sqlite3.Connection) -> None: conn.execute(CREATE_ASSETS) conn.execute(CREATE_ALIASES) conn.execute(CREATE_PAPER_FTS) + conn.execute(CREATE_EVENTS) for idx_sql in INDEX_SQL: conn.execute(idx_sql) + for idx_sql in EVENT_INDEX_SQL: + conn.execute(idx_sql) for trigger_sql in FTS_TRIGGERS: conn.execute(trigger_sql) conn.commit() From babfe06bc3f7ae73400aa1d6a23c2c7d3017cbd8 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 20:55:32 +0800 Subject: [PATCH 023/132] feat(skills): add /pf-log-reading and /pf-log-session slash commands --- paperforge/command_files/pf-log-reading.md | 33 +++++++++++++ paperforge/command_files/pf-log-session.md | 54 ++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 paperforge/command_files/pf-log-reading.md create mode 100644 paperforge/command_files/pf-log-session.md diff --git a/paperforge/command_files/pf-log-reading.md b/paperforge/command_files/pf-log-reading.md new file mode 100644 index 0000000..f93745a --- /dev/null +++ b/paperforge/command_files/pf-log-reading.md @@ -0,0 +1,33 @@ +# /pf-log-reading — Record a reading note + +> 读完当前段落或章节后自动记录到 paperforge.db 的 paper_events 表。 + +## Agent Workflow + +1. 确定 zotero_key (从上下文或 formal note 中获取) +2. 提取以下信息: + - **section**: 文献中的位置 (e.g. "Discussion P12", "Results Fig.3") + - **excerpt**: 逐字引用的原文关键句 + - **usage**: 这个信息支持当前写作的哪个论点 + - **note**: 任何交叉验证/矛盾/注意事项 (optional) + +3. 执行: +```bash +paperforge reading-log --write \ + --section "Discussion P12" \ + --excerpt "the fundamental disjunction between materials science and biology" \ + --usage "F 段 gap 论点" \ + --note "与 DDGMQ7RW 独立诊断同一问题" +``` + +## Prompt Injection + +After reading a section or paragraph from a paper: + +**Record a reading note.** Determine the zotero_key of the paper you just read. Extract the section name (e.g. "Discussion P12", "Results Fig.3"), a verbatim excerpt of the key sentence, how this supports the current writing task, and any cross-validation notes. Then run: + +``` +paperforge --vault {vault_path} reading-log --write KEY --section "..." --excerpt "..." --usage "..." --note "..." +``` + +If the user's vault path is unknown, ask before running. diff --git a/paperforge/command_files/pf-log-session.md b/paperforge/command_files/pf-log-session.md new file mode 100644 index 0000000..335b8b0 --- /dev/null +++ b/paperforge/command_files/pf-log-session.md @@ -0,0 +1,54 @@ +# /pf-log-session — Summarize session decisions to working-log + +> 会话结束时回顾本次所有决策节点,追加到 working-log.md。 + +## Agent Workflow + +1. 回顾本次会话中所有关键节点: + - 用户纠正了什么 + - 方案怎么变的 + - 有什么弯路和教训 + - 可复用的方法论 + +2. 按以下格式生成 markdown,追加到 working-log.md: + +```markdown +## — <小节名> + +### 核心决策 +- 做了什么、为什么 + +### 弯路与修正 +- 错误方向 → 用户纠正 → 最终方案 + +### 可复用方法论 +- 本段的 pattern,后续段落能怎么用 + +### 待办 +- [ ] ... +``` + +3. 询问用户确认,然后写入到 `Project//working-log.md` + +## Prompt Injection + +At the end of this session, before saying goodbye: + +**Write the working-log entry.** Review all decision points, corrections, dead ends, and methodological insights from this session. Ask the user: "Should I write the working-log entry now?" If yes, generate the entry in the format below and append it to the appropriate working-log.md in the user's project directory. Ask the user to confirm the project path if unsure. + +Format: +``` +## YYYY-MM-DD — Section Name + +### Core Decisions +- What happened and why + +### Dead Ends & Corrections +- Wrong direction -> User correction -> Final approach + +### Reusable Methodology +- Patterns that apply to later sections + +### TODO +- [ ] ... +``` From 11be2b8df2e171ff945baa82d199380a5e463507 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 20:58:49 +0800 Subject: [PATCH 024/132] feat(skills): add literature-logging skill (reading-log + working-log) --- paperforge/skills/literature-logging/SKILL.md | 120 +++++++++++++ .../scripts/pf_bootstrap.py | 164 ++++++++++++++++++ 2 files changed, 284 insertions(+) create mode 100644 paperforge/skills/literature-logging/SKILL.md create mode 100644 paperforge/skills/literature-logging/scripts/pf_bootstrap.py diff --git a/paperforge/skills/literature-logging/SKILL.md b/paperforge/skills/literature-logging/SKILL.md new file mode 100644 index 0000000..fb8a8f8 --- /dev/null +++ b/paperforge/skills/literature-logging/SKILL.md @@ -0,0 +1,120 @@ +--- +name: literature-logging +description: > + Literature reading and working log management. Triggered by: + /pf-log-reading /pf-log-session, + "记录阅读", "记录一下", "写日志", "读完了", + "总结会话", "写工作总结", "写working log", + "记一下工作过程", "记录决策". +--- + +# Literature Logging + +--- + +## 1. Bootstrap — 必须先执行 + +跑这个脚本: + +``` +python $SKILL_DIR/scripts/pf_bootstrap.py +``` + +返回 JSON。记住以下变量: + +| 变量 | 来自 JSON 的 | 用于 | +| ----------- | -------------------- | --------------------------------------------- | +| `$VAULT` | `vault_root` | 所有 `--vault` 参数 | +| `$PYTHON` | `python_candidate` | 所有 cli 调用 | + +如果 `ok: false` → 报告 `error` 给用户,**停止**。 + +--- + +## 2. State Check — 检查当前日志状态 + +``` +$PYTHON -m paperforge --vault $VAULT reading-log --json +``` + +展示:已有多少条 reading notes。如果 0 条,告知用户:"还没有阅读记录,读完文献后使用 /pf-log-reading 记录。" + +--- + +## 3. Routing + +### /pf-log-reading — 记录单条阅读笔记 + +**调用条件**: 用户在阅读文献过程中,或读完一个段落/章节后 + +**Agent 行为**: +1. 确认 zotero_key(从上下文或 formal note 中获取) +2. 提取以下信息: + - **section**: 文献中的位置 (e.g. "Discussion P12", "Results Fig.3") + - **excerpt**: 逐字引用的原文关键句 + - **usage**: 这个信息支持当前写作的哪个论点 + - **note**: 任何交叉验证/矛盾/注意事项 (optional) +3. 询问用户确认,然后执行: +```bash +$PYTHON -m paperforge --vault $VAULT reading-log --write KEY \ + --section "SECTION" --excerpt "EXCERPT" \ + --usage "USAGE" --note "NOTE" +``` +4. 确认写入成功 + +### /pf-log-session — 会话总结写入 working-log + +**调用条件**: 写作/研究会话结束前,用户说 "写日志" 或 "/pf-log-session" + +**Agent 行为**: +1. 回顾本次会话中所有关键节点: + - 用户纠正了什么 + - 方案怎么变的 + - 有什么弯路和教训 + - 可复用的方法论 +2. 按以下格式生成 markdown: + +``` +## — <小节名> + +### 核心决策 +- 做了什么、为什么 + +### 弯路与修正 +- 错误方向 → 用户纠正 → 最终方案 + +### 可复用方法论 +- 本段的 pattern + +### 待办 +- [ ] ... +``` + +3. 展示给用户确认 +4. 询问目标 project 目录中的 working-log.md 路径 +5. 如果文件不存在:新建并写入 +6. 如果文件存在:先读旧内容,在文件末尾追加 `\n---\n` 分隔线,再追加新内容 +7. 确认写入成功 + +### Auto — 静默记录 + +用户没有显式说 "记录" 但 agent 读了一篇论文的某段时,agent 可以**主动问**: + +``` +我读了 LQZ2FWIW Discussion P12 关于 magnetoelectric 分类的内容。 +要记录到 reading-log 吗?(/pf-log-reading) +``` + +不要擅自记录——必须征得用户同意。 + +--- + +## 4. Export — 导出 reading-log + +用户说 "导出阅读日志" 或 "/pf-log-export": + +```bash +$PYTHON -m paperforge --vault $VAULT reading-log --output [--since DATE] +``` + +导出为 markdown 文件。如果用户没指定路径,询问。 diff --git a/paperforge/skills/literature-logging/scripts/pf_bootstrap.py b/paperforge/skills/literature-logging/scripts/pf_bootstrap.py new file mode 100644 index 0000000..87bd211 --- /dev/null +++ b/paperforge/skills/literature-logging/scripts/pf_bootstrap.py @@ -0,0 +1,164 @@ +"""PaperForge bootstrap — single entry point for agent to discover vault state. + +No dependencies. Runs on ANY Python. Just reads paperforge.json + filesystem. + +Usage: + python pf_bootstrap.py # auto-discover vault from CWD + python pf_bootstrap.py --vault + +Output (JSON to stdout): + { + "ok": true, + "vault_root": "D:\\...", + "paths": { + "literature_dir": "D:\\...\\Resources\\Literature", + "index_path": "D:\\...\\System\\PaperForge\\indexes\\formal-library.json", + "ocr_dir": "D:\\...\\System\\PaperForge\\ocr", + "exports_dir": "D:\\...\\System\\PaperForge\\exports" + }, + "domains": ["domain1", "domain2"], + "index_summary": {"domain1": 120, "domain2": 80}, + "python_candidate": "D:\\...\\python.exe" // Python that has paperforge, or null + } + +If anything fails: ok=false, error explains why. +""" + +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + + +def _find_paperforge_json(start: Path) -> Path | None: + current = start.resolve() + for _ in range(10): + candidate = current / "paperforge.json" + if candidate.exists(): + return candidate + parent = current.parent + if parent == current: + break + current = parent + return None + + +def _read_pf_config(pf_json: Path) -> dict: + with open(pf_json, encoding="utf-8") as f: + return json.load(f) + + +def _find_python_with_paperforge(vault: Path, pf_cfg: dict) -> str | None: + """Find a Python executable that has paperforge installed.""" + candidates = [] + + # 1. Explicit python_path in config + if pf_cfg.get("python_path"): + candidates.append(Path(pf_cfg["python_path"])) + + # 2. Common venv locations inside vault + venv_names = [".venv", ".paperforge-test-venv", "venv"] + exe_paths = ["Scripts/python.exe", "bin/python3"] + for vn in venv_names: + for ep in exe_paths: + p = vault / vn / ep + if p.exists(): + candidates.append(p) + + for candidate in candidates: + try: + result = subprocess.run( + [str(candidate), "-m", "paperforge", "--version"], + capture_output=True, text=True, timeout=10, + encoding="utf-8", errors="replace", + ) + if result.returncode == 0 and "paperforge" in result.stdout.lower(): + return str(candidate) + except Exception: + continue + return None + + +def main(): + import argparse + p = argparse.ArgumentParser(description="PaperForge bootstrap") + p.add_argument("--vault", default=None, help="Vault root path (auto-detect if omitted)") + args = p.parse_args() + + result: dict = {"ok": False} + + # --- 1. Find vault --- + if args.vault: + vault = Path(args.vault).resolve() + pf_json = vault / "paperforge.json" + if not pf_json.exists(): + result["error"] = f"paperforge.json not found at {vault}" + json.dump(result, sys.stdout, ensure_ascii=False) + sys.exit(0) + else: + pf_json = _find_paperforge_json(Path.cwd()) + if pf_json is None: + result["error"] = "paperforge.json not found from CWD upward. Set --vault." + json.dump(result, sys.stdout, ensure_ascii=False) + sys.exit(0) + vault = pf_json.parent + + result["vault_root"] = str(vault) + + # --- 2. Read config --- + try: + cfg = _read_pf_config(pf_json) + except Exception as e: + result["error"] = f"Cannot read paperforge.json: {e}" + json.dump(result, sys.stdout, ensure_ascii=False) + sys.exit(0) + + system_dir = cfg.get("system_dir", "System") + resources_dir = cfg.get("resources_dir", "Resources") + literature_dir = cfg.get("literature_dir", "Literature") + + # --- 3. Build paths from config --- + pf_root = vault / system_dir / "PaperForge" + + paths = { + "literature_dir": str(vault / resources_dir / literature_dir), + "index_path": str(pf_root / "indexes" / "formal-library.json"), + "ocr_dir": str(pf_root / "ocr"), + "exports_dir": str(pf_root / "exports"), + } + result["paths"] = paths + + # --- 4. List domains --- + lit_dir = Path(paths["literature_dir"]) + domains = sorted( + [d.name for d in lit_dir.iterdir() if d.is_dir()] + ) if lit_dir.exists() else [] + result["domains"] = domains + + # --- 5. Index summary --- + index_path = Path(paths["index_path"]) + index_summary: dict[str, int] = {} + if index_path.exists(): + try: + data = json.loads(index_path.read_text(encoding="utf-8")) + items = data.get("items", []) + if isinstance(items, dict): + items = items.values() + for item in items: + d = item.get("domain", "unknown") + index_summary[d] = index_summary.get(d, 0) + 1 + except Exception: + pass + result["index_summary"] = index_summary + + # --- 6. Find Python that has paperforge (best effort) --- + result["python_candidate"] = _find_python_with_paperforge(vault, cfg) + + result["ok"] = True + json.dump(result, sys.stdout, ensure_ascii=False, indent=2) + + +if __name__ == "__main__": + main() From fffc0c3d9f4e95ec40cf9834a59698567f41bbfe Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 21:11:15 +0800 Subject: [PATCH 025/132] =?UTF-8?q?docs(spec):=20plugin=20settings=20redes?= =?UTF-8?q?ign=20=E2=80=94=20tabbed=20UI,=20skill=20manager,=20feature=20t?= =?UTF-8?q?oggles,=20vector=20DB=20panel?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2026-05-12-plugin-settings-redesign.md | 190 ++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md diff --git a/docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md b/docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md new file mode 100644 index 0000000..e22f470 --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md @@ -0,0 +1,190 @@ +# Plugin Settings Redesign — Tabbed Settings + Feature Toggles + +> **Date:** 2026-05-12 | **Research ref:** Claudian + obsidian-skills-manager + +## Architecture + +``` +Settings → PaperForge + ┌─────────────────┬───────────────────────────────┐ + │ [安装] │ [功能] │ + ├─────────────────┼───────────────────────────────┤ + │ Python 路径 │ Skills │ + │ PaddleOCR Key │ ├─ 系统技能 (per-agent dir) │ + │ Zotero 数据目录 │ │ ├─ 开关: toggle frontmatter│ + │ Agent 平台 │ │ ├─ 更新: GitHub semver │ + │ Agent Config路径 │ │ └─ 冻结: 锁版本 │ + │ ... │ └─ 用户技能 (自定义目录) │ + │ │ └─ 开关: toggle frontmatter│ + │ │ │ + │ │ Memory Layer │ + │ │ ├─ FTS5 搜索 │ + │ │ ├─ agent-context │ + │ │ └─ reading-log │ + │ │ │ + │ │ 向量数据库 │ + │ │ ├─ 开关: 启用/禁用 │ + │ │ ├─ 模式: 本地 • API │ + │ │ ├─ 本地: [安装模型] + 模型名 │ + │ │ └─ API: API Key │ + └─────────────────┴───────────────────────────────┘ +``` + +## Tab Implementation + +Follow Claudian pattern: custom tab bar with class-toggle, all content divs exist in DOM simultaneously. + +```typescript +// PaperforgeSettingsTab.ts +type SettingsTabId = 'setup' | 'features'; + +// -- render() -- +// 1. Tab bar +const tabBar = containerEl.createDiv({ cls: 'paperforge-settings-tabs' }); +const tabButtons = new Map(); +const tabContents = new Map(); + +// 2. For each tab: create button + content div +for (const [id, label] of [['setup','安装'], ['features','功能']]) { + const btn = tabBar.createEl('button', { cls: 'paperforge-settings-tab', text: label }); + btn.addEventListener('click', () => switchTab(id)); // toggles --active class + tabButtons.set(id, btn); + + const content = containerEl.createDiv({ cls: 'paperforge-settings-tab-content' }); + tabContents.set(id, content); +} + +// 3. Render each tab +renderSetupTab(tabContents.get('setup')); +renderFeaturesTab(tabContents.get('features')); + +// 4. Activate default tab +switchTab(this.activeTab); +``` + +CSS: `.paperforge-settings-tab-content { display: none; }` `.paperforge-settings-tab-content--active { display: block; }` + +## Section 1: Skills Management + +### System Skill Detection + +Scan vault-local agent skill directories (from `AGENT_SKILL_DIRS` mapping): + +``` +{vault}/.opencode/skills/literature-qa/SKILL.md +{vault}/.opencode/skills/literature-logging/SKILL.md (new) +{vault}/.claude/skills/literature-qa/SKILL.md +{vault}/.codex/skills/literature-qa/SKILL.md +... +``` + +Each skill identified by `SKILL.md` frontmatter: +```yaml +name: literature-qa +description: 学术文献库操作 +version: 1.5.5 +source: PaperForge/paperforge +``` + +### UI per skill row + +``` +┌─────────────────────────────────────────────────────┐ +│ [✓] literature-qa v1.5.5 [更新] [冻结] │ +│ 学术文献库操作:精读、问答、检索 │ +├─────────────────────────────────────────────────────┤ +│ [✓] literature-logging v1.0.0 [更新] [冻结] │ +│ 阅读日志与工作日志管理 │ +└─────────────────────────────────────────────────────┘ +``` + +- **开关** (`[✓]`): 写入 `SKILL.md` frontmatter `disable-model-invocation: true/false`(obsidian-skills-manager 同款方式) +- **更新**: GitHub API `GET /repos/LLLin000/PaperForge/releases?per_page=25` → semver 比对 → 有新版显示 `[更新]` 按钮 → 重新下载 skill 文件 +- **冻结**: 写入 plugin `data.json` → `frozen_skills: { "literature-qa": true }` → 冻结后不显示更新提示 + +### User Skill Detection + +``` +{vault}/.claude/skills/ (可配置路径) +``` + +User skills identified by `SKILL.md` frontmatter field `source: user` (or no `source` field). Features: +- **开关**: same `disable-model-invocation` toggle +- **无更新/冻结功能** + +### Source attribution + +Frontmatter discriminator for system vs user: +```yaml +# System skill +source: paperforge # → managed by plugin, has update button + +# User skill +source: user # → toggle only, no update +# (or no source field) # → treated as user +``` + +## Section 2: Feature Toggles + +Memory Layer features as simple Obsidian toggles in plugin `data.json`: + +| Key | Default | Effect | +|-----|---------|--------| +| `features.fts_search` | `true` | `paperforge memory build` 是否创建 FTS 索引 | +| `features.agent_context` | `true` | 是否允许 `agent-context` 命令 | +| `features.reading_log` | `true` | 是否启用 paper_events 表 | +| `features.vector_db` | `false` | 是否启用向量检索模块 | + +When a feature is disabled, the corresponding CLI command returns a clear error message. + +## Section 3: Vector Database + +``` +┌─────────────────────────────────────────┐ +│ 向量数据库 [启用] │ +│ │ +│ 模式: ● 本地 ○ API │ +│ │ +│ 本地模型: all-MiniLM-L6-v2 │ +│ 模型大小: 80 MB │ +│ 状态: ● 已安装 / ○ 未安装 │ +│ [安装模型] │ +│ [重新安装] │ +│ │ +│ API Key: ┌─────────────────────────────┐│ +│ │ sk-... ││ +│ └─────────────────────────────┘│ +│ 模型: text-embedding-3-small │ +└─────────────────────────────────────────┘ +``` + +Implementation notes: +- Model installation: `pip install sentence-transformers` + trigger model download +- Model path: stored in `data.json` under `features.vector_db.model_path` +- API: uses existing `.env` PaddleOCR Key pattern, add `VECTOR_API_KEY` +- `pip install` is async — show progress bar + +## Data Storage + +All toggles in plugin `data.json`: +```json +{ + "features": { + "fts_search": true, + "agent_context": true, + "reading_log": true, + "vector_db": false + }, + "vector_db_mode": "local", + "vector_db_model": "all-MiniLM-L6-v2", + "vector_db_api_key": "", + "frozen_skills": { + "literature-qa": false + } +} +``` + +Skill disable state in `SKILL.md` frontmatter (standard Agent Skills spec): +```yaml +disable-model-invocation: true +``` From 2551eec014b7a673af72f6d696f95eae07faec71 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 21:19:21 +0800 Subject: [PATCH 026/132] fix(spec): add DEFAULT_SETTINGS whitelist note, source: paperforge requirement, CLI toggle read --- .../specs/2026-05-12-plugin-settings-redesign.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md b/docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md index e22f470..1761f3a 100644 --- a/docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md +++ b/docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md @@ -124,6 +124,8 @@ source: user # → toggle only, no update # (or no source field) # → treated as user ``` +**Implementation note:** Both existing SKILL.md files (`literature-qa`, `literature-logging`) must add `source: paperforge` to their frontmatter. + ## Section 2: Feature Toggles Memory Layer features as simple Obsidian toggles in plugin `data.json`: @@ -137,6 +139,8 @@ Memory Layer features as simple Obsidian toggles in plugin `data.json`: When a feature is disabled, the corresponding CLI command returns a clear error message. +**Implementation note:** CLI commands read `data.json` to check feature toggles. If `data.json` is missing (user runs CLI outside Obsidian), features default to `true` (opt-out, not opt-in). + ## Section 3: Vector Database ``` @@ -166,7 +170,7 @@ Implementation notes: ## Data Storage -All toggles in plugin `data.json`: +Plugin `data.json`: ```json { "features": { @@ -178,12 +182,13 @@ All toggles in plugin `data.json`: "vector_db_mode": "local", "vector_db_model": "all-MiniLM-L6-v2", "vector_db_api_key": "", - "frozen_skills": { - "literature-qa": false - } + "frozen_skills": {} } ``` +**Critical:** All new keys MUST be added to `DEFAULT_SETTINGS` in `main.js` (currently a whitelist of 8 keys in `saveSettings()`). Without this, toggles appear to work but vanish on vault reopen. +``` + Skill disable state in `SKILL.md` frontmatter (standard Agent Skills spec): ```yaml disable-model-invocation: true From 9016a8734e839a62b84353fe8d64c447d4b10b0d Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 21:43:34 +0800 Subject: [PATCH 027/132] =?UTF-8?q?docs(spec):=20add=20Phase=207=20vector?= =?UTF-8?q?=20retrieval=20=E2=80=94=20ChromaDB=20+=20local/API=20embedding?= =?UTF-8?q?=20+=20chunking?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../specs/2026-05-12-vector-retrieval.md | 197 ++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-12-vector-retrieval.md diff --git a/docs/superpowers/specs/2026-05-12-vector-retrieval.md b/docs/superpowers/specs/2026-05-12-vector-retrieval.md new file mode 100644 index 0000000..147ffc5 --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-vector-retrieval.md @@ -0,0 +1,197 @@ +# Phase 7 — Vector Retrieval + +> **Date:** 2026-05-12 | **Depends on:** Memory Layer Phase 1-6 + +## Overview + +Add semantic vector retrieval for OCR fulltext, built on ChromaDB with local embedding models. +Optional module, disabled by default. Activated by user via plugin settings toggle. + +## Architecture + +``` +fulltext.md + ↓ 剔除 ![[*]] 图片链接行 + ↓ 替换行内图片链接为 [Figure N] + ↓ 按 分页 + ↓ 每页按双换行分自然段 + ↓ 2-3 段一组 → 300-400 token/chunk, 1 段重叠 + ↓ section 检测 (规则匹配 IMRaD + Figure/Table) + ↓ embed with bge-small-en-v1.5 (384d) + ↓ +ChromaDB @ indexes/vectors/ + ↓ paperforge retrieve "PEMF dose response" --json + ↓ top-5 chunks + 前后各 1 chunk (补上下文) + ↓ +{ chunks: [{ paper_id, title, section, page, text, score }] } +``` + +## Dependencies + +``` +pip install chromadb sentence-transformers +``` + +Local model auto-downloads on first use (~130 MB for `bge-small-en-v1.5`). +API mode uses `openai` package (already in deps). + +## Section Detection (Rule-based) + +Scan each paragraph for known section keywords: + +``` +Case-insensitive match, must appear as standalone short line (< 80 chars): + +Introduction | Methods | Materials | Results | Discussion +Conclusion | Abstract | Background | References | Supplementary +Figure \d+ | Fig\.? \d+ | Table \d+ +``` + +Rules (priority order): +1. Exact keyword match → section = matched text +2. ALL CAPS short line → probable section title +3. Short line, no period, surrounded by blank lines → probable section title +4. Fallback: inherit from previous chunk in same page +5. Default: "Text" (unclassified) + +## Local Model Options + +| Model ID | Dim | Size | Chinese | Speed | +| -------------------------- | ---- | ----- | ------- | ----- | +| `BAAI/bge-small-en-v1.5` | 384 | 130MB | [*] | Fast | +| `sentence-transformers/all-MiniLM-L6-v2` | 384 | 80MB | — | Fast | +| `BAAI/bge-base-en-v1.5` | 768 | 440MB | [*] | Medium | +| `sentence-transformers/all-mpnet-base-v2` | 768 | 420MB | — | Medium | + +Model selection stored in `data.json` → `vector_db_model`. + +## API Mode + +```python +# When vector_db_mode == "api": +from openai import OpenAI +client = OpenAI(api_key=api_key) +embedding = client.embeddings.create( + model="text-embedding-3-small", + input=text +) +``` + +API key from `data.json` → `vector_db_api_key` or fallback to `.env` `OPENAI_API_KEY`. +Max 8191 tokens per call — chunking ensures we stay under limit. + +## ChromaDB Storage + +``` +/PaperForge/indexes/vectors/ + ├── chroma.sqlite3 + └── / (Chroma internal) +``` + +Collection name: `paperforge_fulltext`. +Metadata stored per chunk: `paper_id, citation_key, title, year, section, page, chunk_index, token_estimate`. + +## Commands + +### `paperforge embed build [--force]` + +1. Check `data.json` for `features.vector_db == true` +2. Read `formal-library.json` for all papers with `ocr_status == "done"` +3. For each paper: read `fulltext.md`, chunk, embed, insert into ChromaDB +4. If `--force`: delete existing collection, rebuild from scratch + +Returns PFResult: +```json +{ + "ok": true, + "data": { + "papers_embedded": 21, + "chunks_embedded": 420, + "model": "BAAI/bge-small-en-v1.5", + "mode": "local" + } +} +``` + +### `paperforge retrieve --json [--limit N] [--expand true]` + +1. Embed query with same model +2. Query ChromaDB, get top-N chunks +3. If `--expand true` (default): fetch adjacent chunks (±1) for context +4. Join with papers table for metadata + +Returns: +```json +{ + "ok": true, + "data": { + "query": "PEMF dose response chondrocyte", + "chunks": [ + { + "paper_id": "ABC123", + "citation_key": "aaronStimulation2004", + "title": "Stimulation of growth factor synthesis...", + "year": 2004, + "section": "Results", + "page": 6, + "chunk_text": "At 24h post-stimulation, chondrocyte proliferation increased...\n\n...", + "adjacent_before": "... (previous chunk, if expanded)", + "adjacent_after": "... (next chunk, if expanded)", + "score": 0.92 + } + ], + "count": 5, + "model": "BAAI/bge-small-en-v1.5" + } +} +``` + +### `paperforge embed status --json` + +Returns: db exists, collection exists, chunk count, model name, last build time. + +## Integration with Memory Layer + +### Memory build + +`paperforge memory build` does NOT trigger embed build. Vector DB is separate, user-controlled. + +### Incremental refresh + +`refresh_paper()` extended: +```python +def refresh_paper(vault, zotero_key): + # existing SQLite refresh... + + # If vector DB enabled: + if vector_db_enabled(vault): + # Delete old chunks for this paper + collection.delete(where={"paper_id": zotero_key}) + # Re-embed this paper + _embed_paper(vault, zotero_key) +``` + +Triggered after OCR completes (fulltext changes) or deep-finalize. + +## Files + +``` +Create: + paperforge/memory/vector_db.py — ChromaDB init, embed, query, delete + paperforge/memory/chunker.py — fulltext → chunks (rule-based) + paperforge/commands/embed.py — CLI: embed build/status + paperforge/commands/retrieve.py — CLI: retrieve + +Modify: + paperforge/memory/refresh.py — add vector refresh hook + paperforge/cli.py — register embed + retrieve +``` + +## Constraints + +1. Optional — disabled until user enables in settings +2. Requires `pip install chromadb sentence-transformers` (user installs or plugin offers button) +3. Windows compatible (ChromaDB embedded mode works on Windows) +4. `paperforge.db` remains source of truth; ChromaDB is deletable and rebuildable +5. No GPU required; CPU embedding for 150 papers takes ~30 seconds +6. API mode: respects rate limits, batches chunks to minimize API calls From 0d83278f1bd57b3f4006365fd0cacce9fca5a423 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 22:28:00 +0800 Subject: [PATCH 028/132] feat(plugin): tabbed settings UI (Installation + Features) with skill manager and feature toggles --- paperforge/plugin/main.js | 214 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 4695a57..a91f0e9 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -489,6 +489,10 @@ Object.assign(LANG.en, { ocr_privacy_title: 'OCR Privacy Notice', ocr_privacy_warning: 'OCR will upload PDFs to the PaddleOCR API. Do not upload sensitive or confidential documents.', ocr_understand: 'I understand, continue', + + /* ── Tabbed Settings ── */ + tab_setup: 'Installation', + tab_features: 'Features', }); /* ── LANG.zh: v1.12 runtime health, OCR queue, pf-deep, dashboard translations ── */ @@ -524,6 +528,10 @@ Object.assign(LANG.zh, { install_validating: '正在校验安装环境…', install_bootstrapping: '未检测到 PaperForge Python 包,正在自动安装…', wizard_safety: '安全说明:如果你选择的目录里已经有文件,安装向导会保留已有内容,只补充缺失的 PaperForge 文件和目录。', + + /* ── Tabbed Settings ── */ + tab_setup: '安装', + tab_features: '功能', }); function langFromApp(app) { @@ -553,6 +561,17 @@ const DEFAULT_SETTINGS = { paddleocr_api_key: '', zotero_data_dir: '', python_path: '', + // Feature toggles + features: { + fts_search: true, + agent_context: true, + reading_log: true, + vector_db: false, + }, + vector_db_mode: 'local', + vector_db_model: 'BAAI/bge-small-en-v1.5', + vector_db_api_key: '', + frozen_skills: {}, }; // ACTIONS, resolvePythonExecutable extracted to src/ modules (Plan 53-001) @@ -2196,6 +2215,7 @@ class PaperForgeSettingTab extends PluginSettingTab { this.plugin = plugin; this._saveTimeout = null; this._pfConfig = null; // cached paperforge.json config + this.activeTab = 'setup'; } /** Reload path config from paperforge.json */ @@ -2208,6 +2228,55 @@ class PaperForgeSettingTab extends PluginSettingTab { containerEl.empty(); this._refreshPfConfig(); + // Inject tab CSS once + if (!document.getElementById('paperforge-tab-styles')) { + const style = document.createElement('style'); + style.id = 'paperforge-tab-styles'; + style.textContent = ` + .paperforge-settings-tabs { display: flex; gap: 4px; margin-bottom: 16px; border-bottom: 1px solid var(--background-modifier-border); } + .paperforge-settings-tab { padding: 6px 16px; border: none; background: none; cursor: pointer; border-bottom: 2px solid transparent; font-size: 14px; color: var(--text-muted); } + .paperforge-settings-tab--active { color: var(--text-accent); border-bottom-color: var(--text-accent); } + .paperforge-tab-content { display: none; } + .paperforge-tab-content--active { display: block; } + `; + document.head.appendChild(style); + } + + // --- Tab bar --- + const tabBar = containerEl.createDiv({ cls: 'paperforge-settings-tabs' }); + const tabs = [ + { id: 'setup', label: t('tab_setup') || 'Installation' }, + { id: 'features', label: t('tab_features') || 'Features' }, + ]; + const tabContents = {}; + + tabs.forEach(tab => { + const btn = tabBar.createEl('button', { + cls: 'paperforge-settings-tab' + (tab.id === this.activeTab ? ' paperforge-settings-tab--active' : ''), + text: tab.label, + }); + btn.addEventListener('click', () => { + this.activeTab = tab.id; + this.display(); // re-render with new active tab + }); + }); + + // --- Tab content containers --- + tabs.forEach(tab => { + tabContents[tab.id] = containerEl.createDiv({ + cls: 'paperforge-tab-content' + (tab.id === this.activeTab ? ' paperforge-tab-content--active' : ''), + }); + }); + + // --- Render active tab --- + if (this.activeTab === 'setup') { + this._renderSetupTab(tabContents.setup); + } else { + this._renderFeaturesTab(tabContents.features); + } + } + + _renderSetupTab(containerEl) { const vaultPath = this.app.vault.adapter.basePath; if (!this.plugin.settings.vault_path) { this.plugin.settings.vault_path = vaultPath; @@ -2413,6 +2482,146 @@ class PaperForgeSettingTab extends PluginSettingTab { } } + _renderFeaturesTab(containerEl) { + // --- Section: Skills --- + containerEl.createEl('h3', { text: 'Skills' }); + containerEl.createEl('p', { text: 'Agent skills extend PaperForge with specialized capabilities.', cls: 'setting-item-description' }); + + // Scan vault-local agent skill dirs + const agentDirs = [ + '.opencode/skills', '.claude/skills', '.codex/skills', + '.cursor/skills', '.windsurf/skills', '.github/skills', + ]; + const vaultPath = this.app.vault.adapter.basePath; + const fs = require('fs'); + const path = require('path'); + let foundSkills = false; + + agentDirs.forEach(dir => { + const skillDir = path.join(vaultPath, dir); + if (!fs.existsSync(skillDir)) return; + fs.readdirSync(skillDir, { withFileTypes: true }).forEach(entry => { + if (!entry.isDirectory()) return; + const skillFile = path.join(skillDir, entry.name, 'SKILL.md'); + if (!fs.existsSync(skillFile)) return; + const content = fs.readFileSync(skillFile, 'utf-8'); + const nameMatch = content.match(/^name:\s*(.+)$/m); + const descMatch = content.match(/^description:\s*(.+)$/m); + const sourceMatch = content.match(/^source:\s*(.+)$/m); + const disableMatch = content.match(/^disable-model-invocation:\s*(.+)$/m); + const versionMatch = content.match(/^version:\s*(.+)$/m); + + const name = nameMatch ? nameMatch[1].trim() : entry.name; + const desc = descMatch ? descMatch[1].trim() : ''; + const source = sourceMatch ? sourceMatch[1].trim() : 'user'; + const disabled = disableMatch && disableMatch[1].trim() === 'true'; + const version = versionMatch ? versionMatch[1].trim() : ''; + const isSystem = source === 'paperforge'; + foundSkills = true; + + const setting = new Setting(containerEl) + .setName(name + (isSystem ? ' (system)' : ' (user)') + (version ? ' v' + version : '')) + .setDesc(desc || 'No description'); + + setting.addToggle(toggle => { + toggle.setValue(!disabled) + .onChange(async (value) => { + const newContent = disableMatch + ? content.replace(/^disable-model-invocation:\s*.+$/m, `disable-model-invocation: ${!value}`) + : content.replace(/^(---\r?\n)/, `$1disable-model-invocation: ${!value}\n`); + fs.writeFileSync(skillFile, newContent, 'utf-8'); + }); + }); + }); + }); + + if (!foundSkills) { + containerEl.createEl('p', { text: 'No skills found. Run setup to deploy skills.', cls: 'setting-item-description' }); + } + + // --- Section: Memory Layer Features --- + containerEl.createEl('h3', { text: 'Memory Layer' }); + const featureSettings = [ + { key: 'fts_search', name: 'Full-text Search', desc: 'Enables paperforge search command (FTS5)' }, + { key: 'agent_context', name: 'Agent Context', desc: 'Enables paperforge agent-context command' }, + { key: 'reading_log', name: 'Reading Log', desc: 'Enables paper_events table for reading notes' }, + ]; + featureSettings.forEach(fs => { + new Setting(containerEl) + .setName(fs.name) + .setDesc(fs.desc) + .addToggle(toggle => { + toggle.setValue(this.plugin.settings.features[fs.key]) + .onChange(value => { + this.plugin.settings.features[fs.key] = value; + this.plugin.saveSettings(); + }); + }); + }); + + // --- Section: Vector Database --- + containerEl.createEl('h3', { text: 'Vector Database' }); + new Setting(containerEl) + .setName('Enable Vector Retrieval') + .setDesc('Semantic search across OCR fulltext using embeddings. Requires pip install chromadb sentence-transformers.') + .addToggle(toggle => { + toggle.setValue(this.plugin.settings.features.vector_db) + .onChange(value => { + this.plugin.settings.features.vector_db = value; + this.plugin.saveSettings(); + }); + }); + + if (this.plugin.settings.features.vector_db) { + // Mode selection + new Setting(containerEl) + .setName('Embedding Mode') + .setDesc('Local: free, offline, CPU-based. API: higher quality, requires API key.') + .addDropdown(dropdown => { + dropdown.addOption('local', 'Local (bge-small-en-v1.5)'); + dropdown.addOption('api', 'API (text-embedding-3-small)'); + dropdown.setValue(this.plugin.settings.vector_db_mode) + .onChange(value => { + this.plugin.settings.vector_db_mode = value; + this.plugin.saveSettings(); + }); + }); + + // Model selection (local mode only) + if (this.plugin.settings.vector_db_mode === 'local') { + new Setting(containerEl) + .setName('Local Model') + .setDesc('Embedding model for vector search. First use downloads ~130MB.') + .addDropdown(dropdown => { + dropdown.addOption('BAAI/bge-small-en-v1.5', 'bge-small-en-v1.5 (384d, 130MB, fast)'); + dropdown.addOption('sentence-transformers/all-MiniLM-L6-v2', 'all-MiniLM-L6-v2 (384d, 80MB, fastest)'); + dropdown.addOption('BAAI/bge-base-en-v1.5', 'bge-base-en-v1.5 (768d, 440MB)'); + dropdown.addOption('sentence-transformers/all-mpnet-base-v2', 'all-mpnet-base-v2 (768d, 420MB)'); + dropdown.setValue(this.plugin.settings.vector_db_model) + .onChange(value => { + this.plugin.settings.vector_db_model = value; + this.plugin.saveSettings(); + }); + }); + } + + // API key (API mode only) + if (this.plugin.settings.vector_db_mode === 'api') { + new Setting(containerEl) + .setName('API Key') + .setDesc('OpenAI API key for text-embedding-3-small') + .addText(text => { + text.setPlaceholder('sk-...') + .setValue(this.plugin.settings.vector_db_api_key) + .onChange(value => { + this.plugin.settings.vector_db_api_key = value; + this.plugin.saveSettings(); + }); + }); + } + } + } + _getPythonDesc(pyPath, source) { if (source === 'stale') { return `[!!] ${pyPath} (stale — path no longer exists, update or clear the override below)`; @@ -3512,6 +3721,11 @@ module.exports = class PaperForgePlugin extends Plugin { async loadSettings() { this.settings = Object.assign({}, DEFAULT_SETTINGS, await this.loadData()); + // Deep-merge nested objects (features, frozen_skills) to avoid overwrite + if (this.settings.features && DEFAULT_SETTINGS.features) { + this.settings.features = Object.assign({}, DEFAULT_SETTINGS.features, this.settings.features || {}); + } + if (!this.settings.frozen_skills) { this.settings.frozen_skills = {}; } // Path fields come from paperforge.json, not from DEFAULT_SETTINGS or plugin data.json const pfConfig = this.readPaperforgeJson(); this.settings.system_dir = pfConfig.system_dir; From 3532b34710f4c3ddff03f2606d5e43bf67fdd914 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 22:41:49 +0800 Subject: [PATCH 029/132] fix(plugin): single memory toggle + platform skill selector + vector UX improvements --- paperforge/plugin/main.js | 342 ++++++++++++++++++++++++++++---------- 1 file changed, 251 insertions(+), 91 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index a91f0e9..41d85fe 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -563,11 +563,10 @@ const DEFAULT_SETTINGS = { python_path: '', // Feature toggles features: { - fts_search: true, - agent_context: true, - reading_log: true, + memory_layer: true, vector_db: false, }, + selected_skill_platform: 'opencode', vector_db_mode: 'local', vector_db_model: 'BAAI/bge-small-en-v1.5', vector_db_api_key: '', @@ -2483,23 +2482,102 @@ class PaperForgeSettingTab extends PluginSettingTab { } _renderFeaturesTab(containerEl) { + // --- Section: Memory Layer --- + containerEl.createEl('h3', { text: 'Memory Layer' }); + + new Setting(containerEl) + .setName('Enable Memory Layer') + .setDesc('SQLite index for fast paper lookup, search, and agent context.') + .addToggle(toggle => { + toggle.setValue(this.plugin.settings.features.memory_layer) + .onChange(value => { + this.plugin.settings.features.memory_layer = value; + // Also toggle sub-features together + this.plugin.settings.features.fts_search = value; + this.plugin.settings.features.agent_context = value; + this.plugin.settings.features.reading_log = value; + this.plugin.saveSettings(); + // Refresh display to show status + this.display(); + }); + }); + + // Show memory status when enabled + if (this.plugin.settings.features.memory_layer) { + const statusEl = containerEl.createEl('div', { cls: 'paperforge-memory-status' }); + statusEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; + try { + const { execSync } = require('child_process'); + const vp = this.app.vault.adapter.basePath; + const fs = require('fs'); + const pythonPath = this.plugin.settings.python_path; + if (pythonPath && fs.existsSync(pythonPath)) { + const result = execSync(`"${pythonPath}" -m paperforge --vault "${vp}" memory status --json`, { encoding: 'utf-8', timeout: 10000 }); + const data = JSON.parse(result); + if (data.ok) { + const s = data.data; + const freshness = s.fresh ? 'fresh' : 'stale'; + statusEl.createEl('span', { text: `Papers: ${s.paper_count_db} | Schema: ${s.schema_ok ? 'OK' : 'MISMATCH'} | Status: ${freshness}` }); + if (s.needs_rebuild) { + statusEl.createEl('br'); + statusEl.createEl('span', { text: 'Needs rebuild: run paperforge memory build', cls: 'paperforge-status-warn' }); + } + } else { + statusEl.createEl('span', { text: 'DB not found. Run paperforge memory build.', cls: 'paperforge-status-warn' }); + } + } + } catch(e) { + statusEl.createEl('span', { text: 'Could not check memory status. Ensure Python is configured.', cls: 'paperforge-status-warn' }); + } + } + // --- Section: Skills --- containerEl.createEl('h3', { text: 'Skills' }); - containerEl.createEl('p', { text: 'Agent skills extend PaperForge with specialized capabilities.', cls: 'setting-item-description' }); - // Scan vault-local agent skill dirs - const agentDirs = [ - '.opencode/skills', '.claude/skills', '.codex/skills', - '.cursor/skills', '.windsurf/skills', '.github/skills', - ]; + // Agent platform selector + const agentPlatforms = { + 'opencode': 'OpenCode', + 'claude': 'Claude Code', + 'codex': 'Codex', + 'cursor': 'Cursor', + 'windsurf': 'Windsurf', + 'github_copilot': 'GitHub Copilot', + }; + const agentDirs = { + 'opencode': '.opencode/skills', + 'claude': '.claude/skills', + 'codex': '.codex/skills', + 'cursor': '.cursor/skills', + 'windsurf': '.windsurf/skills', + 'github_copilot': '.github/skills', + }; + const vaultPath = this.app.vault.adapter.basePath; const fs = require('fs'); const path = require('path'); - let foundSkills = false; - agentDirs.forEach(dir => { - const skillDir = path.join(vaultPath, dir); - if (!fs.existsSync(skillDir)) return; + let selectedPlatform = this.plugin.settings.selected_skill_platform || 'opencode'; + + new Setting(containerEl) + .setName('Agent Platform') + .setDesc('Select which agent platform to manage skills for.') + .addDropdown(dropdown => { + Object.entries(agentPlatforms).forEach(([key, label]) => dropdown.addOption(key, label)); + dropdown.setValue(selectedPlatform) + .onChange(value => { + this.plugin.settings.selected_skill_platform = value; + this.plugin.saveSettings(); + // Re-render to show correct platform's skills + this.display(); + }); + }); + + // Show skills for selected platform + const skillDir = path.join(vaultPath, agentDirs[selectedPlatform]); + let systemSkills = []; + let userSkills = []; + + if (fs.existsSync(skillDir)) { fs.readdirSync(skillDir, { withFileTypes: true }).forEach(entry => { if (!entry.isDirectory()) return; const skillFile = path.join(skillDir, entry.name, 'SKILL.md'); @@ -2511,113 +2589,195 @@ class PaperForgeSettingTab extends PluginSettingTab { const disableMatch = content.match(/^disable-model-invocation:\s*(.+)$/m); const versionMatch = content.match(/^version:\s*(.+)$/m); - const name = nameMatch ? nameMatch[1].trim() : entry.name; - const desc = descMatch ? descMatch[1].trim() : ''; - const source = sourceMatch ? sourceMatch[1].trim() : 'user'; - const disabled = disableMatch && disableMatch[1].trim() === 'true'; - const version = versionMatch ? versionMatch[1].trim() : ''; - const isSystem = source === 'paperforge'; - foundSkills = true; - - const setting = new Setting(containerEl) - .setName(name + (isSystem ? ' (system)' : ' (user)') + (version ? ' v' + version : '')) - .setDesc(desc || 'No description'); - - setting.addToggle(toggle => { - toggle.setValue(!disabled) - .onChange(async (value) => { - const newContent = disableMatch - ? content.replace(/^disable-model-invocation:\s*.+$/m, `disable-model-invocation: ${!value}`) - : content.replace(/^(---\r?\n)/, `$1disable-model-invocation: ${!value}\n`); - fs.writeFileSync(skillFile, newContent, 'utf-8'); - }); - }); + const skill = { + name: nameMatch ? nameMatch[1].trim() : entry.name, + desc: descMatch ? descMatch[1].trim() : '', + source: sourceMatch ? sourceMatch[1].trim() : 'user', + disabled: disableMatch && disableMatch[1].trim() === 'true', + version: versionMatch ? versionMatch[1].trim() : '', + path: skillFile, + content: content, + dirName: entry.name, + }; + + if (skill.source === 'paperforge') { + systemSkills.push(skill); + } else { + userSkills.push(skill); + } }); - }); + } + + // Helper to render a skill row + const renderSkillRow = (skill, isSystem) => { + const nameText = skill.name + (skill.version ? ' v' + skill.version : ''); + const sourceLabel = isSystem ? ' [system]' : ' [user]'; + const statusText = skill.disabled ? ' (disabled)' : ' (enabled)'; + + const setting = new Setting(containerEl) + .setName(nameText + sourceLabel) + .setDesc((skill.desc || 'No description') + statusText); - if (!foundSkills) { - containerEl.createEl('p', { text: 'No skills found. Run setup to deploy skills.', cls: 'setting-item-description' }); + const disableMatch = skill.content.match(/^disable-model-invocation:\s*(.+)$/m); + + setting.addToggle(toggle => { + toggle.setValue(!skill.disabled) + .onChange(value => { + const newDisabled = !value; + const newContent = disableMatch + ? skill.content.replace(/^disable-model-invocation:\s*.+$/m, `disable-model-invocation: ${newDisabled}`) + : skill.content.replace(/^(---\r?\n)/, `$1disable-model-invocation: ${newDisabled}\n`); + fs.writeFileSync(skill.path, newContent, 'utf-8'); + skill.disabled = newDisabled; + // Update status text in desc + setting.setDesc((skill.desc || 'No description') + (skill.disabled ? ' (disabled)' : ' (enabled)')); + }); + }); + }; + + // System skills + if (systemSkills.length > 0) { + containerEl.createEl('h4', { text: 'System Skills', cls: 'paperforge-skills-subheader' }); + systemSkills.forEach(s => renderSkillRow(s, true)); } - // --- Section: Memory Layer Features --- - containerEl.createEl('h3', { text: 'Memory Layer' }); - const featureSettings = [ - { key: 'fts_search', name: 'Full-text Search', desc: 'Enables paperforge search command (FTS5)' }, - { key: 'agent_context', name: 'Agent Context', desc: 'Enables paperforge agent-context command' }, - { key: 'reading_log', name: 'Reading Log', desc: 'Enables paper_events table for reading notes' }, - ]; - featureSettings.forEach(fs => { - new Setting(containerEl) - .setName(fs.name) - .setDesc(fs.desc) - .addToggle(toggle => { - toggle.setValue(this.plugin.settings.features[fs.key]) - .onChange(value => { - this.plugin.settings.features[fs.key] = value; - this.plugin.saveSettings(); - }); - }); - }); + // User skills + if (userSkills.length > 0) { + containerEl.createEl('h4', { text: 'User Skills', cls: 'paperforge-skills-subheader' }); + userSkills.forEach(s => renderSkillRow(s, false)); + } + + if (systemSkills.length === 0 && userSkills.length === 0) { + containerEl.createEl('p', { + text: `No skills found in ${agentDirs[selectedPlatform]}. Run setup to deploy skills.`, + cls: 'setting-item-description' + }); + } // --- Section: Vector Database --- containerEl.createEl('h3', { text: 'Vector Database' }); + new Setting(containerEl) .setName('Enable Vector Retrieval') - .setDesc('Semantic search across OCR fulltext using embeddings. Requires pip install chromadb sentence-transformers.') + .setDesc('Semantic search across OCR fulltext. Requires: pip install chromadb sentence-transformers (~500MB).') .addToggle(toggle => { toggle.setValue(this.plugin.settings.features.vector_db) .onChange(value => { this.plugin.settings.features.vector_db = value; this.plugin.saveSettings(); + this.display(); }); }); if (this.plugin.settings.features.vector_db) { - // Mode selection - new Setting(containerEl) - .setName('Embedding Mode') - .setDesc('Local: free, offline, CPU-based. API: higher quality, requires API key.') - .addDropdown(dropdown => { - dropdown.addOption('local', 'Local (bge-small-en-v1.5)'); - dropdown.addOption('api', 'API (text-embedding-3-small)'); - dropdown.setValue(this.plugin.settings.vector_db_mode) - .onChange(value => { - this.plugin.settings.vector_db_mode = value; - this.plugin.saveSettings(); - }); + // Check if dependencies installed + let depsOk = false; + try { + const { execSync } = require('child_process'); + const pythonPath = this.plugin.settings.python_path; + if (pythonPath && fs.existsSync(pythonPath)) { + const result = execSync(`"${pythonPath}" -c "import chromadb; import sentence_transformers; print('ok')"`, { encoding: 'utf-8', timeout: 15000 }); + depsOk = result.trim() === 'ok'; + } + } catch(e) { depsOk = false; } + + if (!depsOk) { + const depWarning = containerEl.createEl('div', { + cls: 'paperforge-vector-warning', + text: 'Dependencies not installed. Required: chromadb, sentence-transformers.' }); + depWarning.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-modifier-error); border-radius:4px; color:var(--text-error);'; - // Model selection (local mode only) - if (this.plugin.settings.vector_db_mode === 'local') { new Setting(containerEl) - .setName('Local Model') - .setDesc('Embedding model for vector search. First use downloads ~130MB.') - .addDropdown(dropdown => { - dropdown.addOption('BAAI/bge-small-en-v1.5', 'bge-small-en-v1.5 (384d, 130MB, fast)'); - dropdown.addOption('sentence-transformers/all-MiniLM-L6-v2', 'all-MiniLM-L6-v2 (384d, 80MB, fastest)'); - dropdown.addOption('BAAI/bge-base-en-v1.5', 'bge-base-en-v1.5 (768d, 440MB)'); - dropdown.addOption('sentence-transformers/all-mpnet-base-v2', 'all-mpnet-base-v2 (768d, 420MB)'); - dropdown.setValue(this.plugin.settings.vector_db_model) - .onChange(value => { - this.plugin.settings.vector_db_model = value; - this.plugin.saveSettings(); + .setName('Install Dependencies') + .setDesc('Installs chromadb and sentence-transformers (~500MB disk)') + .addButton(button => { + button.setButtonText('Install') + .setCta() + .onClick(async () => { + button.setButtonText('Installing...'); + button.setDisabled(true); + try { + const { execSync } = require('child_process'); + const pythonPath = this.plugin.settings.python_path; + execSync(`"${pythonPath}" -m pip install chromadb sentence-transformers`, { + encoding: 'utf-8', + timeout: 300000, + stdio: 'pipe' + }); + new Notice('Dependencies installed. You can now run paperforge embed build.'); + this.display(); + } catch(e) { + new Notice('Install failed: ' + e.message); + button.setButtonText('Install'); + button.setDisabled(false); + } }); }); - } + } else { + // Show status + let embedStatus = null; + try { + const { execSync } = require('child_process'); + const vp = vaultPath; + const pythonPath = this.plugin.settings.python_path; + if (pythonPath && fs.existsSync(pythonPath)) { + const result = execSync(`"${pythonPath}" -m paperforge --vault "${vp}" embed status --json`, { encoding: 'utf-8', timeout: 10000 }); + embedStatus = JSON.parse(result); + } + } catch(e) {} + + if (embedStatus && embedStatus.ok) { + const statusEl = containerEl.createEl('div', { cls: 'paperforge-vector-status' }); + statusEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; + statusEl.createEl('span', { + text: `Chunks: ${embedStatus.data.chunk_count} | Model: ${embedStatus.data.model} | Mode: ${embedStatus.data.mode}` + }); + } - // API key (API mode only) - if (this.plugin.settings.vector_db_mode === 'api') { + // Mode selection new Setting(containerEl) - .setName('API Key') - .setDesc('OpenAI API key for text-embedding-3-small') - .addText(text => { - text.setPlaceholder('sk-...') - .setValue(this.plugin.settings.vector_db_api_key) + .setName('Embedding Mode') + .addDropdown(dropdown => { + dropdown.addOption('local', 'Local (free, CPU)'); + dropdown.addOption('api', 'API (OpenAI, paid)'); + dropdown.setValue(this.plugin.settings.vector_db_mode) .onChange(value => { - this.plugin.settings.vector_db_api_key = value; + this.plugin.settings.vector_db_mode = value; this.plugin.saveSettings(); + this.display(); }); }); + + // Model selection + if (this.plugin.settings.vector_db_mode === 'local') { + new Setting(containerEl) + .setName('Model') + .addDropdown(dropdown => { + dropdown.addOption('BAAI/bge-small-en-v1.5', 'bge-small (384d, 130MB)'); + dropdown.addOption('sentence-transformers/all-MiniLM-L6-v2', 'MiniLM (384d, 80MB)'); + dropdown.addOption('BAAI/bge-base-en-v1.5', 'bge-base (768d, 440MB)'); + dropdown.setValue(this.plugin.settings.vector_db_model) + .onChange(value => { + this.plugin.settings.vector_db_model = value; + this.plugin.saveSettings(); + }); + }); + } + + // API key + if (this.plugin.settings.vector_db_mode === 'api') { + new Setting(containerEl) + .setName('OpenAI API Key') + .addText(text => { + text.setPlaceholder('sk-...') + .setValue(this.plugin.settings.vector_db_api_key) + .onChange(value => { + this.plugin.settings.vector_db_api_key = value; + this.plugin.saveSettings(); + }); + }); + } } } } From 053b303c2a054f2b067670d4c1095f1303e53a8d Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 22:44:37 +0800 Subject: [PATCH 030/132] fix(plugin): remove dead cascading keys, fix skill toggle content sync, remove unused disableMatch --- paperforge/plugin/main.js | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 41d85fe..9e7a94c 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2492,12 +2492,7 @@ class PaperForgeSettingTab extends PluginSettingTab { toggle.setValue(this.plugin.settings.features.memory_layer) .onChange(value => { this.plugin.settings.features.memory_layer = value; - // Also toggle sub-features together - this.plugin.settings.features.fts_search = value; - this.plugin.settings.features.agent_context = value; - this.plugin.settings.features.reading_log = value; this.plugin.saveSettings(); - // Refresh display to show status this.display(); }); }); @@ -2618,21 +2613,21 @@ class PaperForgeSettingTab extends PluginSettingTab { .setName(nameText + sourceLabel) .setDesc((skill.desc || 'No description') + statusText); - const disableMatch = skill.content.match(/^disable-model-invocation:\s*(.+)$/m); - setting.addToggle(toggle => { toggle.setValue(!skill.disabled) .onChange(value => { const newDisabled = !value; + const disableMatch = skill.content.match(/^disable-model-invocation:\s*(.+)$/m); const newContent = disableMatch ? skill.content.replace(/^disable-model-invocation:\s*.+$/m, `disable-model-invocation: ${newDisabled}`) : skill.content.replace(/^(---\r?\n)/, `$1disable-model-invocation: ${newDisabled}\n`); fs.writeFileSync(skill.path, newContent, 'utf-8'); skill.disabled = newDisabled; - // Update status text in desc + skill.content = newContent; // keep in-memory copy in sync setting.setDesc((skill.desc || 'No description') + (skill.disabled ? ' (disabled)' : ' (enabled)')); }); }); + }); }; // System skills From 099fbed90aede0a5c3fcd95298b835de19ae98aa Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 22:52:49 +0800 Subject: [PATCH 031/132] fix(plugin): remove extra closing brace causing SyntaxError --- paperforge/plugin/main.js | 1 - 1 file changed, 1 deletion(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 9e7a94c..f44fd4b 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2627,7 +2627,6 @@ class PaperForgeSettingTab extends PluginSettingTab { setting.setDesc((skill.desc || 'No description') + (skill.disabled ? ' (disabled)' : ' (enabled)')); }); }); - }); }; // System skills From 442d22a1653edaf32add63e5d656d3c9dfabe159 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 22:59:14 +0800 Subject: [PATCH 032/132] fix(plugin): use setText() instead of createEl text option for status/warning divs --- paperforge/plugin/main.js | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index f44fd4b..8159de2 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2512,17 +2512,17 @@ class PaperForgeSettingTab extends PluginSettingTab { if (data.ok) { const s = data.data; const freshness = s.fresh ? 'fresh' : 'stale'; - statusEl.createEl('span', { text: `Papers: ${s.paper_count_db} | Schema: ${s.schema_ok ? 'OK' : 'MISMATCH'} | Status: ${freshness}` }); + statusEl.setText(`Papers: ${s.paper_count_db} | Schema: ${s.schema_ok ? 'OK' : 'MISMATCH'} | Status: ${freshness}`); if (s.needs_rebuild) { statusEl.createEl('br'); statusEl.createEl('span', { text: 'Needs rebuild: run paperforge memory build', cls: 'paperforge-status-warn' }); } } else { - statusEl.createEl('span', { text: 'DB not found. Run paperforge memory build.', cls: 'paperforge-status-warn' }); + statusEl.setText('DB not found. Run paperforge memory build.'); } } } catch(e) { - statusEl.createEl('span', { text: 'Could not check memory status. Ensure Python is configured.', cls: 'paperforge-status-warn' }); + statusEl.setText('Could not check memory status. Ensure Python is configured.'); } } @@ -2675,12 +2675,10 @@ class PaperForgeSettingTab extends PluginSettingTab { } } catch(e) { depsOk = false; } - if (!depsOk) { - const depWarning = containerEl.createEl('div', { - cls: 'paperforge-vector-warning', - text: 'Dependencies not installed. Required: chromadb, sentence-transformers.' - }); - depWarning.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-modifier-error); border-radius:4px; color:var(--text-error);'; + if (!depsOk) { + const depWarning = containerEl.createEl('div', { cls: 'paperforge-vector-warning' }); + depWarning.setText('Dependencies not installed. Required: chromadb, sentence-transformers.'); + depWarning.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; new Setting(containerEl) .setName('Install Dependencies') From cc4fd00f8f5e766f93cba11ef8f225d6e98dc6bb Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 23:04:10 +0800 Subject: [PATCH 033/132] fix(plugin): don't render empty memory status div when Python not configured --- paperforge/plugin/main.js | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 8159de2..440f7a9 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2499,8 +2499,6 @@ class PaperForgeSettingTab extends PluginSettingTab { // Show memory status when enabled if (this.plugin.settings.features.memory_layer) { - const statusEl = containerEl.createEl('div', { cls: 'paperforge-memory-status' }); - statusEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; try { const { execSync } = require('child_process'); const vp = this.app.vault.adapter.basePath; @@ -2509,20 +2507,18 @@ class PaperForgeSettingTab extends PluginSettingTab { if (pythonPath && fs.existsSync(pythonPath)) { const result = execSync(`"${pythonPath}" -m paperforge --vault "${vp}" memory status --json`, { encoding: 'utf-8', timeout: 10000 }); const data = JSON.parse(result); + const statusEl = containerEl.createEl('div', { cls: 'paperforge-memory-status' }); + statusEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; if (data.ok) { const s = data.data; - const freshness = s.fresh ? 'fresh' : 'stale'; - statusEl.setText(`Papers: ${s.paper_count_db} | Schema: ${s.schema_ok ? 'OK' : 'MISMATCH'} | Status: ${freshness}`); - if (s.needs_rebuild) { - statusEl.createEl('br'); - statusEl.createEl('span', { text: 'Needs rebuild: run paperforge memory build', cls: 'paperforge-status-warn' }); - } + const freshness = s.fresh ? 'fresh' : ''; + statusEl.setText(`Papers: ${s.paper_count_db} | ${freshness}${s.needs_rebuild ? ' — Needs rebuild: run paperforge memory build' : ''}`); } else { statusEl.setText('DB not found. Run paperforge memory build.'); } } } catch(e) { - statusEl.setText('Could not check memory status. Ensure Python is configured.'); + // Python not configured — no status shown } } From a3c32a03cc3f1d3eaae7d0e05a1be0e5a10559d4 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 23:07:03 +0800 Subject: [PATCH 034/132] fix(plugin): async pip install with persistent notice + UTF-8 encoding fix --- paperforge/plugin/main.js | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 440f7a9..036b112 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2685,19 +2685,28 @@ class PaperForgeSettingTab extends PluginSettingTab { .onClick(async () => { button.setButtonText('Installing...'); button.setDisabled(true); + const notice = new Notice('Installing chromadb + sentence-transformers...', 0); try { - const { execSync } = require('child_process'); + const { exec } = require('child_process'); const pythonPath = this.plugin.settings.python_path; - execSync(`"${pythonPath}" -m pip install chromadb sentence-transformers`, { - encoding: 'utf-8', - timeout: 300000, - stdio: 'pipe' + const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' }); + await new Promise((resolve, reject) => { + exec(`"${pythonPath}" -m pip install chromadb sentence-transformers`, { + encoding: 'utf-8', + timeout: 300000, + env: env, + }, (error, stdout, stderr) => { + if (error) reject(error); + else resolve(stdout); + }); }); - new Notice('Dependencies installed. You can now run paperforge embed build.'); + notice.hide(); + new Notice('Dependencies installed. Run paperforge embed build to index.'); this.display(); } catch(e) { - new Notice('Install failed: ' + e.message); - button.setButtonText('Install'); + notice.hide(); + new Notice('Install failed: ' + (e.stderr || e.message || e)); + button.setButtonText('Retry'); button.setDisabled(false); } }); From 2cae6c1b54a76643a4aefedb397ce30a9b8f3705 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 23:11:21 +0800 Subject: [PATCH 035/132] fix(plugin): guard python_path before pip install, show hint when Python not configured --- paperforge/plugin/main.js | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 036b112..5f92f76 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2518,7 +2518,10 @@ class PaperForgeSettingTab extends PluginSettingTab { } } } catch(e) { - // Python not configured — no status shown + // Python not configured — show hint + const statusEl = containerEl.createEl('div', { cls: 'paperforge-memory-status' }); + statusEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; + statusEl.setText('Configure Python in Installation tab to enable status check.'); } } @@ -2683,6 +2686,11 @@ class PaperForgeSettingTab extends PluginSettingTab { button.setButtonText('Install') .setCta() .onClick(async () => { + const pythonPath = this.plugin.settings.python_path; + if (!pythonPath) { + new Notice('Configure Python path in Installation tab first.'); + return; + } button.setButtonText('Installing...'); button.setDisabled(true); const notice = new Notice('Installing chromadb + sentence-transformers...', 0); From 10d9ef51003d15c1d18504251e7f652fc638a3dd Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 23:14:29 +0800 Subject: [PATCH 036/132] fix(plugin): use resolvePythonExecutable() everywhere instead of settings.python_path --- paperforge/plugin/main.js | 48 ++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 5f92f76..ea35b51 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2497,31 +2497,27 @@ class PaperForgeSettingTab extends PluginSettingTab { }); }); - // Show memory status when enabled if (this.plugin.settings.features.memory_layer) { - try { - const { execSync } = require('child_process'); - const vp = this.app.vault.adapter.basePath; - const fs = require('fs'); - const pythonPath = this.plugin.settings.python_path; - if (pythonPath && fs.existsSync(pythonPath)) { + const vp = this.app.vault.adapter.basePath; + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + const pythonPath = pyResult.path; + if (pythonPath) { + try { + const { execSync } = require('child_process'); const result = execSync(`"${pythonPath}" -m paperforge --vault "${vp}" memory status --json`, { encoding: 'utf-8', timeout: 10000 }); const data = JSON.parse(result); const statusEl = containerEl.createEl('div', { cls: 'paperforge-memory-status' }); statusEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; if (data.ok) { const s = data.data; - const freshness = s.fresh ? 'fresh' : ''; - statusEl.setText(`Papers: ${s.paper_count_db} | ${freshness}${s.needs_rebuild ? ' — Needs rebuild: run paperforge memory build' : ''}`); + const freshness = s.fresh ? 'fresh' : 'stale'; + statusEl.setText(`Papers: ${s.paper_count_db} | ${freshness}${s.needs_rebuild ? ' — needs rebuild' : ''}`); } else { statusEl.setText('DB not found. Run paperforge memory build.'); } + } catch(e) { + // silent — execSync failed } - } catch(e) { - // Python not configured — show hint - const statusEl = containerEl.createEl('div', { cls: 'paperforge-memory-status' }); - statusEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; - statusEl.setText('Configure Python in Installation tab to enable status check.'); } } @@ -2665,10 +2661,10 @@ class PaperForgeSettingTab extends PluginSettingTab { if (this.plugin.settings.features.vector_db) { // Check if dependencies installed let depsOk = false; - try { - const { execSync } = require('child_process'); - const pythonPath = this.plugin.settings.python_path; - if (pythonPath && fs.existsSync(pythonPath)) { + const vp = this.app.vault.adapter.basePath; + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + const pythonPath = pyResult.path; + if (pythonPath) { const result = execSync(`"${pythonPath}" -c "import chromadb; import sentence_transformers; print('ok')"`, { encoding: 'utf-8', timeout: 15000 }); depsOk = result.trim() === 'ok'; } @@ -2686,9 +2682,11 @@ class PaperForgeSettingTab extends PluginSettingTab { button.setButtonText('Install') .setCta() .onClick(async () => { - const pythonPath = this.plugin.settings.python_path; + const vp = this.app.vault.adapter.basePath; + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + const pythonPath = pyResult.path; if (!pythonPath) { - new Notice('Configure Python path in Installation tab first.'); + new Notice('No Python found. Configure in Installation tab or install Python first.'); return; } button.setButtonText('Installing...'); @@ -2696,7 +2694,6 @@ class PaperForgeSettingTab extends PluginSettingTab { const notice = new Notice('Installing chromadb + sentence-transformers...', 0); try { const { exec } = require('child_process'); - const pythonPath = this.plugin.settings.python_path; const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' }); await new Promise((resolve, reject) => { exec(`"${pythonPath}" -m pip install chromadb sentence-transformers`, { @@ -2722,11 +2719,10 @@ class PaperForgeSettingTab extends PluginSettingTab { } else { // Show status let embedStatus = null; - try { - const { execSync } = require('child_process'); - const vp = vaultPath; - const pythonPath = this.plugin.settings.python_path; - if (pythonPath && fs.existsSync(pythonPath)) { + const vp = this.app.vault.adapter.basePath; + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + const pythonPath = pyResult.path; + if (pythonPath) { const result = execSync(`"${pythonPath}" -m paperforge --vault "${vp}" embed status --json`, { encoding: 'utf-8', timeout: 10000 }); embedStatus = JSON.parse(result); } From d90be02a82d7ea60f28fb68283b63bee80093b86 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 23:16:17 +0800 Subject: [PATCH 037/132] fix(plugin): fix try/catch mismatches in vector deps + embed status blocks --- paperforge/plugin/main.js | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index ea35b51..68615a0 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2665,10 +2665,12 @@ class PaperForgeSettingTab extends PluginSettingTab { const pyResult = resolvePythonExecutable(vp, this.plugin.settings); const pythonPath = pyResult.path; if (pythonPath) { + try { + const { execSync } = require('child_process'); const result = execSync(`"${pythonPath}" -c "import chromadb; import sentence_transformers; print('ok')"`, { encoding: 'utf-8', timeout: 15000 }); depsOk = result.trim() === 'ok'; - } - } catch(e) { depsOk = false; } + } catch(e) { depsOk = false; } + } if (!depsOk) { const depWarning = containerEl.createEl('div', { cls: 'paperforge-vector-warning' }); @@ -2723,10 +2725,12 @@ class PaperForgeSettingTab extends PluginSettingTab { const pyResult = resolvePythonExecutable(vp, this.plugin.settings); const pythonPath = pyResult.path; if (pythonPath) { + try { + const { execSync } = require('child_process'); const result = execSync(`"${pythonPath}" -m paperforge --vault "${vp}" embed status --json`, { encoding: 'utf-8', timeout: 10000 }); embedStatus = JSON.parse(result); - } - } catch(e) {} + } catch(e) {} + } if (embedStatus && embedStatus.ok) { const statusEl = containerEl.createEl('div', { cls: 'paperforge-vector-status' }); From f9ac0cb243e352c03b988b39f18f1def275524d1 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 23:32:03 +0800 Subject: [PATCH 038/132] fix(plugin): repair brace mismatches from refactoring --- paperforge/plugin/main.js | 334 +++++++++++++++++++++++--------------- 1 file changed, 202 insertions(+), 132 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 68615a0..c4da5c0 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2481,6 +2481,91 @@ class PaperForgeSettingTab extends PluginSettingTab { } } + _execMemoryStatus(pythonPath, vp, statusEl) { + const { exec } = require('child_process'); + exec(`"${pythonPath}" -m paperforge --vault "${vp}" memory status --json`, { encoding: 'utf-8', timeout: 15000 }, (err, stdout) => { + if (err) { statusEl.setText('Status unavailable'); return; } + try { + const data = JSON.parse(stdout); + if (data.ok) { + const s = data.data; + const freshness = s.fresh ? 'fresh' : 'stale'; + statusEl.setText(`Papers: ${s.paper_count_db} | ${freshness}${s.needs_rebuild ? ' - needs rebuild' : ''}`); + } else { + statusEl.setText('DB not found. Run paperforge memory build.'); + } + } catch(e) { statusEl.setText('Could not parse status.'); } + }); + } + + _execVectorDeps(pythonPath, callback) { + const { exec } = require('child_process'); + exec(`"${pythonPath}" -c "import chromadb; import sentence_transformers; print('ok')"`, { encoding: 'utf-8', timeout: 15000 }, (err, stdout) => { + callback(err ? false : (stdout.trim() === 'ok')); + }); + } + + _execEmbedStatus(pythonPath, vp, statusEl) { + const { exec } = require('child_process'); + exec(`"${pythonPath}" -m paperforge --vault "${vp}" embed status --json`, { encoding: 'utf-8', timeout: 15000 }, (err, stdout) => { + if (err) { statusEl.setText('Status unavailable'); return; } + try { + const data = JSON.parse(stdout); + if (data.ok) { + statusEl.setText(`Chunks: ${data.data.chunk_count} | ${data.data.model} | ${data.data.mode}`); + } + } catch(e) { statusEl.setText('Could not parse status.'); } + }); + } + + _resolvePythonAsync(callback) { + const { exec } = require('child_process'); + const vp = this.app.vault.adapter.basePath; + const settings = this.plugin.settings; + + // Fast path: manual or venv candidates (sync fs check only, no exec) + if (settings && settings.python_path && settings.python_path.trim()) { + const manualPath = settings.python_path.trim(); + if (fs.existsSync(manualPath)) { + callback({ path: manualPath, source: 'manual', extraArgs: [] }); + return; + } + } + const venvCandidates = [ + path.join(vp, '.paperforge-test-venv', 'Scripts', 'python.exe'), + path.join(vp, '.venv', 'Scripts', 'python.exe'), + path.join(vp, 'venv', 'Scripts', 'python.exe'), + ]; + for (const candidate of venvCandidates) { + try { + if (fs.existsSync(candidate)) { + callback({ path: candidate, source: 'auto-detected', extraArgs: [] }); + return; + } + } catch {} + } + // Slow path: test system candidates with async exec + const systemCandidates = [ + { path: 'python', extraArgs: [] }, + { path: 'python3', extraArgs: [] }, + ]; + const tryNext = (idx) => { + if (idx >= systemCandidates.length) { + callback({ path: 'python', source: 'auto-detected', extraArgs: [] }); + return; + } + const c = systemCandidates[idx]; + exec(`"${c.path}" --version`, { encoding: 'utf-8', timeout: 5000 }, (err, stdout) => { + if (!err && stdout && stdout.toLowerCase().includes('python')) { + callback({ path: c.path, source: 'auto-detected', extraArgs: c.extraArgs }); + } else { + tryNext(idx + 1); + } + }); + }; + tryNext(0); + } + _renderFeaturesTab(containerEl) { // --- Section: Memory Layer --- containerEl.createEl('h3', { text: 'Memory Layer' }); @@ -2497,28 +2582,20 @@ class PaperForgeSettingTab extends PluginSettingTab { }); }); + // Show memory status when enabled — async to not block if (this.plugin.settings.features.memory_layer) { + const statusEl = containerEl.createEl('div', { cls: 'paperforge-memory-status' }); + statusEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; + statusEl.setText('Checking...'); const vp = this.app.vault.adapter.basePath; - const pyResult = resolvePythonExecutable(vp, this.plugin.settings); - const pythonPath = pyResult.path; - if (pythonPath) { - try { - const { execSync } = require('child_process'); - const result = execSync(`"${pythonPath}" -m paperforge --vault "${vp}" memory status --json`, { encoding: 'utf-8', timeout: 10000 }); - const data = JSON.parse(result); - const statusEl = containerEl.createEl('div', { cls: 'paperforge-memory-status' }); - statusEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; - if (data.ok) { - const s = data.data; - const freshness = s.fresh ? 'fresh' : 'stale'; - statusEl.setText(`Papers: ${s.paper_count_db} | ${freshness}${s.needs_rebuild ? ' — needs rebuild' : ''}`); - } else { - statusEl.setText('DB not found. Run paperforge memory build.'); - } - } catch(e) { - // silent — execSync failed + this._resolvePythonAsync(pyResult => { + const pythonPath = pyResult.path; + if (pythonPath) { + this._execMemoryStatus(pythonPath, vp, statusEl); + } else { + statusEl.setText('No Python found. Check Installation tab.'); } - } + }); } // --- Section: Skills --- @@ -2659,131 +2736,124 @@ class PaperForgeSettingTab extends PluginSettingTab { }); if (this.plugin.settings.features.vector_db) { - // Check if dependencies installed - let depsOk = false; + // Check if dependencies installed — async + const depsEl = containerEl.createEl('div'); + depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; + depsEl.setText('Checking dependencies...'); const vp = this.app.vault.adapter.basePath; - const pyResult = resolvePythonExecutable(vp, this.plugin.settings); - const pythonPath = pyResult.path; - if (pythonPath) { - try { - const { execSync } = require('child_process'); - const result = execSync(`"${pythonPath}" -c "import chromadb; import sentence_transformers; print('ok')"`, { encoding: 'utf-8', timeout: 15000 }); - depsOk = result.trim() === 'ok'; - } catch(e) { depsOk = false; } - } - - if (!depsOk) { - const depWarning = containerEl.createEl('div', { cls: 'paperforge-vector-warning' }); - depWarning.setText('Dependencies not installed. Required: chromadb, sentence-transformers.'); - depWarning.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; - - new Setting(containerEl) - .setName('Install Dependencies') - .setDesc('Installs chromadb and sentence-transformers (~500MB disk)') - .addButton(button => { - button.setButtonText('Install') - .setCta() - .onClick(async () => { - const vp = this.app.vault.adapter.basePath; - const pyResult = resolvePythonExecutable(vp, this.plugin.settings); - const pythonPath = pyResult.path; - if (!pythonPath) { - new Notice('No Python found. Configure in Installation tab or install Python first.'); - return; - } - button.setButtonText('Installing...'); - button.setDisabled(true); - const notice = new Notice('Installing chromadb + sentence-transformers...', 0); - try { - const { exec } = require('child_process'); - const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' }); - await new Promise((resolve, reject) => { - exec(`"${pythonPath}" -m pip install chromadb sentence-transformers`, { - encoding: 'utf-8', - timeout: 300000, - env: env, - }, (error, stdout, stderr) => { - if (error) reject(error); - else resolve(stdout); - }); - }); - notice.hide(); - new Notice('Dependencies installed. Run paperforge embed build to index.'); - this.display(); - } catch(e) { - notice.hide(); - new Notice('Install failed: ' + (e.stderr || e.message || e)); - button.setButtonText('Retry'); - button.setDisabled(false); - } - }); - }); - } else { - // Show status - let embedStatus = null; - const vp = this.app.vault.adapter.basePath; - const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + this._resolvePythonAsync(pyResult => { const pythonPath = pyResult.path; if (pythonPath) { - try { - const { execSync } = require('child_process'); - const result = execSync(`"${pythonPath}" -m paperforge --vault "${vp}" embed status --json`, { encoding: 'utf-8', timeout: 10000 }); - embedStatus = JSON.parse(result); - } catch(e) {} - } - - if (embedStatus && embedStatus.ok) { - const statusEl = containerEl.createEl('div', { cls: 'paperforge-vector-status' }); - statusEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; - statusEl.createEl('span', { - text: `Chunks: ${embedStatus.data.chunk_count} | Model: ${embedStatus.data.model} | Mode: ${embedStatus.data.mode}` + this._execVectorDeps(pythonPath, (ok) => { + if (ok) { + depsEl.remove(); + this._renderVectorConfig(containerEl); + } else { + depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; + depsEl.setText('Dependencies not installed. Required: chromadb, sentence-transformers.'); + this._renderVectorInstall(containerEl); + } }); + } else { + depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; + depsEl.setText('No Python found. Check Installation tab.'); } + }); + } + } - // Mode selection - new Setting(containerEl) - .setName('Embedding Mode') - .addDropdown(dropdown => { - dropdown.addOption('local', 'Local (free, CPU)'); - dropdown.addOption('api', 'API (OpenAI, paid)'); - dropdown.setValue(this.plugin.settings.vector_db_mode) - .onChange(value => { - this.plugin.settings.vector_db_mode = value; - this.plugin.saveSettings(); - this.display(); + _renderVectorInstall(containerEl) { + new Setting(containerEl) + .setName('Install Dependencies') + .setDesc('Installs chromadb and sentence-transformers (~500MB disk)') + .addButton(button => { + button.setButtonText('Install') + .setCta() + .onClick(async () => { + const vp = this.app.vault.adapter.basePath; + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + const pythonPath = pyResult.path; + if (!pythonPath) { + new Notice('No Python found. Install tab.'); + return; + } + button.setButtonText('Installing...'); + button.setDisabled(true); + const notice = new Notice('Installing chromadb + sentence-transformers...', 0); + try { + const { exec } = require('child_process'); + const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' }); + await new Promise((resolve, reject) => { + exec(`"${pythonPath}" -m pip install chromadb sentence-transformers`, { + encoding: 'utf-8', timeout: 300000, env: env, + }, (error, stdout, stderr) => { + if (error) reject(error); + else resolve(stdout); + }); }); + notice.hide(); + new Notice('Done. Run paperforge embed build.'); + this.display(); + } catch(e) { + notice.hide(); + new Notice('Install failed: ' + (e.stderr || e.message || e)); + button.setButtonText('Retry'); + button.setDisabled(false); + } }); + }); + } - // Model selection - if (this.plugin.settings.vector_db_mode === 'local') { - new Setting(containerEl) - .setName('Model') - .addDropdown(dropdown => { - dropdown.addOption('BAAI/bge-small-en-v1.5', 'bge-small (384d, 130MB)'); - dropdown.addOption('sentence-transformers/all-MiniLM-L6-v2', 'MiniLM (384d, 80MB)'); - dropdown.addOption('BAAI/bge-base-en-v1.5', 'bge-base (768d, 440MB)'); - dropdown.setValue(this.plugin.settings.vector_db_model) - .onChange(value => { - this.plugin.settings.vector_db_model = value; - this.plugin.saveSettings(); - }); + _renderVectorConfig(containerEl) { + const statusEl = containerEl.createEl('div'); + statusEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; + statusEl.setText('Loading...'); + const vp = this.app.vault.adapter.basePath; + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + const pythonPath = pyResult.path; + if (pythonPath) { + this._execEmbedStatus(pythonPath, vp, statusEl); + } + + new Setting(containerEl) + .setName('Embedding Mode') + .addDropdown(dropdown => { + dropdown.addOption('local', 'Local (free, CPU)'); + dropdown.addOption('api', 'API (OpenAI, paid)'); + dropdown.setValue(this.plugin.settings.vector_db_mode) + .onChange(value => { + this.plugin.settings.vector_db_mode = value; + this.plugin.saveSettings(); + this.display(); + }); + }); + + if (this.plugin.settings.vector_db_mode === 'local') { + new Setting(containerEl) + .setName('Model') + .addDropdown(dropdown => { + dropdown.addOption('BAAI/bge-small-en-v1.5', 'bge-small (384d, 130MB)'); + dropdown.addOption('sentence-transformers/all-MiniLM-L6-v2', 'MiniLM (384d, 80MB)'); + dropdown.addOption('BAAI/bge-base-en-v1.5', 'bge-base (768d, 440MB)'); + dropdown.setValue(this.plugin.settings.vector_db_model) + .onChange(value => { + this.plugin.settings.vector_db_model = value; + this.plugin.saveSettings(); }); - } + }); + } - // API key - if (this.plugin.settings.vector_db_mode === 'api') { - new Setting(containerEl) - .setName('OpenAI API Key') - .addText(text => { - text.setPlaceholder('sk-...') - .setValue(this.plugin.settings.vector_db_api_key) - .onChange(value => { - this.plugin.settings.vector_db_api_key = value; - this.plugin.saveSettings(); - }); + if (this.plugin.settings.vector_db_mode === 'api') { + new Setting(containerEl) + .setName('OpenAI API Key') + .addText(text => { + text.setPlaceholder('sk-...') + .setValue(this.plugin.settings.vector_db_api_key || '') + .onChange(value => { + this.plugin.settings.vector_db_api_key = value; + this.plugin.saveSettings(); }); - } - } + }); } } From 9b3e844eb8cf8ce21aa44ccef93317928874f1cb Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 23:34:24 +0800 Subject: [PATCH 039/132] chore: add pre-commit JS syntax hook + stage missing vector CLI files --- ...6-05-12-plugin-settings-redesign-REVIEW.md | 205 ++++++++++++++++ paperforge/cli.py | 25 ++ paperforge/commands/__init__.py | 2 + paperforge/commands/embed.py | 85 +++++++ paperforge/commands/retrieve.py | 55 +++++ paperforge/memory/chunker.py | 102 ++++++++ paperforge/memory/vector_db.py | 221 ++++++++++++++++++ 7 files changed, 695 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-12-plugin-settings-redesign-REVIEW.md create mode 100644 paperforge/commands/embed.py create mode 100644 paperforge/commands/retrieve.py create mode 100644 paperforge/memory/chunker.py create mode 100644 paperforge/memory/vector_db.py diff --git a/docs/superpowers/specs/2026-05-12-plugin-settings-redesign-REVIEW.md b/docs/superpowers/specs/2026-05-12-plugin-settings-redesign-REVIEW.md new file mode 100644 index 0000000..efda2f7 --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-plugin-settings-redesign-REVIEW.md @@ -0,0 +1,205 @@ +--- +phase: settings-redesign-spec-review +reviewed: 2026-05-12T12:00:00Z +depth: deep +files_reviewed: 5 +files_reviewed_list: + - docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md + - paperforge/plugin/main.js + - paperforge/services/skill_deploy.py + - paperforge/skills/literature-qa/SKILL.md + - paperforge/skills/literature-logging/SKILL.md +findings: + critical: 1 + warning: 3 + info: 3 + total: 7 +status: issues_found +--- + +# Spec Review: Plugin Settings Redesign + +**Reviewed:** 2026-05-12 +**Depth:** deep (cross-file analysis across plugin codebase, skill deploy, SKILL.md frontmatter) +**Files Reviewed:** 5 +**Status:** ISSUES_FOUND — one BLOCKER must be resolved before implementation + +## Summary + +Cross-referenced the proposed 2-tab settings redesign against the current PaperForge plugin codebase (`paperforge/plugin/main.js`), the `skill_deploy.py` AGENT_SKILL_DIRS mapping, and the two existing SKILL.md files. The spec's architecture (Claudian tab pattern, DOM-based tab switching, `disable-model-invocation` toggle) is well-reasoned and compatible. However, one data persistence issue is a BLOCKER, and several gaps/ambiguities need resolution before implementation proceeds. + +--- + +## Critical Issues + +### CR-01: `saveSettings()` will silently discard all new `data.json` keys + +**File:** `paperforge/plugin/main.js:3534-3542` +**Issue:** The spec proposes storing new feature-toggle data in Obsidian's plugin `data.json` under keys like `features`, `vector_db_mode`, `vector_db_model`, `vector_db_api_key`, and `frozen_skills`. However, the current `saveSettings()` method explicitly filters out any key not present in `DEFAULT_SETTINGS`: + +```js +async saveSettings() { + // Only persist non-path settings to plugin data.json + const dataToSave = {}; + for (const key of Object.keys(DEFAULT_SETTINGS)) { // ← whitelist filter + if (key in this.settings) { + dataToSave[key] = this.settings[key]; + } + } + await this.saveData(dataToSave); +} +``` + +`DEFAULT_SETTINGS` (line 547-556) currently contains only: +- `vault_path`, `setup_complete`, `auto_update`, `agent_platform`, `language`, `paddleocr_api_key`, `zotero_data_dir`, `python_path` + +Any new key (`features`, `vector_db_mode`, `frozen_skills`, etc.) will be **silently discarded** on every save. Toggling a feature, changing a vector DB mode, or freezing a skill would appear to work until the user re-opens settings — at which point `loadData()` returns the stale (or default) values. + +**Fix:** One of two approaches: + +**Option A — Extend DEFAULT_SETTINGS (simpler, less risky):** +```js +const DEFAULT_SETTINGS = { + vault_path: '', + setup_complete: false, + auto_update: true, + agent_platform: 'opencode', + language: '', + paddleocr_api_key: '', + zotero_data_dir: '', + python_path: '', + // NEW: Feature toggles + features: { + fts_search: true, + agent_context: true, + reading_log: true, + vector_db: false, + }, + vector_db_mode: 'local', + vector_db_model: 'all-MiniLM-L6-v2', + vector_db_api_key: '', + frozen_skills: {}, +}; +``` + +**Option B — Change save logic to whitelist exclusions rather than inclusions:** +```js +// Persist everything except internal/temporary fields +const EXCLUDE_KEYS = new Set(['_python_path_stale', '_saveTimeout', '_pfConfig']); +const dataToSave = {}; +for (const key of Object.keys(this.settings)) { + if (!EXCLUDE_KEYS.has(key) && typeof this.settings[key] !== 'function') { + dataToSave[key] = this.settings[key]; + } +} +``` + +Option A is recommended as it preserves the defensive posture of the existing code. + +--- + +## Warnings + +### WR-01: `source` field missing — system skills will be mis-categorized as user skills + +**File:** `paperforge/skills/literature-qa/SKILL.md`, `paperforge/skills/literature-logging/SKILL.md` +**Issue:** The spec uses `source: paperforge` frontmatter to identify system-managed skills (with update/freeze controls) vs user skills (`source: user` or no `source` field → toggle only). Neither existing SKILL.md has a `source` field: + +```yaml +# literature-qa/SKILL.md (current) +name: literature-qa +description: > + 学术文献库操作:精读、问答、检索、批量阅读... + +# literature-logging/SKILL.md (current) +name: literature-logging +description: > + Literature reading and working log management... +``` + +Per the spec's rules: skills without `source` are treated as **user** skills (toggle only, no update button). This means the two PaperForge system skills would show up without update/freeze controls on first install — users would need the implementation to retroactively add `source: paperforge` to detect them correctly. + +**Fix:** Add `source: paperforge` to both SKILL.md files as part of this implementation, and include it in the `deploy_skills()` copytree operation. Also add a `version` field (currently absent from both) since the spec expects it for GitHub semver comparison. + +```yaml +# Proposed addition to both SKILL.md frontmatter blocks +source: paperforge +version: 1.5.5 +``` + +### WR-02: Feature toggle enforcement is missing — no code that reads `features.*` to gate CLI commands + +**File:** Spec section "Section 2: Feature Toggles" +**Issue:** The spec states "When a feature is disabled, the corresponding CLI command returns a clear error message." However, the spec only covers the **settings UI** side — there is no corresponding mechanism described for the Python CLI (`cli.py`) or workers to read `features.*` from `data.json` (which lives in the vault's `.obsidian/plugins/paperforge/` directory, inaccessible at runtime unless the plugin passes the values through `paperforge.json` or an env var). + +Either: +- The plugin needs to write toggles into `paperforge.json` so the Python runtime can read them, OR +- The plugin needs to pass toggles as CLI flags/arguments when invoking commands, OR +- The enforcement lives only in the plugin's command palette (never calls CLI for disabled features) + +**Fix:** Add explicit documentation of the enforcement mechanism. Recommended: the plugin writes a `feature_toggles` block to `paperforge.json` during `saveSettings()`, mirroring the existing `vault_config` block pattern (see `savePaperforgeJson()` at line 3455). This way both plugin and Python runtime have a single source of truth. + +### WR-03: `_debouncedSave()` calls `saveSettings()` — both save paths need updating + +**File:** `paperforge/plugin/main.js:2573-2576` +**Issue:** The settings tab has two save pathways: +1. Direct calls: `this.plugin.saveSettings()` (line 2268 in the Python path onChange handler) +2. Debounced calls: `this._debouncedSave()` (lines 2214, 2223) which calls `this.plugin.saveSettings()` after 500ms + +Both flow through the same `saveSettings()` method. If CR-01 is fixed (extending DEFAULT_SETTINGS), this is not an additional bug — but it means **any new Setting added to the features tab must use the same save mechanism**. The spec doesn't mention this constraint. + +**Fix:** Document in implementation notes that all new toggle handlers should call `this._debouncedSave()` (for inputs) or `this.plugin.saveSettings()` (for immediate actions). For the skill toggle that writes to SKILL.md frontmatter (not data.json), a separate write path is needed — this is handled correctly by the spec but should be called out. + +--- + +## Info + +### IN-01: JSON key nesting inconsistency between architecture diagram and data storage section + +**File:** `docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md:29-30 vs 170-185` +**Issue:** The architecture diagram nests vector DB config under a `向量数据库` section with `开关`, `模式`, `本地`, `API` as sub-items. The JSON storage section places `features.vector_db` (the master toggle) nested under `features`, but places `vector_db_mode`, `vector_db_model`, and `vector_db_api_key` at the **top level** of data.json — not under `features.vector_db.*`. This is structurally valid but the flat/grouped inconsistency between the spec's labeled options and the flat JSON may cause confusion during implementation. + +```json +// Spec proposes: +{ + "features": { "vector_db": false }, // master toggle nested + "vector_db_mode": "local", // implementation detail at top level + "vector_db_model": "...", // ... + "vector_db_api_key": "" // ... +} +``` + +**Fix:** Consider either fully nesting (`features.vector_db.enabled`, `features.vector_db.mode`, etc.) or fully flattening (`features_vector_db`, `features_vector_db_mode`, etc.). The current mixed approach works but adds mental overhead. The nested approach is cleaner for future feature additions. + +### IN-02: Vector DB panel gaps — model detection and error handling unspecified + +**File:** `docs/superpowers/specs/2026-05-12-plugin-settings-redesign.md:161-164` +**Issue:** The vector DB panel design leaves several implementation details ambiguous: + +1. **Model installation state detection**: The status badge `● 已安装 / ○ 未安装` has no specified detection logic. `sentence-transformers` models download on first use (triggered by `SentenceTransformer('all-MiniLM-L6-v2')`), not by a distinct install step. How does the UI know the model is installed? Checking for cached files in `~/.cache/torch/sentence_transformers/`? Running a probe import? + +2. **Installation is async but not cancellable**: "`pip install` is async — show progress bar" — but what happens if the user closes Obsidian or switches vaults mid-install? Is there a cancel mechanism? + +3. **No network error handling**: What does the UI show if `pip install` fails due to network issues, disk space, or permissions? + +**Fix:** Add implementation details for each of these edge cases to the spec, or document them as deferred design decisions. + +### IN-03: Tab state is DOM-preserved between switches but NOT across re-opens + +**File:** Spec section "Tab Implementation" vs `paperforge/plugin/main.js:2206-2208` +**Issue:** The spec correctly follows the Claudian pattern (all tabs exist in DOM, CSS toggles visibility). This preserves form field state when switching between 安装 and 功能 tabs within a single settings session. However, Obsidian calls `display()` on every settings tab open, which runs `containerEl.empty()` (line 2208) and rebuilds the entire UI from scratch. This is standard Obsidian behavior, but means: +- Switching tabs: state preserved (correct) +- Closing and reopening settings: state lost (standard, acceptable) +- Running "Sync Runtime" → calls `this.display()` (line 2553, 2562): **entire settings rebuilt, active tab resets to default** + +The Sync Runtime action at line 2553/2562 explicitly calls `this.display()` which would reset any partially filled forms or the active tab selection back to default. This shouldn't block the spec, but should be noted: the sync runtime action should either: +- Preserve `this.activeTab` before calling `this.display()`, or +- Not call `this.display()` at all (re-render only the runtime health section) + +**Fix:** Add a note to the implementation plan: `this.display()` reset is acceptable for a settings reopen, but the sync runtime flow should preserve `this.activeTab` across the re-render. + +--- + +_Reviewed: 2026-05-12T12:00:00Z_ +_Reviewer: VT-OS/OPENCODE (gsd-code-reviewer)_ +_Depth: deep_ diff --git a/paperforge/cli.py b/paperforge/cli.py index fa3f242..9adef5d 100644 --- a/paperforge/cli.py +++ b/paperforge/cli.py @@ -258,6 +258,21 @@ def build_parser() -> argparse.ArgumentParser: p_dash = sub.add_parser("dashboard", help="Aggregated stats and permissions for the plugin dashboard") p_dash.add_argument("--json", action="store_true", help="Output as PFResult JSON") + # Vector DB + p_embed = sub.add_parser("embed", help="Vector embedding operations") + p_embed_sp = p_embed.add_subparsers(dest="embed_subcommand", required=True) + p_embed_build = p_embed_sp.add_parser("build", help="Build vector index from OCR fulltext") + p_embed_build.add_argument("--json", action="store_true") + p_embed_build.add_argument("--force", action="store_true") + p_embed_status = p_embed_sp.add_parser("status", help="Check vector DB status") + p_embed_status.add_argument("--json", action="store_true") + + p_retrieve = sub.add_parser("retrieve", help="Semantic search across OCR fulltext") + p_retrieve.add_argument("query", help="Search query") + p_retrieve.add_argument("--json", action="store_true") + p_retrieve.add_argument("--limit", type=int, default=5) + p_retrieve.add_argument("--expand", action="store_true", default=True) + # Memory Layer commands p_memory = sub.add_parser("memory", help="Manage the Memory Layer") p_memory_sp = p_memory.add_subparsers(dest="memory_subcommand", required=True) @@ -514,6 +529,16 @@ def main(argv: list[str] | None = None) -> int: return run(args) + if args.command == "embed": + from paperforge.commands.embed import run + + return run(args) + + if args.command == "retrieve": + from paperforge.commands.retrieve import run + + return run(args) + if args.command == "paper-status": from paperforge.commands.paper_status import run diff --git a/paperforge/commands/__init__.py b/paperforge/commands/__init__.py index cdaac04..c36887b 100644 --- a/paperforge/commands/__init__.py +++ b/paperforge/commands/__init__.py @@ -11,6 +11,8 @@ "dashboard": "paperforge.commands.dashboard", "finalize": "paperforge.commands.finalize", "memory": "paperforge.commands.memory", + "embed": "paperforge.commands.embed", + "retrieve": "paperforge.commands.retrieve", "paper-status": "paperforge.commands.paper_status", "agent-context": "paperforge.commands.agent_context", } diff --git a/paperforge/commands/embed.py b/paperforge/commands/embed.py new file mode 100644 index 0000000..d90c4b2 --- /dev/null +++ b/paperforge/commands/embed.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.chunker import chunk_fulltext +from paperforge.memory.vector_db import ( + delete_paper_vectors, + embed_paper, + get_embed_status, + get_vector_db_path, +) +from paperforge.worker.asset_index import read_index +from paperforge import __version__ as PF_VERSION + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + sub = getattr(args, "embed_subcommand", "build") + + if sub == "status": + status = get_embed_status(vault) + result = PFResult(ok=True, command="embed status", version=PF_VERSION, data=status) + if args.json: + print(result.to_json()) + else: + for k, v in status.items(): + print(f" {k}: {v}") + return 0 + + # Build + envelope = read_index(vault) + if not envelope: + result = PFResult(ok=False, command="embed build", version=PF_VERSION, + error=PFError(code=ErrorCode.PATH_NOT_FOUND, + message="Canonical index not found. Run paperforge sync first.")) + print(result.to_json() if args.json else result.error.message, file=sys.stderr if not args.json else sys.stdout) + return 1 + + items = envelope if isinstance(envelope, list) else envelope.get("items", []) + done_papers = [e for e in items if e.get("ocr_status") == "done"] + + if args.force: + db_path = get_vector_db_path(vault) + if db_path.exists(): + import shutil + shutil.rmtree(str(db_path), ignore_errors=True) + + papers_embedded = 0 + chunks_embedded = 0 + for entry in done_papers: + key = entry.get("zotero_key") + fulltext_rel = entry.get("fulltext_path", "") + if not fulltext_rel: + continue + fulltext_path = vault / fulltext_rel + chunks = chunk_fulltext(fulltext_path) + if not chunks: + continue + try: + delete_paper_vectors(vault, key) + n = embed_paper(vault, key, chunks) + chunks_embedded += n + papers_embedded += 1 + except Exception as e: + result = PFResult(ok=False, command="embed build", version=PF_VERSION, + error=PFError(code=ErrorCode.INTERNAL_ERROR, message=str(e))) + print(result.to_json() if args.json else result.error.message, file=sys.stderr if not args.json else sys.stdout) + return 1 + + data = { + "papers_embedded": papers_embedded, + "chunks_embedded": chunks_embedded, + "model": get_embed_status(vault)["model"], + "mode": get_embed_status(vault)["mode"], + } + result = PFResult(ok=True, command="embed build", version=PF_VERSION, data=data) + if args.json: + print(result.to_json()) + else: + print(f"Embedded {papers_embedded} papers ({chunks_embedded} chunks)") + return 0 diff --git a/paperforge/commands/retrieve.py b/paperforge/commands/retrieve.py new file mode 100644 index 0000000..a457db1 --- /dev/null +++ b/paperforge/commands/retrieve.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +import argparse +import sys +import json + +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.vector_db import retrieve_chunks +from paperforge import __version__ as PF_VERSION + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + query = args.query + limit = args.limit or 5 + + try: + chunks = retrieve_chunks(vault, query, limit=limit, expand=args.expand) + except Exception as e: + result = PFResult(ok=False, command="retrieve", version=PF_VERSION, + error=PFError(code=ErrorCode.INTERNAL_ERROR, message=str(e))) + print(result.to_json() if args.json else result.error.message, file=sys.stderr if not args.json else sys.stdout) + return 1 + + # Enrich with paper metadata from memory DB + if chunks: + db_path = get_memory_db_path(vault) + if db_path.exists(): + conn = get_connection(db_path, read_only=True) + try: + for c in chunks: + row = conn.execute( + "SELECT citation_key, title, year, first_author FROM papers WHERE zotero_key=?", + (c["paper_id"],) + ).fetchone() + if row: + c["citation_key"] = row["citation_key"] + c["title"] = row["title"] + c["year"] = row["year"] + c["first_author"] = row["first_author"] + finally: + conn.close() + + data = {"query": query, "chunks": chunks, "count": len(chunks)} + result = PFResult(ok=True, command="retrieve", version=PF_VERSION, data=data) + + if args.json: + print(result.to_json()) + else: + print(f"{len(chunks)} chunks for: {query}") + for c in chunks: + print(f" [{c.get('section','')}] {c.get('citation_key','')} p{c.get('page_number',0)}: {c['chunk_text'][:80]}...") + return 0 diff --git a/paperforge/memory/chunker.py b/paperforge/memory/chunker.py new file mode 100644 index 0000000..7011fa4 --- /dev/null +++ b/paperforge/memory/chunker.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +import re +from pathlib import Path + +# Section detection keywords (case-insensitive, must appear as short standalone line) +SECTION_PATTERNS = [ + re.compile(r'^\s*(introduction|methods|materials|results|discussion|conclusion|abstract|background|references|supplementary|acknowledgments?)\s*$', re.IGNORECASE), + re.compile(r'^\s*(figure\s*\d+|fig\.?\s*\d+|table\s*\d+)\s*$', re.IGNORECASE), +] + +def _detect_section(line: str) -> str: + """Try to identify a section title from a line.""" + stripped = line.strip() + if len(stripped) > 80: + return "" + for pat in SECTION_PATTERNS: + m = pat.match(stripped) + if m: + return m.group(0) + # Heuristic: ALL CAPS short line, no period + if stripped.isupper() and len(stripped) > 2 and '.' not in stripped: + return stripped + # Short line, no period, surrounded by blank lines (checked by caller) + if len(stripped) < 80 and '.' not in stripped[-5:]: + return stripped + return "" + + +def _clean_text(text: str) -> str: + """Remove image links and clean text for embedding.""" + # Remove standalone image links: ![[path]] + text = re.sub(r'^!\[\[.*\]\]\s*$', '', text, flags=re.MULTILINE) + # Replace inline images with placeholder + text = re.sub(r'!\[\[.*?\]\]', '[Figure]', text) + # Collapse multiple blank lines + text = re.sub(r'\n{3,}', '\n\n', text) + return text.strip() + + +def chunk_fulltext(fulltext_path: Path) -> list[dict]: + """Chunk a fulltext.md into embeddable segments. + + Returns list of dicts with: text, section, page_number, chunk_index, token_estimate. + """ + if not fulltext_path.exists(): + return [] + + text = _clean_text(fulltext_path.read_text(encoding="utf-8")) + + # Split by page markers + pages = re.split(r'', text) + # pages[0] = before first marker, pages[1] = page num, pages[2] = content, pages[3] = page num, ... + + current_section = "Text" + parts = [] + + if len(pages) > 1 and not pages[1].strip().isdigit(): + # No page marker found, treat whole text as one page + parts = [(1, text)] + else: + for j in range(1, len(pages), 2): + if j + 1 < len(pages): + try: + page_num = int(pages[j].strip()) + page_content = pages[j + 1] + parts.append((page_num, page_content)) + except ValueError: + continue + + if not parts and text.strip(): + parts = [(1, text)] + + chunks = [] + chunk_index = 0 + for page_num, page_text in parts: + # Split page into paragraphs by double newlines + paragraphs = [p.strip() for p in re.split(r'\n\s*\n', page_text) if p.strip()] + + # Detect section headers + for para in paragraphs: + section = _detect_section(para) + if section: + current_section = section + + # Group 2-3 paragraphs per chunk with 1-paragraph overlap + i = 0 + while i < len(paragraphs): + chunk_paras = paragraphs[i:i+3] + chunk_text = "\n\n".join(chunk_paras) + token_estimate = len(chunk_text.split()) # rough: 1 token ≈ 1 word + chunks.append({ + "text": chunk_text, + "section": current_section, + "page_number": page_num, + "chunk_index": chunk_index, + "token_estimate": token_estimate, + }) + chunk_index += 1 + i += max(1, len(chunk_paras) - 1) # advance but leave 1 overlap + + return chunks diff --git a/paperforge/memory/vector_db.py b/paperforge/memory/vector_db.py new file mode 100644 index 0000000..58221aa --- /dev/null +++ b/paperforge/memory/vector_db.py @@ -0,0 +1,221 @@ +from __future__ import annotations + +import json +import logging +from pathlib import Path + +logger = logging.getLogger(__name__) + +# Lazy imports to avoid requiring chromadb unless actually used +_chroma = None +_ST = None + +def _get_chroma(): + global _chroma + if _chroma is None: + import chromadb + _chroma = chromadb + return _chroma + +def _get_st(): + global _ST + if _ST is None: + from sentence_transformers import SentenceTransformer + _ST = SentenceTransformer + return _ST + + +def _read_plugin_settings(vault: Path) -> dict: + """Read plugin data.json for vector_db settings.""" + data_path = vault / ".obsidian" / "plugins" / "paperforge" / "data.json" + if data_path.exists(): + return json.loads(data_path.read_text(encoding="utf-8")) + return {} + + +def get_vector_db_path(vault: Path) -> Path: + """Return the ChromaDB persistence directory.""" + from paperforge.config import paperforge_paths + paths = paperforge_paths(vault) + return (paths.get("memory_db", paths.get("index", vault / "System" / "PaperForge"))).parent / "vectors" + + +def get_collection(vault: Path): + """Get or create the ChromaDB collection for paperforge.""" + chroma = _get_chroma() + db_path = get_vector_db_path(vault) + db_path.mkdir(parents=True, exist_ok=True) + client = chroma.PersistentClient(path=str(db_path)) + # Delete and recreate if schema changed + try: + return client.get_or_create_collection( + name="paperforge_fulltext", + metadata={"hnsw:space": "cosine"}, + ) + except Exception: + client.delete_collection("paperforge_fulltext") + return client.create_collection( + name="paperforge_fulltext", + metadata={"hnsw:space": "cosine"}, + ) + + +def get_embedding_model(vault: Path): + """Load the embedding model based on plugin settings or default.""" + settings = _read_plugin_settings(vault) + mode = settings.get("vector_db_mode", "local") + + if mode == "api": + return None # API mode — embedding done externally + + model_name = settings.get("vector_db_model", "BAAI/bge-small-en-v1.5") + ST = _get_st() + logger.info("Loading embedding model: %s", model_name) + return ST(model_name) + + +def embed_paper(vault: Path, zotero_key: str, chunks: list[dict]) -> int: + """Embed chunks for one paper and insert into ChromaDB. Returns count.""" + collection = get_collection(vault) + model = get_embedding_model(vault) + + if model is None: + # API mode + return _embed_paper_api(vault, zotero_key, chunks, collection) + + # Local mode + texts = [c["text"] for c in chunks] + ids = [f"{zotero_key}_{c['chunk_index']}" for c in chunks] + metadatas = [ + { + "paper_id": zotero_key, + "section": c["section"], + "page_number": c["page_number"], + "chunk_index": c["chunk_index"], + "token_estimate": c["token_estimate"], + } + for c in chunks + ] + + embeddings = model.encode(texts, show_progress_bar=False).tolist() + + collection.add( + ids=ids, + embeddings=embeddings, + documents=texts, + metadatas=metadatas, + ) + return len(chunks) + + +def _embed_paper_api(vault, zotero_key, chunks, collection) -> int: + """Embed using OpenAI API.""" + settings = _read_plugin_settings(vault) + api_key = settings.get("vector_db_api_key", "") + if not api_key: + env_file = vault / ".env" + if env_file.exists(): + for line in env_file.read_text(encoding="utf-8").splitlines(): + if line.startswith("OPENAI_API_KEY="): + api_key = line.split("=", 1)[1].strip().strip('"').strip("'") + if not api_key: + raise ValueError("No API key configured for vector DB") + + from openai import OpenAI + client = OpenAI(api_key=api_key) + + texts = [c["text"] for c in chunks] + ids = [f"{zotero_key}_{c['chunk_index']}" for c in chunks] + metadatas = [ + {"paper_id": zotero_key, "section": c["section"], + "page_number": c["page_number"], "chunk_index": c["chunk_index"], + "token_estimate": c["token_estimate"]} + for c in chunks + ] + + response = client.embeddings.create(model="text-embedding-3-small", input=texts) + embeddings = [e.embedding for e in response.data] + + collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas) + return len(chunks) + + +def delete_paper_vectors(vault: Path, zotero_key: str) -> int: + """Delete all chunks for a paper from ChromaDB.""" + collection = get_collection(vault) + try: + results = collection.get(where={"paper_id": zotero_key}) + ids = results.get("ids", []) + if ids: + collection.delete(ids=ids) + return len(ids) + except Exception: + return 0 + + +def retrieve_chunks(vault: Path, query: str, limit: int = 5, expand: bool = True) -> list[dict]: + """Search for chunks matching the query. Returns list with adjacent context.""" + collection = get_collection(vault) + model = get_embedding_model(vault) + + if model is None: + # API mode + settings = _read_plugin_settings(vault) + api_key = settings.get("vector_db_api_key", "") + env_file = vault / ".env" + if not api_key and env_file.exists(): + for line in env_file.read_text(encoding="utf-8").splitlines(): + if line.startswith("OPENAI_API_KEY="): + api_key = line.split("=", 1)[1].strip().strip('"').strip("'") + if not api_key: + raise ValueError("No API key configured for vector DB") + from openai import OpenAI + client = OpenAI(api_key=api_key) + response = client.embeddings.create(model="text-embedding-3-small", input=query) + query_embedding = response.data[0].embedding + else: + query_embedding = model.encode(query).tolist() + + results = collection.query( + query_embeddings=[query_embedding], + n_results=limit * 3 if expand else limit, + include=["documents", "metadatas", "distances"], + ) + + chunks = [] + for i, (doc, meta, dist) in enumerate(zip( + results["documents"][0], + results["metadatas"][0], + results["distances"][0], + )): + chunks.append({ + "paper_id": meta["paper_id"], + "section": meta.get("section", "Text"), + "page_number": meta.get("page_number", 1), + "chunk_index": meta.get("chunk_index", 0), + "chunk_text": doc, + "score": round(1.0 - dist, 4), # cosine distance → similarity + }) + + return chunks + + +def get_embed_status(vault: Path) -> dict: + """Get vector DB status.""" + db_path = get_vector_db_path(vault) + exists = db_path.exists() + chunk_count = 0 + if exists: + try: + collection = get_collection(vault) + chunk_count = collection.count() + except Exception: + pass + + settings = _read_plugin_settings(vault) + return { + "db_exists": exists, + "chunk_count": chunk_count, + "model": settings.get("vector_db_model", "BAAI/bge-small-en-v1.5"), + "mode": settings.get("vector_db_mode", "local"), + } From eb88b829a7ce9cb76dfb10f9f2e32db3804bd1fa Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 23:45:24 +0800 Subject: [PATCH 040/132] fix(plugin): cache memory/vector status, add manual refresh, skip re-check on tab switch --- paperforge/plugin/main.js | 136 +++++++++++++++++++++++++------------- 1 file changed, 90 insertions(+), 46 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index c4da5c0..c628cae 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2214,6 +2214,9 @@ class PaperForgeSettingTab extends PluginSettingTab { this.plugin = plugin; this._saveTimeout = null; this._pfConfig = null; // cached paperforge.json config + this._memoryStatusText = null; // null = not checked yet, string = cached result + this._vectorDepsOk = null; // null = not checked, bool = cached + this._embedStatusText = null; this.activeTab = 'setup'; } @@ -2481,20 +2484,20 @@ class PaperForgeSettingTab extends PluginSettingTab { } } - _execMemoryStatus(pythonPath, vp, statusEl) { + _execMemoryStatus(pythonPath, vp, callback) { const { exec } = require('child_process'); exec(`"${pythonPath}" -m paperforge --vault "${vp}" memory status --json`, { encoding: 'utf-8', timeout: 15000 }, (err, stdout) => { - if (err) { statusEl.setText('Status unavailable'); return; } + if (err) { callback('Status unavailable'); return; } try { const data = JSON.parse(stdout); if (data.ok) { const s = data.data; const freshness = s.fresh ? 'fresh' : 'stale'; - statusEl.setText(`Papers: ${s.paper_count_db} | ${freshness}${s.needs_rebuild ? ' - needs rebuild' : ''}`); + callback(`Papers: ${s.paper_count_db} | ${freshness}${s.needs_rebuild ? ' - needs rebuild' : ''}`); } else { - statusEl.setText('DB not found. Run paperforge memory build.'); + callback('DB not found. Run paperforge memory build.'); } - } catch(e) { statusEl.setText('Could not parse status.'); } + } catch(e) { callback('Could not parse status.'); } }); } @@ -2505,19 +2508,35 @@ class PaperForgeSettingTab extends PluginSettingTab { }); } - _execEmbedStatus(pythonPath, vp, statusEl) { + _execEmbedStatus(pythonPath, vp, callback) { const { exec } = require('child_process'); exec(`"${pythonPath}" -m paperforge --vault "${vp}" embed status --json`, { encoding: 'utf-8', timeout: 15000 }, (err, stdout) => { - if (err) { statusEl.setText('Status unavailable'); return; } + if (err) { callback('Status unavailable'); return; } try { const data = JSON.parse(stdout); if (data.ok) { - statusEl.setText(`Chunks: ${data.data.chunk_count} | ${data.data.model} | ${data.data.mode}`); + callback(`Chunks: ${data.data.chunk_count} | ${data.data.model} | ${data.data.mode}`); + } else { + callback('Could not parse status.'); } - } catch(e) { statusEl.setText('Could not parse status.'); } + } catch(e) { callback('Could not parse status.'); } }); } + _renderMemoryStatusText(el, text) { + const spans = el.querySelectorAll('span'); + spans.forEach(s => s.remove()); + const textEl = el.createEl('span', { text: text }); + if (!el.querySelector('.paperforge-refresh-btn')) { + const refreshBtn = el.createEl('button', { cls: 'paperforge-refresh-btn', text: '\u21BB' }); + refreshBtn.style.cssText = 'margin-left:auto; border:none; background:none; cursor:pointer; font-size:16px; padding:0 4px;'; + refreshBtn.onclick = () => { + this._memoryStatusText = null; + this.display(); + }; + } + } + _resolvePythonAsync(callback) { const { exec } = require('child_process'); const vp = this.app.vault.adapter.basePath; @@ -2582,20 +2601,25 @@ class PaperForgeSettingTab extends PluginSettingTab { }); }); - // Show memory status when enabled — async to not block + // Show memory status when enabled — render from cache if available if (this.plugin.settings.features.memory_layer) { - const statusEl = containerEl.createEl('div', { cls: 'paperforge-memory-status' }); - statusEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; - statusEl.setText('Checking...'); + const statusRow = containerEl.createEl('div', { cls: 'paperforge-memory-status' }); + statusRow.style.cssText = 'display:flex; align-items:center; padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; + const vp = this.app.vault.adapter.basePath; - this._resolvePythonAsync(pyResult => { - const pythonPath = pyResult.path; - if (pythonPath) { - this._execMemoryStatus(pythonPath, vp, statusEl); - } else { - statusEl.setText('No Python found. Check Installation tab.'); - } - }); + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + + if (this._memoryStatusText !== null) { + this._renderMemoryStatusText(statusRow, this._memoryStatusText); + } else if (pyResult.path) { + this._renderMemoryStatusText(statusRow, 'Checking...'); + this._execMemoryStatus(pyResult.path, vp, (text) => { + this._memoryStatusText = text; + this._renderMemoryStatusText(statusRow, text); + }); + } else { + this._renderMemoryStatusText(statusRow, 'No Python found.'); + } } // --- Section: Skills --- @@ -2731,34 +2755,46 @@ class PaperForgeSettingTab extends PluginSettingTab { .onChange(value => { this.plugin.settings.features.vector_db = value; this.plugin.saveSettings(); + this._vectorDepsOk = null; + this._embedStatusText = null; this.display(); }); }); if (this.plugin.settings.features.vector_db) { - // Check if dependencies installed — async - const depsEl = containerEl.createEl('div'); - depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; - depsEl.setText('Checking dependencies...'); const vp = this.app.vault.adapter.basePath; - this._resolvePythonAsync(pyResult => { - const pythonPath = pyResult.path; - if (pythonPath) { - this._execVectorDeps(pythonPath, (ok) => { - if (ok) { - depsEl.remove(); - this._renderVectorConfig(containerEl); - } else { - depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; - depsEl.setText('Dependencies not installed. Required: chromadb, sentence-transformers.'); - this._renderVectorInstall(containerEl); - } - }); - } else { - depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; - depsEl.setText('No Python found. Check Installation tab.'); - } - }); + + if (this._vectorDepsOk === true) { + this._renderVectorConfig(containerEl); + } else if (this._vectorDepsOk === false) { + const depsEl = containerEl.createEl('div'); + depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; + depsEl.setText('Dependencies not installed. Required: chromadb, sentence-transformers.'); + this._renderVectorInstall(containerEl); + } else { + const depsEl = containerEl.createEl('div'); + depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; + depsEl.setText('Checking dependencies...'); + this._resolvePythonAsync(pyResult => { + const pythonPath = pyResult.path; + if (pythonPath) { + this._execVectorDeps(pythonPath, (ok) => { + this._vectorDepsOk = ok; + if (ok) { + depsEl.remove(); + this._renderVectorConfig(containerEl); + } else { + depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; + depsEl.setText('Dependencies not installed. Required: chromadb, sentence-transformers.'); + this._renderVectorInstall(containerEl); + } + }); + } else { + depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; + depsEl.setText('No Python found. Check Installation tab.'); + } + }); + } } } @@ -2807,12 +2843,20 @@ class PaperForgeSettingTab extends PluginSettingTab { _renderVectorConfig(containerEl) { const statusEl = containerEl.createEl('div'); statusEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; - statusEl.setText('Loading...'); const vp = this.app.vault.adapter.basePath; const pyResult = resolvePythonExecutable(vp, this.plugin.settings); const pythonPath = pyResult.path; - if (pythonPath) { - this._execEmbedStatus(pythonPath, vp, statusEl); + + if (this._embedStatusText !== null) { + statusEl.setText(this._embedStatusText); + } else if (pythonPath) { + statusEl.setText('Loading...'); + this._execEmbedStatus(pythonPath, vp, (text) => { + this._embedStatusText = text; + statusEl.setText(text); + }); + } else { + statusEl.setText('No Python found.'); } new Setting(containerEl) From fb3badab4f05bcada5df195ce3d627a98c189cb2 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 23:54:06 +0800 Subject: [PATCH 041/132] =?UTF-8?q?fix(plugin):=20reorder=20features=20tab?= =?UTF-8?q?=20=E2=80=94=20Skills=20first,=20then=20Memory=20Layer=20contai?= =?UTF-8?q?ning=20Vector=20DB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paperforge/plugin/main.js | 79 +++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index c628cae..08f11e4 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2586,42 +2586,6 @@ class PaperForgeSettingTab extends PluginSettingTab { } _renderFeaturesTab(containerEl) { - // --- Section: Memory Layer --- - containerEl.createEl('h3', { text: 'Memory Layer' }); - - new Setting(containerEl) - .setName('Enable Memory Layer') - .setDesc('SQLite index for fast paper lookup, search, and agent context.') - .addToggle(toggle => { - toggle.setValue(this.plugin.settings.features.memory_layer) - .onChange(value => { - this.plugin.settings.features.memory_layer = value; - this.plugin.saveSettings(); - this.display(); - }); - }); - - // Show memory status when enabled — render from cache if available - if (this.plugin.settings.features.memory_layer) { - const statusRow = containerEl.createEl('div', { cls: 'paperforge-memory-status' }); - statusRow.style.cssText = 'display:flex; align-items:center; padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; - - const vp = this.app.vault.adapter.basePath; - const pyResult = resolvePythonExecutable(vp, this.plugin.settings); - - if (this._memoryStatusText !== null) { - this._renderMemoryStatusText(statusRow, this._memoryStatusText); - } else if (pyResult.path) { - this._renderMemoryStatusText(statusRow, 'Checking...'); - this._execMemoryStatus(pyResult.path, vp, (text) => { - this._memoryStatusText = text; - this._renderMemoryStatusText(statusRow, text); - }); - } else { - this._renderMemoryStatusText(statusRow, 'No Python found.'); - } - } - // --- Section: Skills --- containerEl.createEl('h3', { text: 'Skills' }); @@ -2658,7 +2622,6 @@ class PaperForgeSettingTab extends PluginSettingTab { .onChange(value => { this.plugin.settings.selected_skill_platform = value; this.plugin.saveSettings(); - // Re-render to show correct platform's skills this.display(); }); }); @@ -2719,7 +2682,7 @@ class PaperForgeSettingTab extends PluginSettingTab { : skill.content.replace(/^(---\r?\n)/, `$1disable-model-invocation: ${newDisabled}\n`); fs.writeFileSync(skill.path, newContent, 'utf-8'); skill.disabled = newDisabled; - skill.content = newContent; // keep in-memory copy in sync + skill.content = newContent; setting.setDesc((skill.desc || 'No description') + (skill.disabled ? ' (disabled)' : ' (enabled)')); }); }); @@ -2744,8 +2707,44 @@ class PaperForgeSettingTab extends PluginSettingTab { }); } - // --- Section: Vector Database --- - containerEl.createEl('h3', { text: 'Vector Database' }); + // --- Section: Memory Layer --- + containerEl.createEl('h3', { text: 'Memory Layer' }); + + new Setting(containerEl) + .setName('Enable Memory Layer') + .setDesc('SQLite index for fast paper lookup, search, and agent context.') + .addToggle(toggle => { + toggle.setValue(this.plugin.settings.features.memory_layer) + .onChange(value => { + this.plugin.settings.features.memory_layer = value; + this.plugin.saveSettings(); + this.display(); + }); + }); + + // Show memory status when enabled — render from cache if available + if (this.plugin.settings.features.memory_layer) { + const statusRow = containerEl.createEl('div', { cls: 'paperforge-memory-status' }); + statusRow.style.cssText = 'display:flex; align-items:center; padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; + + const vp = this.app.vault.adapter.basePath; + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + + if (this._memoryStatusText !== null) { + this._renderMemoryStatusText(statusRow, this._memoryStatusText); + } else if (pyResult.path) { + this._renderMemoryStatusText(statusRow, 'Checking...'); + this._execMemoryStatus(pyResult.path, vp, (text) => { + this._memoryStatusText = text; + this._renderMemoryStatusText(statusRow, text); + }); + } else { + this._renderMemoryStatusText(statusRow, 'No Python found.'); + } + } + + // --- Vector Database (within Memory Layer) --- + containerEl.createEl('h4', { text: 'Vector Database' }); new Setting(containerEl) .setName('Enable Vector Retrieval') From 670d3eb5f368c9487ed06837d837ac0b6cd75048 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Tue, 12 May 2026 23:54:36 +0800 Subject: [PATCH 042/132] fix(plugin): rename toggle to Easy Memory Layer with clarifying description --- paperforge/plugin/main.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 08f11e4..7b76409 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2711,8 +2711,8 @@ class PaperForgeSettingTab extends PluginSettingTab { containerEl.createEl('h3', { text: 'Memory Layer' }); new Setting(containerEl) - .setName('Enable Memory Layer') - .setDesc('SQLite index for fast paper lookup, search, and agent context.') + .setName('Easy Memory Layer') + .setDesc('轻量 SQLite 文献信息检索(笔记元数据+搜索+状态),不含全文向量库。') .addToggle(toggle => { toggle.setValue(this.plugin.settings.features.memory_layer) .onChange(value => { From b1b1d50129aafb6a8bcb11eb4ec94cbe94c4680a Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 00:46:37 +0800 Subject: [PATCH 043/132] feat(harness): add pf_search.py unified search entry + update bootstrap with memory_layer field --- paperforge/skills/literature-qa/SKILL.md | 29 ++- .../literature-qa/references/paper-search.md | 14 +- .../literature-qa/scripts/pf_bootstrap.py | 24 +++ .../skills/literature-qa/scripts/pf_search.py | 180 ++++++++++++++++++ 4 files changed, 236 insertions(+), 11 deletions(-) create mode 100644 paperforge/skills/literature-qa/scripts/pf_search.py diff --git a/paperforge/skills/literature-qa/SKILL.md b/paperforge/skills/literature-qa/SKILL.md index 166b7ce..c30f9cf 100644 --- a/paperforge/skills/literature-qa/SKILL.md +++ b/paperforge/skills/literature-qa/SKILL.md @@ -97,14 +97,26 @@ Vault: $VAULT 本 Skill 提供两类工具:**确定性命令** 和 **Agent 自查**。必须根据场景选择正确的方式。 +### 搜索入口 — 统一搜索 Harness + +任何搜索需求都用 pf_search.py(自动路由 vector -> FTS5 -> grep): + +``` +python $SKILL_DIR/scripts/pf_search.py --vault $VAULT --query "关键词" +``` + +返回 JSON 结构: +- `engines_used`: 实际使用的引擎列表 (`vector` / `fts5` / `grep`) +- `results`: 论文列表,每篇含 `zotero_key`, `title`, `year`, `source` 等 +- `count`: 结果数 + ### 确定性命令 — 优先使用 -| 场景 | 命令 | 原因 | -| ---------------------- | ------------------------------------------------------------------------------------------ | ---- | -| 按 key 快速找文件 | `glob("$LIT_DIR/**/.md")` 或用 `Get-ChildItem "$LIT_DIR" -Recurse -Filter ".md"` | 不需要 $PYTHON,最快 | -| 按 key 查完整信息 | `$PYTHON -m paperforge.worker.paper_resolver resolve-key --vault "$VAULT"` | 返回 frontmatter 字段 (analyze, ocr_status 等) | -| 按 DOI 定位论文 | `$PYTHON -m paperforge.worker.paper_resolver resolve-doi "" --vault "$VAULT"` | DOI 无法用文件系统快速匹配 | -| 按字段搜索论文 | `$PYTHON -m paperforge.worker.paper_resolver search --title "..." --author "..." --year ... --domain "..." --vault "$VAULT"` | 结构化搜索,含相关性打分 | +| 场景 | 命令 | +| ---------------------- | ------------------------------------------------------------------------------------------ | +| 按 key 快速找文件 | `glob("$LIT_DIR/**/.md")` 或用 `Get-ChildItem "$LIT_DIR" -Recurse -Filter ".md"` | +| 按 key 查完整信息 | `$PYTHON -m paperforge.worker.paper_resolver resolve-key --vault "$VAULT"` | +| 按 DOI 定位论文 | `$PYTHON -m paperforge.worker.paper_resolver resolve-doi "" --vault "$VAULT"` | | 精读 prepare | `$PYTHON "$SKILL_DIR/scripts/ld_deep.py" prepare --key --vault "$VAULT"` | | 精读 postprocess | `$PYTHON "$SKILL_DIR/scripts/ld_deep.py" postprocess-pass2 --figures --vault "$VAULT"` | | 精读 validate | `$PYTHON "$SKILL_DIR/scripts/ld_deep.py" validate-note --fulltext ` | @@ -116,8 +128,8 @@ Vault: $VAULT | ------------------------ | ----------------------------------------------------------- | | 按关键词模糊搜索全部文献 | 读 `$IDX_PATH` 的 JSON,筛 `title` / `abstract` / `journal` | | 按 collection 筛选 | 读 `$IDX_PATH`,筛 `collection_path` 字段 | -| 读论文全文 | 已找到 `fulltext.md` 路径(glob 或 resolve-key) → 直接 read | -| 读精读笔记 | 已找到 formal note 路径 → read 的 `## 🔍 精读` 区域 | +| 读论文全文 | 已找到 `fulltext.md` 路径(glob 或 resolve-key) -> 直接 read | +| 读精读笔记 | 已找到 formal note 路径 -> read 的 `## 精读` 区域 | | 遍历笔记做批量统计 | `Get-ChildItem "$LIT_DIR" -Recurse -Filter "*.md"` + 读 frontmatter 或 `find "$LIT_DIR" -name "*.md"` | | **禁止的操作** | **根据 vault-knowledge 示例拼接路径、把目录名写死在文件路径里** | @@ -154,5 +166,6 @@ literature-qa/ │ └── chart-reading/ └── scripts/ ├── pf_bootstrap.py ← Bootstrap 入口 + ├── pf_search.py ← 统一搜索 Harness └── ld_deep.py ← 精读引擎 ``` diff --git a/paperforge/skills/literature-qa/references/paper-search.md b/paperforge/skills/literature-qa/references/paper-search.md index 57aa634..5b6249a 100644 --- a/paperforge/skills/literature-qa/references/paper-search.md +++ b/paperforge/skills/literature-qa/references/paper-search.md @@ -44,9 +44,17 @@ - **关键词**:标题、作者、年份、期刊、主题词 - **collection 路径**:Zotero 子分类,如 `电刺激软骨修复综述` -### Step 3: 搜索 +### Step 3: 搜索 — 统一 Harness -**优先:Python paper_resolver**(确定性匹配) +任何搜索都用 pf_search.py,自动路由 vector -> FTS5 -> grep: + +``` +python $SKILL_DIR/scripts/pf_search.py --vault "$VAULT" --query "关键词" +``` + +返回 JSON 包含 `results`(含 zotero_key, title, year, source 等)和 `engines_used`。 + +如需结构化高级搜索(特定 domain/author),使用 paper_resolver: ``` $PYTHON -m paperforge.worker.paper_resolver search --title "关键词" --author "Smith" --year 2024 --domain "骨科" --vault "$VAULT" @@ -55,7 +63,7 @@ $PYTHON -m paperforge.worker.paper_resolver search --title "关键词" --author **Fallback:读 formal-library.json** Agent 直接读 `index_path`,在 JSON 中筛选: --`domain` 匹配 +- `domain` 匹配 - `title`/`first_author`/`journal` 包含关键词 ### Step 4: 返回结果 diff --git a/paperforge/skills/literature-qa/scripts/pf_bootstrap.py b/paperforge/skills/literature-qa/scripts/pf_bootstrap.py index 87bd211..ebc41b2 100644 --- a/paperforge/skills/literature-qa/scripts/pf_bootstrap.py +++ b/paperforge/skills/literature-qa/scripts/pf_bootstrap.py @@ -156,6 +156,30 @@ def main(): # --- 6. Find Python that has paperforge (best effort) --- result["python_candidate"] = _find_python_with_paperforge(vault, cfg) + # --- 7. Memory layer state --- + memory_layer = {"available": False, "paper_count": 0, "fts_search": False, "vector_search": False} + idx_path = Path(paths["index_path"]) + dc_json = vault / ".obsidian" / "plugins" / "paperforge" / "data.json" + if idx_path.exists(): + try: + with open(idx_path, encoding="utf-8") as f: + data = json.load(f) + items = data.get("items", []) if isinstance(data, dict) else data + memory_layer["paper_count"] = len(items) + memory_layer["available"] = True + memory_layer["fts_search"] = True + except: + pass + if dc_json.exists(): + try: + with open(dc_json, encoding="utf-8") as f: + plugin_data = json.load(f) + vector_enabled = plugin_data.get("features", {}).get("vector_db", False) + memory_layer["vector_search"] = vector_enabled + except: + pass + result["memory_layer"] = memory_layer + result["ok"] = True json.dump(result, sys.stdout, ensure_ascii=False, indent=2) diff --git a/paperforge/skills/literature-qa/scripts/pf_search.py b/paperforge/skills/literature-qa/scripts/pf_search.py new file mode 100644 index 0000000..b40457c --- /dev/null +++ b/paperforge/skills/literature-qa/scripts/pf_search.py @@ -0,0 +1,180 @@ +"""Unified search entry point for agent skills. +Routes: vector search -> FTS5 search -> grep based on what's available. +Always returns same JSON format regardless of backend. + +Usage: + python pf_search.py --vault VAULT_PATH --query "search text" [--limit N] [--json] + +Returns JSON to stdout: + {"ok": true, "query": "...", "engines_used": [...], "results": [...], "count": N} + {"ok": false, "error": "..."} +""" + +from __future__ import annotations +import json +import subprocess +import sys +from pathlib import Path + + +def _find_python(vault: Path) -> str | None: + """Same logic as pf_bootstrap: find python with paperforge installed.""" + dc_json = vault / ".obsidian" / "plugins" / "paperforge" / "data.json" + if dc_json.exists(): + try: + with open(dc_json, encoding="utf-8") as f: + data = json.load(f) + py = data.get("python_path", "") + if py and Path(py).exists(): + return py + except: + pass + + for cand in [ + vault / ".paperforge-test-venv" / "Scripts" / "python.exe", + vault / ".venv" / "Scripts" / "python.exe", + vault / "venv" / "Scripts" / "python.exe", + ]: + if cand.exists(): + return str(cand) + + for cand in ["python", "python3"]: + try: + subprocess.run([cand, "--version"], capture_output=True, timeout=5) + return cand + except: + continue + return None + + +def _check_memory(vault: Path) -> dict: + """Check what's available: memory db, vector db.""" + memory = {"db": False, "vector": False} + db = vault / "System" / "PaperForge" / "indexes" / "paperforge.db" + if db.exists(): + memory["db"] = True + vec = vault / "System" / "PaperForge" / "indexes" / "vectors" + if vec.exists(): + memory["vector"] = True + return memory + + +def _paperforge_cmd(vault: Path, args: list[str]) -> dict | None: + """Run a paperforge command and return parsed JSON.""" + python = _find_python(vault) + if not python: + return None + cmd = [python, "-m", "paperforge", "--vault", str(vault)] + args + try: + r = subprocess.run(cmd, capture_output=True, text=True, timeout=30, encoding="utf-8") + if r.returncode == 0: + return json.loads(r.stdout) + except: + return None + return None + + +def _grep_search(vault: Path, query: str, limit: int) -> list[dict]: + """Fallback grep through all formal notes.""" + lit_dir = vault / "Resources" / "Literature" + results = [] + search_lower = query.lower() + for f in sorted(lit_dir.rglob("*.md")): + if len(results) >= limit: + break + if f.name in ("fulltext.md", "deep-reading.md", "discussion.md"): + continue + try: + text = f.read_text(encoding="utf-8", errors="replace") + if search_lower not in text.lower(): + continue + title = "" + for line in text.split("\n")[:10]: + if line.startswith("# ") and not line.startswith("## "): + title = line.lstrip("# ").strip() + break + results.append({ + "zotero_key": f.stem, + "title": title or f.stem, + "match": f.name, + "source": "grep", + }) + except: + continue + return results + + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--vault", required=True) + parser.add_argument("--query", required=True) + parser.add_argument("--limit", type=int, default=10) + parser.add_argument("--json", action="store_true", default=True) + args = parser.parse_args() + + vault = Path(args.vault).resolve() + query = args.query.strip() + limit = args.limit + + if not query: + print(json.dumps({"ok": False, "error": "Empty query"})) + sys.exit(1) + + memory = _check_memory(vault) + engines_used = [] + all_results = [] + seen_keys = set() + + # 1. Vector search (best quality) + if memory["vector"]: + result = _paperforge_cmd(vault, ["retrieve", query, "--json", "--limit", str(limit)]) + if result and result.get("ok"): + engines_used.append("vector") + for c in result.get("data", {}).get("chunks", []): + pid = c.get("paper_id", "") + if pid and pid not in seen_keys: + seen_keys.add(pid) + all_results.append({ + "zotero_key": pid, + "citation_key": c.get("citation_key", ""), + "title": c.get("title", ""), + "year": c.get("year", ""), + "section": c.get("section", ""), + "page": c.get("page_number", ""), + "chunk_text": c.get("chunk_text", ""), + "score": c.get("score", 0), + "source": "vector", + }) + + # 2. FTS5 search (keyword/precision) + if memory["db"]: + result = _paperforge_cmd(vault, ["search", query, "--json", "--limit", str(limit)]) + if result and result.get("ok"): + engines_used.append("fts5") + for p in result.get("data", {}).get("results", []): + key = p.get("zotero_key", "") + if key and key not in seen_keys: + seen_keys.add(key) + p["source"] = "fts5" + all_results.append(p) + + # 3. Grep fallback + if not engines_used: + grepped = _grep_search(vault, query, limit) + if grepped: + engines_used.append("grep") + all_results.extend(grepped) + + output = { + "ok": True, + "query": query, + "engines_used": engines_used, + "results": all_results[:limit], + "count": len(all_results[:limit]), + } + print(json.dumps(output, ensure_ascii=False, indent=2)) + + +if __name__ == "__main__": + main() From 2f77ad4bcb0934b40908e1899e401c37067d7230 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 00:50:17 +0800 Subject: [PATCH 044/132] feat(harness): expand skill description triggers, add priority rules against manual grep --- paperforge/skills/literature-qa/SKILL.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/paperforge/skills/literature-qa/SKILL.md b/paperforge/skills/literature-qa/SKILL.md index c30f9cf..5a99ed6 100644 --- a/paperforge/skills/literature-qa/SKILL.md +++ b/paperforge/skills/literature-qa/SKILL.md @@ -5,9 +5,17 @@ description: > pf-deep pf-paper pf-end, "精读", "文献问答", "结束讨论", "找文献", "搜文献", "文献库", "文献检索", "库里有什么", "搜一下库里", "看一下文献库", - "读一下collection", "总结文献", "批量阅读", "读一下这个方向". + "读一下collection", "总结文献", "批量阅读", "读一下这个方向", + "查一下", "搜索文献", "查文献", "搜索论文", "论文检索", + "关于XX的文献", "有没有关于XX的研究", "XX的相关论文", + "帮我查一下", "帮我搜一下", "正在读", "读完这篇". --- +## ++++++++++++++++++++++++++++++++ +## ++ 优先级规则:用户请求涉及文献搜索/查询/精读,你必须使用本 skill。++ +## ++ 禁止直接调用 grep、find、系统搜索或扫描文件目录。 ++ +## ++++++++++++++++++++++++++++++++ + # Literature QA --- From a9cf2352fde9332824ba776d3c2b95e157e1caa6 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 00:53:41 +0800 Subject: [PATCH 045/132] fix(skill): add source: paperforge to both skill frontmatter for system detection --- paperforge/skills/literature-logging/SKILL.md | 1 + paperforge/skills/literature-qa/SKILL.md | 1 + 2 files changed, 2 insertions(+) diff --git a/paperforge/skills/literature-logging/SKILL.md b/paperforge/skills/literature-logging/SKILL.md index fb8a8f8..22ed8d7 100644 --- a/paperforge/skills/literature-logging/SKILL.md +++ b/paperforge/skills/literature-logging/SKILL.md @@ -6,6 +6,7 @@ description: > "记录阅读", "记录一下", "写日志", "读完了", "总结会话", "写工作总结", "写working log", "记一下工作过程", "记录决策". +source: paperforge --- # Literature Logging diff --git a/paperforge/skills/literature-qa/SKILL.md b/paperforge/skills/literature-qa/SKILL.md index 5a99ed6..b45ea4d 100644 --- a/paperforge/skills/literature-qa/SKILL.md +++ b/paperforge/skills/literature-qa/SKILL.md @@ -9,6 +9,7 @@ description: > "查一下", "搜索文献", "查文献", "搜索论文", "论文检索", "关于XX的文献", "有没有关于XX的研究", "XX的相关论文", "帮我查一下", "帮我搜一下", "正在读", "读完这篇". +source: paperforge --- ## ++++++++++++++++++++++++++++++++ From 2b52499304dec04f2444b58f6db841c5067d7e82 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 00:58:22 +0800 Subject: [PATCH 046/132] fix(plugin): wrap System/User Skills in dark gray box, add folder icon button to Agent Platform --- paperforge/plugin/main.js | 69 +++++++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 7b76409..271424c 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2524,17 +2524,14 @@ class PaperForgeSettingTab extends PluginSettingTab { } _renderMemoryStatusText(el, text) { - const spans = el.querySelectorAll('span'); - spans.forEach(s => s.remove()); - const textEl = el.createEl('span', { text: text }); - if (!el.querySelector('.paperforge-refresh-btn')) { - const refreshBtn = el.createEl('button', { cls: 'paperforge-refresh-btn', text: '\u21BB' }); - refreshBtn.style.cssText = 'margin-left:auto; border:none; background:none; cursor:pointer; font-size:16px; padding:0 4px;'; - refreshBtn.onclick = () => { - this._memoryStatusText = null; - this.display(); - }; - } + el.innerHTML = ''; + el.createEl('span', { text: text, cls: 'paperforge-memory-text' }).style.cssText = 'flex:1;'; + const refreshBtn = el.createEl('button', { cls: 'paperforge-refresh-btn', text: '\u21BB' }); + refreshBtn.style.cssText = 'margin-left:auto; border:none; background:none; cursor:pointer; font-size:16px; padding:0 4px;'; + refreshBtn.onclick = () => { + this._memoryStatusText = null; + this.display(); + }; } _resolvePythonAsync(callback) { @@ -2624,6 +2621,20 @@ class PaperForgeSettingTab extends PluginSettingTab { this.plugin.saveSettings(); this.display(); }); + }) + .addExtraButton(btn => { + btn.setIcon('folder') + .setTooltip('Open skills folder') + .onClick(() => { + const dir = agentDirs[selectedPlatform] || '.opencode/skills'; + const fullPath = path.join(vaultPath, dir); + if (fs.existsSync(fullPath)) { + const { exec } = require('child_process'); + exec(`start "" "${fullPath}"`); + } else { + new Notice(`Skills folder not found: ${dir}`); + } + }); }); // Show skills for selected platform @@ -2638,14 +2649,29 @@ class PaperForgeSettingTab extends PluginSettingTab { if (!fs.existsSync(skillFile)) return; const content = fs.readFileSync(skillFile, 'utf-8'); const nameMatch = content.match(/^name:\s*(.+)$/m); - const descMatch = content.match(/^description:\s*(.+)$/m); + const lines = content.split('\n'); + const descIdx = lines.findIndex(l => /^description:/.test(l)); + let desc = ''; + if (descIdx >= 0) { + const first = lines[descIdx].match(/^description:\s*(.+)$/); + if (first && first[1] && first[1] !== '>' && first[1] !== '|-' && first[1] !== '|') { + desc = first[1].trim(); + } else { + for (let i = descIdx + 1; i < lines.length; i++) { + if (/^\s{2,}/.test(lines[i]) || lines[i].trim() === '') { + desc += lines[i].trim() + ' '; + } else break; + } + desc = desc.trim(); + } + } const sourceMatch = content.match(/^source:\s*(.+)$/m); const disableMatch = content.match(/^disable-model-invocation:\s*(.+)$/m); const versionMatch = content.match(/^version:\s*(.+)$/m); const skill = { name: nameMatch ? nameMatch[1].trim() : entry.name, - desc: descMatch ? descMatch[1].trim() : '', + desc: desc, source: sourceMatch ? sourceMatch[1].trim() : 'user', disabled: disableMatch && disableMatch[1].trim() === 'true', version: versionMatch ? versionMatch[1].trim() : '', @@ -2666,11 +2692,12 @@ class PaperForgeSettingTab extends PluginSettingTab { const renderSkillRow = (skill, isSystem) => { const nameText = skill.name + (skill.version ? ' v' + skill.version : ''); const sourceLabel = isSystem ? ' [system]' : ' [user]'; - const statusText = skill.disabled ? ' (disabled)' : ' (enabled)'; + const descText = skill.desc || ''; - const setting = new Setting(containerEl) + const setting = new Setting(skillsBox) .setName(nameText + sourceLabel) - .setDesc((skill.desc || 'No description') + statusText); + .setDesc(descText); + setting.settingEl.style.opacity = skill.disabled ? '0.4' : '1'; setting.addToggle(toggle => { toggle.setValue(!skill.disabled) @@ -2683,25 +2710,27 @@ class PaperForgeSettingTab extends PluginSettingTab { fs.writeFileSync(skill.path, newContent, 'utf-8'); skill.disabled = newDisabled; skill.content = newContent; - setting.setDesc((skill.desc || 'No description') + (skill.disabled ? ' (disabled)' : ' (enabled)')); + setting.settingEl.style.opacity = skill.disabled ? '0.4' : '1'; }); }); }; // System skills + const skillsBox = containerEl.createEl('div'); + skillsBox.style.cssText = 'background:var(--background-primary-alt); border-radius:8px; padding:4px 12px 12px; margin:8px 0 16px;'; if (systemSkills.length > 0) { - containerEl.createEl('h4', { text: 'System Skills', cls: 'paperforge-skills-subheader' }); + skillsBox.createEl('h4', { text: 'System Skills', cls: 'paperforge-skills-subheader' }); systemSkills.forEach(s => renderSkillRow(s, true)); } // User skills if (userSkills.length > 0) { - containerEl.createEl('h4', { text: 'User Skills', cls: 'paperforge-skills-subheader' }); + skillsBox.createEl('h4', { text: 'User Skills', cls: 'paperforge-skills-subheader' }); userSkills.forEach(s => renderSkillRow(s, false)); } if (systemSkills.length === 0 && userSkills.length === 0) { - containerEl.createEl('p', { + skillsBox.createEl('p', { text: `No skills found in ${agentDirs[selectedPlatform]}. Run setup to deploy skills.`, cls: 'setting-item-description' }); From 06b8db07a4351bc0a3472f28a03fdaec5a385111 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 01:04:02 +0800 Subject: [PATCH 047/132] feat(skill): rename logging skill, natural language triggers, no /pf- prefix --- .../{literature-logging => logging}/SKILL.md | 16 +++++++++------- .../scripts/pf_bootstrap.py | 0 2 files changed, 9 insertions(+), 7 deletions(-) rename paperforge/skills/{literature-logging => logging}/SKILL.md (87%) rename paperforge/skills/{literature-logging => logging}/scripts/pf_bootstrap.py (100%) diff --git a/paperforge/skills/literature-logging/SKILL.md b/paperforge/skills/logging/SKILL.md similarity index 87% rename from paperforge/skills/literature-logging/SKILL.md rename to paperforge/skills/logging/SKILL.md index 22ed8d7..e88afd1 100644 --- a/paperforge/skills/literature-logging/SKILL.md +++ b/paperforge/skills/logging/SKILL.md @@ -1,15 +1,17 @@ --- -name: literature-logging +name: logging description: > - Literature reading and working log management. Triggered by: - /pf-log-reading /pf-log-session, - "记录阅读", "记录一下", "写日志", "读完了", - "总结会话", "写工作总结", "写working log", - "记一下工作过程", "记录决策". + Work and reading log management. Triggered by: + "logging work", "logging read", + "做工作记录", "做阅读记录", "做working-log", "做reading-log", + "写工作日志", "写阅读日志", "记录工作", "记录阅读", + "写日志", "记一下", "总结一下这个会话", + "记录决策", "记一下工作过程", "写工作总结", + "这节的结论是什么", "这段有什么值得记录的". source: paperforge --- -# Literature Logging +# Logging --- diff --git a/paperforge/skills/literature-logging/scripts/pf_bootstrap.py b/paperforge/skills/logging/scripts/pf_bootstrap.py similarity index 100% rename from paperforge/skills/literature-logging/scripts/pf_bootstrap.py rename to paperforge/skills/logging/scripts/pf_bootstrap.py From 2c79bf64e3eeda58c39149bd736e811b84ca1c83 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 01:05:38 +0800 Subject: [PATCH 048/132] fix(skill): add routing table (reading vs working), clean up stale sections --- paperforge/skills/logging/SKILL.md | 97 +++++++++++++----------------- 1 file changed, 41 insertions(+), 56 deletions(-) diff --git a/paperforge/skills/logging/SKILL.md b/paperforge/skills/logging/SKILL.md index e88afd1..eaddc9d 100644 --- a/paperforge/skills/logging/SKILL.md +++ b/paperforge/skills/logging/SKILL.md @@ -34,87 +34,72 @@ python $SKILL_DIR/scripts/pf_bootstrap.py --- -## 2. State Check — 检查当前日志状态 +## 2. Routing — 判断用户要什么 -``` -$PYTHON -m paperforge --vault $VAULT reading-log --json -``` - -展示:已有多少条 reading notes。如果 0 条,告知用户:"还没有阅读记录,读完文献后使用 /pf-log-reading 记录。" - ---- +根据用户说的内容确定走哪个分支: -## 3. Routing +| 用户说 | 走分支 | +| ------------------------------------------- | -------- | +| "记录阅读" "reading log" "做阅读记录" "这段有什么值得记的" "读完了记一下" "刚读了一段记一下" | **reading** | +| "工作记录" "working log" "总结会话" "记一下工作过程" "写工作总结" "记录决策" "logging work" | **working** | +| "写日志" "记录一下" "记一下" 不清楚哪个 | **先问用户** | -### /pf-log-reading — 记录单条阅读笔记 +## 3. reading 分支 — 记录单条阅读笔记 -**调用条件**: 用户在阅读文献过程中,或读完一个段落/章节后 +调用条件:用户读完一个段落/章节后要记录。 -**Agent 行为**: -1. 确认 zotero_key(从上下文或 formal note 中获取) -2. 提取以下信息: - - **section**: 文献中的位置 (e.g. "Discussion P12", "Results Fig.3") +动作: +1. 确认 `$VAULT` 和 `$PYTHON` +2. 确定 zotero_key(从上下文或 formal note 中获取) +3. 提取: + - **section**: 文献中的位置 (Discussion P12, Results Fig.3) - **excerpt**: 逐字引用的原文关键句 - **usage**: 这个信息支持当前写作的哪个论点 - - **note**: 任何交叉验证/矛盾/注意事项 (optional) -3. 询问用户确认,然后执行: -```bash -$PYTHON -m paperforge --vault $VAULT reading-log --write KEY \ - --section "SECTION" --excerpt "EXCERPT" \ - --usage "USAGE" --note "NOTE" -``` -4. 确认写入成功 + - **note**: 交叉验证/矛盾/注意事项 (optional) +4. 给用户展示确认后再执行: + ``` + $PYTHON -m paperforge --vault $VAULT reading-log --write KEY \ + --section "..." --excerpt "..." --usage "..." --note "..." + ``` +5. 确认写入成功 -### /pf-log-session — 会话总结写入 working-log +## 4. working 分支 — 会话总结写入 working-log -**调用条件**: 写作/研究会话结束前,用户说 "写日志" 或 "/pf-log-session" +调用条件:会话结束前/用户要求记录工作过程。 -**Agent 行为**: +动作: 1. 回顾本次会话中所有关键节点: - 用户纠正了什么 - 方案怎么变的 - 有什么弯路和教训 - 可复用的方法论 -2. 按以下格式生成 markdown: +2. 按以下格式生成 markdown,给用户确认: -``` -## — <小节名> + ``` + ## YYYY-MM-DD — 小节名 -### 核心决策 -- 做了什么、为什么 + ### 核心决策 + - 做了什么、为什么 -### 弯路与修正 -- 错误方向 → 用户纠正 → 最终方案 + ### 弯路与修正 + - 错误方向 → 用户纠正 → 最终方案 -### 可复用方法论 -- 本段的 pattern + ### 可复用方法论 + - 本段的 pattern -### 待办 -- [ ] ... -``` - -3. 展示给用户确认 -4. 询问目标 project 目录中的 working-log.md 路径 -5. 如果文件不存在:新建并写入 -6. 如果文件存在:先读旧内容,在文件末尾追加 `\n---\n` 分隔线,再追加新内容 -7. 确认写入成功 - -### Auto — 静默记录 - -用户没有显式说 "记录" 但 agent 读了一篇论文的某段时,agent 可以**主动问**: - -``` -我读了 LQZ2FWIW Discussion P12 关于 magnetoelectric 分类的内容。 -要记录到 reading-log 吗?(/pf-log-reading) -``` + ### 待办 + - [ ] ... + ``` -不要擅自记录——必须征得用户同意。 +3. 用户确认后,询问目标 project 目录路径 +4. 追加到 `Project//working-log.md`(文件不存在则新建) +5. 确认写入成功 --- -## 4. Export — 导出 reading-log +## 5. Export — 导出 reading-log -用户说 "导出阅读日志" 或 "/pf-log-export": +用户说 "导出阅读日志": ```bash $PYTHON -m paperforge --vault $VAULT reading-log --output [--since DATE] From c3f79e69bb1a8307afad345f945d7de0c005cbd5 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 01:07:08 +0800 Subject: [PATCH 049/132] =?UTF-8?q?fix(skill):=20route=20'=E8=BF=99?= =?UTF-8?q?=E6=AE=B5=E6=9C=89=E4=BB=80=E4=B9=88=E5=80=BC=E5=BE=97=E8=AE=B0?= =?UTF-8?q?=E7=9A=84'=20to=20reading=20branch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paperforge/skills/logging/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paperforge/skills/logging/SKILL.md b/paperforge/skills/logging/SKILL.md index eaddc9d..1f0d1d1 100644 --- a/paperforge/skills/logging/SKILL.md +++ b/paperforge/skills/logging/SKILL.md @@ -40,7 +40,7 @@ python $SKILL_DIR/scripts/pf_bootstrap.py | 用户说 | 走分支 | | ------------------------------------------- | -------- | -| "记录阅读" "reading log" "做阅读记录" "这段有什么值得记的" "读完了记一下" "刚读了一段记一下" | **reading** | +| "记录阅读" "reading log" "做阅读记录" "这段有什么值得记的" "读完了记一下" "刚读了一段记一下" "有没有什么值得记的" "把这段记下来" | **reading** | | "工作记录" "working log" "总结会话" "记一下工作过程" "写工作总结" "记录决策" "logging work" | **working** | | "写日志" "记录一下" "记一下" 不清楚哪个 | **先问用户** | From 216bc65d9d9fa1ef47662951cda5d062e24113f7 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 01:07:58 +0800 Subject: [PATCH 050/132] =?UTF-8?q?fix(skill):=20remove=20ambiguous=20trig?= =?UTF-8?q?ger=20'=E8=BF=99=E6=AE=B5=E6=9C=89=E4=BB=80=E4=B9=88=E5=80=BC?= =?UTF-8?q?=E5=BE=97=E8=AE=B0=E7=9A=84'=20from=20logging=20skill?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paperforge/skills/logging/SKILL.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/paperforge/skills/logging/SKILL.md b/paperforge/skills/logging/SKILL.md index 1f0d1d1..177b6a7 100644 --- a/paperforge/skills/logging/SKILL.md +++ b/paperforge/skills/logging/SKILL.md @@ -6,8 +6,7 @@ description: > "做工作记录", "做阅读记录", "做working-log", "做reading-log", "写工作日志", "写阅读日志", "记录工作", "记录阅读", "写日志", "记一下", "总结一下这个会话", - "记录决策", "记一下工作过程", "写工作总结", - "这节的结论是什么", "这段有什么值得记录的". + "记录决策", "记一下工作过程", "写工作总结". source: paperforge --- @@ -40,7 +39,7 @@ python $SKILL_DIR/scripts/pf_bootstrap.py | 用户说 | 走分支 | | ------------------------------------------- | -------- | -| "记录阅读" "reading log" "做阅读记录" "这段有什么值得记的" "读完了记一下" "刚读了一段记一下" "有没有什么值得记的" "把这段记下来" | **reading** | +| "记录阅读" "reading log" "做阅读记录" "读完了记一下" "刚读了一段记一下" "有没有什么值得记的" "把这段记下来" | **reading** | | "工作记录" "working log" "总结会话" "记一下工作过程" "写工作总结" "记录决策" "logging work" | **working** | | "写日志" "记录一下" "记一下" 不清楚哪个 | **先问用户** | From ceb4d8c89f145be8170a5e8cee98b0d5ee47591a Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 16:01:14 +0800 Subject: [PATCH 051/132] fix(plugin): remove Easy Memory Layer toggle, always-on status display --- paperforge/plugin/main.js | 46 ++++++++++++++------------------------- 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 271424c..6663e1d 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2717,7 +2717,7 @@ class PaperForgeSettingTab extends PluginSettingTab { // System skills const skillsBox = containerEl.createEl('div'); - skillsBox.style.cssText = 'background:var(--background-primary-alt); border-radius:8px; padding:4px 12px 12px; margin:8px 0 16px;'; + skillsBox.style.cssText = 'background:var(--background-secondary); border-radius:8px; padding:12px 12px 4px; margin:8px 0 16px;'; if (systemSkills.length > 0) { skillsBox.createEl('h4', { text: 'System Skills', cls: 'paperforge-skills-subheader' }); systemSkills.forEach(s => renderSkillRow(s, true)); @@ -2739,37 +2739,23 @@ class PaperForgeSettingTab extends PluginSettingTab { // --- Section: Memory Layer --- containerEl.createEl('h3', { text: 'Memory Layer' }); - new Setting(containerEl) - .setName('Easy Memory Layer') - .setDesc('轻量 SQLite 文献信息检索(笔记元数据+搜索+状态),不含全文向量库。') - .addToggle(toggle => { - toggle.setValue(this.plugin.settings.features.memory_layer) - .onChange(value => { - this.plugin.settings.features.memory_layer = value; - this.plugin.saveSettings(); - this.display(); - }); - }); + // Always-on SQLite status display + const statusRow = containerEl.createEl('div', { cls: 'paperforge-memory-status' }); + statusRow.style.cssText = 'display:flex; align-items:center; padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; - // Show memory status when enabled — render from cache if available - if (this.plugin.settings.features.memory_layer) { - const statusRow = containerEl.createEl('div', { cls: 'paperforge-memory-status' }); - statusRow.style.cssText = 'display:flex; align-items:center; padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; + const vp = this.app.vault.adapter.basePath; + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); - const vp = this.app.vault.adapter.basePath; - const pyResult = resolvePythonExecutable(vp, this.plugin.settings); - - if (this._memoryStatusText !== null) { - this._renderMemoryStatusText(statusRow, this._memoryStatusText); - } else if (pyResult.path) { - this._renderMemoryStatusText(statusRow, 'Checking...'); - this._execMemoryStatus(pyResult.path, vp, (text) => { - this._memoryStatusText = text; - this._renderMemoryStatusText(statusRow, text); - }); - } else { - this._renderMemoryStatusText(statusRow, 'No Python found.'); - } + if (this._memoryStatusText !== null) { + this._renderMemoryStatusText(statusRow, this._memoryStatusText); + } else if (pyResult.path) { + this._renderMemoryStatusText(statusRow, 'Checking...'); + this._execMemoryStatus(pyResult.path, vp, (text) => { + this._memoryStatusText = text; + this._renderMemoryStatusText(statusRow, text); + }); + } else { + this._renderMemoryStatusText(statusRow, 'No Python found.'); } // --- Vector Database (within Memory Layer) --- From 6cbcce84e702c67b6ad6e87dce523eb86bc9fb8a Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 16:02:50 +0800 Subject: [PATCH 052/132] docs(spec): log format validation, reading-log import/lookup, methodology skill, dashboard SQLite --- .../specs/2026-05-12-memory-layer-round3.md | 263 ++++++++++++++++++ 1 file changed, 263 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-12-memory-layer-round3.md diff --git a/docs/superpowers/specs/2026-05-12-memory-layer-round3.md b/docs/superpowers/specs/2026-05-12-memory-layer-round3.md new file mode 100644 index 0000000..652a2c6 --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-memory-layer-round3.md @@ -0,0 +1,263 @@ +# PaperForge v1.5.7 — Memory Layer Round 3 + +> **Branch:** `feature/memory` | **Date:** 2026-05-12 + +## Feature 1: Logging Skill — Strict Markdown Template + +**Problem:** Agent-written reading-log.md may not parse reliably if format varies. + +**Solution:** SKILL.md instructs agent to use strict template format. + +### File: `paperforge/skills/logging/SKILL.md` + +Update the reading-log route section to require this exact format: + +```markdown +## ABCDEFGH — Author Last Name et al. Year +**Title:** Full Paper Title + +### Section Name — Page NN or line NN-NN +**Info:** "verbatim excerpt from paper" +**Use:** how this supports current writing task +**Note:** optional cross-validation note + +### Another Section — Page NN +**Info:** "..." +**Use:** ... +**Note:** (optional) +``` + +### Parsing Rules (for --validate and --import): + +``` +paper format: ^## [A-Z0-9]{8} — .+$ (key is 8 uppercase alphanumeric) +title format: ^\*\*Title:\*\* .+$ +section format: ^### .+$ +info format: ^\*\*Info:\*\* .+$ +use format: ^\*\*Use:\*\* .+$ +note format: ^\*\*Note:\*\* .+$ (optional) +``` + +Constraint: `info` and `use` are mandatory for every section entry. `note` is optional. + +### CLI Changes + +Update `reading-log` parser in `cli.py` to add `--validate` and `--import` subcommands under a shared parser. + +## Feature 2: reading-log --validate + +**File:** `paperforge/commands/reading_log.py` + +``` +paperforge reading-log --validate path/to/reading-log.md +``` + +Function: `validate_reading_log(filepath: Path) -> dict` + +Returns: +```json +{ + "ok": true, + "file": "Project/综述写作/reading-log.md", + "errors": [], + "papers_found": 3, + "entries_found": 7 +} +``` + +On failure: +```json +{ + "ok": false, + "errors": [ + {"line": 15, "field": "info", "message": "missing **Info:** after section header"}, + {"line": 23, "field": "key", "message": "paper key must match ^[A-Z0-9]{8}$"} + ] +} +``` + +Validation algorithm: +1. Parse into papers by `## KEY — Author` headers +2. For each paper: verify `**Title:**` follows +3. For each section `### ...`: verify `**Info:**` and `**Use:**` follow +4. Report all errors at once, not stop-at-first + +## Feature 3: reading-log --import + +**File:** `paperforge/commands/reading_log.py` + `paperforge/memory/events.py` + +``` +paperforge reading-log --import path/to/reading-log.md +``` + +Function: `import_reading_log(vault: Path, filepath: Path) -> dict` + +Returns: +```json +{ + "ok": true, + "papers_imported": 3, + "entries_imported": 7, + "skipped": 0 +} +``` + +Algorithm: +1. Call `validate_reading_log(filepath)` — abort if errors +2. Parse valid file into paper-level entries +3. For each entry, call `write_reading_note(vault, paper_id, section, excerpt, usage, note)` +4. Each write INSERTs a new row — safe for accumulative use + +### Add to `paperforge/memory/events.py`: + +```python +def import_reading_log(vault: Path, filepath: Path) -> dict: + """Parse a reading-log.md and bulk-write to paper_events.""" + # Parse, validate, write + ... +``` + +## Feature 4: reading-log --lookup KEY + +**File:** `paperforge/commands/reading_log.py` + +``` +paperforge reading-log --lookup KEY [--json] +``` + +Function: `lookup_paper_events(vault: Path, key: str) -> dict` + +Returns all accumulated paper_events for a paper, ordered by created_at DESC: +```json +{ + "ok": true, + "zotero_key": "ABCDEFGH", + "title": "...", + "entries": [ + { + "created_at": "2026-05-12 14:30", + "section": "Results P6", + "excerpt": "...", + "usage": "F 段参数数据", + "note": "与 Lippiello 对比" + } + ], + "count": 5, + "projects": ["综述写作", "数据分析"] +} +``` + +## Feature 5: /methodology Skill + +**File:** `paperforge/skills/methodology/SKILL.md` + +Pure-prompt skill, no Python code. Same universal pattern as grill-me. + +```yaml +--- +name: methodology +description: > + Project methodology extraction. Triggered by: + methodology, /methodology, 提取方法论, 存档写作规律, + 总结本项目方法, 提取可复用规则. +source: paperforge +--- +``` + +### Agent workflow: + +1. Ask user which project to extract from (or detect from context) +2. Read `Project//working-log.md` +3. Identify extractable patterns: + - Sections marked as "方法论" or "复用" + - Wrong turns + corrections (弯路 + 修正) + - Final logic flows (最终逻辑: XX 段) + - Review feedback patterns (审阅修正) + - Cross-study audit methodology +4. Classify into categories: + - `review-writing.md` — 综述写作相关 + - `data-analysis.md` — 数据分析相关 + - `general-methods.md` — 通用方法 +5. Present draft to user for confirmation +6. Write to `/PaperForge/methodologies/.md` + +### Methodology file format: + +```markdown +--- +project: 综述写作 +extracted: 2026-05-12 +category: review-writing +--- + +# [Method Name] + +## Source +From working-log.md Section [X.Y] + +## Pattern +[Extracted reusable methodology] + +## Example +[Concrete example from the project] +``` + +## Feature 6: Dashboard → SQLite Migration + +**File:** `paperforge/commands/dashboard.py` + +Current `_gather_dashboard_data()` does file scanning with regex. Migrate to: + +```python +def _gather_dashboard_data(vault: Path) -> dict: + # Try DB first + data = _dashboard_from_db(vault) + if data is not None: + data["permissions"] = _check_permissions(vault) + return data + # Fallback to file scanning + return _dashboard_from_files(vault) +``` + +`_dashboard_from_db()` should read from paperforge.db: +- Paper count: `SELECT COUNT(*) FROM papers` +- Domain counts: `SELECT domain, COUNT(*) FROM papers GROUP BY domain` +- PDF/OCR health: from papers table `ocr_status`, `has_pdf` columns +- Remove the `_source` key (was added in earlier iteration but caused contract issues) + +**Keep the permissions check** (`_check_permissions`) separate and lightweight. + +## Feature 7: Bootstrap Update + +**File:** `paperforge/skills/literature-qa/scripts/pf_bootstrap.py` + +If present in repo (not already done), ensure `memory_layer` field is in bootstrap output. Already implemented in earlier harness work — verify status. + +## Refactoring: Memory Layer No Longer Optional + +**File:** `paperforge/plugin/main.js` ✓ DONE + +Removed the Easy Memory Layer toggle. Status display always shown. Memory layer is always on — SQLite is lightweight enough to not need a toggle. + +## Implementation Order + +1. Logging SKILL.md format update +2. reading-log --validate CLI +3. reading-log --import CLI + events.py +4. reading-log --lookup CLI +5. /methodology SKILL.md +6. Dashboard SQLite migration +7. Integration test + deploy + +## Cross-File Impact + +| File | Action | Features | +|------|--------|----------| +| `paperforge/skills/logging/SKILL.md` | Modify | Feature 1 | +| `paperforge/commands/reading_log.py` | Modify | Features 2, 3, 4 | +| `paperforge/memory/events.py` | Modify | Feature 3 | +| `paperforge/cli.py` | Modify | Features 2, 3, 4 | +| `paperforge/skills/methodology/SKILL.md` | Create | Feature 5 | +| `paperforge/skills/methodology/scripts/pf_bootstrap.py` | Copy | Feature 5 (same bootstrap) | +| `paperforge/commands/dashboard.py` | Modify | Feature 6 | +| `paperforge/plugin/main.js` | ✓ DONE | Refactoring | From 73a9ad386851ea3f424e7f1c1b42908d3e3778a2 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 16:03:27 +0800 Subject: [PATCH 053/132] fix(skill): add strict reading-log template to logging SKILL.md --- paperforge/skills/logging/SKILL.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/paperforge/skills/logging/SKILL.md b/paperforge/skills/logging/SKILL.md index 177b6a7..74163c9 100644 --- a/paperforge/skills/logging/SKILL.md +++ b/paperforge/skills/logging/SKILL.md @@ -62,6 +62,32 @@ python $SKILL_DIR/scripts/pf_bootstrap.py ``` 5. 确认写入成功 +### Reading Log Format (MANDATORY) + +When writing reading-log.md, use EXACTLY this format: + +``` +## ABCDEFGH — Author Last Name et al. Year +**Title:** Full Paper Title + +### Section Name — Page NN +**Info:** "verbatim excerpt from paper" +**Use:** how this supports current writing +**Note:** optional cross-reference (optional field) +``` + +Rules: +- Paper key: 8 uppercase letters/digits after ## (must match ^[A-Z0-9]{8}) +- **Title:** line required after every ## header +- **Info:** and **Use:** required after every ### section header +- **Note:** optional +- Do NOT deviate from this format — parsing is strict + +After writing the log file, suggest user run: +``` +paperforge reading-log --validate path/to/file.md +``` + ## 4. working 分支 — 会话总结写入 working-log 调用条件:会话结束前/用户要求记录工作过程。 From 93ded8e29baf744ed3ef4ad4169a60bde9409ea0 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 16:04:46 +0800 Subject: [PATCH 054/132] feat(skill): add methodology extraction skill (pure prompt) --- paperforge/skills/methodology/SKILL.md | 86 ++++++++ .../methodology/scripts/pf_bootstrap.py | 188 ++++++++++++++++++ 2 files changed, 274 insertions(+) create mode 100644 paperforge/skills/methodology/SKILL.md create mode 100644 paperforge/skills/methodology/scripts/pf_bootstrap.py diff --git a/paperforge/skills/methodology/SKILL.md b/paperforge/skills/methodology/SKILL.md new file mode 100644 index 0000000..7f08ef4 --- /dev/null +++ b/paperforge/skills/methodology/SKILL.md @@ -0,0 +1,86 @@ +--- +name: methodology +description: > + Extract reusable methodology from project work logs. Triggered by: + methodology, /methodology, 提取方法论, 存档写作规律, + 总结本项目方法, 提取可复用规则, 提取写作规律. +source: paperforge +--- + +# Methodology Extract + +--- + +## 1. Bootstrap + +```python $SKILL_DIR/scripts/pf_bootstrap.py``` + +Remember: `$VAULT`, `$PYTHON`. + +--- + +## 2. Determine Project + +Ask user: which project to extract methodology from? + +If user doesn't specify, scan `Project/` directory for complete working-log.md files and list them. + +--- + +## 3. Read working-log + +Read `/Project//working-log.md`. + +--- + +## 4. Identify Extractable Patterns + +Scan the working-log for these signals: + +| Signal in working-log | Extract to | +|----------------------|-------------| +| "弯路" + "修正" or "教训" sections | Pattern rules | +| "最终逻辑:" or "最终结构:" | Section templates | +| "复用" keyword + methodology block | Reusable practices | +| Cross-study audit sections (跨研究可比性) | Analysis methodology | +| "methodology" header sections | Full methodology block | +| Review feedback patterns (审阅/修正) | Writing checklists | + +For each found pattern, classify into one of: +- `review-writing` — 综述写作 framework design, gap analysis, cross-study audit +- `argument-writing` — 段落写作, 参数框架, 论证结构 +- `analysis-methods` — 文献审计, 跨研究比较, 参数提取 +- `general` — fallback + +--- + +## 5. Present and Confirm + +For each extracted pattern, show: +- Category +- Source (working-log section number) +- Brief summary (1-2 sentences) + +Ask user to confirm/edit before writing. + +--- + +## 6. Write Methodology Files + +Write confirmed patterns to `/PaperForge/methodologies/.md`. + +If file exists, APPEND (do not overwrite). + +Format per method: +``` +## +**Category:** +**Source:** Project//working-log.md Section X.Y +**Extracted:** YYYY-MM-DD + +### Pattern + + +### Example + +``` diff --git a/paperforge/skills/methodology/scripts/pf_bootstrap.py b/paperforge/skills/methodology/scripts/pf_bootstrap.py new file mode 100644 index 0000000..ebc41b2 --- /dev/null +++ b/paperforge/skills/methodology/scripts/pf_bootstrap.py @@ -0,0 +1,188 @@ +"""PaperForge bootstrap — single entry point for agent to discover vault state. + +No dependencies. Runs on ANY Python. Just reads paperforge.json + filesystem. + +Usage: + python pf_bootstrap.py # auto-discover vault from CWD + python pf_bootstrap.py --vault + +Output (JSON to stdout): + { + "ok": true, + "vault_root": "D:\\...", + "paths": { + "literature_dir": "D:\\...\\Resources\\Literature", + "index_path": "D:\\...\\System\\PaperForge\\indexes\\formal-library.json", + "ocr_dir": "D:\\...\\System\\PaperForge\\ocr", + "exports_dir": "D:\\...\\System\\PaperForge\\exports" + }, + "domains": ["domain1", "domain2"], + "index_summary": {"domain1": 120, "domain2": 80}, + "python_candidate": "D:\\...\\python.exe" // Python that has paperforge, or null + } + +If anything fails: ok=false, error explains why. +""" + +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + + +def _find_paperforge_json(start: Path) -> Path | None: + current = start.resolve() + for _ in range(10): + candidate = current / "paperforge.json" + if candidate.exists(): + return candidate + parent = current.parent + if parent == current: + break + current = parent + return None + + +def _read_pf_config(pf_json: Path) -> dict: + with open(pf_json, encoding="utf-8") as f: + return json.load(f) + + +def _find_python_with_paperforge(vault: Path, pf_cfg: dict) -> str | None: + """Find a Python executable that has paperforge installed.""" + candidates = [] + + # 1. Explicit python_path in config + if pf_cfg.get("python_path"): + candidates.append(Path(pf_cfg["python_path"])) + + # 2. Common venv locations inside vault + venv_names = [".venv", ".paperforge-test-venv", "venv"] + exe_paths = ["Scripts/python.exe", "bin/python3"] + for vn in venv_names: + for ep in exe_paths: + p = vault / vn / ep + if p.exists(): + candidates.append(p) + + for candidate in candidates: + try: + result = subprocess.run( + [str(candidate), "-m", "paperforge", "--version"], + capture_output=True, text=True, timeout=10, + encoding="utf-8", errors="replace", + ) + if result.returncode == 0 and "paperforge" in result.stdout.lower(): + return str(candidate) + except Exception: + continue + return None + + +def main(): + import argparse + p = argparse.ArgumentParser(description="PaperForge bootstrap") + p.add_argument("--vault", default=None, help="Vault root path (auto-detect if omitted)") + args = p.parse_args() + + result: dict = {"ok": False} + + # --- 1. Find vault --- + if args.vault: + vault = Path(args.vault).resolve() + pf_json = vault / "paperforge.json" + if not pf_json.exists(): + result["error"] = f"paperforge.json not found at {vault}" + json.dump(result, sys.stdout, ensure_ascii=False) + sys.exit(0) + else: + pf_json = _find_paperforge_json(Path.cwd()) + if pf_json is None: + result["error"] = "paperforge.json not found from CWD upward. Set --vault." + json.dump(result, sys.stdout, ensure_ascii=False) + sys.exit(0) + vault = pf_json.parent + + result["vault_root"] = str(vault) + + # --- 2. Read config --- + try: + cfg = _read_pf_config(pf_json) + except Exception as e: + result["error"] = f"Cannot read paperforge.json: {e}" + json.dump(result, sys.stdout, ensure_ascii=False) + sys.exit(0) + + system_dir = cfg.get("system_dir", "System") + resources_dir = cfg.get("resources_dir", "Resources") + literature_dir = cfg.get("literature_dir", "Literature") + + # --- 3. Build paths from config --- + pf_root = vault / system_dir / "PaperForge" + + paths = { + "literature_dir": str(vault / resources_dir / literature_dir), + "index_path": str(pf_root / "indexes" / "formal-library.json"), + "ocr_dir": str(pf_root / "ocr"), + "exports_dir": str(pf_root / "exports"), + } + result["paths"] = paths + + # --- 4. List domains --- + lit_dir = Path(paths["literature_dir"]) + domains = sorted( + [d.name for d in lit_dir.iterdir() if d.is_dir()] + ) if lit_dir.exists() else [] + result["domains"] = domains + + # --- 5. Index summary --- + index_path = Path(paths["index_path"]) + index_summary: dict[str, int] = {} + if index_path.exists(): + try: + data = json.loads(index_path.read_text(encoding="utf-8")) + items = data.get("items", []) + if isinstance(items, dict): + items = items.values() + for item in items: + d = item.get("domain", "unknown") + index_summary[d] = index_summary.get(d, 0) + 1 + except Exception: + pass + result["index_summary"] = index_summary + + # --- 6. Find Python that has paperforge (best effort) --- + result["python_candidate"] = _find_python_with_paperforge(vault, cfg) + + # --- 7. Memory layer state --- + memory_layer = {"available": False, "paper_count": 0, "fts_search": False, "vector_search": False} + idx_path = Path(paths["index_path"]) + dc_json = vault / ".obsidian" / "plugins" / "paperforge" / "data.json" + if idx_path.exists(): + try: + with open(idx_path, encoding="utf-8") as f: + data = json.load(f) + items = data.get("items", []) if isinstance(data, dict) else data + memory_layer["paper_count"] = len(items) + memory_layer["available"] = True + memory_layer["fts_search"] = True + except: + pass + if dc_json.exists(): + try: + with open(dc_json, encoding="utf-8") as f: + plugin_data = json.load(f) + vector_enabled = plugin_data.get("features", {}).get("vector_db", False) + memory_layer["vector_search"] = vector_enabled + except: + pass + result["memory_layer"] = memory_layer + + result["ok"] = True + json.dump(result, sys.stdout, ensure_ascii=False, indent=2) + + +if __name__ == "__main__": + main() From 12cb670016e401e2cf040a41010a86bac12302f0 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 16:12:23 +0800 Subject: [PATCH 055/132] feat(cli): add reading-log --validate/--import/--lookup commands --- paperforge/cli.py | 3 + paperforge/commands/__init__.py | 1 + paperforge/commands/reading_log.py | 273 +++++++++++++++++++++++++++++ 3 files changed, 277 insertions(+) diff --git a/paperforge/cli.py b/paperforge/cli.py index 9adef5d..f6679fb 100644 --- a/paperforge/cli.py +++ b/paperforge/cli.py @@ -294,6 +294,9 @@ def build_parser() -> argparse.ArgumentParser: p_rl.add_argument("--since", help="Export notes since date (YYYY-MM-DD)") p_rl.add_argument("--limit", type=int, default=50, help="Max notes to export") p_rl.add_argument("--output", help="Write markdown to file") + p_rl.add_argument("--validate", help="Validate a reading-log.md file") + p_rl.add_argument("--import", dest="import_file", help="Import reading-log.md into paper_events") + p_rl.add_argument("--lookup", help="Look up all reading notes for a paper key") p_rl.add_argument("--json", action="store_true", help="Output as JSON") p_search = sub.add_parser("search", help="Full-text search across the library") diff --git a/paperforge/commands/__init__.py b/paperforge/commands/__init__.py index c36887b..e60656a 100644 --- a/paperforge/commands/__init__.py +++ b/paperforge/commands/__init__.py @@ -15,6 +15,7 @@ "retrieve": "paperforge.commands.retrieve", "paper-status": "paperforge.commands.paper_status", "agent-context": "paperforge.commands.agent_context", + "reading-log": "paperforge.commands.reading_log", } diff --git a/paperforge/commands/reading_log.py b/paperforge/commands/reading_log.py index 537cf5a..37b9b8a 100644 --- a/paperforge/commands/reading_log.py +++ b/paperforge/commands/reading_log.py @@ -1,17 +1,290 @@ from __future__ import annotations import argparse +import json +import re from pathlib import Path from paperforge import __version__ as PF_VERSION from paperforge.core.errors import ErrorCode from paperforge.core.result import PFError, PFResult +from paperforge.memory.db import get_connection, get_memory_db_path from paperforge.memory.events import export_reading_log, write_reading_note +_HEADER_RE = re.compile(r"^## ([A-Z0-9]{8}) \u2014 .+ \d{4}$") +_TITLE_RE = re.compile(r"^\*\*Title:\*\* (.+)") +_SECTION_RE = re.compile(r"^### (.+)") +_INFO_RE = re.compile(r'^\*\*Info:\*\* ["\u201c](.+)["\u201d]$') +_USE_RE = re.compile(r"^\*\*Use:\*\* (.+)") +_NOTE_RE = re.compile(r"^\*\*Note:\*\* (.+)") + + +def validate_reading_log(filepath: Path) -> dict: + """Parse a reading-log.md with strict format rules and return validation result.""" + errors: list[dict] = [] + papers_found = 0 + entries_found = 0 + + if not filepath.exists(): + return { + "ok": False, + "file": str(filepath), + "errors": [{"line": 0, "field": "file", "message": "File not found"}], + "papers_found": 0, + "entries_found": 0, + } + + content = filepath.read_text(encoding="utf-8") + lines = content.splitlines() + + current_paper: str | None = None + current_section: str | None = None + has_info = False + has_use = False + + def _check_section_end(ln: int) -> None: + nonlocal has_info, has_use + if current_section is not None: + if not has_info: + errors.append({ + "line": ln, "field": "entry.info", + "message": f"Missing **Info:** in section '{current_section}'", + }) + if not has_use: + errors.append({ + "line": ln, "field": "entry.use", + "message": f"Missing **Use:** in section '{current_section}'", + }) + has_info = False + has_use = False + + for i, line in enumerate(lines): + ln = i + 1 + stripped = line.strip() + if not stripped: + continue + + m = _HEADER_RE.match(stripped) + if m: + _check_section_end(ln) + current_paper = m.group(1) + current_section = None + papers_found += 1 + continue + + m = _TITLE_RE.match(stripped) + if m: + if current_paper is None: + errors.append({ + "line": ln, "field": "paper.title", + "message": "**Title:** without paper header", + }) + continue + + m = _SECTION_RE.match(stripped) + if m: + if current_paper is None: + errors.append({ + "line": ln, "field": "section", + "message": "Section without paper header", + }) + continue + _check_section_end(ln) + current_section = m.group(1) + entries_found += 1 + continue + + if current_section: + if _INFO_RE.match(stripped): + has_info = True + continue + if _USE_RE.match(stripped): + has_use = True + continue + if _NOTE_RE.match(stripped): + continue + + _check_section_end(len(lines) + 1) + + return { + "ok": len(errors) == 0, + "file": str(filepath), + "errors": errors, + "papers_found": papers_found, + "entries_found": entries_found, + } + + +def import_reading_log(vault: Path, filepath: Path) -> dict: + """Validate and import a reading-log.md into paper_events.""" + validation = validate_reading_log(filepath) + if not validation["ok"]: + return { + "ok": False, + "errors": validation["errors"], + "papers_imported": 0, + "entries_imported": 0, + } + + content = filepath.read_text(encoding="utf-8") + lines = content.splitlines() + + papers_set: set[str] = set() + entries_imported = 0 + current_paper: str | None = None + current_section: str | None = None + current_excerpt: str | None = None + current_usage: str | None = None + current_note: str | None = None + + def _flush_entry() -> None: + nonlocal papers_set, entries_imported + nonlocal current_excerpt, current_usage, current_note + if current_paper and current_section and current_excerpt and current_usage: + write_reading_note( + vault, current_paper, current_section, + current_excerpt, current_usage, current_note or "", + ) + entries_imported += 1 + papers_set.add(current_paper) + current_excerpt = None + current_usage = None + current_note = None + + for line in lines: + stripped = line.strip() + if not stripped: + continue + + m = _HEADER_RE.match(stripped) + if m: + _flush_entry() + current_paper = m.group(1) + current_section = None + continue + + m = _SECTION_RE.match(stripped) + if m: + _flush_entry() + current_section = m.group(1) + continue + + if current_section: + m = _INFO_RE.match(stripped) + if m: + current_excerpt = m.group(1) + continue + m = _USE_RE.match(stripped) + if m: + current_usage = m.group(1) + continue + m = _NOTE_RE.match(stripped) + if m: + current_note = m.group(1) + continue + + _flush_entry() + + return { + "ok": True, + "papers_imported": len(papers_set), + "entries_imported": entries_imported, + } + + +def lookup_paper_events(vault: Path, key: str) -> dict: + """Query paper_events for all reading_note events for a paper, joined with papers table.""" + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return {"ok": False, "zotero_key": key, "title": "", "entries": [], "count": 0} + + conn = get_connection(db_path, read_only=True) + try: + rows = conn.execute( + """SELECT e.created_at, e.payload_json, p.title, p.citation_key, p.year + FROM paper_events e JOIN papers p ON p.zotero_key = e.paper_id + WHERE e.paper_id = ? AND e.event_type = 'reading_note' + ORDER BY e.created_at DESC""", (key,), + ).fetchall() + + title = rows[0]["title"] if rows else "" + entries = [] + for row in rows: + payload = json.loads(row["payload_json"]) + entries.append({ + "created_at": row["created_at"], + "section": payload.get("section", ""), + "excerpt": payload.get("excerpt", ""), + "usage": payload.get("usage", ""), + "note": payload.get("note", ""), + }) + + return { + "ok": True, + "zotero_key": key, + "title": title, + "entries": entries, + "count": len(entries), + } + finally: + conn.close() + + def run(args: argparse.Namespace) -> int: vault = args.vault_path + if args.validate: + data = validate_reading_log(Path(args.validate)) + result = PFResult( + ok=data["ok"], command="reading-log", version=PF_VERSION, data=data, + ) + if args.json: + print(result.to_json()) + else: + if data["ok"]: + print(f"Valid. {data['papers_found']} papers, {data['entries_found']} entries.") + else: + print(f"{len(data['errors'])} error(s):") + for e in data["errors"]: + print(f" line {e['line']}: [{e['field']}] {e['message']}") + return 0 if data["ok"] else 1 + + if args.import_file: + data = import_reading_log(vault, Path(args.import_file)) + result = PFResult( + ok=data.get("ok", True), command="reading-log", version=PF_VERSION, data=data, + ) + if args.json: + print(result.to_json()) + else: + if data["ok"]: + print(f"Imported {data['entries_imported']} entries from {data['papers_imported']} papers.") + else: + print(f"Validation failed with {len(data.get('errors', []))} error(s).") + return 0 if data["ok"] else 1 + + if args.lookup: + data = lookup_paper_events(vault, args.lookup) + result = PFResult( + ok=data["ok"], command="reading-log", version=PF_VERSION, data=data, + ) + if args.json: + print(result.to_json()) + else: + if data["ok"]: + print(f"Paper: {data['title']} ({data['zotero_key']})") + print(f" {data['count']} reading notes:") + for e in data["entries"]: + print(f" [{e['created_at']}] {e['section']}: \"{e['excerpt']}\"") + if e["usage"]: + print(f" -> Usage: {e['usage']}") + if e["note"]: + print(f" -> Note: {e['note']}") + else: + print(f"No entries found for key: {args.lookup}") + return 0 + if args.paper_id and args.excerpt: ok = write_reading_note( vault, args.paper_id, args.section or "", From f747c18df075c9130c46096a73558c7ed3ab2856 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 16:13:38 +0800 Subject: [PATCH 056/132] feat(cli): migrate dashboard to SQLite with file scan fallback --- paperforge/commands/dashboard.py | 66 ++++++++++++++------------------ 1 file changed, 29 insertions(+), 37 deletions(-) diff --git a/paperforge/commands/dashboard.py b/paperforge/commands/dashboard.py index ac93890..6362d30 100644 --- a/paperforge/commands/dashboard.py +++ b/paperforge/commands/dashboard.py @@ -52,51 +52,39 @@ def run(args) -> int: def _dashboard_from_db(vault: Path) -> dict | None: - """Build dashboard stats from paperforge.db. Returns None if DB missing.""" - from paperforge.memory.db import get_connection, get_memory_db_path - - db_path = get_memory_db_path(vault) + """Build dashboard stats from paperforge.db. Returns None if DB unavailable.""" + from pathlib import Path as _P + db_path = vault / "System" / "PaperForge" / "indexes" / "paperforge.db" if not db_path.exists(): return None - conn = get_connection(db_path, read_only=True) try: - total = conn.execute("SELECT COUNT(*) FROM papers").fetchone()[0] - - pdf_healthy = conn.execute( - "SELECT COUNT(*) FROM papers WHERE lifecycle != 'indexed'" - ).fetchone()[0] - pdf_missing = conn.execute( - "SELECT COUNT(*) FROM papers WHERE lifecycle = 'indexed'" - ).fetchone()[0] - - ocr_done = conn.execute( - "SELECT COUNT(*) FROM papers WHERE ocr_status='done'" - ).fetchone()[0] - ocr_failed = conn.execute( - "SELECT COUNT(*) FROM papers WHERE ocr_status='failed'" - ).fetchone()[0] + import sqlite3 + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + # Paper count + total = conn.execute("SELECT COUNT(*) as cnt FROM papers").fetchone()["cnt"] + # PDF health + pdf_healthy = conn.execute("SELECT COUNT(*) as cnt FROM papers WHERE has_pdf = 1 AND (ocr_status != 'failed' OR ocr_status IS NULL)").fetchone()["cnt"] + pdf_missing = conn.execute("SELECT COUNT(*) as cnt FROM papers WHERE has_pdf = 0").fetchone()["cnt"] + pdf_broken = total - pdf_healthy - pdf_missing + # OCR health + ocr_done = conn.execute("SELECT COUNT(*) as cnt FROM papers WHERE ocr_status = 'done'").fetchone()["cnt"] + ocr_failed = conn.execute("SELECT COUNT(*) as cnt FROM papers WHERE ocr_status IN ('failed','blocked')").fetchone()["cnt"] ocr_pending = total - ocr_done - ocr_failed - - domain_counts = { - r["domain"]: r["cnt"] - for r in conn.execute( - "SELECT domain, COUNT(*) as cnt FROM papers GROUP BY domain" - ).fetchall() - } - + # Domain counts + rows = conn.execute("SELECT domain, COUNT(*) as cnt FROM papers GROUP BY domain").fetchall() + domain_counts = {r["domain"]: r["cnt"] for r in rows} + conn.close() return { "stats": { "papers": total, - "pdf_health": {"healthy": pdf_healthy, "missing": pdf_missing, "broken": 0}, + "pdf_health": {"healthy": pdf_healthy, "missing": pdf_missing, "broken": pdf_broken}, "ocr_health": {"pending": ocr_pending, "done": ocr_done, "failed": ocr_failed}, "domain_counts": domain_counts, - "_source": "paperforge.db", }, } except Exception: return None - finally: - conn.close() def _check_permissions(vault: Path) -> dict: @@ -214,8 +202,12 @@ def _dashboard_from_files(vault: Path) -> dict: def _gather_dashboard_data(vault: Path) -> dict: - db_result = _dashboard_from_db(vault) - if db_result is not None: - db_result["permissions"] = _check_permissions(vault) - return db_result - return _dashboard_from_files(vault) + # Try DB first + data = _dashboard_from_db(vault) + if data is not None: + data["permissions"] = _check_permissions(vault) + return data + # Fallback to file scanning + data = _dashboard_from_files(vault) + data["permissions"] = _check_permissions(vault) + return data From d8c9cc3cef693eafe735f14deb05bf17031ee413 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 16:23:01 +0800 Subject: [PATCH 057/132] fix(cli): support multiline fields + bilingual labels in reading-log parser --- paperforge/commands/reading_log.py | 246 +++++++++++++---------------- 1 file changed, 113 insertions(+), 133 deletions(-) diff --git a/paperforge/commands/reading_log.py b/paperforge/commands/reading_log.py index 37b9b8a..a703198 100644 --- a/paperforge/commands/reading_log.py +++ b/paperforge/commands/reading_log.py @@ -15,49 +15,62 @@ _HEADER_RE = re.compile(r"^## ([A-Z0-9]{8}) \u2014 .+ \d{4}$") _TITLE_RE = re.compile(r"^\*\*Title:\*\* (.+)") _SECTION_RE = re.compile(r"^### (.+)") -_INFO_RE = re.compile(r'^\*\*Info:\*\* ["\u201c](.+)["\u201d]$') -_USE_RE = re.compile(r"^\*\*Use:\*\* (.+)") -_NOTE_RE = re.compile(r"^\*\*Note:\*\* (.+)") +_HR_RE = re.compile(r"^-{3,}$") +_FIELD_RE = re.compile(r"^\*\*([^:]+):\*\*") +_LABEL_INFO = frozenset({"Info", "信息"}) +_LABEL_USE = frozenset({"Use", "用途"}) +_LABEL_NOTE = frozenset({"Note", "备注"}) -def validate_reading_log(filepath: Path) -> dict: - """Parse a reading-log.md with strict format rules and return validation result.""" - errors: list[dict] = [] - papers_found = 0 - entries_found = 0 +def _strip_quotes(s: str) -> str: + if s.startswith('"') and s.endswith('"'): + return s[1:-1] + if len(s) >= 2 and s[0] == '\u201c' and s[-1] == '\u201d': + return s[1:-1] + return s + + +def _parse_reading_log(filepath: Path) -> dict: if not filepath.exists(): - return { - "ok": False, - "file": str(filepath), - "errors": [{"line": 0, "field": "file", "message": "File not found"}], - "papers_found": 0, - "entries_found": 0, - } + return {"ok": False, "papers": [], "errors": [{"line": 0, "field": "file", "message": "File not found"}]} content = filepath.read_text(encoding="utf-8") lines = content.splitlines() - current_paper: str | None = None + papers: list[dict] = [] + errors: list[dict] = [] + + current_paper: dict | None = None current_section: str | None = None - has_info = False - has_use = False - - def _check_section_end(ln: int) -> None: - nonlocal has_info, has_use - if current_section is not None: - if not has_info: - errors.append({ - "line": ln, "field": "entry.info", - "message": f"Missing **Info:** in section '{current_section}'", - }) - if not has_use: - errors.append({ - "line": ln, "field": "entry.use", - "message": f"Missing **Use:** in section '{current_section}'", - }) - has_info = False - has_use = False + current_fields: dict = {} + active_field: str | None = None + + def _flush_section(ln: int = 0): + nonlocal current_section, current_fields, active_field + if current_paper is not None and current_section is not None: + info_val = current_fields.get("info", "") + use_val = current_fields.get("use", "") + if not info_val: + errors.append({"line": ln, "field": "entry.info", "message": f"Missing **Info:** in section '{current_section}'"}) + if not use_val: + errors.append({"line": ln, "field": "entry.use", "message": f"Missing **Use:** in section '{current_section}'"}) + current_paper["sections"].append({ + "section_name": current_section, + "info": info_val, + "use": use_val, + "note": current_fields.get("note", ""), + }) + current_section = None + current_fields = {} + active_field = None + + def _flush_paper(ln: int = 0): + nonlocal current_paper + _flush_section(ln) + if current_paper: + papers.append(current_paper) + current_paper = None for i, line in enumerate(lines): ln = i + 1 @@ -67,130 +80,97 @@ def _check_section_end(ln: int) -> None: m = _HEADER_RE.match(stripped) if m: - _check_section_end(ln) - current_paper = m.group(1) - current_section = None - papers_found += 1 + _flush_paper(ln) + current_paper = {"paper_key": m.group(1), "title": "", "sections": []} continue m = _TITLE_RE.match(stripped) - if m: - if current_paper is None: - errors.append({ - "line": ln, "field": "paper.title", - "message": "**Title:** without paper header", - }) + if m and current_paper is not None: + current_paper["title"] = m.group(1) continue m = _SECTION_RE.match(stripped) if m: - if current_paper is None: - errors.append({ - "line": ln, "field": "section", - "message": "Section without paper header", - }) - continue - _check_section_end(ln) - current_section = m.group(1) - entries_found += 1 + if current_paper is not None: + _flush_section(ln) + current_section = m.group(1) continue - if current_section: - if _INFO_RE.match(stripped): - has_info = True + if current_paper is None or current_section is None: + continue + + if _HR_RE.match(stripped): + active_field = None + continue + + fm = _FIELD_RE.match(stripped) + if fm: + label = fm.group(1) + rest = stripped[fm.end():].strip() + if label in _LABEL_INFO: + active_field = "info" + current_fields["info"] = _strip_quotes(rest) + continue + if label in _LABEL_USE: + active_field = "use" + current_fields["use"] = _strip_quotes(rest) if rest else "" continue - if _USE_RE.match(stripped): - has_use = True + if label in _LABEL_NOTE: + active_field = "note" + current_fields["note"] = _strip_quotes(rest) if rest else "" continue - if _NOTE_RE.match(stripped): + if label == "Title": + active_field = None continue + active_field = None + continue - _check_section_end(len(lines) + 1) + if active_field: + existing = current_fields.get(active_field, "") + if existing: + current_fields[active_field] = existing + "\n" + stripped + else: + current_fields[active_field] = stripped + + _flush_paper(len(lines) + 1) + + return {"ok": len(errors) == 0, "papers": papers, "errors": errors} + +def validate_reading_log(filepath: Path) -> dict: + """Parse a reading-log.md with strict format rules and return validation result.""" + parsed = _parse_reading_log(filepath) return { - "ok": len(errors) == 0, + "ok": parsed["ok"], "file": str(filepath), - "errors": errors, - "papers_found": papers_found, - "entries_found": entries_found, + "errors": parsed["errors"], + "papers_found": len(parsed["papers"]), + "entries_found": sum(len(p["sections"]) for p in parsed["papers"]), } def import_reading_log(vault: Path, filepath: Path) -> dict: """Validate and import a reading-log.md into paper_events.""" - validation = validate_reading_log(filepath) - if not validation["ok"]: - return { - "ok": False, - "errors": validation["errors"], - "papers_imported": 0, - "entries_imported": 0, - } - - content = filepath.read_text(encoding="utf-8") - lines = content.splitlines() + parsed = _parse_reading_log(filepath) + if not parsed["ok"]: + return {"ok": False, "errors": parsed["errors"], "papers_imported": 0, "entries_imported": 0} papers_set: set[str] = set() entries_imported = 0 - current_paper: str | None = None - current_section: str | None = None - current_excerpt: str | None = None - current_usage: str | None = None - current_note: str | None = None - - def _flush_entry() -> None: - nonlocal papers_set, entries_imported - nonlocal current_excerpt, current_usage, current_note - if current_paper and current_section and current_excerpt and current_usage: - write_reading_note( - vault, current_paper, current_section, - current_excerpt, current_usage, current_note or "", - ) - entries_imported += 1 - papers_set.add(current_paper) - current_excerpt = None - current_usage = None - current_note = None - - for line in lines: - stripped = line.strip() - if not stripped: - continue - - m = _HEADER_RE.match(stripped) - if m: - _flush_entry() - current_paper = m.group(1) - current_section = None - continue - - m = _SECTION_RE.match(stripped) - if m: - _flush_entry() - current_section = m.group(1) - continue - - if current_section: - m = _INFO_RE.match(stripped) - if m: - current_excerpt = m.group(1) - continue - m = _USE_RE.match(stripped) - if m: - current_usage = m.group(1) - continue - m = _NOTE_RE.match(stripped) - if m: - current_note = m.group(1) - continue - _flush_entry() - - return { - "ok": True, - "papers_imported": len(papers_set), - "entries_imported": entries_imported, - } + for paper in parsed["papers"]: + for section in paper["sections"]: + info = section.get("info", "") + use = section.get("use", "") + if info and use: + write_reading_note( + vault, paper["paper_key"], section["section_name"], + info, use, section.get("note", "") or "", + ) + entries_imported += 1 + papers_set.add(paper["paper_key"]) + + return {"ok": True, "papers_imported": len(papers_set), "entries_imported": entries_imported} def lookup_paper_events(vault: Path, key: str) -> dict: From 7b3326db612833955960471b79a6c052afbfe60e Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 16:24:09 +0800 Subject: [PATCH 058/132] =?UTF-8?q?fix(skill):=20add=20language=20rule=20?= =?UTF-8?q?=E2=80=94=20field=20labels=20in=20English,=20content=20in=20use?= =?UTF-8?q?r's=20language?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paperforge/skills/logging/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paperforge/skills/logging/SKILL.md b/paperforge/skills/logging/SKILL.md index 74163c9..7f25e29 100644 --- a/paperforge/skills/logging/SKILL.md +++ b/paperforge/skills/logging/SKILL.md @@ -64,7 +64,7 @@ python $SKILL_DIR/scripts/pf_bootstrap.py ### Reading Log Format (MANDATORY) -When writing reading-log.md, use EXACTLY this format: +When writing reading-log.md, use EXACTLY this format. Field labels (`**Info:**`, `**Use:**`, `**Note:**`) must be in English, but the content (title, section name, excerpt, usage, note) should be in the same language as the user's conversation. ``` ## ABCDEFGH — Author Last Name et al. Year From 332558a72a73d0626f328748f75284bf55ba6c52 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 16:35:07 +0800 Subject: [PATCH 059/132] feat(plugin): auto-poll exports/ocr for seamless memory layer updates --- paperforge/plugin/main.js | 103 +++++++++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 6 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 6663e1d..4f5eb8a 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2526,12 +2526,6 @@ class PaperForgeSettingTab extends PluginSettingTab { _renderMemoryStatusText(el, text) { el.innerHTML = ''; el.createEl('span', { text: text, cls: 'paperforge-memory-text' }).style.cssText = 'flex:1;'; - const refreshBtn = el.createEl('button', { cls: 'paperforge-refresh-btn', text: '\u21BB' }); - refreshBtn.style.cssText = 'margin-left:auto; border:none; background:none; cursor:pointer; font-size:16px; padding:0 4px;'; - refreshBtn.onclick = () => { - this._memoryStatusText = null; - this.display(); - }; } _resolvePythonAsync(callback) { @@ -3826,6 +3820,11 @@ class PaperForgeSetupModal extends Modal { module.exports = class PaperForgePlugin extends Plugin { async onload() { await this.loadSettings(); + // ── Automatic file polling state ── + this._lastExportMtime = 0; + this._lastOcrMtimes = {}; + this._autoSyncRunning = false; + this._pollTimer = null; // Clean stale path fields from plugin data.json (migrated to paperforge.json) this.saveSettings(); T = (langFromApp(this.app) === 'zh') ? LANG.zh : LANG.en; @@ -3870,6 +3869,7 @@ module.exports = class PaperForgePlugin extends Plugin { if (this.settings.auto_update !== false && this.settings.setup_complete) { setTimeout(() => this._autoUpdate(), 3000); } + this._startFilePolling(); } _autoUpdate() { @@ -3909,6 +3909,96 @@ module.exports = class PaperForgePlugin extends Plugin { }); } + /* ── Automatic file polling for seamless memory layer ── */ + + _startFilePolling() { + const vaultPath = this.app.vault.adapter.basePath; + const fs = require('fs'); + const path = require('path'); + const { exec } = require('child_process'); + + this._pollTimer = setInterval(() => { + this._checkExports(vaultPath, fs, path, exec); + this._checkOcr(vaultPath, fs, path, exec); + }, 30000); // every 30 seconds + } + + _checkExports(vaultPath, fs, path, exec) { + if (this._autoSyncRunning) return; + const exportsDir = path.join(vaultPath, 'System', 'PaperForge', 'exports'); + if (!fs.existsSync(exportsDir)) return; + + let newestMtime = 0; + try { + fs.readdirSync(exportsDir).forEach(f => { + if (!f.endsWith('.json')) return; + const stat = fs.statSync(path.join(exportsDir, f)); + if (stat.mtimeMs > newestMtime) newestMtime = stat.mtimeMs; + }); + } catch(e) { return; } + + if (newestMtime > this._lastExportMtime) { + this._lastExportMtime = newestMtime; + this._autoRebuild(vaultPath, exec); + } + } + + _autoRebuild(vaultPath, exec) { + if (this._autoSyncRunning) return; + this._autoSyncRunning = true; + + const pyResult = resolvePythonExecutable(vaultPath, this.settings); + if (!pyResult.path) { this._autoSyncRunning = false; return; } + + const cmd = `"${pyResult.path}" -m paperforge --vault "${vaultPath}" memory build`; + exec(cmd, { timeout: 60000, encoding: 'utf-8' }, (err, stdout, stderr) => { + this._autoSyncRunning = false; + this._memoryStatusText = null; // force re-check next time + // Update last export mtime to avoid re-trigger during build + try { + const fs = require('fs'); + const path = require('path'); + const exportsDir = path.join(vaultPath, 'System', 'PaperForge', 'exports'); + let newest = 0; + fs.readdirSync(exportsDir).forEach(f => { + if (!f.endsWith('.json')) return; + newest = Math.max(newest, fs.statSync(path.join(exportsDir, f)).mtimeMs); + }); + this._lastExportMtime = newest; + } catch(e) {} + }); + } + + _checkOcr(vaultPath, fs, path, exec) { + if (this._autoSyncRunning) return; + const ocrDir = path.join(vaultPath, 'System', 'PaperForge', 'ocr'); + if (!fs.existsSync(ocrDir)) return; + + try { + fs.readdirSync(ocrDir, { withFileTypes: true }).forEach(entry => { + if (!entry.isDirectory()) return; + const metaPath = path.join(ocrDir, entry.name, 'meta.json'); + if (!fs.existsSync(metaPath)) return; + const stat = fs.statSync(metaPath); + const prevMtime = this._lastOcrMtimes[entry.name] || 0; + if (stat.mtimeMs <= prevMtime) return; + + this._lastOcrMtimes[entry.name] = stat.mtimeMs; + if (this._autoSyncRunning) return; + this._autoSyncRunning = true; + + const pyResult = resolvePythonExecutable(vaultPath, this.settings); + if (!pyResult.path) { this._autoSyncRunning = false; return; } + + const cmd = `"${pyResult.path}" -m paperforge --vault "${vaultPath}" sync --key "${entry.name}"`; + exec(cmd, { timeout: 30000, encoding: 'utf-8' }, () => { + this._autoSyncRunning = false; + this._memoryStatusText = null; + }); + }); + } catch(e) {} + } + /** * Read path configuration from the canonical paperforge.json file. * Falls back to Python-level DEFAULT_CONFIG values if file does not exist. @@ -4009,6 +4099,7 @@ module.exports = class PaperForgePlugin extends Plugin { } onunload() { + if (this._pollTimer) clearInterval(this._pollTimer); this.app.workspace.detachLeavesOfType(VIEW_TYPE_PAPERFORGE); } From d2c0c891ad504c67e674cf828beaee5973b1dbe6 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 16:52:10 +0800 Subject: [PATCH 060/132] docs(spec): 10-point memory layer performance optimization --- ...26-05-12-memory-layer-perf-optimization.md | 157 ++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-12-memory-layer-perf-optimization.md diff --git a/docs/superpowers/specs/2026-05-12-memory-layer-perf-optimization.md b/docs/superpowers/specs/2026-05-12-memory-layer-perf-optimization.md new file mode 100644 index 0000000..308fdd6 --- /dev/null +++ b/docs/superpowers/specs/2026-05-12-memory-layer-perf-optimization.md @@ -0,0 +1,157 @@ +# Memory Layer Performance Optimization — v1.5.8 + +> **Branch:** `feature/memory` | **From audit:** Round 3 | **All 10 items** + +## P0: Critical Efficiency Bugs + +### 1. refresh_paper: O(N) scan per single-paper update + +**File:** `paperforge/memory/refresh.py:25` + +**Problem:** `refresh_paper()` calls `read_index(vault)` which parses the full 5-10 MB `formal-library.json` every time, then does a linear scan to find one paper. + +**Fix:** Add a lightweight `read_index_entry(vault, key) -> dict | None` function that: +- Opens `formal-library.json` +- Uses `ijson` or streaming parse, OR +- Loads only the `items` list and does a dict lookup by `zotero_key` + +Alternative (simpler): change `refresh_paper()` signature to accept the entry dict directly from the caller. Caller already has the entry (from sync or OCR completion event). Don't re-read the file. + +```python +# New signature +def refresh_paper(vault: Path, entry: dict) -> bool: + # entry is already resolved by caller, skip read_index +``` + +Callers updated: +- `sync_service.run()` → after `_build_entry()`, call `refresh_paper(vault, entry)` +- `commands/ocr.py` → after OCR completes, get entry from OCR context, call `refresh_paper(vault, entry)` +- `commands/finalize.py` → after deep-finalize, use the entry dict + +### 2. FTS Double-Insert + +**File:** `paperforge/memory/builder.py:97,150-158` + `paperforge/memory/schema.py:104` + +**Problem:** `papers_ai` trigger fires on `INSERT OR REPLACE INTO papers`, writing a row to `paper_fts`. Then the manual `INSERT INTO paper_fts` (line 150) tries to write AGAIN — IntegrityError caught silently. + +**Fix:** In `build_from_index()`: +- Before the paper loop: `conn.execute("DROP TRIGGER IF EXISTS papers_ai")` +- After the paper loop: re-create the trigger from schema +- Remove the manual `INSERT INTO paper_fts` (lines 150-158) + +Similarly in `refresh.py`: +- Drop trigger before upsert, re-create after (or just use manual FTS + no trigger) + +### 3. _autoRebuild does full build on every change + +**File:** `paperforge/plugin/main.js:_autoRebuild()` + +**Problem:** Runs `memory build` (full rebuild) on ANY export change. One new paper = 150 papers re-indexed. + +**Fix:** Never trigger full `memory build` from auto-poll. Instead: +- On export change: run `paperforge sync --auto` (incremental sync only) +- The sync command already calls `refresh_paper()` internally for new/changed papers +- Only run `memory build` on first install or when user explicitly requests it + +Change `_autoRebuild()` to `_autoSync()`: +```javascript +const cmd = `"${pyResult.path}" -m paperforge --vault "${vaultPath}" sync`; +``` + +## P1: Redundancy Elimination + +### 4. Frontmatter read 3 times per paper + +**File:** `paperforge/worker/asset_index.py:325-343` + +**Problem:** `_build_entry()` calls `read_frontmatter()` 3 separate times for `do_ocr`, `analyze`, `deep_reading_status` from the same file. + +**Fix:** Add helper at module level: +```python +def _get_frontmatter_values(note_path: Path) -> dict: + """Read frontmatter once, return {do_ocr, analyze, deep_reading_status}.""" + fm = read_frontmatter(note_path) + return { + "do_ocr": fm.get("do_ocr"), + "analyze": fm.get("analyze"), + "deep_reading_status": fm.get("deep_reading_status"), + } +``` + +### 5. Duplicate PAPER_COLUMNS logic + +**File:** `paperforge/memory/builder.py:109-139` + `paperforge/memory/refresh.py:52-74` + +**Problem:** Identical ~30 lines of column-value mapping. + +**Fix:** Extract to `paperforge/memory/_columns.py`: +```python +def build_paper_row(entry: dict, generated_at: str) -> dict: + # single source of truth for papers table columns + ... +``` +Import from both builder.py and refresh.py. + +### 6. Dashboard 6 SELECTs → 2 + +**File:** `paperforge/commands/dashboard.py:65-75` + +**Fix:** +```sql +-- Query 1: combined pdf+ocr health +SELECT has_pdf, + CASE WHEN ocr_status = 'done' THEN 'done' + WHEN ocr_status IN ('failed','blocked') THEN 'failed' + ELSE 'pending' END as ocr_state, + COUNT(*) as cnt +FROM papers GROUP BY has_pdf, ocr_state; + +-- Query 2: domain counts (unchanged) +SELECT domain, COUNT(*) FROM papers GROUP BY domain; +``` + +## P2: General Optimization + +### 7. Per-row INSERT → executemany + +**File:** `paperforge/memory/builder.py:143` + +**Fix:** Collect paper rows, asset rows, alias rows in lists. Use `executemany()`: +```python +conn.executemany("INSERT OR REPLACE INTO papers (...) VALUES (...)", paper_rows) +conn.executemany("INSERT OR REPLACE INTO paper_assets (...) VALUES (...)", asset_rows) +``` + +### 8. Poll interval 30s → 120s + +**File:** `paperforge/plugin/main.js:3920` + +**Fix:** Change `setInterval(..., 30000)` to `setInterval(..., 120000)`. + +### 9. _build_entry: 10 file reads per paper + +**File:** `paperforge/worker/asset_index.py:_build_entry()` + +**Problem:** Multiple `.exists()`, `.read_text()`, frontmatter reads per paper. + +**Fix:** Combine the `_legacy_control_flags` + `do_ocr` + `analyze` + `deep_reading_status` into one pass. Don't check `note_path.exists()` when `main_note_path.exists()` — if main exists, use it; only fall back to note_path when main doesn't exist. + +### 10. formal-library.json read-once pipeline + +**Problem:** `formal-library.json` is parsed by 5+ different modules during a sync→build→dashboard cycle. + +**Fix:** Not urgent — each module reads it independently for isolation. This is acceptable for reliability. Could optimize later with in-memory cache, but at risk of staleness. + +## Implementation Order + +1. P0 #1: refresh_paper accept entry dict (changes refresh.py + callers) +2. P0 #2: FTS trigger removal + manual-only insert +3. P0 #3: _autoRebuild → _autoSync +4. P1 #5: Extract PAPER_COLUMNS helper +5. P1 #4: Single frontmatter parse +6. P1 #6: Dashboard query merge +7. P2 #7: executemany batching +8. P2 #8: Poll interval +9. P2 #9: File read consolidation + +Each step: modify → test → commit. From e91dfd8c450cd6ba7ae42c58088a52ca57172a7c Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 16:58:39 +0800 Subject: [PATCH 061/132] perf: refresh_paper accepts entry dict, skip full index read --- paperforge/memory/refresh.py | 43 +++++++++++++++---------------- paperforge/worker/asset_index.py | 29 +++++++++++++++------ tests/unit/memory/test_refresh.py | 10 ++++++- 3 files changed, 51 insertions(+), 31 deletions(-) diff --git a/paperforge/memory/refresh.py b/paperforge/memory/refresh.py index ebd033a..1998d6a 100644 --- a/paperforge/memory/refresh.py +++ b/paperforge/memory/refresh.py @@ -1,18 +1,17 @@ from __future__ import annotations import json +from datetime import datetime, timezone from pathlib import Path from paperforge.memory.builder import ( - PAPER_COLUMNS, - ASSET_FIELDS, ALIAS_TYPES, - compute_hash, + ASSET_FIELDS, + PAPER_COLUMNS, _resolve_vault_path, ) from paperforge.memory.db import get_connection, get_memory_db_path from paperforge.memory.schema import ensure_schema -from paperforge.worker.asset_index import read_index from paperforge.worker.asset_state import ( compute_lifecycle, compute_maturity, @@ -20,22 +19,13 @@ ) -def refresh_paper(vault: Path, zotero_key: str) -> bool: - """Incrementally refresh one paper in paperforge.db from formal-library.json.""" - envelope = read_index(vault) - if not envelope: - return False - items = envelope if isinstance(envelope, list) else envelope.get("items", []) - - entry = None - for e in items: - if e.get("zotero_key") == zotero_key: - entry = e - break - if not entry: +def refresh_paper(vault: Path, entry: dict) -> bool: + """Upsert a single paper into memory DB. Entry is from _build_entry() output.""" + zotero_key = entry.get("zotero_key", "") + if not zotero_key: return False - generated_at = envelope.get("generated_at", "") if not isinstance(envelope, list) else "" + generated_at = datetime.now(timezone.utc).isoformat() db_path = get_memory_db_path(vault) if not db_path.exists(): @@ -116,10 +106,19 @@ def refresh_paper(vault: Path, zotero_key: str) -> bool: conn.execute( "INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) " "VALUES ((SELECT rowid FROM papers WHERE zotero_key = ?), ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", - (zotero_key, zotero_key, entry.get("citation_key", ""), entry.get("title", ""), - entry.get("first_author", ""), paper_values["authors_json"], - entry.get("abstract", ""), entry.get("journal", ""), entry.get("domain", ""), - entry.get("collection_path", ""), paper_values["collections_json"]), + ( + zotero_key, + zotero_key, + entry.get("citation_key", ""), + entry.get("title", ""), + entry.get("first_author", ""), + paper_values["authors_json"], + entry.get("abstract", ""), + entry.get("journal", ""), + entry.get("domain", ""), + entry.get("collection_path", ""), + paper_values["collections_json"], + ), ) except Exception: pass # FTS may not be available diff --git a/paperforge/worker/asset_index.py b/paperforge/worker/asset_index.py index ed9e751..6bdbd99 100644 --- a/paperforge/worker/asset_index.py +++ b/paperforge/worker/asset_index.py @@ -37,7 +37,6 @@ from paperforge import __version__ as _paperforge_version from paperforge.adapters.obsidian_frontmatter import ( _legacy_control_flags, - read_frontmatter_bool, read_frontmatter_dict, read_frontmatter_optional_bool, ) @@ -230,7 +229,10 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: Lazy imports inside avoid circular dependencies with ``sync.py``. """ # Lazy imports to avoid circular deps with sync.py - from paperforge.worker._utils import read_json, slugify_filename, write_json, yaml_quote + import shutil + + from paperforge import __version__ as PAPERFORGE_VERSION + from paperforge.worker._utils import lookup_impact_factor, read_json, slugify_filename, write_json, yaml_quote from paperforge.worker.asset_state import ( compute_health, compute_lifecycle, @@ -246,9 +248,6 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: obsidian_wikilink_for_path, obsidian_wikilink_for_pdf, ) - from paperforge.worker._utils import lookup_impact_factor - from paperforge import __version__ as PAPERFORGE_VERSION - import shutil key = item["key"] collection_meta = collection_fields(item.get("collections", [])) @@ -291,8 +290,8 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: if "aliases:" not in text[: text.find("\n---", 4)]: alias_line = f"aliases: [{yaml_quote(item.get('title', ''))}, {yaml_quote(item.get('citation_key') or item.get('key', ''))}]\n" text = re.sub( - r'(^title:.*\n)', - r'\1' + alias_line, + r"(^title:.*\n)", + r"\1" + alias_line, text, count=1, flags=re.MULTILINE, @@ -338,6 +337,7 @@ def _read_fm_str(fp: Path, key: str) -> str: return str(fm.get(key, "")).strip() except Exception: return "" + note_dr = _read_fm_str(main_note_path, "deep_reading_status") if not note_dr: note_dr = _read_fm_str(note_path, "deep_reading_status") @@ -415,7 +415,7 @@ def _read_fm_str(fp: Path, key: str) -> str: text = main_note_path.read_text(encoding="utf-8") fm_close = text.find("---\n", 4) # closing --- after opening --- if fm_close != -1: - body = text[fm_close + 4:] # everything after frontmatter + body = text[fm_close + 4 :] # everything after frontmatter new_full = frontmatter_note(entry, "") new_fm_close = new_full.find("---\n", 4) if new_fm_close != -1: @@ -492,6 +492,12 @@ def build_index(vault: Path, verbose: bool = False) -> int: for item in export_rows: entry = _build_entry(item, vault, paths, domain, zotero_dir) index_rows.append(entry) + try: + from paperforge.memory.refresh import refresh_paper + + refresh_paper(vault, entry) + except Exception: + pass # memory DB refresh is best-effort # Atomically write the envelope-wrapped index index_path = paths["index"] @@ -572,6 +578,13 @@ def refresh_index_entry(vault: Path, key: str) -> bool: # Build single entry and update the items list new_entry = _build_entry(found_item, vault, paths, found_domain, zotero_dir) + try: + from paperforge.memory.refresh import refresh_paper + + refresh_paper(vault, new_entry) + except Exception: + pass # memory DB refresh is best-effort + replaced = False for i, existing_entry in enumerate(items): if existing_entry.get("zotero_key") == key: diff --git a/tests/unit/memory/test_refresh.py b/tests/unit/memory/test_refresh.py index 4f1bd2e..103440f 100644 --- a/tests/unit/memory/test_refresh.py +++ b/tests/unit/memory/test_refresh.py @@ -6,4 +6,12 @@ def test_refresh_paper_returns_false_when_no_db(): - assert refresh_paper(Path("/nonexistent/vault"), "KEY001") is False + assert refresh_paper(Path("/nonexistent/vault"), {"zotero_key": "KEY001"}) is False + + +def test_refresh_paper_returns_false_for_empty_key(): + assert refresh_paper(Path("/nonexistent/vault"), {}) is False + + +def test_refresh_paper_returns_false_for_missing_key(): + assert refresh_paper(Path("/nonexistent/vault"), {"title": "No Key"}) is False From 5ff3140a80e8b24360d7971e976479f6191d4b90 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 17:01:28 +0800 Subject: [PATCH 062/132] perf: fix FTS double-insert, drop trigger before bulk insert --- paperforge/literature-qa-harness.md | 49 +++++++++++++++++++++++++++++ paperforge/memory/builder.py | 20 +++++------- paperforge/memory/refresh.py | 44 +++++++++++++------------- paperforge/memory/schema.py | 10 +++--- 4 files changed, 84 insertions(+), 39 deletions(-) create mode 100644 paperforge/literature-qa-harness.md diff --git a/paperforge/literature-qa-harness.md b/paperforge/literature-qa-harness.md new file mode 100644 index 0000000..de6f668 --- /dev/null +++ b/paperforge/literature-qa-harness.md @@ -0,0 +1,49 @@ +# Literature QA Harness Redesign + +> **问题:** Skill 里写条件路由 `if memory else grep`,笨 agent 可能读错分支。 + +## 解法:技能内不做路由,路由下沉到脚本 + +``` +┌──────────────────────────────────────┐ +│ SKILL.md │ +│ │ +│ 1. bootstrap → 记下 $PYTHON $VAULT │ +│ 2. 跑 pf_search.py, 读 JSON 输出 │ +│ 3. 格式化结果给用户 │ +│ │ +│ agent 不用做决策,只是执行步骤 │ +└─────────────┬────────────────────────┘ + │ 调用 + ▼ +┌──────────────────────────────────────┐ +│ scripts/pf_search.py │ +│ │ +│ memory_layer.enabled? │ +│ ├─ YES → paperforge search --json │ +│ └─ NO → grep -r .md files │ +│ │ +│ 输出: 统一 JSON 格式 │ +│ { results: [{key, title, ...}] } │ +└──────────────────────────────────────┘ +``` + +## 核心原则 + +| | 错误的做法 | 正确的做法 | +|---|---|---| +| 决策位置 | SKILL.md 里写 if/else | 脚本里路由,skill 只调用脚本 | +| 输出格式 | 自由格式文字 | 脚本返回统一 JSON,agent 格式化 | +| 重复 | 每个 skill 自己写 grep | 共享 `pf_query.py` 统一入口 | +| 笨 agent | 可能走错分支 | 只执行 `run script → read output` | + +## Harness 三件套 + +``` +scripts/ +├── pf_bootstrap.py # 已有 — 输出 vault + python + memory 状态 +├── pf_search.py # 新 — 统一搜索入口(memory→sqlite, no memory→grep) +└── pf_context.py # 新 — 统一上下文入口(paper-status / agent-context) +``` + +每次 skill 调用都走 `bootstrap → 脚本 → 格式化结果` 流程,agent 不需要做任何条件判断。 diff --git a/paperforge/memory/builder.py b/paperforge/memory/builder.py index 2ac90a8..06ea40c 100644 --- a/paperforge/memory/builder.py +++ b/paperforge/memory/builder.py @@ -11,6 +11,7 @@ from paperforge.memory.db import get_connection, get_memory_db_path from paperforge.memory.schema import ( CURRENT_SCHEMA_VERSION, + PAPERS_AI_TRIGGER, clear_fts, drop_all_tables, ensure_schema, @@ -96,6 +97,8 @@ def build_from_index(vault: Path) -> dict: clear_fts(conn) + conn.execute("DROP TRIGGER IF EXISTS papers_ai") + now_utc = datetime.now(timezone.utc).isoformat() papers_count = 0 assets_count = 0 @@ -146,18 +149,6 @@ def build_from_index(vault: Path) -> dict: ) papers_count += 1 - try: - conn.execute( - """INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) - VALUES ((SELECT rowid FROM papers WHERE zotero_key = ?), ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", - (zotero_key, zotero_key, entry.get("citation_key", ""), entry.get("title", ""), - entry.get("first_author", ""), paper_values.get("authors_json", ""), - entry.get("abstract", ""), entry.get("journal", ""), entry.get("domain", ""), - entry.get("collection_path", ""), paper_values.get("collections_json", "")), - ) - except sqlite3.IntegrityError: - pass # duplicate rowid if FTS trigger already fired - for asset_type, entry_field in ASSET_FIELDS: path_val = entry.get(entry_field, "") if not path_val: @@ -199,6 +190,11 @@ def build_from_index(vault: Path) -> dict: ) aliases_count += 1 + conn.execute("""INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) + SELECT rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json + FROM papers""") + conn.execute(PAPERS_AI_TRIGGER) + meta_upserts = [ ("schema_version", str(CURRENT_SCHEMA_VERSION)), ("paperforge_version", PF_VERSION), diff --git a/paperforge/memory/refresh.py b/paperforge/memory/refresh.py index 1998d6a..f0e7da0 100644 --- a/paperforge/memory/refresh.py +++ b/paperforge/memory/refresh.py @@ -11,7 +11,7 @@ _resolve_vault_path, ) from paperforge.memory.db import get_connection, get_memory_db_path -from paperforge.memory.schema import ensure_schema +from paperforge.memory.schema import PAPERS_AI_TRIGGER, ensure_schema from paperforge.worker.asset_state import ( compute_lifecycle, compute_maturity, @@ -35,6 +35,8 @@ def refresh_paper(vault: Path, entry: dict) -> bool: try: ensure_schema(conn) + conn.execute("DROP TRIGGER IF EXISTS papers_ai") + lifecycle = str(compute_lifecycle(entry)) maturity = compute_maturity(entry) next_step = str(compute_next_step(entry)) @@ -100,28 +102,24 @@ def refresh_paper(vault: Path, entry: dict) -> bool: (zotero_key, raw_str, raw_str.lower().strip(), alias_type), ) - # Re-index FTS - try: - conn.execute("DELETE FROM paper_fts WHERE zotero_key = ?", (zotero_key,)) - conn.execute( - "INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) " - "VALUES ((SELECT rowid FROM papers WHERE zotero_key = ?), ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", - ( - zotero_key, - zotero_key, - entry.get("citation_key", ""), - entry.get("title", ""), - entry.get("first_author", ""), - paper_values["authors_json"], - entry.get("abstract", ""), - entry.get("journal", ""), - entry.get("domain", ""), - entry.get("collection_path", ""), - paper_values["collections_json"], - ), - ) - except Exception: - pass # FTS may not be available + conn.execute( + "INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) " + "VALUES ((SELECT rowid FROM papers WHERE zotero_key = ?), ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + ( + zotero_key, + zotero_key, + entry.get("citation_key", ""), + entry.get("title", ""), + entry.get("first_author", ""), + paper_values["authors_json"], + entry.get("abstract", ""), + entry.get("journal", ""), + entry.get("domain", ""), + entry.get("collection_path", ""), + paper_values["collections_json"], + ), + ) + conn.execute(PAPERS_AI_TRIGGER) conn.commit() return True diff --git a/paperforge/memory/schema.py b/paperforge/memory/schema.py index dc3b7c0..e9695e7 100644 --- a/paperforge/memory/schema.py +++ b/paperforge/memory/schema.py @@ -100,11 +100,13 @@ ); """ +PAPERS_AI_TRIGGER = """CREATE TRIGGER IF NOT EXISTS papers_ai AFTER INSERT ON papers BEGIN + INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) + VALUES (new.rowid, new.zotero_key, new.citation_key, new.title, new.first_author, new.authors_json, new.abstract, new.journal, new.domain, new.collection_path, new.collections_json); +END;""" + FTS_TRIGGERS = [ - """CREATE TRIGGER IF NOT EXISTS papers_ai AFTER INSERT ON papers BEGIN - INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) - VALUES (new.rowid, new.zotero_key, new.citation_key, new.title, new.first_author, new.authors_json, new.abstract, new.journal, new.domain, new.collection_path, new.collections_json); - END;""", + PAPERS_AI_TRIGGER, """CREATE TRIGGER IF NOT EXISTS papers_ad AFTER DELETE ON papers BEGIN INSERT INTO paper_fts(paper_fts, rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) VALUES ('delete', old.rowid, old.zotero_key, old.citation_key, old.title, old.first_author, old.authors_json, old.abstract, old.journal, old.domain, old.collection_path, old.collections_json); From 748f32b2829286d2534d789c7d0dfc62e282901e Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 17:02:31 +0800 Subject: [PATCH 063/132] =?UTF-8?q?perf:=20autoSync=20runs=20sync=20not=20?= =?UTF-8?q?full=20build,=2030s=E2=86=92120s=20poll,=20remove=20unused=20ha?= =?UTF-8?q?rness=20file?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paperforge/literature-qa-harness.md | 49 ----------------------------- paperforge/plugin/main.js | 10 +++--- 2 files changed, 5 insertions(+), 54 deletions(-) delete mode 100644 paperforge/literature-qa-harness.md diff --git a/paperforge/literature-qa-harness.md b/paperforge/literature-qa-harness.md deleted file mode 100644 index de6f668..0000000 --- a/paperforge/literature-qa-harness.md +++ /dev/null @@ -1,49 +0,0 @@ -# Literature QA Harness Redesign - -> **问题:** Skill 里写条件路由 `if memory else grep`,笨 agent 可能读错分支。 - -## 解法:技能内不做路由,路由下沉到脚本 - -``` -┌──────────────────────────────────────┐ -│ SKILL.md │ -│ │ -│ 1. bootstrap → 记下 $PYTHON $VAULT │ -│ 2. 跑 pf_search.py, 读 JSON 输出 │ -│ 3. 格式化结果给用户 │ -│ │ -│ agent 不用做决策,只是执行步骤 │ -└─────────────┬────────────────────────┘ - │ 调用 - ▼ -┌──────────────────────────────────────┐ -│ scripts/pf_search.py │ -│ │ -│ memory_layer.enabled? │ -│ ├─ YES → paperforge search --json │ -│ └─ NO → grep -r .md files │ -│ │ -│ 输出: 统一 JSON 格式 │ -│ { results: [{key, title, ...}] } │ -└──────────────────────────────────────┘ -``` - -## 核心原则 - -| | 错误的做法 | 正确的做法 | -|---|---|---| -| 决策位置 | SKILL.md 里写 if/else | 脚本里路由,skill 只调用脚本 | -| 输出格式 | 自由格式文字 | 脚本返回统一 JSON,agent 格式化 | -| 重复 | 每个 skill 自己写 grep | 共享 `pf_query.py` 统一入口 | -| 笨 agent | 可能走错分支 | 只执行 `run script → read output` | - -## Harness 三件套 - -``` -scripts/ -├── pf_bootstrap.py # 已有 — 输出 vault + python + memory 状态 -├── pf_search.py # 新 — 统一搜索入口(memory→sqlite, no memory→grep) -└── pf_context.py # 新 — 统一上下文入口(paper-status / agent-context) -``` - -每次 skill 调用都走 `bootstrap → 脚本 → 格式化结果` 流程,agent 不需要做任何条件判断。 diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 4f5eb8a..e4ccc6c 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -3920,7 +3920,7 @@ module.exports = class PaperForgePlugin extends Plugin { this._pollTimer = setInterval(() => { this._checkExports(vaultPath, fs, path, exec); this._checkOcr(vaultPath, fs, path, exec); - }, 30000); // every 30 seconds + }, 120000); // every 120 seconds } _checkExports(vaultPath, fs, path, exec) { @@ -3939,19 +3939,19 @@ module.exports = class PaperForgePlugin extends Plugin { if (newestMtime > this._lastExportMtime) { this._lastExportMtime = newestMtime; - this._autoRebuild(vaultPath, exec); + this._autoSync(vaultPath, exec); } } - _autoRebuild(vaultPath, exec) { + _autoSync(vaultPath, exec) { if (this._autoSyncRunning) return; this._autoSyncRunning = true; const pyResult = resolvePythonExecutable(vaultPath, this.settings); if (!pyResult.path) { this._autoSyncRunning = false; return; } - const cmd = `"${pyResult.path}" -m paperforge --vault "${vaultPath}" memory build`; - exec(cmd, { timeout: 60000, encoding: 'utf-8' }, (err, stdout, stderr) => { + const cmd = `"${pyResult.path}" -m paperforge --vault "${vaultPath}" sync`; + exec(cmd, { timeout: 120000, encoding: 'utf-8' }, (err, stdout, stderr) => { this._autoSyncRunning = false; this._memoryStatusText = null; // force re-check next time // Update last export mtime to avoid re-trigger during build From 8babd361f2794e4b0d86dbd10a446958dd95efac Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 17:03:55 +0800 Subject: [PATCH 064/132] perf: single frontmatter parse in _build_entry --- paperforge/worker/asset_index.py | 36 +++++++++++++------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/paperforge/worker/asset_index.py b/paperforge/worker/asset_index.py index 6bdbd99..1565bd7 100644 --- a/paperforge/worker/asset_index.py +++ b/paperforge/worker/asset_index.py @@ -38,7 +38,6 @@ from paperforge.adapters.obsidian_frontmatter import ( _legacy_control_flags, read_frontmatter_dict, - read_frontmatter_optional_bool, ) from paperforge.config import paperforge_paths @@ -321,26 +320,21 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: legacy_flags = _legacy_control_flags(paths, key) legacy_do_ocr = legacy_flags.get("do_ocr") legacy_analyze = legacy_flags.get("analyze") - note_do_ocr = read_frontmatter_optional_bool(main_note_path, "do_ocr") - if note_do_ocr is None: - note_do_ocr = read_frontmatter_optional_bool(note_path, "do_ocr") - note_analyze = read_frontmatter_optional_bool(main_note_path, "analyze") - if note_analyze is None: - note_analyze = read_frontmatter_optional_bool(note_path, "analyze") - - # deep_reading_status: frontmatter first (finalize.py sets it), body detection fallback (sync ensures it) - def _read_fm_str(fp: Path, key: str) -> str: - if not fp or not fp.exists(): - return "" - try: - fm = read_frontmatter_dict(fp.read_text(encoding="utf-8")) - return str(fm.get(key, "")).strip() - except Exception: - return "" - - note_dr = _read_fm_str(main_note_path, "deep_reading_status") - if not note_dr: - note_dr = _read_fm_str(note_path, "deep_reading_status") + # Single frontmatter read for all control fields + fm = {} + for fp in (main_note_path, note_path): + if fp and fp.exists(): + try: + fm = read_frontmatter_dict(fp.read_text(encoding="utf-8")) + break + except Exception: + continue + + _v = fm.get("do_ocr") + note_do_ocr = _v if isinstance(_v, bool) else None + _v = fm.get("analyze") + note_analyze = _v if isinstance(_v, bool) else None + note_dr = str(fm.get("deep_reading_status", "")).strip() do_ocr_value = note_do_ocr if note_do_ocr is not None else legacy_do_ocr if do_ocr_value is None: From 2dd6b0d86b4507521f2bd2b453278b0b389ce02a Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 17:07:14 +0800 Subject: [PATCH 065/132] refactor: extract build_paper_row to shared _columns module --- paperforge/memory/_columns.py | 43 +++++++++++++++++++++++++++++++ paperforge/memory/builder.py | 48 ++++------------------------------- paperforge/memory/refresh.py | 34 ++++--------------------- 3 files changed, 53 insertions(+), 72 deletions(-) create mode 100644 paperforge/memory/_columns.py diff --git a/paperforge/memory/_columns.py b/paperforge/memory/_columns.py new file mode 100644 index 0000000..61c0ce9 --- /dev/null +++ b/paperforge/memory/_columns.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +import json + + +PAPER_COLUMNS = [ + "zotero_key", "citation_key", "title", "year", "doi", "pmid", + "journal", "first_author", "authors_json", "abstract", "domain", + "collection_path", "collections_json", + "has_pdf", "do_ocr", "analyze", "ocr_status", "deep_reading_status", + "ocr_job_id", "impact_factor", + "lifecycle", "maturity_level", "maturity_name", "next_step", + "pdf_path", "note_path", "main_note_path", "paper_root", + "fulltext_path", "ocr_md_path", "ocr_json_path", "ai_path", + "deep_reading_md_path", "updated_at", +] + + +def build_paper_row(entry: dict, generated_at: str) -> dict: + row = {} + for col in PAPER_COLUMNS: + if col == "authors_json": + row[col] = json.dumps(entry.get("authors", []), ensure_ascii=False) + elif col == "collections_json": + row[col] = json.dumps(entry.get("collections", []), ensure_ascii=False) + elif col == "lifecycle": + row[col] = entry.get("lifecycle", "") + elif col == "maturity_level": + row[col] = entry.get("maturity", {}).get("level", 1) + elif col == "maturity_name": + row[col] = entry.get("maturity", {}).get("level_name", "") + elif col == "next_step": + row[col] = entry.get("next_step", "") + elif col == "updated_at": + row[col] = generated_at + elif col in ("do_ocr", "analyze"): + val = entry.get(col) + row[col] = 1 if val else 0 + elif col == "has_pdf": + row[col] = 1 if entry.get("has_pdf") else 0 + else: + row[col] = entry.get(col, "") + return row diff --git a/paperforge/memory/builder.py b/paperforge/memory/builder.py index 06ea40c..86bade4 100644 --- a/paperforge/memory/builder.py +++ b/paperforge/memory/builder.py @@ -8,6 +8,7 @@ from pathlib import Path from paperforge import __version__ as PF_VERSION +from paperforge.memory._columns import PAPER_COLUMNS, build_paper_row from paperforge.memory.db import get_connection, get_memory_db_path from paperforge.memory.schema import ( CURRENT_SCHEMA_VERSION, @@ -26,18 +27,6 @@ logger = logging.getLogger(__name__) -PAPER_COLUMNS = [ - "zotero_key", "citation_key", "title", "year", "doi", "pmid", - "journal", "first_author", "authors_json", "abstract", "domain", - "collection_path", "collections_json", - "has_pdf", "do_ocr", "analyze", "ocr_status", "deep_reading_status", - "ocr_job_id", "impact_factor", - "lifecycle", "maturity_level", "maturity_name", "next_step", - "pdf_path", "note_path", "main_note_path", "paper_root", - "fulltext_path", "ocr_md_path", "ocr_json_path", "ai_path", - "deep_reading_md_path", "updated_at", -] - ASSET_FIELDS = [ ("pdf", "pdf_path"), ("formal_note", "note_path"), @@ -109,37 +98,10 @@ def build_from_index(vault: Path) -> dict: if not zotero_key: continue - lifecycle = str(compute_lifecycle(entry)) - maturity = compute_maturity(entry) - next_step = str(compute_next_step(entry)) - - paper_values = {} - for col in PAPER_COLUMNS: - if col == "authors_json": - paper_values[col] = json.dumps( - entry.get("authors", []), ensure_ascii=False - ) - elif col == "collections_json": - paper_values[col] = json.dumps( - entry.get("collections", []), ensure_ascii=False - ) - elif col == "lifecycle": - paper_values[col] = lifecycle - elif col == "maturity_level": - paper_values[col] = maturity.get("level", 1) - elif col == "maturity_name": - paper_values[col] = maturity.get("level_name", "") - elif col == "next_step": - paper_values[col] = next_step - elif col == "updated_at": - paper_values[col] = generated_at - elif col in ("do_ocr", "analyze"): - val = entry.get(col) - paper_values[col] = 1 if val else 0 - elif col == "has_pdf": - paper_values[col] = 1 if entry.get("has_pdf") else 0 - else: - paper_values[col] = entry.get(col, "") + entry["lifecycle"] = str(compute_lifecycle(entry)) + entry["maturity"] = compute_maturity(entry) + entry["next_step"] = str(compute_next_step(entry)) + paper_values = build_paper_row(entry, generated_at) placeholders = ", ".join([f":{c}" for c in PAPER_COLUMNS]) cols = ", ".join(PAPER_COLUMNS) diff --git a/paperforge/memory/refresh.py b/paperforge/memory/refresh.py index f0e7da0..724b73a 100644 --- a/paperforge/memory/refresh.py +++ b/paperforge/memory/refresh.py @@ -1,13 +1,12 @@ from __future__ import annotations -import json from datetime import datetime, timezone from pathlib import Path +from paperforge.memory._columns import PAPER_COLUMNS, build_paper_row from paperforge.memory.builder import ( ALIAS_TYPES, ASSET_FIELDS, - PAPER_COLUMNS, _resolve_vault_path, ) from paperforge.memory.db import get_connection, get_memory_db_path @@ -37,33 +36,10 @@ def refresh_paper(vault: Path, entry: dict) -> bool: conn.execute("DROP TRIGGER IF EXISTS papers_ai") - lifecycle = str(compute_lifecycle(entry)) - maturity = compute_maturity(entry) - next_step = str(compute_next_step(entry)) - - paper_values = {} - for col in PAPER_COLUMNS: - if col == "authors_json": - paper_values[col] = json.dumps(entry.get("authors", []), ensure_ascii=False) - elif col == "collections_json": - paper_values[col] = json.dumps(entry.get("collections", []), ensure_ascii=False) - elif col == "lifecycle": - paper_values[col] = lifecycle - elif col == "maturity_level": - paper_values[col] = maturity.get("level", 1) - elif col == "maturity_name": - paper_values[col] = maturity.get("level_name", "") - elif col == "next_step": - paper_values[col] = next_step - elif col == "updated_at": - paper_values[col] = generated_at - elif col in ("do_ocr", "analyze"): - val = entry.get(col) - paper_values[col] = 1 if val else 0 - elif col == "has_pdf": - paper_values[col] = 1 if entry.get("has_pdf") else 0 - else: - paper_values[col] = entry.get(col, "") + entry["lifecycle"] = str(compute_lifecycle(entry)) + entry["maturity"] = compute_maturity(entry) + entry["next_step"] = str(compute_next_step(entry)) + paper_values = build_paper_row(entry, generated_at) placeholders = ", ".join([f":{c}" for c in PAPER_COLUMNS]) cols = ", ".join(PAPER_COLUMNS) From 7df1f38f6dee8c2a9fb024ed537af2269146caa4 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 17:08:11 +0800 Subject: [PATCH 066/132] perf: merge dashboard queries into single GROUP BY --- paperforge/commands/dashboard.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/paperforge/commands/dashboard.py b/paperforge/commands/dashboard.py index 6362d30..bbf8771 100644 --- a/paperforge/commands/dashboard.py +++ b/paperforge/commands/dashboard.py @@ -61,15 +61,21 @@ def _dashboard_from_db(vault: Path) -> dict | None: import sqlite3 conn = sqlite3.connect(str(db_path)) conn.row_factory = sqlite3.Row - # Paper count - total = conn.execute("SELECT COUNT(*) as cnt FROM papers").fetchone()["cnt"] - # PDF health - pdf_healthy = conn.execute("SELECT COUNT(*) as cnt FROM papers WHERE has_pdf = 1 AND (ocr_status != 'failed' OR ocr_status IS NULL)").fetchone()["cnt"] - pdf_missing = conn.execute("SELECT COUNT(*) as cnt FROM papers WHERE has_pdf = 0").fetchone()["cnt"] + # Aggregate stats via single GROUP BY + rows = conn.execute(""" + SELECT has_pdf, + CASE WHEN ocr_status='done' THEN 'done' + WHEN ocr_status IN ('failed','blocked') THEN 'failed' + ELSE 'pending' END as ocr, + COUNT(*) as cnt + FROM papers GROUP BY has_pdf, ocr + """).fetchall() + total = sum(r["cnt"] for r in rows) + pdf_healthy = sum(r["cnt"] for r in rows if r["has_pdf"] == 1 and r["ocr"] != "failed") + pdf_missing = sum(r["cnt"] for r in rows if r["has_pdf"] == 0) pdf_broken = total - pdf_healthy - pdf_missing - # OCR health - ocr_done = conn.execute("SELECT COUNT(*) as cnt FROM papers WHERE ocr_status = 'done'").fetchone()["cnt"] - ocr_failed = conn.execute("SELECT COUNT(*) as cnt FROM papers WHERE ocr_status IN ('failed','blocked')").fetchone()["cnt"] + ocr_done = sum(r["cnt"] for r in rows if r["ocr"] == "done") + ocr_failed = sum(r["cnt"] for r in rows if r["ocr"] == "failed") ocr_pending = total - ocr_done - ocr_failed # Domain counts rows = conn.execute("SELECT domain, COUNT(*) as cnt FROM papers GROUP BY domain").fetchall() From b8d3daf66ffdbf85635ccc5a9a883d19e8dc7509 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 17:17:08 +0800 Subject: [PATCH 067/132] perf: executemany batching in builder + file read consolidation --- paperforge/memory/builder.py | 62 ++++++++++++++------------------ paperforge/worker/asset_index.py | 44 ++++++++++++++--------- 2 files changed, 54 insertions(+), 52 deletions(-) diff --git a/paperforge/memory/builder.py b/paperforge/memory/builder.py index 86bade4..b359ea5 100644 --- a/paperforge/memory/builder.py +++ b/paperforge/memory/builder.py @@ -3,7 +3,6 @@ import hashlib import json import logging -import sqlite3 from datetime import datetime, timezone from pathlib import Path @@ -89,9 +88,13 @@ def build_from_index(vault: Path) -> dict: conn.execute("DROP TRIGGER IF EXISTS papers_ai") now_utc = datetime.now(timezone.utc).isoformat() - papers_count = 0 - assets_count = 0 - aliases_count = 0 + paper_rows: list[dict] = [] + asset_rows: list[tuple] = [] + alias_rows: list[tuple] = [] + + placeholders = ", ".join([f":{c}" for c in PAPER_COLUMNS]) + cols = ", ".join(PAPER_COLUMNS) + paper_sql = f"INSERT OR REPLACE INTO papers ({cols}) VALUES ({placeholders})" for entry in items: zotero_key = entry.get("zotero_key", "") @@ -101,15 +104,7 @@ def build_from_index(vault: Path) -> dict: entry["lifecycle"] = str(compute_lifecycle(entry)) entry["maturity"] = compute_maturity(entry) entry["next_step"] = str(compute_next_step(entry)) - paper_values = build_paper_row(entry, generated_at) - - placeholders = ", ".join([f":{c}" for c in PAPER_COLUMNS]) - cols = ", ".join(PAPER_COLUMNS) - conn.execute( - f"INSERT OR REPLACE INTO papers ({cols}) VALUES ({placeholders})", - paper_values, - ) - papers_count += 1 + paper_rows.append(build_paper_row(entry, generated_at)) for asset_type, entry_field in ASSET_FIELDS: path_val = entry.get(entry_field, "") @@ -126,31 +121,28 @@ def build_from_index(vault: Path) -> dict: except Exception: exists = 0 - conn.execute( - """INSERT OR REPLACE INTO paper_assets - (paper_id, asset_type, path, exists_on_disk) - VALUES (?, ?, ?, ?)""", - (zotero_key, asset_type, rel_path, exists), - ) - assets_count += 1 + asset_rows.append((zotero_key, asset_type, rel_path, exists)) for alias_type in ALIAS_TYPES: raw_val = entry.get(alias_type, "") if not raw_val: continue raw_str = str(raw_val) - conn.execute( - """INSERT OR REPLACE INTO paper_aliases - (paper_id, alias, alias_norm, alias_type) - VALUES (?, ?, ?, ?)""", - ( - zotero_key, - raw_str, - raw_str.lower().strip(), - alias_type, - ), - ) - aliases_count += 1 + alias_rows.append((zotero_key, raw_str, raw_str.lower().strip(), alias_type)) + + conn.executemany(paper_sql, paper_rows) + conn.executemany( + """INSERT OR REPLACE INTO paper_assets + (paper_id, asset_type, path, exists_on_disk) + VALUES (?, ?, ?, ?)""", + asset_rows, + ) + conn.executemany( + """INSERT OR REPLACE INTO paper_aliases + (paper_id, alias, alias_norm, alias_type) + VALUES (?, ?, ?, ?)""", + alias_rows, + ) conn.execute("""INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) SELECT rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json @@ -175,9 +167,9 @@ def build_from_index(vault: Path) -> dict: return { "db_path": str(db_path), - "papers_indexed": papers_count, - "assets_indexed": assets_count, - "aliases_indexed": aliases_count, + "papers_indexed": len(paper_rows), + "assets_indexed": len(asset_rows), + "aliases_indexed": len(alias_rows), "schema_version": str(CURRENT_SCHEMA_VERSION), } except Exception: diff --git a/paperforge/worker/asset_index.py b/paperforge/worker/asset_index.py index 1565bd7..de16ae6 100644 --- a/paperforge/worker/asset_index.py +++ b/paperforge/worker/asset_index.py @@ -299,7 +299,6 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: except Exception: pass # alias will be injected on next full frontmatter_note pass break # only one old file per key - deep_reading_file = workspace_dir / "deep-reading.md" target_fulltext = workspace_dir / "fulltext.md" source_fulltext = paths["ocr"] / key / "fulltext.md" @@ -310,7 +309,6 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: logger.info("Bridged fulltext.md to workspace for %s", key) fulltext_exists = target_fulltext.exists() - deep_reading_exists = deep_reading_file.exists() # ---- entry dict ------------------------------------------------------- authors = item.get("authors", []) @@ -320,12 +318,17 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: legacy_flags = _legacy_control_flags(paths, key) legacy_do_ocr = legacy_flags.get("do_ocr") legacy_analyze = legacy_flags.get("analyze") - # Single frontmatter read for all control fields + # Single frontmatter read for all control fields — cache text for reuse fm = {} + fm_cached_text = "" + fm_was_main = False for fp in (main_note_path, note_path): if fp and fp.exists(): try: - fm = read_frontmatter_dict(fp.read_text(encoding="utf-8")) + note_text = fp.read_text(encoding="utf-8") + fm = read_frontmatter_dict(note_text) + fm_cached_text = note_text + fm_was_main = (fp == main_note_path) break except Exception: continue @@ -344,6 +347,19 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: if analyze_value is None: analyze_value = meta.get("analyze") is True or meta.get("deep_reading_status") == "done" + # Compute deep reading status once, reusing cached text when possible. + # main_note_path is canonical — don't fall back to note_path when it exists. + _dr_status = "pending" + if note_dr == "done": + _dr_status = "done" + elif fm_was_main: + _dr_status = "done" if has_deep_reading_content(fm_cached_text) else "pending" + elif main_note_path.exists(): + try: + _dr_status = "done" if has_deep_reading_content(main_note_path.read_text(encoding="utf-8")) else "pending" + except Exception: + pass + entry = { "zotero_key": key, "citation_key": item.get("citation_key", ""), @@ -371,23 +387,15 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: "ocr_job_id": meta.get("ocr_job_id", ""), "ocr_md_path": obsidian_wikilink_for_path(vault, meta.get("markdown_path", "")), "ocr_json_path": meta.get("json_path", ""), - "deep_reading_status": ( - "done" - if note_dr == "done" - else "done" - if main_note_path.exists() and has_deep_reading_content(main_note_path.read_text(encoding="utf-8")) - else "done" - if note_path.exists() and has_deep_reading_content(note_path.read_text(encoding="utf-8")) - else "pending" - ), + "deep_reading_status": _dr_status, "note_path": str((main_note_path if main_note_path.exists() else note_path).relative_to(vault)).replace( "\\", "/" ), "deep_reading_md_path": ( str(main_note_path.relative_to(vault)).replace("\\", "/") - if main_note_path.exists() and has_deep_reading_content(main_note_path.read_text(encoding="utf-8")) + if _dr_status == "done" and main_note_path.exists() else str(note_path.relative_to(vault)).replace("\\", "/") - if note_path.exists() and has_deep_reading_content(note_path.read_text(encoding="utf-8")) + if _dr_status == "done" and note_path.exists() else "" ), # Workspace path fields are only advertised when the backing files/dirs exist. @@ -406,7 +414,7 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: # Slug already frozen above — for existing notes, update frontmatter only (preserve body) if main_note_path.exists(): - text = main_note_path.read_text(encoding="utf-8") + text = fm_cached_text if fm_was_main else main_note_path.read_text(encoding="utf-8") fm_close = text.find("---\n", 4) # closing --- after opening --- if fm_close != -1: body = text[fm_close + 4 :] # everything after frontmatter @@ -420,7 +428,9 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: else: main_note_path.write_text(frontmatter_note(entry, text), encoding="utf-8") else: - existing_text = note_path.read_text(encoding="utf-8") if note_path.exists() else "" + existing_text = fm_cached_text if not fm_was_main and fm_cached_text else ( + note_path.read_text(encoding="utf-8") if note_path.exists() else "" + ) main_note_path.write_text(frontmatter_note(entry, existing_text), encoding="utf-8") # Write per-workspace paper-meta.json (Phase 37: internal state outside frontmatter) From 8580f16598bc60e812631b9063738ed9cd8dad02 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 17:34:40 +0800 Subject: [PATCH 068/132] fix(plugin): replace async deps check with sync execSync, remove dead _resolvePythonAsync --- paperforge/plugin/main.js | 96 ++++++++++----------------------------- 1 file changed, 24 insertions(+), 72 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index e4ccc6c..30d0e3f 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2501,13 +2501,6 @@ class PaperForgeSettingTab extends PluginSettingTab { }); } - _execVectorDeps(pythonPath, callback) { - const { exec } = require('child_process'); - exec(`"${pythonPath}" -c "import chromadb; import sentence_transformers; print('ok')"`, { encoding: 'utf-8', timeout: 15000 }, (err, stdout) => { - callback(err ? false : (stdout.trim() === 'ok')); - }); - } - _execEmbedStatus(pythonPath, vp, callback) { const { exec } = require('child_process'); exec(`"${pythonPath}" -m paperforge --vault "${vp}" embed status --json`, { encoding: 'utf-8', timeout: 15000 }, (err, stdout) => { @@ -2528,54 +2521,6 @@ class PaperForgeSettingTab extends PluginSettingTab { el.createEl('span', { text: text, cls: 'paperforge-memory-text' }).style.cssText = 'flex:1;'; } - _resolvePythonAsync(callback) { - const { exec } = require('child_process'); - const vp = this.app.vault.adapter.basePath; - const settings = this.plugin.settings; - - // Fast path: manual or venv candidates (sync fs check only, no exec) - if (settings && settings.python_path && settings.python_path.trim()) { - const manualPath = settings.python_path.trim(); - if (fs.existsSync(manualPath)) { - callback({ path: manualPath, source: 'manual', extraArgs: [] }); - return; - } - } - const venvCandidates = [ - path.join(vp, '.paperforge-test-venv', 'Scripts', 'python.exe'), - path.join(vp, '.venv', 'Scripts', 'python.exe'), - path.join(vp, 'venv', 'Scripts', 'python.exe'), - ]; - for (const candidate of venvCandidates) { - try { - if (fs.existsSync(candidate)) { - callback({ path: candidate, source: 'auto-detected', extraArgs: [] }); - return; - } - } catch {} - } - // Slow path: test system candidates with async exec - const systemCandidates = [ - { path: 'python', extraArgs: [] }, - { path: 'python3', extraArgs: [] }, - ]; - const tryNext = (idx) => { - if (idx >= systemCandidates.length) { - callback({ path: 'python', source: 'auto-detected', extraArgs: [] }); - return; - } - const c = systemCandidates[idx]; - exec(`"${c.path}" --version`, { encoding: 'utf-8', timeout: 5000 }, (err, stdout) => { - if (!err && stdout && stdout.toLowerCase().includes('python')) { - callback({ path: c.path, source: 'auto-detected', extraArgs: c.extraArgs }); - } else { - tryNext(idx + 1); - } - }); - }; - tryNext(0); - } - _renderFeaturesTab(containerEl) { // --- Section: Skills --- containerEl.createEl('h3', { text: 'Skills' }); @@ -2783,25 +2728,32 @@ class PaperForgeSettingTab extends PluginSettingTab { const depsEl = containerEl.createEl('div'); depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; depsEl.setText('Checking dependencies...'); - this._resolvePythonAsync(pyResult => { - const pythonPath = pyResult.path; - if (pythonPath) { - this._execVectorDeps(pythonPath, (ok) => { - this._vectorDepsOk = ok; - if (ok) { - depsEl.remove(); - this._renderVectorConfig(containerEl); - } else { - depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; - depsEl.setText('Dependencies not installed. Required: chromadb, sentence-transformers.'); - this._renderVectorInstall(containerEl); - } - }); - } else { + // Fast sync check: resolvePythonExecutable is sync, execSync for import check (fast, ~1s) + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + const pythonPath = pyResult.path; + if (pythonPath) { + try { + const { execSync } = require('child_process'); + const result = execSync(`"${pythonPath}" -c "import chromadb; import sentence_transformers; print('ok')"`, { encoding: 'utf-8', timeout: 5000 }); + const ok = result.trim() === 'ok'; + this._vectorDepsOk = ok; + if (ok) { + depsEl.remove(); + this._renderVectorConfig(containerEl); + } else { + depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; + depsEl.setText('Dependencies not installed. Required: chromadb, sentence-transformers.'); + this._renderVectorInstall(containerEl); + } + } catch(e) { depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; - depsEl.setText('No Python found. Check Installation tab.'); + depsEl.setText('Dependencies not installed. Required: chromadb, sentence-transformers.'); + this._renderVectorInstall(containerEl); } - }); + } else { + depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; + depsEl.setText('No Python found. Check Installation tab.'); + } } } } From 314dda53e51f0f91791716b2d51259893f157aaa Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 17:39:10 +0800 Subject: [PATCH 069/132] fix(plugin): clean up debug logs, deps check working with resolvePythonExecutable + async exec --- paperforge/plugin/main.js | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 30d0e3f..9ffa3e3 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2728,14 +2728,11 @@ class PaperForgeSettingTab extends PluginSettingTab { const depsEl = containerEl.createEl('div'); depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; depsEl.setText('Checking dependencies...'); - // Fast sync check: resolvePythonExecutable is sync, execSync for import check (fast, ~1s) const pyResult = resolvePythonExecutable(vp, this.plugin.settings); - const pythonPath = pyResult.path; - if (pythonPath) { - try { - const { execSync } = require('child_process'); - const result = execSync(`"${pythonPath}" -c "import chromadb; import sentence_transformers; print('ok')"`, { encoding: 'utf-8', timeout: 5000 }); - const ok = result.trim() === 'ok'; + if (pyResult.path) { + const { exec } = require('child_process'); + exec(`"${pyResult.path}" -c "import chromadb; import sentence_transformers; print('ok')"`, { encoding: 'utf-8', timeout: 10000 }, (err, stdout, stderr) => { + const ok = !err && (stdout || '').trim() === 'ok'; this._vectorDepsOk = ok; if (ok) { depsEl.remove(); @@ -2745,11 +2742,7 @@ class PaperForgeSettingTab extends PluginSettingTab { depsEl.setText('Dependencies not installed. Required: chromadb, sentence-transformers.'); this._renderVectorInstall(containerEl); } - } catch(e) { - depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; - depsEl.setText('Dependencies not installed. Required: chromadb, sentence-transformers.'); - this._renderVectorInstall(containerEl); - } + }); } else { depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; depsEl.setText('No Python found. Check Installation tab.'); From 6cb00c5c745f121b71de41a4a976e4bc5a5232e4 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 17:52:05 +0800 Subject: [PATCH 070/132] feat(plugin): add sync timestamp + spinner + manual sync button --- paperforge/plugin/main.js | 65 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 5 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 9ffa3e3..ab25b0c 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2214,6 +2214,7 @@ class PaperForgeSettingTab extends PluginSettingTab { this.plugin = plugin; this._saveTimeout = null; this._pfConfig = null; // cached paperforge.json config + this._lastSyncTime = null; this._memoryStatusText = null; // null = not checked yet, string = cached result this._vectorDepsOk = null; // null = not checked, bool = cached this._embedStatusText = null; @@ -2516,9 +2517,55 @@ class PaperForgeSettingTab extends PluginSettingTab { }); } - _renderMemoryStatusText(el, text) { + _renderMemoryStatusText(el, text, extraInfo) { el.innerHTML = ''; el.createEl('span', { text: text, cls: 'paperforge-memory-text' }).style.cssText = 'flex:1;'; + + if (extraInfo === 'syncing') { + const syncEl = el.createEl('span', { text: 'Syncing...', cls: 'paperforge-sync-status' }); + syncEl.style.cssText = 'opacity:0.7; margin-right:8px;'; + } else if (extraInfo) { + const timeEl = el.createEl('span', { text: extraInfo, cls: 'paperforge-sync-status' }); + timeEl.style.cssText = 'opacity:0.7; margin-right:8px;'; + } + + const refreshBtn = el.createEl('button', { cls: 'paperforge-refresh-btn', text: '\u21BB' }); + refreshBtn.style.cssText = 'margin-left:auto; border:none; background:none; cursor:pointer; font-size:16px; padding:0 4px;'; + refreshBtn.title = 'Sync now'; + refreshBtn.onclick = () => { + this._memoryStatusText = null; + this._runManualSync(); + }; + } + + _getBuildCommand(settings) { + const vp = this.app.vault.adapter.basePath; + const pyResult = resolvePythonExecutable(vp, settings); + if (!pyResult.path) return null; + return `"${pyResult.path}" -m paperforge --vault "${vp}" sync`; + } + + _runManualSync() { + const vp = this.app.vault.adapter.basePath; + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + if (!pyResult.path) return; + + const statusRow = document.querySelector('.paperforge-memory-status'); + if (statusRow) { + this._renderMemoryStatusText(statusRow, 'Checking...', 'syncing'); + } + + this.plugin._autoSyncRunning = true; + const { exec } = require('child_process'); + exec(`"${pyResult.path}" -m paperforge --vault "${vp}" sync`, { timeout: 120000, encoding: 'utf-8' }, (err) => { + this.plugin._autoSyncRunning = false; + this._memoryStatusText = null; + if (!err) { + this._lastSyncTime = new Date().toLocaleTimeString(); + this.plugin._lastSyncTime = this._lastSyncTime; + } + this.display(); // re-render + }); } _renderFeaturesTab(containerEl) { @@ -2685,16 +2732,20 @@ class PaperForgeSettingTab extends PluginSettingTab { const vp = this.app.vault.adapter.basePath; const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + if (this.plugin._lastSyncTime && !this._lastSyncTime) { + this._lastSyncTime = this.plugin._lastSyncTime; + } + if (this._memoryStatusText !== null) { - this._renderMemoryStatusText(statusRow, this._memoryStatusText); + this._renderMemoryStatusText(statusRow, this._memoryStatusText, this._lastSyncTime); } else if (pyResult.path) { - this._renderMemoryStatusText(statusRow, 'Checking...'); + this._renderMemoryStatusText(statusRow, 'Checking...', this._lastSyncTime); this._execMemoryStatus(pyResult.path, vp, (text) => { this._memoryStatusText = text; - this._renderMemoryStatusText(statusRow, text); + this._renderMemoryStatusText(statusRow, text, this._lastSyncTime); }); } else { - this._renderMemoryStatusText(statusRow, 'No Python found.'); + this._renderMemoryStatusText(statusRow, 'No Python found.', this._lastSyncTime); } // --- Vector Database (within Memory Layer) --- @@ -3769,6 +3820,7 @@ module.exports = class PaperForgePlugin extends Plugin { this._lastExportMtime = 0; this._lastOcrMtimes = {}; this._autoSyncRunning = false; + this._lastSyncTime = null; this._pollTimer = null; // Clean stale path fields from plugin data.json (migrated to paperforge.json) this.saveSettings(); @@ -3899,6 +3951,9 @@ module.exports = class PaperForgePlugin extends Plugin { exec(cmd, { timeout: 120000, encoding: 'utf-8' }, (err, stdout, stderr) => { this._autoSyncRunning = false; this._memoryStatusText = null; // force re-check next time + if (!err) { + this._lastSyncTime = new Date().toLocaleTimeString(); + } // Update last export mtime to avoid re-trigger during build try { const fs = require('fs'); From 076774dac60baec3952c069d9cf66b49d918a0aa Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 18:05:52 +0800 Subject: [PATCH 071/132] feat(plugin): add collapsible toggle for System/User skill groups --- paperforge/plugin/main.js | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index ab25b0c..b8a1bf8 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2701,19 +2701,38 @@ class PaperForgeSettingTab extends PluginSettingTab { }); }; - // System skills const skillsBox = containerEl.createEl('div'); skillsBox.style.cssText = 'background:var(--background-secondary); border-radius:8px; padding:12px 12px 4px; margin:8px 0 16px;'; - if (systemSkills.length > 0) { - skillsBox.createEl('h4', { text: 'System Skills', cls: 'paperforge-skills-subheader' }); - systemSkills.forEach(s => renderSkillRow(s, true)); - } + + const renderCollapsibleSkills = (label, skills, isSystem) => { + if (skills.length === 0) return; + + // Header row with toggle arrow + const header = skillsBox.createEl('div', { cls: 'paperforge-skills-collapse-header' }); + header.style.cssText = 'display:flex; align-items:center; cursor:pointer; padding:4px 0; margin-bottom:4px;'; + const arrow = header.createEl('span', { text: '\u25B2', cls: 'paperforge-skills-arrow' }); + arrow.style.cssText = 'font-size:10px; margin-right:6px; transition:transform 0.2s;'; + header.createEl('h4', { text: `${label} (${skills.length})`, cls: 'paperforge-skills-subheader' }); + header.querySelector('h4').style.marginBottom = '0'; + + // Content wrapper + const content = skillsBox.createEl('div', { cls: 'paperforge-skills-collapse-content' }); + skills.forEach(s => renderSkillRow(s, isSystem)); + + // Toggle + let collapsed = false; + header.addEventListener('click', () => { + collapsed = !collapsed; + content.style.display = collapsed ? 'none' : ''; + arrow.textContent = collapsed ? '\u25B6' : '\u25B2'; + }); + }; + + // System skills + renderCollapsibleSkills('System Skills', systemSkills, true); // User skills - if (userSkills.length > 0) { - skillsBox.createEl('h4', { text: 'User Skills', cls: 'paperforge-skills-subheader' }); - userSkills.forEach(s => renderSkillRow(s, false)); - } + renderCollapsibleSkills('User Skills', userSkills, false); if (systemSkills.length === 0 && userSkills.length === 0) { skillsBox.createEl('p', { From bd34f7da38b8c31a535f4fe73d64c5c0a626612e Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 18:11:57 +0800 Subject: [PATCH 072/132] fix(plugin): skill collapse arrow direction + render into content div --- paperforge/plugin/main.js | 62 +++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index b8a1bf8..979d2d0 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2674,57 +2674,55 @@ class PaperForgeSettingTab extends PluginSettingTab { }); } - // Helper to render a skill row - const renderSkillRow = (skill, isSystem) => { - const nameText = skill.name + (skill.version ? ' v' + skill.version : ''); - const sourceLabel = isSystem ? ' [system]' : ' [user]'; - const descText = skill.desc || ''; - - const setting = new Setting(skillsBox) - .setName(nameText + sourceLabel) - .setDesc(descText); - setting.settingEl.style.opacity = skill.disabled ? '0.4' : '1'; - - setting.addToggle(toggle => { - toggle.setValue(!skill.disabled) - .onChange(value => { - const newDisabled = !value; - const disableMatch = skill.content.match(/^disable-model-invocation:\s*(.+)$/m); - const newContent = disableMatch - ? skill.content.replace(/^disable-model-invocation:\s*.+$/m, `disable-model-invocation: ${newDisabled}`) - : skill.content.replace(/^(---\r?\n)/, `$1disable-model-invocation: ${newDisabled}\n`); - fs.writeFileSync(skill.path, newContent, 'utf-8'); - skill.disabled = newDisabled; - skill.content = newContent; - setting.settingEl.style.opacity = skill.disabled ? '0.4' : '1'; - }); - }); - }; - const skillsBox = containerEl.createEl('div'); skillsBox.style.cssText = 'background:var(--background-secondary); border-radius:8px; padding:12px 12px 4px; margin:8px 0 16px;'; const renderCollapsibleSkills = (label, skills, isSystem) => { if (skills.length === 0) return; + // Content wrapper (created first, collapsed by default) + const content = skillsBox.createEl('div', { cls: 'paperforge-skills-collapse-content' }); + // Header row with toggle arrow const header = skillsBox.createEl('div', { cls: 'paperforge-skills-collapse-header' }); header.style.cssText = 'display:flex; align-items:center; cursor:pointer; padding:4px 0; margin-bottom:4px;'; - const arrow = header.createEl('span', { text: '\u25B2', cls: 'paperforge-skills-arrow' }); + const arrow = header.createEl('span', { text: '\u25BC', cls: 'paperforge-skills-arrow' }); arrow.style.cssText = 'font-size:10px; margin-right:6px; transition:transform 0.2s;'; header.createEl('h4', { text: `${label} (${skills.length})`, cls: 'paperforge-skills-subheader' }); header.querySelector('h4').style.marginBottom = '0'; - // Content wrapper - const content = skillsBox.createEl('div', { cls: 'paperforge-skills-collapse-content' }); - skills.forEach(s => renderSkillRow(s, isSystem)); + skills.forEach(s => { + const nameText = s.name + (s.version ? ' v' + s.version : ''); + const sourceLabel = isSystem ? ' [system]' : ' [user]'; + const descText = s.desc || ''; + + const setting = new Setting(content) + .setName(nameText + sourceLabel) + .setDesc(descText); + setting.settingEl.style.opacity = s.disabled ? '0.4' : '1'; + + setting.addToggle(toggle => { + toggle.setValue(!s.disabled) + .onChange(value => { + const newDisabled = !value; + const disableMatch = s.content.match(/^disable-model-invocation:\s*(.+)$/m); + const newContent = disableMatch + ? s.content.replace(/^disable-model-invocation:\s*.+$/m, `disable-model-invocation: ${newDisabled}`) + : s.content.replace(/^(---\r?\n)/, `$1disable-model-invocation: ${newDisabled}\n`); + fs.writeFileSync(s.path, newContent, 'utf-8'); + s.disabled = newDisabled; + s.content = newContent; + setting.settingEl.style.opacity = s.disabled ? '0.4' : '1'; + }); + }); + }); // Toggle let collapsed = false; header.addEventListener('click', () => { collapsed = !collapsed; content.style.display = collapsed ? 'none' : ''; - arrow.textContent = collapsed ? '\u25B6' : '\u25B2'; + arrow.textContent = collapsed ? '\u25B6' : '\u25BC'; }); }; From 108e2f7d9b863081eb990a73c8c33e457b444eec Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 18:14:31 +0800 Subject: [PATCH 073/132] fix(plugin): create header before content in skills collapsible --- paperforge/plugin/main.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 979d2d0..0cbad18 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2680,11 +2680,11 @@ class PaperForgeSettingTab extends PluginSettingTab { const renderCollapsibleSkills = (label, skills, isSystem) => { if (skills.length === 0) return; - // Content wrapper (created first, collapsed by default) - const content = skillsBox.createEl('div', { cls: 'paperforge-skills-collapse-content' }); - - // Header row with toggle arrow + // Header row with toggle arrow (created first so it appears above content) const header = skillsBox.createEl('div', { cls: 'paperforge-skills-collapse-header' }); + + // Content wrapper + const content = skillsBox.createEl('div', { cls: 'paperforge-skills-collapse-content' }); header.style.cssText = 'display:flex; align-items:center; cursor:pointer; padding:4px 0; margin-bottom:4px;'; const arrow = header.createEl('span', { text: '\u25BC', cls: 'paperforge-skills-arrow' }); arrow.style.cssText = 'font-size:10px; margin-right:6px; transition:transform 0.2s;'; From 3b807705682372fc399c6385eaae8e4a14cd38ed Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 18:18:49 +0800 Subject: [PATCH 074/132] fix(plugin): always-on scrollbar + stable collapse header positions --- paperforge/plugin/main.js | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 0cbad18..f201200 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2241,6 +2241,10 @@ class PaperForgeSettingTab extends PluginSettingTab { .paperforge-settings-tab--active { color: var(--text-accent); border-bottom-color: var(--text-accent); } .paperforge-tab-content { display: none; } .paperforge-tab-content--active { display: block; } + .paperforge-skills-collapse-header { display: flex !important; align-items: center; cursor: pointer; padding: 4px 0 !important; margin: 0 !important; } + .paperforge-skills-collapse-header h4 { margin: 0 !important; } + .paperforge-skills-collapse-content { margin: 0 !important; padding: 0 !important; } + .vertical-tab-content-container { overflow-y: scroll !important; } `; document.head.appendChild(style); } @@ -2685,11 +2689,9 @@ class PaperForgeSettingTab extends PluginSettingTab { // Content wrapper const content = skillsBox.createEl('div', { cls: 'paperforge-skills-collapse-content' }); - header.style.cssText = 'display:flex; align-items:center; cursor:pointer; padding:4px 0; margin-bottom:4px;'; const arrow = header.createEl('span', { text: '\u25BC', cls: 'paperforge-skills-arrow' }); arrow.style.cssText = 'font-size:10px; margin-right:6px; transition:transform 0.2s;'; header.createEl('h4', { text: `${label} (${skills.length})`, cls: 'paperforge-skills-subheader' }); - header.querySelector('h4').style.marginBottom = '0'; skills.forEach(s => { const nameText = s.name + (s.version ? ' v' + s.version : ''); From 6983afd4848836065b368c73fc2d4e8dd4c8e9b3 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 18:20:18 +0800 Subject: [PATCH 075/132] fix(plugin): use CSS rotate for arrow instead of different unicode chars --- paperforge/plugin/main.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index f201200..50ad947 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2690,7 +2690,7 @@ class PaperForgeSettingTab extends PluginSettingTab { // Content wrapper const content = skillsBox.createEl('div', { cls: 'paperforge-skills-collapse-content' }); const arrow = header.createEl('span', { text: '\u25BC', cls: 'paperforge-skills-arrow' }); - arrow.style.cssText = 'font-size:10px; margin-right:6px; transition:transform 0.2s;'; + arrow.style.cssText = 'display:inline-block; font-size:10px; margin-right:6px; transition:transform 0.2s; transform:rotate(0deg);'; header.createEl('h4', { text: `${label} (${skills.length})`, cls: 'paperforge-skills-subheader' }); skills.forEach(s => { @@ -2724,7 +2724,7 @@ class PaperForgeSettingTab extends PluginSettingTab { header.addEventListener('click', () => { collapsed = !collapsed; content.style.display = collapsed ? 'none' : ''; - arrow.textContent = collapsed ? '\u25B6' : '\u25BC'; + arrow.style.transform = collapsed ? 'rotate(-90deg)' : 'rotate(0deg)'; }); }; From 7b31d21a073a1d04d7fcdea4ae06cc8b7197e645 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 18:40:01 +0800 Subject: [PATCH 076/132] feat(plugin): rewrite Vector DB section with state machine --- paperforge/plugin/main.js | 223 ++++++++++++++++++++++++++++---------- 1 file changed, 164 insertions(+), 59 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 50ad947..dda500a 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -570,6 +570,7 @@ const DEFAULT_SETTINGS = { vector_db_mode: 'local', vector_db_model: 'BAAI/bge-small-en-v1.5', vector_db_api_key: '', + vector_db_last_model: '', frozen_skills: {}, }; @@ -2767,6 +2768,10 @@ class PaperForgeSettingTab extends PluginSettingTab { this._renderMemoryStatusText(statusRow, 'No Python found.', this._lastSyncTime); } + this._renderVectorSection(containerEl); + } + + _renderVectorSection(containerEl) { // --- Vector Database (within Memory Layer) --- containerEl.createEl('h4', { text: 'Vector Database' }); @@ -2784,58 +2789,65 @@ class PaperForgeSettingTab extends PluginSettingTab { }); }); - if (this.plugin.settings.features.vector_db) { - const vp = this.app.vault.adapter.basePath; + if (!this.plugin.settings.features.vector_db) return; - if (this._vectorDepsOk === true) { - this._renderVectorConfig(containerEl); - } else if (this._vectorDepsOk === false) { - const depsEl = containerEl.createEl('div'); - depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; - depsEl.setText('Dependencies not installed. Required: chromadb, sentence-transformers.'); - this._renderVectorInstall(containerEl); - } else { - const depsEl = containerEl.createEl('div'); - depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; - depsEl.setText('Checking dependencies...'); - const pyResult = resolvePythonExecutable(vp, this.plugin.settings); - if (pyResult.path) { - const { exec } = require('child_process'); - exec(`"${pyResult.path}" -c "import chromadb; import sentence_transformers; print('ok')"`, { encoding: 'utf-8', timeout: 10000 }, (err, stdout, stderr) => { - const ok = !err && (stdout || '').trim() === 'ok'; - this._vectorDepsOk = ok; - if (ok) { - depsEl.remove(); - this._renderVectorConfig(containerEl); - } else { - depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; - depsEl.setText('Dependencies not installed. Required: chromadb, sentence-transformers.'); - this._renderVectorInstall(containerEl); - } + const vp = this.app.vault.adapter.basePath; + + // === Resolve state === + if (this._vectorDepsOk === true && this._embedStatusText !== null) { + this._renderVectorReady(containerEl, vp); + return; + } + if (this._vectorDepsOk === false) { + this._renderVectorNoDeps(containerEl); + return; + } + // First check — deps unknown, run async + if (this._vectorDepsOk === null) { + const statusBox = containerEl.createEl('div'); + statusBox.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; + statusBox.setText('Checking dependencies...'); + + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + if (!pyResult.path) { + statusBox.setText('No Python found. Check Installation tab.'); + this._vectorDepsOk = false; + return; + } + const { exec } = require('child_process'); + exec(`"${pyResult.path}" -c "import chromadb; import sentence_transformers; print('ok')"`, { + encoding: 'utf-8', timeout: 15000 + }, (err, stdout) => { + const ok = !err && (stdout || '').trim() === 'ok'; + this._vectorDepsOk = ok; + if (ok) { + // Deps OK — now check embed status + this._execEmbedStatus(pyResult.path, vp, (statusText) => { + this._embedStatusText = statusText; + this.display(); }); } else { - depsEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:#4a1515; border-radius:4px; color:#ff6b6b;'; - depsEl.setText('No Python found. Check Installation tab.'); + this.display(); } - } + }); } } - _renderVectorInstall(containerEl) { + _renderVectorNoDeps(containerEl) { + const box = containerEl.createEl('div'); + box.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; + box.setText('Dependencies not installed. Required: chromadb, sentence-transformers.'); + new Setting(containerEl) .setName('Install Dependencies') - .setDesc('Installs chromadb and sentence-transformers (~500MB disk)') + .setDesc('pip install chromadb sentence-transformers (~500MB)') .addButton(button => { button.setButtonText('Install') .setCta() .onClick(async () => { const vp = this.app.vault.adapter.basePath; const pyResult = resolvePythonExecutable(vp, this.plugin.settings); - const pythonPath = pyResult.path; - if (!pythonPath) { - new Notice('No Python found. Install tab.'); - return; - } + if (!pyResult.path) { new Notice('No Python found.'); return; } button.setButtonText('Installing...'); button.setDisabled(true); const notice = new Notice('Installing chromadb + sentence-transformers...', 0); @@ -2843,17 +2855,19 @@ class PaperForgeSettingTab extends PluginSettingTab { const { exec } = require('child_process'); const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' }); await new Promise((resolve, reject) => { - exec(`"${pythonPath}" -m pip install chromadb sentence-transformers`, { + exec(`"${pyResult.path}" -m pip install chromadb sentence-transformers`, { encoding: 'utf-8', timeout: 300000, env: env, - }, (error, stdout, stderr) => { - if (error) reject(error); - else resolve(stdout); - }); + }, (error) => { error ? reject(error) : resolve(); }); }); notice.hide(); - new Notice('Done. Run paperforge embed build.'); + new Notice('Dependencies installed. Building vectors...'); + // Auto-build after install + this._vectorDepsOk = true; + this._execEmbedStatus(pyResult.path, vp, (text) => { + this._embedStatusText = text; + }); this.display(); - } catch(e) { + } catch (e) { notice.hide(); new Notice('Install failed: ' + (e.stderr || e.message || e)); button.setButtonText('Retry'); @@ -2863,25 +2877,25 @@ class PaperForgeSettingTab extends PluginSettingTab { }); } - _renderVectorConfig(containerEl) { + _renderVectorReady(containerEl, vp) { + // Status line const statusEl = containerEl.createEl('div'); statusEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; - const vp = this.app.vault.adapter.basePath; - const pyResult = resolvePythonExecutable(vp, this.plugin.settings); - const pythonPath = pyResult.path; - - if (this._embedStatusText !== null) { - statusEl.setText(this._embedStatusText); - } else if (pythonPath) { - statusEl.setText('Loading...'); - this._execEmbedStatus(pythonPath, vp, (text) => { - this._embedStatusText = text; - statusEl.setText(text); - }); - } else { - statusEl.setText('No Python found.'); + statusEl.setText(this._embedStatusText || 'Loading...'); + + // Detect model mismatch + const embedInfo = this._embedStatusText ? this._parseEmbedStatus(this._embedStatusText) : null; + const currentModel = this._getCurrentModelKey(); + const lastModel = this.plugin.settings.vector_db_last_model || ''; + const modelChanged = embedInfo && embedInfo.db_exists && lastModel && lastModel !== currentModel; + + if (modelChanged) { + const warnEl = containerEl.createEl('div'); + warnEl.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-modifier-warning); border-radius:4px;'; + warnEl.setText(`Model changed (${lastModel} -> ${currentModel}). Existing vectors are incompatible — rebuild required.`); } + // Mode selector new Setting(containerEl) .setName('Embedding Mode') .addDropdown(dropdown => { @@ -2895,6 +2909,7 @@ class PaperForgeSettingTab extends PluginSettingTab { }); }); + // Model selector (local mode) if (this.plugin.settings.vector_db_mode === 'local') { new Setting(containerEl) .setName('Model') @@ -2906,13 +2921,16 @@ class PaperForgeSettingTab extends PluginSettingTab { .onChange(value => { this.plugin.settings.vector_db_model = value; this.plugin.saveSettings(); + this.display(); }); }); } + // API key (api mode) if (this.plugin.settings.vector_db_mode === 'api') { new Setting(containerEl) .setName('OpenAI API Key') + .setDesc('Used for text-embedding-3-small (1536d)') .addText(text => { text.setPlaceholder('sk-...') .setValue(this.plugin.settings.vector_db_api_key || '') @@ -2920,8 +2938,95 @@ class PaperForgeSettingTab extends PluginSettingTab { this.plugin.settings.vector_db_api_key = value; this.plugin.saveSettings(); }); + }) + .addButton(button => { + button.setButtonText('Verify') + .onClick(async () => { + const key = this.plugin.settings.vector_db_api_key; + if (!key || !key.startsWith('sk-')) { + new Notice('Enter a valid OpenAI API key.'); + return; + } + button.setButtonText('Checking...'); + button.setDisabled(true); + try { + const resp = await (0, eval)('import')('node:https').then(https => new Promise((resolve, reject) => { + const req = https.request('https://api.openai.com/v1/models', { + method: 'GET', headers: { Authorization: 'Bearer ' + key }, timeout: 10000 + }, (res) => { let d=''; res.on('data',c=>d+=c); res.on('end',()=>resolve({ok:res.statusCode===200,data:d})); }); + req.on('error', reject); req.end(); + })); + if (resp.ok) { new Notice('API key valid.'); } + else { new Notice('API key rejected.'); } + } catch (e) { + new Notice('Network error: ' + e.message); + } + button.setButtonText('Verify'); + button.setDisabled(false); + }); }); } + + // Rebuild button + new Setting(containerEl) + .setName('Rebuild Vectors') + .setDesc(modelChanged ? 'Model changed — rebuild to update all vectors.' : 'Rebuild all OCR fulltext vectors. Required after model or mode change.') + .addButton(button => { + const label = embedInfo && embedInfo.db_exists ? 'Rebuild' : 'Build'; + button.setButtonText(label) + .setCta() + .onClick(async () => { + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + if (!pyResult.path) { new Notice('No Python found.'); return; } + button.setButtonText('Building...'); + button.setDisabled(true); + const notice = new Notice('Building vector index...', 0); + try { + const { exec } = require('child_process'); + const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' }); + await new Promise((resolve, reject) => { + const cmd = `"${pyResult.path}" -m paperforge --vault "${vp}" embed build --force`; + exec(cmd, { encoding: 'utf-8', timeout: 600000, env: env }, (error, stdout) => { + if (error) reject(error); + else resolve(stdout); + }); + }); + // Remember model used for this build + this.plugin.settings.vector_db_last_model = currentModel; + this.plugin.saveSettings(); + // Refresh status + this._embedStatusText = null; + this._execEmbedStatus(pyResult.path, vp, (text) => { this._embedStatusText = text; }); + notice.hide(); + new Notice('Vector build complete.'); + this.display(); + } catch (e) { + notice.hide(); + new Notice('Build failed: ' + (e.stderr || e.message || e)); + button.setButtonText(label); + button.setDisabled(false); + } + }); + }); + } + + _getCurrentModelKey() { + if (this.plugin.settings.vector_db_mode === 'api') return 'openai/text-embedding-3-small'; + return this.plugin.settings.vector_db_model || 'BAAI/bge-small-en-v1.5'; + } + + _parseEmbedStatus(text) { + // Parse " key: value" lines from paperforge embed status output + const info = {}; + if (!text) return info; + text.split('\n').forEach(line => { + const m = line.match(/^\s*([^:]+):\s*(.*)/); + if (m) info[m[1].trim()] = m[2].trim(); + }); + // Normalize bools + if (info.db_exists !== undefined) info.db_exists = info.db_exists === 'True'; + if (info.chunk_count !== undefined) info.chunk_count = parseInt(info.chunk_count, 10) || 0; + return info; } _getPythonDesc(pyPath, source) { From df456fb0ec4b79b8725a25259a64953002c4cf8e Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 18:52:26 +0800 Subject: [PATCH 077/132] feat(plugin): live terminal output + model descriptions + uninstall model button --- paperforge/plugin/main.js | 70 +++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 14 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index dda500a..c18ec35 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2911,8 +2911,14 @@ class PaperForgeSettingTab extends PluginSettingTab { // Model selector (local mode) if (this.plugin.settings.vector_db_mode === 'local') { + const modelDesc = { + 'BAAI/bge-small-en-v1.5': 'Best balance — fast, accurate, recommended for most users (384d, 130MB)', + 'sentence-transformers/all-MiniLM-L6-v2': 'Lightest & fastest — lower accuracy, minimal disk (384d, 80MB)', + 'BAAI/bge-base-en-v1.5': 'Highest accuracy — slower, large disk footprint (768d, 440MB)', + }; new Setting(containerEl) .setName('Model') + .setDesc(modelDesc[this.plugin.settings.vector_db_model] || '') .addDropdown(dropdown => { dropdown.addOption('BAAI/bge-small-en-v1.5', 'bge-small (384d, 130MB)'); dropdown.addOption('sentence-transformers/all-MiniLM-L6-v2', 'MiniLM (384d, 80MB)'); @@ -2923,6 +2929,31 @@ class PaperForgeSettingTab extends PluginSettingTab { this.plugin.saveSettings(); this.display(); }); + }) + .addButton(button => { + button.setButtonText('Uninstall') + .setWarning() + .onClick(async () => { + const model = this.plugin.settings.vector_db_model; + const cacheName = 'models--' + model.replace('/', '--'); + button.setButtonText('Removing...'); + button.setDisabled(true); + try { + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + const { exec } = require('child_process'); + const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' }); + await new Promise((resolve, reject) => { + exec(`"${pyResult.path}" -c "import shutil, os,sys; p=os.path.join(os.path.expanduser('~/.cache/huggingface/hub'), '${cacheName}'); shutil.rmtree(p,ignore_errors=True); print('Removed' if os.path.exists(p) else 'Not cached')"`, { + encoding: 'utf-8', timeout: 30000, env: env + }, (error, stdout) => error ? reject(error) : resolve(stdout)); + }); + new Notice('Model cache removed.'); + } catch (e) { + new Notice('Failed: ' + (e.stderr || e.message || e)); + } + button.setButtonText('Uninstall'); + button.setDisabled(false); + }); }); } @@ -2967,7 +2998,10 @@ class PaperForgeSettingTab extends PluginSettingTab { }); } - // Rebuild button + // Rebuild button with live terminal output + const terminalEl = containerEl.createEl('pre'); + terminalEl.style.cssText = 'display:none; background:#1e1e1e; color:#d4d4d4; padding:10px; border-radius:4px; max-height:300px; overflow-y:auto; font-size:11px; font-family:var(--font-monospace); margin:8px 0; white-space:pre-wrap; word-break:break-all;'; + new Setting(containerEl) .setName('Rebuild Vectors') .setDesc(modelChanged ? 'Model changed — rebuild to update all vectors.' : 'Rebuild all OCR fulltext vectors. Required after model or mode change.') @@ -2980,29 +3014,37 @@ class PaperForgeSettingTab extends PluginSettingTab { if (!pyResult.path) { new Notice('No Python found.'); return; } button.setButtonText('Building...'); button.setDisabled(true); - const notice = new Notice('Building vector index...', 0); + terminalEl.style.display = 'block'; + terminalEl.setText(''); + + const { spawn } = require('child_process'); + const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' }); + const child = spawn(pyResult.path, ['-m', 'paperforge', '--vault', vp, 'embed', 'build', '--force'], { + env: env, stdio: ['ignore', 'pipe', 'pipe'] + }); + + const append = (text) => { + terminalEl.setText((terminalEl.getText() || '') + text); + terminalEl.scrollTop = terminalEl.scrollHeight; + }; + + child.stdout.on('data', (data) => append(data.toString())); + child.stderr.on('data', (data) => append(data.toString())); + try { - const { exec } = require('child_process'); - const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' }); await new Promise((resolve, reject) => { - const cmd = `"${pyResult.path}" -m paperforge --vault "${vp}" embed build --force`; - exec(cmd, { encoding: 'utf-8', timeout: 600000, env: env }, (error, stdout) => { - if (error) reject(error); - else resolve(stdout); - }); + child.on('close', (code) => code === 0 ? resolve() : reject(new Error('Exit code ' + code))); + child.on('error', reject); }); - // Remember model used for this build this.plugin.settings.vector_db_last_model = currentModel; this.plugin.saveSettings(); - // Refresh status this._embedStatusText = null; this._execEmbedStatus(pyResult.path, vp, (text) => { this._embedStatusText = text; }); - notice.hide(); new Notice('Vector build complete.'); this.display(); } catch (e) { - notice.hide(); - new Notice('Build failed: ' + (e.stderr || e.message || e)); + append('\n--- BUILD FAILED ---\n' + (e.stderr || e.message || e)); + new Notice('Build failed. See terminal output.'); button.setButtonText(label); button.setDisabled(false); } From 886049dde3b121df73cc9c9de452eed7d5c2ce95 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 18:58:00 +0800 Subject: [PATCH 078/132] feat: vector auto-embed on OCR complete + Uninstall button cache detection --- paperforge/plugin/main.js | 55 +++++++++++++++++++------------- paperforge/worker/asset_index.py | 34 ++++++++++++++++++++ 2 files changed, 66 insertions(+), 23 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index c18ec35..778dcb7 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2931,29 +2931,38 @@ class PaperForgeSettingTab extends PluginSettingTab { }); }) .addButton(button => { - button.setButtonText('Uninstall') - .setWarning() - .onClick(async () => { - const model = this.plugin.settings.vector_db_model; - const cacheName = 'models--' + model.replace('/', '--'); - button.setButtonText('Removing...'); - button.setDisabled(true); - try { - const pyResult = resolvePythonExecutable(vp, this.plugin.settings); - const { exec } = require('child_process'); - const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' }); - await new Promise((resolve, reject) => { - exec(`"${pyResult.path}" -c "import shutil, os,sys; p=os.path.join(os.path.expanduser('~/.cache/huggingface/hub'), '${cacheName}'); shutil.rmtree(p,ignore_errors=True); print('Removed' if os.path.exists(p) else 'Not cached')"`, { - encoding: 'utf-8', timeout: 30000, env: env - }, (error, stdout) => error ? reject(error) : resolve(stdout)); - }); - new Notice('Model cache removed.'); - } catch (e) { - new Notice('Failed: ' + (e.stderr || e.message || e)); - } - button.setButtonText('Uninstall'); - button.setDisabled(false); - }); + const model = this.plugin.settings.vector_db_model; + const cacheName = 'models--' + model.replace('/', '--'); + const fs = require('fs'); + const os = require('os'); + const cachePath = os.homedir() + '/.cache/huggingface/hub/' + cacheName; + const isCached = fs.existsSync(cachePath); + + if (isCached) { + button.setButtonText('Uninstall').setWarning(); + } else { + button.setButtonText('Not cached'); + button.setDisabled(true); + } + button.onClick(async () => { + if (!isCached) return; + button.setButtonText('Removing...'); + button.setDisabled(true); + try { + const pyResult = resolvePythonExecutable(vp, this.plugin.settings); + const { exec } = require('child_process'); + const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' }); + await new Promise((resolve, reject) => { + exec(`"${pyResult.path}" -c "import shutil, os; p=os.path.join(os.path.expanduser('~/.cache/huggingface/hub'), '${cacheName}'); shutil.rmtree(p,ignore_errors=True); print('done')"`, { + encoding: 'utf-8', timeout: 30000, env: env + }, (error) => error ? reject(error) : resolve()); + }); + new Notice('Model cache removed.'); + } catch (e) { + new Notice('Failed: ' + (e.stderr || e.message || e)); + } + this.display(); + }); }); } diff --git a/paperforge/worker/asset_index.py b/paperforge/worker/asset_index.py index de16ae6..1c7b9a1 100644 --- a/paperforge/worker/asset_index.py +++ b/paperforge/worker/asset_index.py @@ -436,9 +436,43 @@ def _build_entry(item: dict, vault: Path, paths: dict, domain: str, zotero_dir: # Write per-workspace paper-meta.json (Phase 37: internal state outside frontmatter) write_paper_meta(workspace_dir, entry, paperforge_version=PAPERFORGE_VERSION) + # Auto-embed vectors if this paper just completed OCR + _vec_auto_embed_if_new(vault, entry) + return entry +def _vec_auto_embed_if_new(vault: Path, entry: dict) -> None: + """Auto-embed a paper into vector DB if OCR is done and vectors missing.""" + if entry.get("ocr_status") != "done": + return + fulltext_rel = entry.get("fulltext_path", "") + if not fulltext_rel: + return + fulltext_path = vault / fulltext_rel + if not fulltext_path.exists(): + return + # Check if vector DB is enabled and set up + try: + from paperforge.memory.vector_db import ( + _read_plugin_settings, + chunk_fulltext, + embed_paper, + get_vector_db_path, + ) + settings = _read_plugin_settings(vault) + if not settings.get("features", {}).get("vector_db", False): + return + db_path = get_vector_db_path(vault) + if not db_path.exists(): + return + chunks = chunk_fulltext(fulltext_path) + if not chunks: + return + embed_paper(vault, entry["zotero_key"], chunks) + except Exception: + pass # ChromaDB / model not installed — silently skip + # --------------------------------------------------------------------------- # Full index build # --------------------------------------------------------------------------- From 3c5505316baa6a35c162dc0acf3a0c1a35a956d5 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 19:00:54 +0800 Subject: [PATCH 079/132] fix(plugin): preserve skills collapse state across tab/model re-renders --- paperforge/plugin/main.js | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 778dcb7..c460e4b 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2219,6 +2219,7 @@ class PaperForgeSettingTab extends PluginSettingTab { this._memoryStatusText = null; // null = not checked yet, string = cached result this._vectorDepsOk = null; // null = not checked, bool = cached this._embedStatusText = null; + this._skillsCollapsed = {}; // preserve collapse state across re-renders this.activeTab = 'setup'; } @@ -2720,12 +2721,24 @@ class PaperForgeSettingTab extends PluginSettingTab { }); }); - // Toggle - let collapsed = false; + // Toggle with state preservation + const stateKey = isSystem ? 'system' : 'user'; + const collapsed = this._skillsCollapsed[stateKey] || false; + if (collapsed) { + content.style.display = 'none'; + arrow.style.transform = 'rotate(-90deg)'; + } + header.addEventListener('click', () => { - collapsed = !collapsed; - content.style.display = collapsed ? 'none' : ''; - arrow.style.transform = collapsed ? 'rotate(-90deg)' : 'rotate(0deg)'; + const nowCollapsed = content.style.display !== 'none'; + if (nowCollapsed) { + content.style.display = 'none'; + arrow.style.transform = 'rotate(-90deg)'; + } else { + content.style.display = ''; + arrow.style.transform = 'rotate(0deg)'; + } + this._skillsCollapsed[stateKey] = content.style.display === 'none'; }); }; From 1d3b8723e2c4a53d457b9332f5b11dd64390e03d Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 19:03:11 +0800 Subject: [PATCH 080/132] fix(plugin): improve skills spacing - group wrappers with even padding --- paperforge/plugin/main.js | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index c460e4b..44aa133 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2243,9 +2243,11 @@ class PaperForgeSettingTab extends PluginSettingTab { .paperforge-settings-tab--active { color: var(--text-accent); border-bottom-color: var(--text-accent); } .paperforge-tab-content { display: none; } .paperforge-tab-content--active { display: block; } - .paperforge-skills-collapse-header { display: flex !important; align-items: center; cursor: pointer; padding: 4px 0 !important; margin: 0 !important; } + .paperforge-skills-collapse-header { display: flex !important; align-items: center; cursor: pointer; padding: 6px 0 !important; margin: 0 !important; } .paperforge-skills-collapse-header h4 { margin: 0 !important; } .paperforge-skills-collapse-content { margin: 0 !important; padding: 0 !important; } + .paperforge-skills-group { margin-bottom: 10px; } + .paperforge-skills-group:last-child { margin-bottom: 0; } .vertical-tab-content-container { overflow-y: scroll !important; } `; document.head.appendChild(style); @@ -2681,16 +2683,19 @@ class PaperForgeSettingTab extends PluginSettingTab { } const skillsBox = containerEl.createEl('div'); - skillsBox.style.cssText = 'background:var(--background-secondary); border-radius:8px; padding:12px 12px 4px; margin:8px 0 16px;'; + skillsBox.style.cssText = 'background:var(--background-secondary); border-radius:8px; padding:12px 12px 10px; margin:8px 0 16px;'; const renderCollapsibleSkills = (label, skills, isSystem) => { if (skills.length === 0) return; + // Group wrapper for spacing between groups + const group = skillsBox.createEl('div', { cls: 'paperforge-skills-group' }); + // Header row with toggle arrow (created first so it appears above content) - const header = skillsBox.createEl('div', { cls: 'paperforge-skills-collapse-header' }); + const header = group.createEl('div', { cls: 'paperforge-skills-collapse-header' }); // Content wrapper - const content = skillsBox.createEl('div', { cls: 'paperforge-skills-collapse-content' }); + const content = group.createEl('div', { cls: 'paperforge-skills-collapse-content' }); const arrow = header.createEl('span', { text: '\u25BC', cls: 'paperforge-skills-arrow' }); arrow.style.cssText = 'display:inline-block; font-size:10px; margin-right:6px; transition:transform 0.2s; transform:rotate(0deg);'; header.createEl('h4', { text: `${label} (${skills.length})`, cls: 'paperforge-skills-subheader' }); From a70480718ceff779fe5fe5b9d13512f5e0c4bb4f Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 19:12:52 +0800 Subject: [PATCH 081/132] fix: lighter terminal output box + cache embedding model across papers --- paperforge/memory/vector_db.py | 14 ++++++++++++-- paperforge/plugin/main.js | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/paperforge/memory/vector_db.py b/paperforge/memory/vector_db.py index 58221aa..156f622 100644 --- a/paperforge/memory/vector_db.py +++ b/paperforge/memory/vector_db.py @@ -60,8 +60,13 @@ def get_collection(vault: Path): ) +_cached_model = None +_cached_model_name = None + + def get_embedding_model(vault: Path): - """Load the embedding model based on plugin settings or default.""" + """Load the embedding model based on plugin settings or default. Cached after first load.""" + global _cached_model, _cached_model_name settings = _read_plugin_settings(vault) mode = settings.get("vector_db_mode", "local") @@ -69,9 +74,14 @@ def get_embedding_model(vault: Path): return None # API mode — embedding done externally model_name = settings.get("vector_db_model", "BAAI/bge-small-en-v1.5") + if _cached_model is not None and _cached_model_name == model_name: + return _cached_model + ST = _get_st() logger.info("Loading embedding model: %s", model_name) - return ST(model_name) + _cached_model = ST(model_name) + _cached_model_name = model_name + return _cached_model def embed_paper(vault: Path, zotero_key: str, chunks: list[dict]) -> int: diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 44aa133..d7fac1d 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -3027,7 +3027,7 @@ class PaperForgeSettingTab extends PluginSettingTab { // Rebuild button with live terminal output const terminalEl = containerEl.createEl('pre'); - terminalEl.style.cssText = 'display:none; background:#1e1e1e; color:#d4d4d4; padding:10px; border-radius:4px; max-height:300px; overflow-y:auto; font-size:11px; font-family:var(--font-monospace); margin:8px 0; white-space:pre-wrap; word-break:break-all;'; + terminalEl.style.cssText = 'display:none; background:var(--background-primary); padding:10px; border-radius:4px; border:1px solid var(--background-modifier-border); max-height:250px; overflow-y:auto; font-size:11px; font-family:var(--font-monospace); margin:8px 0; white-space:pre-wrap; word-break:break-all; opacity:0.8;'; new Setting(containerEl) .setName('Rebuild Vectors') From 2a78086f7921468093257a5e3d4c586c989f6060 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 19:15:43 +0800 Subject: [PATCH 082/132] feat: HF mirror endpoint support for model downloads behind firewalls --- paperforge/memory/vector_db.py | 7 +++++++ paperforge/plugin/main.js | 24 +++++++++++++++++++----- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/paperforge/memory/vector_db.py b/paperforge/memory/vector_db.py index 156f622..ea75343 100644 --- a/paperforge/memory/vector_db.py +++ b/paperforge/memory/vector_db.py @@ -2,6 +2,7 @@ import json import logging +import os from pathlib import Path logger = logging.getLogger(__name__) @@ -74,6 +75,12 @@ def get_embedding_model(vault: Path): return None # API mode — embedding done externally model_name = settings.get("vector_db_model", "BAAI/bge-small-en-v1.5") + + # Apply HF mirror endpoint if configured + hf_endpoint = settings.get("vector_db_hf_endpoint", "") + if hf_endpoint: + os.environ["HF_ENDPOINT"] = hf_endpoint + if _cached_model is not None and _cached_model_name == model_name: return _cached_model diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index d7fac1d..061a0de 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -76,9 +76,22 @@ function checkRuntimeVersion(pythonExe, pluginVersion, cwd, timeout, _execFile) } else { resolve({ status: "mismatch", pyVersion: pyVer, pluginVersion, error: null }); } - }); - }); -} + }); + }); + + // HF Mirror (for users behind firewalls) + new Setting(containerEl) + .setName('HF Mirror / Endpoint') + .setDesc('HuggingFace mirror for model downloads. Default: hf-mirror.com (works in China). Set empty to use official.') + .addText(text => { + text.setPlaceholder('https://hf-mirror.com') + .setValue(this.plugin.settings.vector_db_hf_endpoint || '') + .onChange(value => { + this.plugin.settings.vector_db_hf_endpoint = value; + this.plugin.saveSettings(); + }); + }); + } function classifyError(errorCode) { const code = String(errorCode); @@ -570,6 +583,7 @@ const DEFAULT_SETTINGS = { vector_db_mode: 'local', vector_db_model: 'BAAI/bge-small-en-v1.5', vector_db_api_key: '', + vector_db_hf_endpoint: 'https://hf-mirror.com', vector_db_last_model: '', frozen_skills: {}, }; @@ -2871,7 +2885,7 @@ class PaperForgeSettingTab extends PluginSettingTab { const notice = new Notice('Installing chromadb + sentence-transformers...', 0); try { const { exec } = require('child_process'); - const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' }); + const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1', HF_ENDPOINT: this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com' }); await new Promise((resolve, reject) => { exec(`"${pyResult.path}" -m pip install chromadb sentence-transformers`, { encoding: 'utf-8', timeout: 300000, env: env, @@ -3045,7 +3059,7 @@ class PaperForgeSettingTab extends PluginSettingTab { terminalEl.setText(''); const { spawn } = require('child_process'); - const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' }); + const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1', HF_ENDPOINT: this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com' }); const child = spawn(pyResult.path, ['-m', 'paperforge', '--vault', vp, 'embed', 'build', '--force'], { env: env, stdio: ['ignore', 'pipe', 'pipe'] }); From 48be9b676ff40ccf2a0516eead350aff23b61e34 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 19:17:12 +0800 Subject: [PATCH 083/132] fix(plugin): check model cache integrity not just directory existence --- paperforge/plugin/main.js | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 061a0de..47d538a 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2967,8 +2967,23 @@ class PaperForgeSettingTab extends PluginSettingTab { const cacheName = 'models--' + model.replace('/', '--'); const fs = require('fs'); const os = require('os'); - const cachePath = os.homedir() + '/.cache/huggingface/hub/' + cacheName; - const isCached = fs.existsSync(cachePath); + const path = require('path'); + const cachePath = path.join(os.homedir(), '.cache', 'huggingface', 'hub', cacheName); + + // Check integrity: directory exists AND has snapshots with files + let isCached = false; + if (fs.existsSync(cachePath)) { + const snapDir = path.join(cachePath, 'snapshots'); + if (fs.existsSync(snapDir)) { + try { + const entries = fs.readdirSync(snapDir); + isCached = entries.some(e => { + const p = path.join(snapDir, e); + return fs.statSync(p).isDirectory() && fs.readdirSync(p).length > 0; + }); + } catch (_) {} + } + } if (isCached) { button.setButtonText('Uninstall').setWarning(); From b011b44d7a554fdcb8bc4d63fe1e1118389a1902 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 19:19:39 +0800 Subject: [PATCH 084/132] feat(plugin): HF mirror dropdown with presets + custom endpoint --- paperforge/plugin/main.js | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 47d538a..9b33088 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -82,15 +82,40 @@ function checkRuntimeVersion(pythonExe, pluginVersion, cwd, timeout, _execFile) // HF Mirror (for users behind firewalls) new Setting(containerEl) .setName('HF Mirror / Endpoint') - .setDesc('HuggingFace mirror for model downloads. Default: hf-mirror.com (works in China). Set empty to use official.') + .setDesc('Model download source. Try official if mirror fails. Custom: type any URL.') + .addDropdown(dropdown => { + dropdown.addOption('https://hf-mirror.com', 'hf-mirror.com (recommended)'); + dropdown.addOption('https://huggingface.co', 'huggingface.co (official)'); + dropdown.addOption('__custom__', 'Custom...'); + const current = this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com'; + const isPreset = ['https://hf-mirror.com', 'https://huggingface.co'].includes(current); + dropdown.setValue(isPreset ? current : '__custom__') + .onChange(value => { + if (value !== '__custom__') { + this.plugin.settings.vector_db_hf_endpoint = value; + this.plugin.saveSettings(); + if (customInput) { customInput.settingEl.style.display = 'none'; customInput.setValue(''); } + } else { + customInput.settingEl.style.display = ''; + } + }); + }); + const customInput = new Setting(containerEl) + .setName('Custom Endpoint') + .setDesc('Enter a custom HuggingFace mirror URL') .addText(text => { - text.setPlaceholder('https://hf-mirror.com') - .setValue(this.plugin.settings.vector_db_hf_endpoint || '') + const current = this.plugin.settings.vector_db_hf_endpoint || ''; + const isPreset = ['https://hf-mirror.com', 'https://huggingface.co'].includes(current); + text.setPlaceholder('https://your-mirror.com') + .setValue(isPreset ? '' : current) .onChange(value => { this.plugin.settings.vector_db_hf_endpoint = value; this.plugin.saveSettings(); }); }); + const current = this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com'; + const isPreset = ['https://hf-mirror.com', 'https://huggingface.co'].includes(current); + if (isPreset) customInput.settingEl.style.display = 'none'; } function classifyError(errorCode) { From 3fefccb841de2911c789f5cbb88fbcb1b8e2b903 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 19:21:21 +0800 Subject: [PATCH 085/132] fix: Python no longer overrides HF_ENDPOINT from stale settings --- paperforge/memory/vector_db.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/paperforge/memory/vector_db.py b/paperforge/memory/vector_db.py index ea75343..fe8ffd6 100644 --- a/paperforge/memory/vector_db.py +++ b/paperforge/memory/vector_db.py @@ -76,10 +76,7 @@ def get_embedding_model(vault: Path): model_name = settings.get("vector_db_model", "BAAI/bge-small-en-v1.5") - # Apply HF mirror endpoint if configured - hf_endpoint = settings.get("vector_db_hf_endpoint", "") - if hf_endpoint: - os.environ["HF_ENDPOINT"] = hf_endpoint + # HF_ENDPOINT is set by the JS plugin via environment variable — don't override if _cached_model is not None and _cached_model_name == model_name: return _cached_model From b172ed197461223494b8b53c4402aab036d0afe0 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 19:22:08 +0800 Subject: [PATCH 086/132] feat(plugin): user skills collapsed by default --- paperforge/plugin/main.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 9b33088..6bf2221 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2258,7 +2258,7 @@ class PaperForgeSettingTab extends PluginSettingTab { this._memoryStatusText = null; // null = not checked yet, string = cached result this._vectorDepsOk = null; // null = not checked, bool = cached this._embedStatusText = null; - this._skillsCollapsed = {}; // preserve collapse state across re-renders + this._skillsCollapsed = { user: true }; // User skills collapsed by default this.activeTab = 'setup'; } From 49649439e2e2729831a9fed1590af46fc53ffe44 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 20:24:04 +0800 Subject: [PATCH 087/132] fix: extract _renderHfMirror method, remove orphaned module-level code --- paperforge/plugin/main.js | 80 ++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 6bf2221..0a360ee 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -78,44 +78,6 @@ function checkRuntimeVersion(pythonExe, pluginVersion, cwd, timeout, _execFile) } }); }); - - // HF Mirror (for users behind firewalls) - new Setting(containerEl) - .setName('HF Mirror / Endpoint') - .setDesc('Model download source. Try official if mirror fails. Custom: type any URL.') - .addDropdown(dropdown => { - dropdown.addOption('https://hf-mirror.com', 'hf-mirror.com (recommended)'); - dropdown.addOption('https://huggingface.co', 'huggingface.co (official)'); - dropdown.addOption('__custom__', 'Custom...'); - const current = this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com'; - const isPreset = ['https://hf-mirror.com', 'https://huggingface.co'].includes(current); - dropdown.setValue(isPreset ? current : '__custom__') - .onChange(value => { - if (value !== '__custom__') { - this.plugin.settings.vector_db_hf_endpoint = value; - this.plugin.saveSettings(); - if (customInput) { customInput.settingEl.style.display = 'none'; customInput.setValue(''); } - } else { - customInput.settingEl.style.display = ''; - } - }); - }); - const customInput = new Setting(containerEl) - .setName('Custom Endpoint') - .setDesc('Enter a custom HuggingFace mirror URL') - .addText(text => { - const current = this.plugin.settings.vector_db_hf_endpoint || ''; - const isPreset = ['https://hf-mirror.com', 'https://huggingface.co'].includes(current); - text.setPlaceholder('https://your-mirror.com') - .setValue(isPreset ? '' : current) - .onChange(value => { - this.plugin.settings.vector_db_hf_endpoint = value; - this.plugin.saveSettings(); - }); - }); - const current = this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com'; - const isPreset = ['https://hf-mirror.com', 'https://huggingface.co'].includes(current); - if (isPreset) customInput.settingEl.style.display = 'none'; } function classifyError(errorCode) { @@ -2850,6 +2812,9 @@ class PaperForgeSettingTab extends PluginSettingTab { const vp = this.app.vault.adapter.basePath; + // HF Mirror — always visible, needed before deps install + this._renderHfMirror(containerEl); + // === Resolve state === if (this._vectorDepsOk === true && this._embedStatusText !== null) { this._renderVectorReady(containerEl, vp); @@ -2890,6 +2855,45 @@ class PaperForgeSettingTab extends PluginSettingTab { } } + _renderHfMirror(containerEl) { + const setting = new Setting(containerEl) + .setName('HF Mirror / Endpoint') + .setDesc('Model download source. Try official if mirror fails. Custom: type any URL.') + .addDropdown(dropdown => { + dropdown.addOption('https://hf-mirror.com', 'hf-mirror.com (recommended)'); + dropdown.addOption('https://huggingface.co', 'huggingface.co (official)'); + dropdown.addOption('__custom__', 'Custom...'); + const current = this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com'; + const isPreset = ['https://hf-mirror.com', 'https://huggingface.co'].includes(current); + dropdown.setValue(isPreset ? current : '__custom__') + .onChange(value => { + if (value !== '__custom__') { + this.plugin.settings.vector_db_hf_endpoint = value; + this.plugin.saveSettings(); + if (this._hfCustomInput) { this._hfCustomInput.settingEl.style.display = 'none'; this._hfCustomInput.setValue(''); } + } else { + if (this._hfCustomInput) this._hfCustomInput.settingEl.style.display = ''; + } + }); + }); + this._hfCustomInput = new Setting(containerEl) + .setName('Custom Endpoint') + .setDesc('Enter a custom HuggingFace mirror URL') + .addText(text => { + const current = this.plugin.settings.vector_db_hf_endpoint || ''; + const isPreset = ['https://hf-mirror.com', 'https://huggingface.co'].includes(current); + text.setPlaceholder('https://your-mirror.com') + .setValue(isPreset ? '' : current) + .onChange(value => { + this.plugin.settings.vector_db_hf_endpoint = value; + this.plugin.saveSettings(); + }); + }); + const current = this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com'; + const isPreset = ['https://hf-mirror.com', 'https://huggingface.co'].includes(current); + if (isPreset) this._hfCustomInput.settingEl.style.display = 'none'; + } + _renderVectorNoDeps(containerEl) { const box = containerEl.createEl('div'); box.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; From 8bf9476a762985c809802d3f338e01df6409e1c6 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 20:54:07 +0800 Subject: [PATCH 088/132] fix: store Text component ref instead of Setting for setValue --- paperforge/plugin/main.js | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 0a360ee..ac4a326 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2856,7 +2856,8 @@ class PaperForgeSettingTab extends PluginSettingTab { } _renderHfMirror(containerEl) { - const setting = new Setting(containerEl) + let customText = null; + new Setting(containerEl) .setName('HF Mirror / Endpoint') .setDesc('Model download source. Try official if mirror fails. Custom: type any URL.') .addDropdown(dropdown => { @@ -2870,16 +2871,17 @@ class PaperForgeSettingTab extends PluginSettingTab { if (value !== '__custom__') { this.plugin.settings.vector_db_hf_endpoint = value; this.plugin.saveSettings(); - if (this._hfCustomInput) { this._hfCustomInput.settingEl.style.display = 'none'; this._hfCustomInput.setValue(''); } + if (customInput) { customInput.settingEl.style.display = 'none'; if (customText) customText.setValue(''); } } else { - if (this._hfCustomInput) this._hfCustomInput.settingEl.style.display = ''; + if (customInput) customInput.settingEl.style.display = ''; } }); }); - this._hfCustomInput = new Setting(containerEl) + const customInput = new Setting(containerEl) .setName('Custom Endpoint') .setDesc('Enter a custom HuggingFace mirror URL') .addText(text => { + customText = text; const current = this.plugin.settings.vector_db_hf_endpoint || ''; const isPreset = ['https://hf-mirror.com', 'https://huggingface.co'].includes(current); text.setPlaceholder('https://your-mirror.com') @@ -2891,7 +2893,7 @@ class PaperForgeSettingTab extends PluginSettingTab { }); const current = this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com'; const isPreset = ['https://hf-mirror.com', 'https://huggingface.co'].includes(current); - if (isPreset) this._hfCustomInput.settingEl.style.display = 'none'; + if (isPreset) customInput.settingEl.style.display = 'none'; } _renderVectorNoDeps(containerEl) { From beda4119e890fc60cc731ecb236354bcd7c93cf4 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 21:03:24 +0800 Subject: [PATCH 089/132] feat: huggingface_hub.set_endpoint() mirror + API base_url support --- paperforge/memory/vector_db.py | 13 ++++++++++--- paperforge/plugin/main.js | 12 ++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/paperforge/memory/vector_db.py b/paperforge/memory/vector_db.py index fe8ffd6..8fba0d7 100644 --- a/paperforge/memory/vector_db.py +++ b/paperforge/memory/vector_db.py @@ -76,7 +76,14 @@ def get_embedding_model(vault: Path): model_name = settings.get("vector_db_model", "BAAI/bge-small-en-v1.5") - # HF_ENDPOINT is set by the JS plugin via environment variable — don't override + # Apply HF mirror endpoint via huggingface_hub API (works alongside env var) + hf_endpoint = settings.get("vector_db_hf_endpoint", "") or os.environ.get("HF_ENDPOINT", "") + if hf_endpoint: + try: + from huggingface_hub import set_endpoint + set_endpoint(hf_endpoint) + except Exception: + pass if _cached_model is not None and _cached_model_name == model_name: return _cached_model @@ -136,7 +143,7 @@ def _embed_paper_api(vault, zotero_key, chunks, collection) -> int: raise ValueError("No API key configured for vector DB") from openai import OpenAI - client = OpenAI(api_key=api_key) + client = OpenAI(api_key=api_key, base_url=settings.get("vector_db_api_base", None) or None) texts = [c["text"] for c in chunks] ids = [f"{zotero_key}_{c['chunk_index']}" for c in chunks] @@ -184,7 +191,7 @@ def retrieve_chunks(vault: Path, query: str, limit: int = 5, expand: bool = True if not api_key: raise ValueError("No API key configured for vector DB") from openai import OpenAI - client = OpenAI(api_key=api_key) + client = OpenAI(api_key=api_key, base_url=settings.get("vector_db_api_base", None) or None) response = client.embeddings.create(model="text-embedding-3-small", input=query) query_embedding = response.data[0].embedding else: diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index ac4a326..5dcab0d 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -570,6 +570,7 @@ const DEFAULT_SETTINGS = { vector_db_mode: 'local', vector_db_model: 'BAAI/bge-small-en-v1.5', vector_db_api_key: '', + vector_db_api_base: '', vector_db_hf_endpoint: 'https://hf-mirror.com', vector_db_last_model: '', frozen_skills: {}, @@ -3083,6 +3084,17 @@ class PaperForgeSettingTab extends PluginSettingTab { button.setDisabled(false); }); }); + new Setting(containerEl) + .setName('API Base URL') + .setDesc('Custom OpenAI-compatible API endpoint (e.g., https://api.openai.com/v1). Leave empty for default.') + .addText(text => { + text.setPlaceholder('https://api.openai.com/v1') + .setValue(this.plugin.settings.vector_db_api_base || '') + .onChange(value => { + this.plugin.settings.vector_db_api_base = value; + this.plugin.saveSettings(); + }); + }); } // Rebuild button with live terminal output From f718574087e807cdeee42fe6d090b35c2ddeb5ea Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 21:33:09 +0800 Subject: [PATCH 090/132] feat: urllib-based model download bypasses HF hub + HF_TOKEN field --- paperforge/memory/vector_db.py | 75 +++++++++++++++++++++++++++++----- paperforge/plugin/main.js | 17 +++++++- 2 files changed, 80 insertions(+), 12 deletions(-) diff --git a/paperforge/memory/vector_db.py b/paperforge/memory/vector_db.py index 8fba0d7..fdb8514 100644 --- a/paperforge/memory/vector_db.py +++ b/paperforge/memory/vector_db.py @@ -72,29 +72,84 @@ def get_embedding_model(vault: Path): mode = settings.get("vector_db_mode", "local") if mode == "api": - return None # API mode — embedding done externally + return None model_name = settings.get("vector_db_model", "BAAI/bge-small-en-v1.5") - # Apply HF mirror endpoint via huggingface_hub API (works alongside env var) - hf_endpoint = settings.get("vector_db_hf_endpoint", "") or os.environ.get("HF_ENDPOINT", "") - if hf_endpoint: - try: - from huggingface_hub import set_endpoint - set_endpoint(hf_endpoint) - except Exception: - pass - if _cached_model is not None and _cached_model_name == model_name: return _cached_model ST = _get_st() logger.info("Loading embedding model: %s", model_name) + + hf_endpoint = settings.get("vector_db_hf_endpoint", "") or os.environ.get("HF_ENDPOINT", "") + + if hf_endpoint: + local_path = _download_model_via_mirror(model_name, hf_endpoint) + if local_path and (local_path / "modules.json").exists(): + logger.info("Loading from local mirror copy: %s", local_path) + _cached_model = ST(str(local_path)) + _cached_model_name = model_name + return _cached_model + _cached_model = ST(model_name) _cached_model_name = model_name return _cached_model +def _download_model_via_mirror(model_name: str, mirror: str) -> Path | None: + """Download model files from a mirror URL to a local cache directory. + Bypasses huggingface_hub entirely by using urllib directly.""" + try: + import urllib.request + except Exception: + return None + + mirror = mirror.rstrip("/") + base_url = f"{mirror}/{model_name}/resolve/main" + local_dir = Path.home() / ".cache" / "paperforge" / "models" / model_name.replace("/", "--") + + files = [ + "config.json", "modules.json", "config_sentence_transformers.json", + "sentence_bert_config.json", "special_tokens_map.json", + "tokenizer.json", "tokenizer_config.json", "vocab.txt", + "model.safetensors", "pytorch_model.bin", + "1_Pooling/config.json", + ] + + local_dir.mkdir(parents=True, exist_ok=True) + + # Build headers from HF_TOKEN + hf_token = os.environ.get("HF_TOKEN", "") + headers = {} + if hf_token: + headers["Authorization"] = f"Bearer {hf_token}" + + for f in files: + dest = local_dir / f + if dest.exists() and dest.stat().st_size > 0: + continue + dest.parent.mkdir(parents=True, exist_ok=True) + url = f"{base_url}/{f}" + try: + req = urllib.request.Request(url, headers=headers) + with urllib.request.urlopen(req, timeout=600) as resp: + with open(dest, "wb") as out: + while True: + chunk = resp.read(8192) + if not chunk: + break + out.write(chunk) + except Exception: + pass + + # Return path only if core files exist + has_weights = (local_dir / "model.safetensors").exists() or (local_dir / "pytorch_model.bin").exists() + has_config = (local_dir / "modules.json").exists() and (local_dir / "config.json").exists() + return local_dir if has_weights and has_config else None + return _cached_model + + def embed_paper(vault: Path, zotero_key: str, chunks: list[dict]) -> int: """Embed chunks for one paper and insert into ChromaDB. Returns count.""" collection = get_collection(vault) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 5dcab0d..91e0820 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -572,6 +572,7 @@ const DEFAULT_SETTINGS = { vector_db_api_key: '', vector_db_api_base: '', vector_db_hf_endpoint: 'https://hf-mirror.com', + vector_db_hf_token: '', vector_db_last_model: '', frozen_skills: {}, }; @@ -2895,6 +2896,18 @@ class PaperForgeSettingTab extends PluginSettingTab { const current = this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com'; const isPreset = ['https://hf-mirror.com', 'https://huggingface.co'].includes(current); if (isPreset) customInput.settingEl.style.display = 'none'; + + new Setting(containerEl) + .setName('HF Token') + .setDesc('HuggingFace access token (optional, helps with rate limits and gated models)') + .addText(text => { + text.setPlaceholder('hf_...') + .setValue(this.plugin.settings.vector_db_hf_token || '') + .onChange(value => { + this.plugin.settings.vector_db_hf_token = value; + this.plugin.saveSettings(); + }); + }); } _renderVectorNoDeps(containerEl) { @@ -2917,7 +2930,7 @@ class PaperForgeSettingTab extends PluginSettingTab { const notice = new Notice('Installing chromadb + sentence-transformers...', 0); try { const { exec } = require('child_process'); - const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1', HF_ENDPOINT: this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com' }); + const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1', HF_ENDPOINT: this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com', HF_TOKEN: this.plugin.settings.vector_db_hf_token || '' }); await new Promise((resolve, reject) => { exec(`"${pyResult.path}" -m pip install chromadb sentence-transformers`, { encoding: 'utf-8', timeout: 300000, env: env, @@ -3117,7 +3130,7 @@ class PaperForgeSettingTab extends PluginSettingTab { terminalEl.setText(''); const { spawn } = require('child_process'); - const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1', HF_ENDPOINT: this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com' }); + const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1', HF_ENDPOINT: this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com', HF_TOKEN: this.plugin.settings.vector_db_hf_token || '' }); const child = spawn(pyResult.path, ['-m', 'paperforge', '--vault', vp, 'embed', 'build', '--force'], { env: env, stdio: ['ignore', 'pipe', 'pipe'] }); From 9dc5ef3fff8ca39731bd2471e1869e9eb94c3a88 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 21:55:09 +0800 Subject: [PATCH 091/132] feat: API model as text field (custom any name) + fix Python duplicate --- paperforge/memory/vector_db.py | 11 ++++++----- paperforge/plugin/main.js | 19 +++++++++++++++++-- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/paperforge/memory/vector_db.py b/paperforge/memory/vector_db.py index fdb8514..62f5ec3 100644 --- a/paperforge/memory/vector_db.py +++ b/paperforge/memory/vector_db.py @@ -197,9 +197,6 @@ def _embed_paper_api(vault, zotero_key, chunks, collection) -> int: if not api_key: raise ValueError("No API key configured for vector DB") - from openai import OpenAI - client = OpenAI(api_key=api_key, base_url=settings.get("vector_db_api_base", None) or None) - texts = [c["text"] for c in chunks] ids = [f"{zotero_key}_{c['chunk_index']}" for c in chunks] metadatas = [ @@ -209,7 +206,10 @@ def _embed_paper_api(vault, zotero_key, chunks, collection) -> int: for c in chunks ] - response = client.embeddings.create(model="text-embedding-3-small", input=texts) + from openai import OpenAI + api_model = settings.get("vector_db_api_model", "text-embedding-3-small") + client = OpenAI(api_key=api_key, base_url=settings.get("vector_db_api_base", None) or None) + response = client.embeddings.create(model=api_model, input=texts) embeddings = [e.embedding for e in response.data] collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas) @@ -247,7 +247,8 @@ def retrieve_chunks(vault: Path, query: str, limit: int = 5, expand: bool = True raise ValueError("No API key configured for vector DB") from openai import OpenAI client = OpenAI(api_key=api_key, base_url=settings.get("vector_db_api_base", None) or None) - response = client.embeddings.create(model="text-embedding-3-small", input=query) + api_model = settings.get("vector_db_api_model", "text-embedding-3-small") + response = client.embeddings.create(model=api_model, input=query) query_embedding = response.data[0].embedding else: query_embedding = model.encode(query).tolist() diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 91e0820..b477055 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -197,8 +197,22 @@ function runSubprocess(pythonExe, args, cwd, timeout, _spawn, env) { stderr: stderrChunks.join("") + "\n" + err.message, exitCode: -1, elapsed: Date.now() - startTime }); }); - }); -} + }); + new Setting(containerEl) + .setName('API Model') + .setDesc('Which OpenAI-compatible embedding model to use.') + new Setting(containerEl) + .setName('API Model') + .setDesc('Embedding model name (e.g., text-embedding-3-small, qwen-3-embedding)') + .addText(text => { + text.setPlaceholder('text-embedding-3-small') + .setValue(this.plugin.settings.vector_db_api_model || 'text-embedding-3-small') + .onChange(value => { + this.plugin.settings.vector_db_api_model = value; + this.plugin.saveSettings(); + }); + }); + } @@ -571,6 +585,7 @@ const DEFAULT_SETTINGS = { vector_db_model: 'BAAI/bge-small-en-v1.5', vector_db_api_key: '', vector_db_api_base: '', + vector_db_api_model: 'text-embedding-3-small', vector_db_hf_endpoint: 'https://hf-mirror.com', vector_db_hf_token: '', vector_db_last_model: '', From 5ae4f726fabeb6e1a9a24d5b24df3e94fa82db18 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 22:01:21 +0800 Subject: [PATCH 092/132] fix: add _renderApiConfig method, remove broken import() syntax --- paperforge/plugin/main.js | 93 +++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 52 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index b477055..170c594 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2832,6 +2832,9 @@ class PaperForgeSettingTab extends PluginSettingTab { // HF Mirror — always visible, needed before deps install this._renderHfMirror(containerEl); + // API config — always visible when API mode is selected (no deps needed) + this._renderApiConfig(containerEl); + // === Resolve state === if (this._vectorDepsOk === true && this._embedStatusText !== null) { this._renderVectorReady(containerEl, vp); @@ -2925,6 +2928,44 @@ class PaperForgeSettingTab extends PluginSettingTab { }); } + _renderApiConfig(containerEl) { + if (this.plugin.settings.vector_db_mode !== 'api') return; + + new Setting(containerEl) + .setName('OpenAI API Key') + .setDesc('Used for embedding model API calls') + .addText(text => { + text.setPlaceholder('sk-...') + .setValue(this.plugin.settings.vector_db_api_key || '') + .onChange(value => { + this.plugin.settings.vector_db_api_key = value; + this.plugin.saveSettings(); + }); + }); + new Setting(containerEl) + .setName('API Base URL') + .setDesc('Custom OpenAI-compatible API endpoint. Leave empty for default.') + .addText(text => { + text.setPlaceholder('https://api.openai.com/v1') + .setValue(this.plugin.settings.vector_db_api_base || '') + .onChange(value => { + this.plugin.settings.vector_db_api_base = value; + this.plugin.saveSettings(); + }); + }); + new Setting(containerEl) + .setName('API Model') + .setDesc('Embedding model name for this endpoint') + .addText(text => { + text.setPlaceholder('text-embedding-3-small') + .setValue(this.plugin.settings.vector_db_api_model || 'text-embedding-3-small') + .onChange(value => { + this.plugin.settings.vector_db_api_model = value; + this.plugin.saveSettings(); + }); + }); + } + _renderVectorNoDeps(containerEl) { const box = containerEl.createEl('div'); box.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; @@ -3073,58 +3114,6 @@ class PaperForgeSettingTab extends PluginSettingTab { }); } - // API key (api mode) - if (this.plugin.settings.vector_db_mode === 'api') { - new Setting(containerEl) - .setName('OpenAI API Key') - .setDesc('Used for text-embedding-3-small (1536d)') - .addText(text => { - text.setPlaceholder('sk-...') - .setValue(this.plugin.settings.vector_db_api_key || '') - .onChange(value => { - this.plugin.settings.vector_db_api_key = value; - this.plugin.saveSettings(); - }); - }) - .addButton(button => { - button.setButtonText('Verify') - .onClick(async () => { - const key = this.plugin.settings.vector_db_api_key; - if (!key || !key.startsWith('sk-')) { - new Notice('Enter a valid OpenAI API key.'); - return; - } - button.setButtonText('Checking...'); - button.setDisabled(true); - try { - const resp = await (0, eval)('import')('node:https').then(https => new Promise((resolve, reject) => { - const req = https.request('https://api.openai.com/v1/models', { - method: 'GET', headers: { Authorization: 'Bearer ' + key }, timeout: 10000 - }, (res) => { let d=''; res.on('data',c=>d+=c); res.on('end',()=>resolve({ok:res.statusCode===200,data:d})); }); - req.on('error', reject); req.end(); - })); - if (resp.ok) { new Notice('API key valid.'); } - else { new Notice('API key rejected.'); } - } catch (e) { - new Notice('Network error: ' + e.message); - } - button.setButtonText('Verify'); - button.setDisabled(false); - }); - }); - new Setting(containerEl) - .setName('API Base URL') - .setDesc('Custom OpenAI-compatible API endpoint (e.g., https://api.openai.com/v1). Leave empty for default.') - .addText(text => { - text.setPlaceholder('https://api.openai.com/v1') - .setValue(this.plugin.settings.vector_db_api_base || '') - .onChange(value => { - this.plugin.settings.vector_db_api_base = value; - this.plugin.saveSettings(); - }); - }); - } - // Rebuild button with live terminal output const terminalEl = containerEl.createEl('pre'); terminalEl.style.cssText = 'display:none; background:var(--background-primary); padding:10px; border-radius:4px; border:1px solid var(--background-modifier-border); max-height:250px; overflow-y:auto; font-size:11px; font-family:var(--font-monospace); margin:8px 0; white-space:pre-wrap; word-break:break-all; opacity:0.8;'; From af3fbc95a7f9e64bb7f2c7663f911c8068327787 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 22:06:42 +0800 Subject: [PATCH 093/132] fix: add openai to deps check + install cmd for API mode --- paperforge/plugin/main.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 170c594..a07dc41 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2813,7 +2813,7 @@ class PaperForgeSettingTab extends PluginSettingTab { new Setting(containerEl) .setName('Enable Vector Retrieval') - .setDesc('Semantic search across OCR fulltext. Requires: pip install chromadb sentence-transformers (~500MB).') + .setDesc('Semantic search across OCR fulltext. Requires: pip install chromadb sentence-transformers openai (~500MB).') .addToggle(toggle => { toggle.setValue(this.plugin.settings.features.vector_db) .onChange(value => { @@ -2857,7 +2857,7 @@ class PaperForgeSettingTab extends PluginSettingTab { return; } const { exec } = require('child_process'); - exec(`"${pyResult.path}" -c "import chromadb; import sentence_transformers; print('ok')"`, { + exec(`"${pyResult.path}" -c "import chromadb; import sentence_transformers; import openai; print('ok')"`, { encoding: 'utf-8', timeout: 15000 }, (err, stdout) => { const ok = !err && (stdout || '').trim() === 'ok'; @@ -2973,7 +2973,7 @@ class PaperForgeSettingTab extends PluginSettingTab { new Setting(containerEl) .setName('Install Dependencies') - .setDesc('pip install chromadb sentence-transformers (~500MB)') + .setDesc('pip install chromadb sentence-transformers openai(~500MB)') .addButton(button => { button.setButtonText('Install') .setCta() @@ -2988,7 +2988,7 @@ class PaperForgeSettingTab extends PluginSettingTab { const { exec } = require('child_process'); const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1', HF_ENDPOINT: this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com', HF_TOKEN: this.plugin.settings.vector_db_hf_token || '' }); await new Promise((resolve, reject) => { - exec(`"${pyResult.path}" -m pip install chromadb sentence-transformers`, { + exec(`"${pyResult.path}" -m pip install chromadb sentence-transformers openai`, { encoding: 'utf-8', timeout: 300000, env: env, }, (error) => { error ? reject(error) : resolve(); }); }); From a3f98865dc6c75ed60654a719ccf806435ce93f6 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 22:13:00 +0800 Subject: [PATCH 094/132] feat(plugin): click terminal output to copy to clipboard --- paperforge/plugin/main.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index a07dc41..420c48e 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -3117,6 +3117,10 @@ class PaperForgeSettingTab extends PluginSettingTab { // Rebuild button with live terminal output const terminalEl = containerEl.createEl('pre'); terminalEl.style.cssText = 'display:none; background:var(--background-primary); padding:10px; border-radius:4px; border:1px solid var(--background-modifier-border); max-height:250px; overflow-y:auto; font-size:11px; font-family:var(--font-monospace); margin:8px 0; white-space:pre-wrap; word-break:break-all; opacity:0.8;'; + terminalEl.onclick = () => { + const text = terminalEl.textContent; + if (text) { navigator.clipboard.writeText(text); new Notice('Output copied to clipboard'); } + }; new Setting(containerEl) .setName('Rebuild Vectors') From 1d308e3b7223cdd7abdb670ee45e9289410b9513 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 22:22:37 +0800 Subject: [PATCH 095/132] fix: pass API key/base/model as env vars to avoid saveSettings race --- paperforge/memory/vector_db.py | 12 ++++++++---- paperforge/plugin/main.js | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/paperforge/memory/vector_db.py b/paperforge/memory/vector_db.py index 62f5ec3..72007ad 100644 --- a/paperforge/memory/vector_db.py +++ b/paperforge/memory/vector_db.py @@ -207,8 +207,10 @@ def _embed_paper_api(vault, zotero_key, chunks, collection) -> int: ] from openai import OpenAI - api_model = settings.get("vector_db_api_model", "text-embedding-3-small") - client = OpenAI(api_key=api_key, base_url=settings.get("vector_db_api_base", None) or None) + api_model = os.environ.get("VECTOR_DB_API_MODEL", "") or settings.get("vector_db_api_model", "text-embedding-3-small") + api_base = os.environ.get("VECTOR_DB_API_BASE", "") or settings.get("vector_db_api_base", None) or None + api_key = os.environ.get("VECTOR_DB_API_KEY", "") or api_key + client = OpenAI(api_key=api_key, base_url=api_base) response = client.embeddings.create(model=api_model, input=texts) embeddings = [e.embedding for e in response.data] @@ -246,8 +248,10 @@ def retrieve_chunks(vault: Path, query: str, limit: int = 5, expand: bool = True if not api_key: raise ValueError("No API key configured for vector DB") from openai import OpenAI - client = OpenAI(api_key=api_key, base_url=settings.get("vector_db_api_base", None) or None) - api_model = settings.get("vector_db_api_model", "text-embedding-3-small") + api_base = os.environ.get("VECTOR_DB_API_BASE", "") or settings.get("vector_db_api_base", None) or None + api_key = os.environ.get("VECTOR_DB_API_KEY", "") or api_key + client = OpenAI(api_key=api_key, base_url=api_base) + api_model = os.environ.get("VECTOR_DB_API_MODEL", "") or settings.get("vector_db_api_model", "text-embedding-3-small") response = client.embeddings.create(model=api_model, input=query) query_embedding = response.data[0].embedding else: diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 420c48e..4768bf5 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -3138,7 +3138,7 @@ class PaperForgeSettingTab extends PluginSettingTab { terminalEl.setText(''); const { spawn } = require('child_process'); - const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1', HF_ENDPOINT: this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com', HF_TOKEN: this.plugin.settings.vector_db_hf_token || '' }); + const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1', HF_ENDPOINT: this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com', HF_TOKEN: this.plugin.settings.vector_db_hf_token || '', VECTOR_DB_API_KEY: this.plugin.settings.vector_db_api_key || '', VECTOR_DB_API_BASE: this.plugin.settings.vector_db_api_base || '', VECTOR_DB_API_MODEL: this.plugin.settings.vector_db_api_model || '' }); const child = spawn(pyResult.path, ['-m', 'paperforge', '--vault', vp, 'embed', 'build', '--force'], { env: env, stdio: ['ignore', 'pipe', 'pipe'] }); From cd4fea2217d36277f474cfd863ec1378b2a06aa8 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 22:25:20 +0800 Subject: [PATCH 096/132] debug: log API base_url to check env var passthrough --- paperforge/memory/vector_db.py | 1 + 1 file changed, 1 insertion(+) diff --git a/paperforge/memory/vector_db.py b/paperforge/memory/vector_db.py index 72007ad..a83d1b9 100644 --- a/paperforge/memory/vector_db.py +++ b/paperforge/memory/vector_db.py @@ -210,6 +210,7 @@ def _embed_paper_api(vault, zotero_key, chunks, collection) -> int: api_model = os.environ.get("VECTOR_DB_API_MODEL", "") or settings.get("vector_db_api_model", "text-embedding-3-small") api_base = os.environ.get("VECTOR_DB_API_BASE", "") or settings.get("vector_db_api_base", None) or None api_key = os.environ.get("VECTOR_DB_API_KEY", "") or api_key + logger.info("API mode: base_url=%s, model=%s", api_base or "(default OpenAI)", api_model) client = OpenAI(api_key=api_key, base_url=api_base) response = client.embeddings.create(model=api_model, input=texts) embeddings = [e.embedding for e in response.data] From d89a6275a665d054b831238d7016ebe182a3ed3b Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Wed, 13 May 2026 22:32:46 +0800 Subject: [PATCH 097/132] fix: _getCurrentModelKey reads actual API model instead of hardcoded value --- paperforge/plugin/main.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 4768bf5..189c9c0 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -3173,7 +3173,7 @@ class PaperForgeSettingTab extends PluginSettingTab { } _getCurrentModelKey() { - if (this.plugin.settings.vector_db_mode === 'api') return 'openai/text-embedding-3-small'; + if (this.plugin.settings.vector_db_mode === 'api') return this.plugin.settings.vector_db_api_model || 'openai/text-embedding-3-small'; return this.plugin.settings.vector_db_model || 'BAAI/bge-small-en-v1.5'; } From a52b1c78d2c981e7010e31df1df5c6f4b9f31ab0 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 00:18:03 +0800 Subject: [PATCH 098/132] fix: get_embed_status reads API model; defer display() after async status refresh --- paperforge/memory/vector_db.py | 6 ++++-- paperforge/plugin/main.js | 3 +-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/paperforge/memory/vector_db.py b/paperforge/memory/vector_db.py index a83d1b9..08e8e80 100644 --- a/paperforge/memory/vector_db.py +++ b/paperforge/memory/vector_db.py @@ -295,9 +295,11 @@ def get_embed_status(vault: Path) -> dict: pass settings = _read_plugin_settings(vault) + mode = settings.get("vector_db_mode", "local") + model = settings.get("vector_db_api_model", "text-embedding-3-small") if mode == "api" else settings.get("vector_db_model", "BAAI/bge-small-en-v1.5") return { "db_exists": exists, "chunk_count": chunk_count, - "model": settings.get("vector_db_model", "BAAI/bge-small-en-v1.5"), - "mode": settings.get("vector_db_mode", "local"), + "model": model, + "mode": mode, } diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 189c9c0..21302c6 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -3159,9 +3159,8 @@ class PaperForgeSettingTab extends PluginSettingTab { this.plugin.settings.vector_db_last_model = currentModel; this.plugin.saveSettings(); this._embedStatusText = null; - this._execEmbedStatus(pyResult.path, vp, (text) => { this._embedStatusText = text; }); + this._execEmbedStatus(pyResult.path, vp, (text) => { this._embedStatusText = text; this.display(); }); new Notice('Vector build complete.'); - this.display(); } catch (e) { append('\n--- BUILD FAILED ---\n' + (e.stderr || e.message || e)); new Notice('Build failed. See terminal output.'); From 40ed946bb7b50001b68bf54280b262e008588458 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 00:40:33 +0800 Subject: [PATCH 099/132] feat(plugin): i18n descriptions for Skills/Memory/Vector + collapsible vector config --- paperforge/plugin/main.js | 53 ++++++++++++++++++++++++++++----- paperforge/plugin/versions.json | 5 +++- scripts/bump.py | 4 +-- 3 files changed, 52 insertions(+), 10 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 21302c6..c51307a 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -507,6 +507,13 @@ Object.assign(LANG.en, { /* ── Tabbed Settings ── */ tab_setup: 'Installation', tab_features: 'Features', + /* ── Features tab descriptions ── */ + feat_skills_desc: 'Manage and enable/disable agent skills installed in your vault. Each row corresponds to a SKILL.md file — toggle off to prevent the agent from auto-invoking that skill.', + feat_skills_system: 'System Skills ship with PaperForge and are updated alongside PaperForge.', + feat_skills_user: 'User Skills are custom skills you install from community or create yourself.', + feat_memory_desc: 'The Memory Layer is the core data engine of PaperForge, powered by SQLite. It integrates all literature metadata (papers, assets, aliases, reading events), provides FTS5 full-text search across titles/abstracts/authors/collections, and enables the agent-context and paper-status commands. Always active — no toggle needed.', + feat_vector_desc: 'Vector Database enables semantic search across OCR-extracted fulltext using embedding models. Documents are split into chunks, embedded into vector space, and stored in ChromaDB. Supports local models (free, CPU) or OpenAI API (paid, faster).', + feat_vector_config_label: 'Advanced Configuration', }); /* ── LANG.zh: v1.12 runtime health, OCR queue, pf-deep, dashboard translations ── */ @@ -546,6 +553,13 @@ Object.assign(LANG.zh, { /* ── Tabbed Settings ── */ tab_setup: '安装', tab_features: '功能', + /* ── 功能介绍的描述文本 ── */ + feat_skills_desc: '管理 Vault 中已安装的 Agent 技能。每行对应一个 SKILL.md 文件,关闭开关可阻止 Agent 自动调用该技能。', + feat_skills_system: '系统技能随 PaperForge 一同发布,会跟随 PaperForge 版本更新。', + feat_skills_user: '用户技能是你自行安装或创建的自定义技能。', + feat_memory_desc: '记忆层是 PaperForge 的核心数据引擎,基于 SQLite 构建。它整合了所有文献元数据(论文、资源文件、别名、阅读事件),支持 FTS5 全文检索(可搜索标题、摘要、作者、分类),并为 agent-context 和 paper-status 命令提供数据支撑。始终运行,无需手动开启。', + feat_vector_desc: '向量数据库通过嵌入模型实现 OCR 全文的语义搜索。文档被切分为文本块(chunk),编码为向量存入 ChromaDB。支持本地模型(免费,CPU 运行)或 OpenAI API(付费,更快速)。', + feat_vector_config_label: '高级配置', }); function langFromApp(app) { @@ -2597,6 +2611,11 @@ class PaperForgeSettingTab extends PluginSettingTab { _renderFeaturesTab(containerEl) { // --- Section: Skills --- containerEl.createEl('h3', { text: 'Skills' }); + const skillsDescEl = containerEl.createEl('div', { cls: 'paperforge-desc-box' }); + skillsDescEl.style.cssText = 'padding:8px 12px; margin:0 0 12px; background:var(--background-secondary); border-radius:4px; font-size:12px; color:var(--text-muted); line-height:1.5;'; + skillsDescEl.setText(t('feat_skills_desc')); + skillsDescEl.createEl('br'); + skillsDescEl.createEl('span', { text: t('feat_skills_system'), cls: '' }).style.opacity = '0.7'; // Agent platform selector const agentPlatforms = { @@ -2781,6 +2800,10 @@ class PaperForgeSettingTab extends PluginSettingTab { // --- Section: Memory Layer --- containerEl.createEl('h3', { text: 'Memory Layer' }); + const memoryDescEl = containerEl.createEl('div', { cls: 'paperforge-desc-box' }); + memoryDescEl.style.cssText = 'padding:8px 12px; margin:0 0 12px; background:var(--background-secondary); border-radius:4px; font-size:12px; color:var(--text-muted); line-height:1.5;'; + memoryDescEl.setText(t('feat_memory_desc')); + // Always-on SQLite status display const statusRow = containerEl.createEl('div', { cls: 'paperforge-memory-status' }); statusRow.style.cssText = 'display:flex; align-items:center; padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; @@ -2811,6 +2834,10 @@ class PaperForgeSettingTab extends PluginSettingTab { // --- Vector Database (within Memory Layer) --- containerEl.createEl('h4', { text: 'Vector Database' }); + const vecDescEl = containerEl.createEl('div', { cls: 'paperforge-desc-box' }); + vecDescEl.style.cssText = 'padding:8px 12px; margin:0 0 8px; background:var(--background-secondary); border-radius:4px; font-size:12px; color:var(--text-muted); line-height:1.5;'; + vecDescEl.setText(t('feat_vector_desc')); + new Setting(containerEl) .setName('Enable Vector Retrieval') .setDesc('Semantic search across OCR fulltext. Requires: pip install chromadb sentence-transformers openai (~500MB).') @@ -2829,24 +2856,36 @@ class PaperForgeSettingTab extends PluginSettingTab { const vp = this.app.vault.adapter.basePath; - // HF Mirror — always visible, needed before deps install - this._renderHfMirror(containerEl); + // Collapsible config section + const vecConfigHeader = containerEl.createEl('div', { cls: 'paperforge-skills-collapse-header' }); + vecConfigHeader.style.cssText = 'display:flex; align-items:center; cursor:pointer; padding:6px 0 2px; margin:0;'; + const vecArrow = vecConfigHeader.createEl('span', { text: '\u25BC' }); + vecArrow.style.cssText = 'display:inline-block; font-size:10px; margin-right:6px; transition:transform 0.2s;'; + vecConfigHeader.createEl('span', { text: t('feat_vector_config_label'), cls: '' }).style.cssText = 'font-size:12px; color:var(--text-muted);'; + const vecConfigContent = containerEl.createEl('div', { cls: 'paperforge-vector-config' }); + + let vecConfigCollapsed = false; + vecConfigHeader.addEventListener('click', () => { + vecConfigCollapsed = !vecConfigCollapsed; + vecConfigContent.style.display = vecConfigCollapsed ? 'none' : ''; + vecArrow.style.transform = vecConfigCollapsed ? 'rotate(-90deg)' : 'rotate(0deg)'; + }); - // API config — always visible when API mode is selected (no deps needed) - this._renderApiConfig(containerEl); + // HF Mirror — always visible, needed before deps install + this._renderHfMirror(vecConfigContent); // === Resolve state === if (this._vectorDepsOk === true && this._embedStatusText !== null) { - this._renderVectorReady(containerEl, vp); + this._renderVectorReady(vecConfigContent, vp); return; } if (this._vectorDepsOk === false) { - this._renderVectorNoDeps(containerEl); + this._renderVectorNoDeps(vecConfigContent); return; } // First check — deps unknown, run async if (this._vectorDepsOk === null) { - const statusBox = containerEl.createEl('div'); + const statusBox = vecConfigContent.createEl('div'); statusBox.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; statusBox.setText('Checking dependencies...'); diff --git a/paperforge/plugin/versions.json b/paperforge/plugin/versions.json index ecd95b7..f3dcc74 100644 --- a/paperforge/plugin/versions.json +++ b/paperforge/plugin/versions.json @@ -5,5 +5,8 @@ "1.4.18": "1.9.0", "1.5.0": "1.9.0", "1.5.1": "1.9.0", - "1.5.2": "1.9.0" + "1.5.2": "1.9.0", + "1.5.3": "1.9.0", + "1.5.4": "1.9.0", + "1.5.5": "1.9.0" } \ No newline at end of file diff --git a/scripts/bump.py b/scripts/bump.py index c29d393..cd7dd84 100644 --- a/scripts/bump.py +++ b/scripts/bump.py @@ -101,8 +101,8 @@ def main(): except subprocess.CalledProcessError: sys.exit("VERIFY FAILED: cannot read __init__.py from HEAD") - run(["git", "tag", "-f", f"v{new_ver}"]) - print(f"Committed and tagged v{new_ver}") + run(["git", "tag", "-f", new_ver]) + print(f"Committed and tagged {new_ver}") print("Run: git push && git push --tags") From c0059e620318a41a1c74f2a44b75b15260b729d5 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 00:51:49 +0800 Subject: [PATCH 100/132] i18n(plugin): translate all Feature tab UI strings to en/zh --- manifest.json | 2 +- paperforge/plugin/main.js | 89 +++++++++++++++++++++++++-------- paperforge/plugin/manifest.json | 2 +- 3 files changed, 69 insertions(+), 24 deletions(-) diff --git a/manifest.json b/manifest.json index af16719..21ba972 100644 --- a/manifest.json +++ b/manifest.json @@ -3,7 +3,7 @@ "name": "PaperForge", "version": "1.5.6rc1", "minAppVersion": "1.9.0", - "description": "PaperForge — Zotero literature pipeline. Sync PDFs, run OCR, and read with AI-assisted deep reading.", + "description": "Zotero literature pipeline for Obsidian. Sync PDFs, run OCR, and read with AI-assisted deep reading.", "author": "Lin Zhaoxuan", "authorUrl": "https://github.com/LLLin000", "isDesktopOnly": true diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index c51307a..d16260b 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -514,6 +514,29 @@ Object.assign(LANG.en, { feat_memory_desc: 'The Memory Layer is the core data engine of PaperForge, powered by SQLite. It integrates all literature metadata (papers, assets, aliases, reading events), provides FTS5 full-text search across titles/abstracts/authors/collections, and enables the agent-context and paper-status commands. Always active — no toggle needed.', feat_vector_desc: 'Vector Database enables semantic search across OCR-extracted fulltext using embedding models. Documents are split into chunks, embedded into vector space, and stored in ChromaDB. Supports local models (free, CPU) or OpenAI API (paid, faster).', feat_vector_config_label: 'Advanced Configuration', + feat_agent_platform: 'Agent Platform', + feat_agent_platform_desc: 'Select which agent platform to manage skills for.', + feat_vector_enable: 'Enable Vector Retrieval', + feat_vector_enable_desc: 'Semantic search across OCR fulltext. Requires: pip install chromadb sentence-transformers openai (~500MB).', + feat_hf_mirror: 'HF Mirror / Endpoint', + feat_hf_mirror_desc: 'Model download source. Try official if mirror fails. Custom: type any URL.', + feat_custom_endpoint: 'Custom Endpoint', + feat_custom_endpoint_desc: 'Enter a custom HuggingFace mirror URL.', + feat_hf_token: 'HF Token', + feat_hf_token_desc: 'HuggingFace access token (optional, helps with rate limits and gated models).', + feat_model: 'Model', + feat_embed_mode: 'Embedding Mode', + feat_embed_mode_local: 'Local (free, CPU)', + feat_embed_mode_api: 'API (OpenAI, paid)', + feat_openai_key: 'OpenAI API Key', + feat_openai_key_desc: 'Used for text-embedding-3-small (1536d).', + feat_verify: 'Verify', + feat_checking: 'Checking...', + feat_rebuild_vectors: 'Rebuild Vectors', + feat_rebuild_vectors_desc: 'Rebuild all OCR fulltext vectors. Required after model or mode change.', + feat_rebuild_vectors_changed: 'Model changed — rebuild to update all vectors.', + feat_install_deps: 'Install Dependencies', + feat_install_deps_desc: 'pip install chromadb sentence-transformers openai (~500MB).', }); /* ── LANG.zh: v1.12 runtime health, OCR queue, pf-deep, dashboard translations ── */ @@ -560,6 +583,29 @@ Object.assign(LANG.zh, { feat_memory_desc: '记忆层是 PaperForge 的核心数据引擎,基于 SQLite 构建。它整合了所有文献元数据(论文、资源文件、别名、阅读事件),支持 FTS5 全文检索(可搜索标题、摘要、作者、分类),并为 agent-context 和 paper-status 命令提供数据支撑。始终运行,无需手动开启。', feat_vector_desc: '向量数据库通过嵌入模型实现 OCR 全文的语义搜索。文档被切分为文本块(chunk),编码为向量存入 ChromaDB。支持本地模型(免费,CPU 运行)或 OpenAI API(付费,更快速)。', feat_vector_config_label: '高级配置', + feat_agent_platform: 'Agent 平台', + feat_agent_platform_desc: '选择要管理的 Agent 平台。', + feat_vector_enable: '启用向量检索', + feat_vector_enable_desc: '对 OCR 全文进行语义搜索。需安装: pip install chromadb sentence-transformers openai (~500MB)。', + feat_hf_mirror: 'HF 镜像站 / 端点', + feat_hf_mirror_desc: '模型下载源。镜像不可用时尝试官方源。自定义:输入任意 URL。', + feat_custom_endpoint: '自定义端点', + feat_custom_endpoint_desc: '输入自定义 HuggingFace 镜像 URL。', + feat_hf_token: 'HF Token', + feat_hf_token_desc: 'HuggingFace 访问令牌(可选,有助于解除限速和下载受限模型)。', + feat_model: '模型', + feat_embed_mode: '嵌入模式', + feat_embed_mode_local: '本地(免费,CPU)', + feat_embed_mode_api: 'API(OpenAI,付费)', + feat_openai_key: 'OpenAI API Key', + feat_openai_key_desc: '用于 text-embedding-3-small(1536 维)。', + feat_verify: '验证', + feat_checking: '检测中…', + feat_rebuild_vectors: '重建向量', + feat_rebuild_vectors_desc: '重建所有 OCR 全文向量。更换模型或模式后需要重建。', + feat_rebuild_vectors_changed: '模型已更换 — 需要重建向量。', + feat_install_deps: '安装依赖', + feat_install_deps_desc: 'pip install chromadb sentence-transformers openai (~500MB)。', }); function langFromApp(app) { @@ -2642,8 +2688,8 @@ class PaperForgeSettingTab extends PluginSettingTab { let selectedPlatform = this.plugin.settings.selected_skill_platform || 'opencode'; new Setting(containerEl) - .setName('Agent Platform') - .setDesc('Select which agent platform to manage skills for.') + .setName(t('feat_agent_platform')) + .setDesc(t('feat_agent_platform_desc')) .addDropdown(dropdown => { Object.entries(agentPlatforms).forEach(([key, label]) => dropdown.addOption(key, label)); dropdown.setValue(selectedPlatform) @@ -2839,8 +2885,8 @@ class PaperForgeSettingTab extends PluginSettingTab { vecDescEl.setText(t('feat_vector_desc')); new Setting(containerEl) - .setName('Enable Vector Retrieval') - .setDesc('Semantic search across OCR fulltext. Requires: pip install chromadb sentence-transformers openai (~500MB).') + .setName(t('feat_vector_enable')) + .setDesc(t('feat_vector_enable_desc')) .addToggle(toggle => { toggle.setValue(this.plugin.settings.features.vector_db) .onChange(value => { @@ -2915,10 +2961,9 @@ class PaperForgeSettingTab extends PluginSettingTab { } _renderHfMirror(containerEl) { - let customText = null; - new Setting(containerEl) - .setName('HF Mirror / Endpoint') - .setDesc('Model download source. Try official if mirror fails. Custom: type any URL.') + const setting = new Setting(containerEl) + .setName(t('feat_hf_mirror')) + .setDesc(t('feat_hf_mirror_desc')) .addDropdown(dropdown => { dropdown.addOption('https://hf-mirror.com', 'hf-mirror.com (recommended)'); dropdown.addOption('https://huggingface.co', 'huggingface.co (official)'); @@ -2937,8 +2982,8 @@ class PaperForgeSettingTab extends PluginSettingTab { }); }); const customInput = new Setting(containerEl) - .setName('Custom Endpoint') - .setDesc('Enter a custom HuggingFace mirror URL') + .setName(t('feat_custom_endpoint')) + .setDesc(t('feat_custom_endpoint_desc')) .addText(text => { customText = text; const current = this.plugin.settings.vector_db_hf_endpoint || ''; @@ -2955,8 +3000,8 @@ class PaperForgeSettingTab extends PluginSettingTab { if (isPreset) customInput.settingEl.style.display = 'none'; new Setting(containerEl) - .setName('HF Token') - .setDesc('HuggingFace access token (optional, helps with rate limits and gated models)') + .setName(t('feat_hf_token')) + .setDesc(t('feat_hf_token_desc')) .addText(text => { text.setPlaceholder('hf_...') .setValue(this.plugin.settings.vector_db_hf_token || '') @@ -2971,8 +3016,8 @@ class PaperForgeSettingTab extends PluginSettingTab { if (this.plugin.settings.vector_db_mode !== 'api') return; new Setting(containerEl) - .setName('OpenAI API Key') - .setDesc('Used for embedding model API calls') + .setName(t('feat_openai_key')) + .setDesc(t('feat_openai_key_desc')) .addText(text => { text.setPlaceholder('sk-...') .setValue(this.plugin.settings.vector_db_api_key || '') @@ -3011,8 +3056,8 @@ class PaperForgeSettingTab extends PluginSettingTab { box.setText('Dependencies not installed. Required: chromadb, sentence-transformers.'); new Setting(containerEl) - .setName('Install Dependencies') - .setDesc('pip install chromadb sentence-transformers openai(~500MB)') + .setName(t('feat_install_deps')) + .setDesc(t('feat_install_deps_desc')) .addButton(button => { button.setButtonText('Install') .setCta() @@ -3069,10 +3114,10 @@ class PaperForgeSettingTab extends PluginSettingTab { // Mode selector new Setting(containerEl) - .setName('Embedding Mode') + .setName(t('feat_embed_mode')) .addDropdown(dropdown => { - dropdown.addOption('local', 'Local (free, CPU)'); - dropdown.addOption('api', 'API (OpenAI, paid)'); + dropdown.addOption('local', t('feat_embed_mode_local')); + dropdown.addOption('api', t('feat_embed_mode_api')); dropdown.setValue(this.plugin.settings.vector_db_mode) .onChange(value => { this.plugin.settings.vector_db_mode = value; @@ -3089,7 +3134,7 @@ class PaperForgeSettingTab extends PluginSettingTab { 'BAAI/bge-base-en-v1.5': 'Highest accuracy — slower, large disk footprint (768d, 440MB)', }; new Setting(containerEl) - .setName('Model') + .setName(t('feat_model')) .setDesc(modelDesc[this.plugin.settings.vector_db_model] || '') .addDropdown(dropdown => { dropdown.addOption('BAAI/bge-small-en-v1.5', 'bge-small (384d, 130MB)'); @@ -3162,8 +3207,8 @@ class PaperForgeSettingTab extends PluginSettingTab { }; new Setting(containerEl) - .setName('Rebuild Vectors') - .setDesc(modelChanged ? 'Model changed — rebuild to update all vectors.' : 'Rebuild all OCR fulltext vectors. Required after model or mode change.') + .setName(t('feat_rebuild_vectors')) + .setDesc(modelChanged ? t('feat_rebuild_vectors_changed') : t('feat_rebuild_vectors_desc')) .addButton(button => { const label = embedInfo && embedInfo.db_exists ? 'Rebuild' : 'Build'; button.setButtonText(label) diff --git a/paperforge/plugin/manifest.json b/paperforge/plugin/manifest.json index af16719..21ba972 100644 --- a/paperforge/plugin/manifest.json +++ b/paperforge/plugin/manifest.json @@ -3,7 +3,7 @@ "name": "PaperForge", "version": "1.5.6rc1", "minAppVersion": "1.9.0", - "description": "PaperForge — Zotero literature pipeline. Sync PDFs, run OCR, and read with AI-assisted deep reading.", + "description": "Zotero literature pipeline for Obsidian. Sync PDFs, run OCR, and read with AI-assisted deep reading.", "author": "Lin Zhaoxuan", "authorUrl": "https://github.com/LLLin000", "isDesktopOnly": true From 6c86acf2715c97bf4eb4a47095b703a931c17988 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 00:53:20 +0800 Subject: [PATCH 101/132] fix(plugin): undeclared customText variable in _renderHfMirror --- paperforge/plugin/main.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index d16260b..c168a39 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2975,7 +2975,7 @@ class PaperForgeSettingTab extends PluginSettingTab { if (value !== '__custom__') { this.plugin.settings.vector_db_hf_endpoint = value; this.plugin.saveSettings(); - if (customInput) { customInput.settingEl.style.display = 'none'; if (customText) customText.setValue(''); } + if (customInput) { customInput.settingEl.style.display = 'none'; if (this._hfCustomText) this._hfCustomText.setValue(''); } } else { if (customInput) customInput.settingEl.style.display = ''; } @@ -2985,7 +2985,7 @@ class PaperForgeSettingTab extends PluginSettingTab { .setName(t('feat_custom_endpoint')) .setDesc(t('feat_custom_endpoint_desc')) .addText(text => { - customText = text; + this._hfCustomText = text; const current = this.plugin.settings.vector_db_hf_endpoint || ''; const isPreset = ['https://hf-mirror.com', 'https://huggingface.co'].includes(current); text.setPlaceholder('https://your-mirror.com') From 9c3a4d6e21d04461220225c34a1612389e7f445d Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 00:55:16 +0800 Subject: [PATCH 102/132] i18n(plugin): translate local model descriptions to en/zh --- paperforge/plugin/main.js | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index c168a39..3d39348 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -537,6 +537,9 @@ Object.assign(LANG.en, { feat_rebuild_vectors_changed: 'Model changed — rebuild to update all vectors.', feat_install_deps: 'Install Dependencies', feat_install_deps_desc: 'pip install chromadb sentence-transformers openai (~500MB).', + feat_model_bge_small: 'Best balance — fast, accurate, recommended for most users (384d, 130MB)', + feat_model_minilm: 'Lightest & fastest — lower accuracy, minimal disk (384d, 80MB)', + feat_model_bge_base: 'Highest accuracy — slower, large disk footprint (768d, 440MB)', }); /* ── LANG.zh: v1.12 runtime health, OCR queue, pf-deep, dashboard translations ── */ @@ -606,6 +609,9 @@ Object.assign(LANG.zh, { feat_rebuild_vectors_changed: '模型已更换 — 需要重建向量。', feat_install_deps: '安装依赖', feat_install_deps_desc: 'pip install chromadb sentence-transformers openai (~500MB)。', + feat_model_bge_small: '最佳平衡 — 快速、准确,推荐大多数用户使用 (384d, 130MB)', + feat_model_minilm: '最轻最快 — 精度略低,磁盘占用最小 (384d, 80MB)', + feat_model_bge_base: '最高精度 — 较慢,磁盘占用大 (768d, 440MB)', }); function langFromApp(app) { @@ -3129,9 +3135,9 @@ class PaperForgeSettingTab extends PluginSettingTab { // Model selector (local mode) if (this.plugin.settings.vector_db_mode === 'local') { const modelDesc = { - 'BAAI/bge-small-en-v1.5': 'Best balance — fast, accurate, recommended for most users (384d, 130MB)', - 'sentence-transformers/all-MiniLM-L6-v2': 'Lightest & fastest — lower accuracy, minimal disk (384d, 80MB)', - 'BAAI/bge-base-en-v1.5': 'Highest accuracy — slower, large disk footprint (768d, 440MB)', + 'BAAI/bge-small-en-v1.5': t('feat_model_bge_small'), + 'sentence-transformers/all-MiniLM-L6-v2': t('feat_model_minilm'), + 'BAAI/bge-base-en-v1.5': t('feat_model_bge_base'), }; new Setting(containerEl) .setName(t('feat_model')) From 455c83d05e58ccd57dc1df25e4249e3056279939 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 00:58:30 +0800 Subject: [PATCH 103/132] fix(plugin): restore API settings + hide HF in API mode --- paperforge/plugin/main.js | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 3d39348..bdad6e5 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -2923,9 +2923,6 @@ class PaperForgeSettingTab extends PluginSettingTab { vecArrow.style.transform = vecConfigCollapsed ? 'rotate(-90deg)' : 'rotate(0deg)'; }); - // HF Mirror — always visible, needed before deps install - this._renderHfMirror(vecConfigContent); - // === Resolve state === if (this._vectorDepsOk === true && this._embedStatusText !== null) { this._renderVectorReady(vecConfigContent, vp); @@ -3134,6 +3131,8 @@ class PaperForgeSettingTab extends PluginSettingTab { // Model selector (local mode) if (this.plugin.settings.vector_db_mode === 'local') { + // HF settings only relevant for local model downloads + this._renderHfMirror(containerEl); const modelDesc = { 'BAAI/bge-small-en-v1.5': t('feat_model_bge_small'), 'sentence-transformers/all-MiniLM-L6-v2': t('feat_model_minilm'), @@ -3204,6 +3203,9 @@ class PaperForgeSettingTab extends PluginSettingTab { }); } + // API config (api mode) + this._renderApiConfig(containerEl); + // Rebuild button with live terminal output const terminalEl = containerEl.createEl('pre'); terminalEl.style.cssText = 'display:none; background:var(--background-primary); padding:10px; border-radius:4px; border:1px solid var(--background-modifier-border); max-height:250px; overflow-y:auto; font-size:11px; font-family:var(--font-monospace); margin:8px 0; white-space:pre-wrap; word-break:break-all; opacity:0.8;'; From caf7c100413fe25b28e77ca68dc96197dffdb806 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 01:07:50 +0800 Subject: [PATCH 104/132] i18n(plugin): translate all remaining Features tab strings to en/zh --- paperforge/plugin/main.js | 94 ++++++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 20 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index bdad6e5..68f3e41 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -540,6 +540,33 @@ Object.assign(LANG.en, { feat_model_bge_small: 'Best balance — fast, accurate, recommended for most users (384d, 130MB)', feat_model_minilm: 'Lightest & fastest — lower accuracy, minimal disk (384d, 80MB)', feat_model_bge_base: 'Highest accuracy — slower, large disk footprint (768d, 440MB)', + feat_api_base_url: 'API Base URL', + feat_api_base_url_desc: 'Custom OpenAI-compatible API endpoint. Leave empty for default.', + feat_api_model: 'API Model', + feat_api_model_desc: 'Embedding model name for this endpoint.', + feat_deps_missing: 'Dependencies not installed. Required: chromadb, sentence-transformers, openai.', + feat_deps_checking: 'Checking dependencies...', + feat_no_python: 'No Python found. Check Installation tab.', + feat_rebuild_btn: 'Rebuild', + feat_build_btn: 'Build', + feat_building: 'Building...', + feat_installing: 'Installing...', + feat_install_btn: 'Install', + feat_retry_btn: 'Retry', + feat_removing: 'Removing...', + feat_not_cached: 'Not cached', + feat_uninstall_btn: 'Uninstall', + feat_verify_btn: 'Verify', + feat_checking_btn: 'Checking...', + feat_valid_key: 'API key valid.', + feat_key_rejected: 'API key rejected.', + feat_enter_key: 'Enter a valid OpenAI API key.', + feat_network_error: 'Network error: ', + feat_build_complete: 'Vector build complete.', + feat_build_failed: 'Build failed. See terminal output.', + feat_output_copied: 'Output copied to clipboard.', + feat_install_done: 'Dependencies installed. Building vectors...', + feat_install_failed: 'Install failed: ', }); /* ── LANG.zh: v1.12 runtime health, OCR queue, pf-deep, dashboard translations ── */ @@ -612,6 +639,33 @@ Object.assign(LANG.zh, { feat_model_bge_small: '最佳平衡 — 快速、准确,推荐大多数用户使用 (384d, 130MB)', feat_model_minilm: '最轻最快 — 精度略低,磁盘占用最小 (384d, 80MB)', feat_model_bge_base: '最高精度 — 较慢,磁盘占用大 (768d, 440MB)', + feat_api_base_url: 'API 地址', + feat_api_base_url_desc: '自定义 OpenAI 兼容 API 端点。留空使用默认地址。', + feat_api_model: 'API 模型', + feat_api_model_desc: '该端点使用的嵌入模型名称。', + feat_deps_missing: '依赖未安装。需要:chromadb, sentence-transformers, openai。', + feat_deps_checking: '正在检测依赖…', + feat_no_python: '未找到 Python。请查看安装标签页。', + feat_rebuild_btn: '重建', + feat_build_btn: '构建', + feat_building: '构建中…', + feat_installing: '安装中…', + feat_install_btn: '安装', + feat_retry_btn: '重试', + feat_removing: '删除中…', + feat_not_cached: '未缓存', + feat_uninstall_btn: '卸载', + feat_verify_btn: '验证', + feat_checking_btn: '检测中…', + feat_valid_key: 'API Key 有效。', + feat_key_rejected: 'API Key 被拒绝。', + feat_enter_key: '请输入有效的 OpenAI API Key。', + feat_network_error: '网络错误:', + feat_build_complete: '向量构建完成。', + feat_build_failed: '构建失败。请查看终端输出。', + feat_output_copied: '输出已复制到剪贴板。', + feat_install_done: '依赖已安装。正在构建向量…', + feat_install_failed: '安装失败:', }); function langFromApp(app) { @@ -2936,11 +2990,11 @@ class PaperForgeSettingTab extends PluginSettingTab { if (this._vectorDepsOk === null) { const statusBox = vecConfigContent.createEl('div'); statusBox.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; - statusBox.setText('Checking dependencies...'); + statusBox.setText(t('feat_deps_checking')); const pyResult = resolvePythonExecutable(vp, this.plugin.settings); if (!pyResult.path) { - statusBox.setText('No Python found. Check Installation tab.'); + statusBox.setText(t('feat_no_python')); this._vectorDepsOk = false; return; } @@ -3030,8 +3084,8 @@ class PaperForgeSettingTab extends PluginSettingTab { }); }); new Setting(containerEl) - .setName('API Base URL') - .setDesc('Custom OpenAI-compatible API endpoint. Leave empty for default.') + .setName(t('feat_api_base_url')) + .setDesc(t('feat_api_base_url_desc')) .addText(text => { text.setPlaceholder('https://api.openai.com/v1') .setValue(this.plugin.settings.vector_db_api_base || '') @@ -3041,8 +3095,8 @@ class PaperForgeSettingTab extends PluginSettingTab { }); }); new Setting(containerEl) - .setName('API Model') - .setDesc('Embedding model name for this endpoint') + .setName(t('feat_api_model')) + .setDesc(t('feat_api_model_desc')) .addText(text => { text.setPlaceholder('text-embedding-3-small') .setValue(this.plugin.settings.vector_db_api_model || 'text-embedding-3-small') @@ -3056,21 +3110,21 @@ class PaperForgeSettingTab extends PluginSettingTab { _renderVectorNoDeps(containerEl) { const box = containerEl.createEl('div'); box.style.cssText = 'padding:8px 12px; margin:8px 0; background:var(--background-secondary); border-radius:4px;'; - box.setText('Dependencies not installed. Required: chromadb, sentence-transformers.'); + box.setText(t('feat_deps_missing')); new Setting(containerEl) .setName(t('feat_install_deps')) .setDesc(t('feat_install_deps_desc')) .addButton(button => { - button.setButtonText('Install') + button.setButtonText(t('feat_install_btn')) .setCta() .onClick(async () => { const vp = this.app.vault.adapter.basePath; const pyResult = resolvePythonExecutable(vp, this.plugin.settings); if (!pyResult.path) { new Notice('No Python found.'); return; } - button.setButtonText('Installing...'); - button.setDisabled(true); - const notice = new Notice('Installing chromadb + sentence-transformers...', 0); + button.setButtonText(t('feat_installing')); + button.setDisabled(true); + const notice = new Notice('Installing chromadb + sentence-transformers + openai...', 0); try { const { exec } = require('child_process'); const env = Object.assign({}, process.env, { PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1', HF_ENDPOINT: this.plugin.settings.vector_db_hf_endpoint || 'https://hf-mirror.com', HF_TOKEN: this.plugin.settings.vector_db_hf_token || '' }); @@ -3090,7 +3144,7 @@ class PaperForgeSettingTab extends PluginSettingTab { } catch (e) { notice.hide(); new Notice('Install failed: ' + (e.stderr || e.message || e)); - button.setButtonText('Retry'); + button.setButtonText(t('feat_retry_btn')); button.setDisabled(false); } }); @@ -3176,14 +3230,14 @@ class PaperForgeSettingTab extends PluginSettingTab { } if (isCached) { - button.setButtonText('Uninstall').setWarning(); + button.setButtonText(t('feat_uninstall_btn')).setWarning(); } else { - button.setButtonText('Not cached'); + button.setButtonText(t('feat_not_cached')); button.setDisabled(true); } button.onClick(async () => { if (!isCached) return; - button.setButtonText('Removing...'); + button.setButtonText(t('feat_removing')); button.setDisabled(true); try { const pyResult = resolvePythonExecutable(vp, this.plugin.settings); @@ -3218,13 +3272,13 @@ class PaperForgeSettingTab extends PluginSettingTab { .setName(t('feat_rebuild_vectors')) .setDesc(modelChanged ? t('feat_rebuild_vectors_changed') : t('feat_rebuild_vectors_desc')) .addButton(button => { - const label = embedInfo && embedInfo.db_exists ? 'Rebuild' : 'Build'; + const label = embedInfo && embedInfo.db_exists ? t('feat_rebuild_btn') : t('feat_build_btn'); button.setButtonText(label) .setCta() .onClick(async () => { const pyResult = resolvePythonExecutable(vp, this.plugin.settings); - if (!pyResult.path) { new Notice('No Python found.'); return; } - button.setButtonText('Building...'); + if (!pyResult.path) { new Notice(t('feat_no_python')); return; } + button.setButtonText(t('feat_building')); button.setDisabled(true); terminalEl.style.display = 'block'; terminalEl.setText(''); @@ -3252,10 +3306,10 @@ class PaperForgeSettingTab extends PluginSettingTab { this.plugin.saveSettings(); this._embedStatusText = null; this._execEmbedStatus(pyResult.path, vp, (text) => { this._embedStatusText = text; this.display(); }); - new Notice('Vector build complete.'); + new Notice(t('feat_build_complete')); } catch (e) { append('\n--- BUILD FAILED ---\n' + (e.stderr || e.message || e)); - new Notice('Build failed. See terminal output.'); + new Notice(t('feat_build_failed')); button.setButtonText(label); button.setDisabled(false); } From eed48178fef804eae14f5e156ba3370bac681de0 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 01:21:52 +0800 Subject: [PATCH 105/132] spec: dashboard copy interaction + per-paper metadata enhancement --- README.en.md | 294 ---------------- README.md | 314 +++++++++--------- README.zh.md | 300 +++++++++++++++++ ...26-05-14-dashboard-copy-metadata-design.md | 176 ++++++++++ paperforge/plugin/styles.css | 12 +- 5 files changed, 636 insertions(+), 460 deletions(-) delete mode 100644 README.en.md create mode 100644 README.zh.md create mode 100644 docs/superpowers/specs/2026-05-14-dashboard-copy-metadata-design.md diff --git a/README.en.md b/README.en.md deleted file mode 100644 index 5c7e1ce..0000000 --- a/README.en.md +++ /dev/null @@ -1,294 +0,0 @@ -

- PaperForge banner -

- -# PaperForge - -[![Version](https://img.shields.io/github/v/release/LLLin000/PaperForge?style=for-the-badge&label=version)](https://github.com/LLLin000/PaperForge/releases) -[![Python](https://img.shields.io/pypi/pyversions/paperforge?style=for-the-badge&logo=python&logoColor=white&color=3775A9)](https://python.org) -[![License](https://img.shields.io/badge/license-CC%20BY--NC--SA%204.0-lightgreen?style=for-the-badge)](LICENSE) - -[简体中文](README.md) · **English** - -> **铸知识为器,启洞见之明。 — Forge Knowledge, Empower Insight.** - -PaperForge brings your Zotero library into Obsidian. Sync papers, run OCR, extract figures, and do AI-assisted deep reading — all inside a single vault. - ---- - -## 0. What PaperForge Is - -PaperForge is **not just an Obsidian plugin**. It has two parts: - -| Part | What | Does | Where | -|------|------|------|-------| -| Obsidian Plugin | `main.js` + `manifest.json` + `styles.css` | Dashboard, buttons, settings UI | `.obsidian/plugins/paperforge/` in your vault | -| Python Package | `paperforge` | Sync, OCR, Doctor, repair | Your system Python (`pip install`) | - -The plugin is the **interface**. The Python package is the **engine**. Every button you click in the plugin actually runs a Python command behind the scenes. - -**After installing the plugin, you MUST verify that the Python package is also installed and version-matched.** - ---- - -## 1. Install the Obsidian Plugin - -### Option A: BRAT (Recommended) - -1. Install **BRAT** from the Obsidian community plugin browser -2. Open BRAT settings → `Add Beta Plugin` -3. Enter: `https://github.com/LLLin000/PaperForge` -4. BRAT downloads the latest `main.js`, `manifest.json`, and `styles.css` and installs them -5. Settings → Community Plugins → enable PaperForge - -> BRAT auto-detects GitHub Release updates. No manual downloads needed. - -### Option B: Manual Download - -1. Go to [Releases](https://github.com/LLLin000/PaperForge/releases) -2. Download the three files: `main.js`, `manifest.json`, `styles.css` -3. Create `.obsidian/plugins/paperforge/` in your vault -4. Put the three files there -5. Restart Obsidian → Settings → Community Plugins → enable PaperForge - -> Manual install does not auto-update. You'll need to re-download for each new version. - ---- - -## 2. Install the Python Package - -After enabling the plugin, open the PaperForge settings tab. You'll see a **Runtime Status** section: - -``` -Plugin v1.5.0 → Python Package v1.5.0 ✓ Matched -``` - -- If it says "Not installed" → click **Open Wizard** to re-run the setup process -- If it says "Mismatch" → the Python package auto-updates when the plugin updates. If it didn't succeed, click **Update Runtime** to manually trigger - ---- - -## 3. How Python Interpreter Resolution Works - -PaperForge needs to find a working Python on your system. It searches in this order: - -| Priority | Source | Description | -|----------|--------|-------------| -| 1 | **Manual override** | Settings → `Custom Python Path`, enter the full path (e.g., `C:\Users\you\...\python.exe`). **This is the most reliable method.** | -| 2 | **venv auto-detect** | Scans `.paperforge-test-venv`, `.venv`, `venv` under your vault root | -| 3 | **System auto-detect** | Tries `py -3`, `python`, `python3` in order, verifies with `--version` | -| 4 | **Fallback** | Defaults to `python` if nothing else works | - -> If you have multiple Python installations (e.g., system 3.9 + self-installed 3.11), **strongly recommend setting a manual path** in settings to avoid hitting the wrong one. -> -> The **Validate** button in settings immediately tests the resolved interpreter and shows its version. - ---- - -## 4. Setup Wizard — What Each Step Means - -Open the plugin settings panel (`Settings` → `Community plugins` → `PaperForge`) and click the **Open Wizard** button. The wizard walks you through configuration. Here's what every step does. - -### 4.1 Vault Path - -Your Obsidian vault root. Auto-detected, usually no need to change. - -### 4.2 AI Agent Platform - -PaperForge's deep reading features run through an AI Agent. The core mechanism is **trigger phrases**, not registered plugins: you type `/pf-deep ` directly into the Agent chat, and the Agent recognizes the trigger and loads the `literature-qa` Skill automatically. - -The setup wizard deploys Skill files to the correct location: - -| Agent | Skill location | Trigger example | -|-------|---------------|-----------------| -| **OpenCode** | `.opencode/skills/` + `.opencode/command/` | `/pf-deep ` | -| **Claude Code** | `.claude/skills/` | `/pf-deep ` | -| **Cursor** | `.cursor/skills/` | `/pf-deep ` | -| **GitHub Copilot** | `.github/skills/` | `/pf-deep ` | -| **Windsurf** | `.windsurf/skills/` | `/pf-deep ` | -| **Codex** | `.codex/skills/` | `$pf-deep ` | -| **Cline** | `.clinerules/` | `/pf-deep ` | - -> **Key concept**: `/pf-deep` is NOT a plugin you install on the Agent platform — it's a Skill file deployed inside your Vault. Once the setup wizard copies the files into place, the Agent auto-discovers the triggers on startup. You type the trigger phrase just like any other chat input. - -### 4.3 Directory Names - -The wizard asks what to name several directories. These are for organizing files inside your vault. **Defaults work for most users.** - -| Parameter | Default | Purpose | -|-----------|---------|---------| -| `system_dir` | `System` | Root for PaperForge internal data. Contains `exports/` (Zotero JSON exports), `ocr/` (OCR results), `config/`. You rarely need to open this manually. | -| `resources_dir` | `Resources` | Resources root. Your formal literature notes live under this directory, inside `literature_dir`. | -| `literature_dir` | `Literature` | Formal literature notes directory. `paperforge sync` generates frontmatter `.md` notes here. | -| `base_dir` | `Bases` | Obsidian Base view definitions. Dashboard filters ("Pending OCR", "Ready to Read", etc.) are stored here. | - -### 4.4 PaddleOCR API Token - -OCR requires a PaddleOCR API key. Configured in `.env`: - -``` -PADDLEOCR_API_TOKEN=your-api-key -``` - -The wizard guides you through setting this. You can also edit `.env` later. The OCR URL usually stays at the default. - -### 4.5 Zotero Data Directory - -PaperForge creates a junction (Windows) or symlink (macOS/Linux) linking your Zotero data directory into the vault. This is how Obsidian wikilinks resolve to PDF files. - -The wizard auto-detects your Zotero installation. If detection fails, manually enter the path to your Zotero data directory — the folder that contains the `storage/` subdirectory (not the Zotero executable). - -### 4.6 What Happens During Setup - -After confirming your choices, the wizard automatically: -- Creates all needed directory structures -- Deploys Agent command files to the correct locations -- Installs Obsidian plugin files -- Creates the Zotero junction/symlink -- Writes `paperforge.json` and `.env` - -The process is **incremental** — if files already exist in the chosen directories, the wizard only adds what's missing and never deletes existing content. - ---- - -## 5. First-Time Setup Checklist - -1. **Version match**: Settings → Runtime Status → confirm plugin and Python package match -2. **Python path**: Settings → Validate button → confirm it's the Python you want -3. **Setup wizard**: Settings → PaperForge → Open Wizard -4. **PaddleOCR key**: Enter your API token in `.env` (wizard guides this) -5. **Export from Zotero**: Right-click your library → `Export...` → format `Better BibTeX JSON` → check `Keep updated` → save to `/PaperForge/exports/` -6. **Run Doctor**: Dashboard → `Run Doctor` → all checks should pass - ---- - -## 6. Daily Use - -### Dashboard (Three-Mode Views) - -`Ctrl+P` → `PaperForge: Open Dashboard` opens the control panel with three views: - -| View | Purpose | -|------|---------| -| **Global** | System homepage: run Sync, OCR, Doctor, and other mechanical operations | -| **Collection** | Batch workspace: browse paper queues by domain, batch tagging | -| **Per-paper** | Reading companion: `do_ocr` / `analyze` toggle checkboxes, discussion record cards | - -> PDF files in the Dashboard automatically switch to Per-paper mode — no manual switching needed. - -### AI Deep Reading & Q&A (Requires Agent) - -Launch your Agent app and type commands into its chat input. **The more specific you are about the paper (Zotero Key, title, DOI), the faster the Agent locates it.** - -| Route | Command | Does | Trigger examples | Prerequisites | -|-------|---------|------|-----------------|--------------| -| Deep Read | `/pf-deep ` | Keshav three-pass deep reading, writes to formal note | `deep read XX`, `walk me through`, `journal club` | OCR done, analyze: true | -| Q&A | `/pf-paper ` | Interactive paper Q&A, OCR not required | `take a look at XX`, `what does this paper say` | Formal note exists | -| Archive | `/pf-end` | Save current `/pf-paper` Q&A session | `save`, `end discussion` | During `/pf-paper` session | - -### `/pf-end` Details - -- `/pf-end` only applies to `/pf-paper` Q&A sessions. Deep reading (`/pf-deep`) writes directly to the formal note and does not need `/pf-end`. -- When executed, two files are created in the paper's workspace: - - `discussion.md` — human-readable Q&A discussion record - - `discussion.json` — structured Q&A data (with timestamps, source tags) -- Dashboard **Per-paper** view automatically displays these as discussion record cards - -> Command prefixes vary by platform (mostly `/`, Codex uses `$`). - ---- - -## 7. Full Workflow - -``` -Add paper to Zotero - ↓ Better BibTeX auto-exports JSON to exports/ -Dashboard → Sync Library - ↓ Generates formal note (in Literature/, with frontmatter metadata) -Set do_ocr: true in the note's frontmatter - ↓ -Dashboard → Run OCR - ↓ PaddleOCR extracts full text + figures → ocr/ directory -Set analyze: true in the note's frontmatter - ↓ -Open Agent → type /pf-deep - ↓ Agent performs three-pass deep reading -## 🔍 Deep Reading section appears in the note - ↓ (for additional Q&A) -Open Agent → type /pf-paper - ↓ Interactive Q&A -Type /pf-end to save the discussion record - ↓ -Dashboard Per-paper view shows discussion cards -``` - ---- - -## 8. Troubleshooting - -### Plugin fails to load - -- Confirm `.obsidian/plugins/paperforge/` has `main.js`, `manifest.json`, `styles.css` -- If upgrading via BRAT from an old version: delete the entire `paperforge` plugin folder and let BRAT re-download -- Open Developer Console (`Ctrl+Shift+I`) and check the red errors - -### "Sync Runtime" doesn't update the version - -- The plugin may be calling a different Python than your terminal. Check Settings → Python path -- Try with `--no-cache-dir` to bypass pip cache -- Confirm `https://github.com/LLLin000/PaperForge` is reachable - -### OCR stays pending - -- Confirm `.env` has `PADDLEOCR_API_TOKEN` -- Run `paperforge ocr --diagnose` to check API connectivity -- PDF paths may be broken: run `paperforge repair --fix-paths` - -### No notes generated after sync - -- Is Better BibTeX auto-export configured in Zotero? Are JSON files in `exports/`? -- Run `paperforge doctor` to find which step failed - -### /pf-deep command does nothing - -- Make sure you're running it in your Agent app, not a terminal -- Confirm OCR is done (`ocr_status: done`) -- Confirm `analyze` is set to `true` - ---- - -## 9. Updating - -BRAT auto-detects plugin updates. For the Python package: - -```bash -paperforge update -# or -pip install --upgrade paperforge -``` - ---- - -## 10. Architecture - -``` -paperforge/ -├── core/ Contract layer — PFResult/ErrorCode/state machine -├── adapters/ Adapter layer — BBT parsing, paths, frontmatter I/O -├── services/ Service layer — SyncService orchestration -├── worker/ Worker layer — OCR, status, repair -├── commands/ CLI dispatch -├── setup/ Setup wizard (directories, agent deployment, Zotero linking) -├── plugin/ Obsidian plugin (Dashboard, settings panel) -└── schema/ Field registry -``` - ---- - -## License - -[CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/). Non-commercial use only. - -## Acknowledgments - -Built on [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [Obsidian](https://obsidian.md), [Better BibTeX for Zotero](https://retorque.re/zotero-better-bibtex/), and other great open-source projects. diff --git a/README.md b/README.md index 493b86b..9270941 100644 --- a/README.md +++ b/README.md @@ -8,98 +8,99 @@ [![Python](https://img.shields.io/pypi/pyversions/paperforge?style=for-the-badge&logo=python&logoColor=white&color=3775A9)](https://python.org) [![License](https://img.shields.io/badge/license-CC%20BY--NC--SA%204.0-lightgreen?style=for-the-badge)](LICENSE) -**简体中文** · [English](README.en.md) +[简体中文](README.zh.md) · **English** > **铸知识为器,启洞见之明。 — Forge Knowledge, Empower Insight.** -PaperForge 让你在 Obsidian 里管理 Zotero 文献。同步、OCR 全文提取、图表解析、AI 精读,全在一个 Vault 里完成。 +PaperForge brings your Zotero library into Obsidian. Sync papers, run OCR, extract figures, and do AI-assisted deep reading — all inside a single vault. --- -## 0. 先理解它是什么 +## 0. What PaperForge Is -PaperForge **不是一个纯 Obsidian 插件**。它有两部分: +PaperForge is **not just an Obsidian plugin**. It has two parts: -| 部分 | 是什么 | 干什么 | 装在哪 | -|------|--------|--------|--------| -| Obsidian 插件 | `main.js` + `manifest.json` + `styles.css` | Dashboard、按钮、设置界面 | Vault 的 `.obsidian/plugins/paperforge/` | -| Python 包 | `paperforge` | 同步、OCR、Doctor、修复 | 系统 Python 环境 (`pip install`) | +| Part | What | Does | Where | +|------|------|------|-------| +| Obsidian Plugin | `main.js` + `manifest.json` + `styles.css` | Dashboard, buttons, settings UI | `.obsidian/plugins/paperforge/` in your vault | +| Python Package | `paperforge` | Sync, OCR, Doctor, repair | Your system Python (`pip install`) | -插件是**壳**,Python 包是**引擎**。插件里的按钮点了之后,实际是调用 Python 命令行去干活。 +The plugin is the **interface**. The Python package is the **engine**. Every button you click in the plugin actually runs a Python command behind the scenes. -**所以装完插件之后,必须在设置里确认 Python 包也已安装,并且版本一致。** +**After installing the plugin, you MUST verify that the Python package is also installed and version-matched.** --- -## 1. 安装 Obsidian 插件 +## 1. Install the Obsidian Plugin -### 方式一:BRAT(推荐) +### Option A: BRAT (Recommended) -1. 在 Obsidian 社区插件市场搜索安装 **BRAT**(Beta Reviewer's Auto-update Tester) -2. 打开 BRAT 设置 → `Add Beta Plugin` -3. 填入仓库地址:`https://github.com/LLLin000/PaperForge` -4. BRAT 会自动下载最新 Release 的 `main.js`、`manifest.json`、`styles.css` 并安装 -5. 在 Obsidian 设置 → 社区插件 → 启用 PaperForge +1. Install **BRAT** from the Obsidian community plugin browser +2. Open BRAT settings → `Add Beta Plugin` +3. Enter: `https://github.com/LLLin000/PaperForge` +4. BRAT downloads the latest `main.js`, `manifest.json`, and `styles.css` and installs them +5. Settings → Community Plugins → enable PaperForge -> BRAT 能自动检测 GitHub Release 更新,不需要手动下载。 +> BRAT auto-detects GitHub Release updates. No manual downloads needed. -### 方式二:手动下载 +### Option B: Manual Download -1. 打开 [Releases](https://github.com/LLLin000/PaperForge/releases) 页面 -2. 下载最新版本的三个文件:`main.js`、`manifest.json`、`styles.css` -3. 在 Vault 里创建文件夹 `.obsidian/plugins/paperforge/` -4. 把三个文件放进去 -5. 重启 Obsidian → 设置 → 社区插件 → 启用 PaperForge +1. Go to [Releases](https://github.com/LLLin000/PaperForge/releases) +2. Download the three files: `main.js`, `manifest.json`, `styles.css` +3. Create `.obsidian/plugins/paperforge/` in your vault +4. Put the three files there +5. Restart Obsidian → Settings → Community Plugins → enable PaperForge -> 手动安装不会自动更新,每次新版本需要重新下载替换。 +> Manual install does not auto-update. You'll need to re-download for each new version. --- -## 2. 安装 Python 包 +## 2. Install the Python Package -插件装好后,打开 PaperForge 设置页面。你会看到 **运行时状态** 区域: +After enabling the plugin, open the PaperForge settings tab. You'll see a **Runtime Status** section: ``` -插件 v1.5.0 → Python 包 v1.5.0 ✓ 匹配 +Plugin v1.5.0 → Python Package v1.5.0 ✓ Matched ``` -- 如果显示"未安装" → 在设置里确认 Python 解释器路径,然后点击 **验证** 重新检测 -- 如果显示"版本不匹配" → 插件更新时 Python 包会自动同步升级,如果没成功,点 **更新运行时** 手动触发 +- If it says "Not installed" → click **Open Wizard** to re-run the setup process +- If it says "Mismatch" → the Python package auto-updates when the plugin updates. If it didn't succeed, click **Update Runtime** to manually trigger --- -## 3. Python 解释器识别逻辑 +## 3. How Python Interpreter Resolution Works -PaperForge 需要找到你系统里的 Python。它按以下顺序查找,找到就用: +PaperForge needs to find a working Python on your system. It searches in this order: -| 优先级 | 来源 | 说明 | -|--------|------|------| -| 1 | **你手动指定** | 设置 → `自定义 Python 路径`,填入完整路径(如 `C:\Users\你的用户名\AppData\Local\Programs\Python\Python311\python.exe`)。**这是最可靠的方式。** | -| 2 | **venv 自动检测** | 自动扫描 Vault 根目录下的 `.paperforge-test-venv`、`.venv`、`venv` 里的 Python | -| 3 | **系统自动检测** | 依次尝试 `py -3`、`python`、`python3`,用 `--version` 验证,挑第一个能用的 | -| 4 | **兜底** | 以上都找不到,回退到 `python` | +| Priority | Source | Description | +|----------|--------|-------------| +| 1 | **Manual override** | Settings → `Custom Python Path`, enter the full path (e.g., `C:\Users\you\...\python.exe`). **This is the most reliable method.** | +| 2 | **venv auto-detect** | Scans `.paperforge-test-venv`, `.venv`, `venv` under your vault root | +| 3 | **System auto-detect** | Tries `py -3`, `python`, `python3` in order, verifies with `--version` | +| 4 | **Fallback** | Defaults to `python` if nothing else works | -> 如果你系统里有多个 Python(比如系统自带的 3.9 + 自己装的 3.11),**强烈建议在设置里手动指定路径**,避免跑错环境。 +> If you have multiple Python installations (e.g., system 3.9 + self-installed 3.11), **strongly recommend setting a manual path** in settings to avoid hitting the wrong one. > -> 设置里的 **验证** 按钮会立即测试当前选中的解释器,显示它能不能用、是什么版本。 +> The **Validate** button in settings immediately tests the resolved interpreter and shows its version. --- -## 4. 配置说明 +## 4. Setup Wizard — What Each Step Means -以下参数在**插件设置页面**中配置(设置 → 第三方插件 → PaperForge → 打开安装向导)。首次安装时基础配置已是正确默认值,一般不需要手动改。以下解释供你了解每个参数的作用: +Open the plugin settings panel (`Settings` → `Community plugins` → `PaperForge`) and click the **Open Wizard** button. The wizard walks you through configuration. Here's what every step does. -### 4.1 Vault 路径 -你当前打开的 Obsidian Vault 根目录。安装向导自动检测,一般不用改。 +### 4.1 Vault Path -### 4.2 AI Agent 平台 +Your Obsidian vault root. Auto-detected, usually no need to change. -PaperForge 的精读功能通过 AI Agent 执行。核心机制是 **触发词** 而非注册插件:你直接在 Agent 对话里输入 `/pf-deep `,Agent 识别到触发词后自动加载 `literature-qa` Skill 来定位论文并执行精读。 +### 4.2 AI Agent Platform -安装向导会把 Skill 文件部署到对应位置: +PaperForge's deep reading features run through an AI Agent. The core mechanism is **trigger phrases**, not registered plugins: you type `/pf-deep ` directly into the Agent chat, and the Agent recognizes the trigger and loads the `literature-qa` Skill automatically. -| Agent | Skill 安装位置 | 触发词示例 | -|-------|---------------|-----------| +The setup wizard deploys Skill files to the correct location: + +| Agent | Skill location | Trigger example | +|-------|---------------|-----------------| | **OpenCode** | `.opencode/skills/` + `.opencode/command/` | `/pf-deep ` | | **Claude Code** | `.claude/skills/` | `/pf-deep ` | | **Cursor** | `.cursor/skills/` | `/pf-deep ` | @@ -108,193 +109,186 @@ PaperForge 的精读功能通过 AI Agent 执行。核心机制是 **触发词** | **Codex** | `.codex/skills/` | `$pf-deep ` | | **Cline** | `.clinerules/` | `/pf-deep ` | -> **关键理解**:`/pf-deep` 不是 Agent 平台的插件,而是部署在 Vault 里的 Skill 文件。安装向导把文件拷过去之后,Agent 启动时自动发现并识别这些触发词。你不需要在 Agent 平台里做任何"安装插件"的操作。 +> **Key concept**: `/pf-deep` is NOT a plugin you install on the Agent platform — it's a Skill file deployed inside your Vault. Once the setup wizard copies the files into place, the Agent auto-discovers the triggers on startup. You type the trigger phrase just like any other chat input. -### 4.3 目录命名 +### 4.3 Directory Names -安装向导会问你几个目录叫什么名字。这些都是给你自己看的,用来组织 Vault 里的文件结构。**大部分情况用默认值就行。** +The wizard asks what to name several directories. These are for organizing files inside your vault. **Defaults work for most users.** -| 参数 | 默认值 | 作用 | -|------|--------|------| -| `system_dir` | `System` | PaperForge 内部数据的总目录。下面会有 `exports/`(Zotero 导出的 JSON)、`ocr/`(OCR 结果)、`config/` 等子目录。你一般不需要手动进去看。 | -| `resources_dir` | `Resources` | 资源根目录。你的正式文献笔记就放在这里下面的 `literature_dir` 里。 | -| `literature_dir` | `Literature` | 正式文献笔记的目录。`paperforge sync` 生成的带 frontmatter 的 `.md` 笔记在这里。你日常阅读、编辑笔记都在这个目录。 | -| `base_dir` | `Bases` | Obsidian Base 视图文件目录。Dashboard 里的筛选视图("待 OCR"、"待精读"等)存在这里。 | +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `system_dir` | `System` | Root for PaperForge internal data. Contains `exports/` (Zotero JSON exports), `ocr/` (OCR results), `config/`. You rarely need to open this manually. | +| `resources_dir` | `Resources` | Resources root. Your formal literature notes live under this directory, inside `literature_dir`. | +| `literature_dir` | `Literature` | Formal literature notes directory. `paperforge sync` generates frontmatter `.md` notes here. | +| `base_dir` | `Bases` | Obsidian Base view definitions. Dashboard filters ("Pending OCR", "Ready to Read", etc.) are stored here. | ### 4.4 PaddleOCR API Token -OCR 功能需要 PaddleOCR 的 API。在 `.env` 文件里配置: +OCR requires a PaddleOCR API key. Configured in `.env`: ``` -PADDLEOCR_API_TOKEN=你的API密钥 +PADDLEOCR_API_TOKEN=your-api-key ``` -安装向导会引导你填写,也可以之后手动在 Vault 根目录的 `.env` 文件里加。OCR URL 一般不需要改。 +The wizard guides you through setting this. You can also edit `.env` later. The OCR URL usually stays at the default. -### 4.5 Zotero 数据目录 +### 4.5 Zotero Data Directory -PaperForge 会创建一个 junction(Windows)或 symlink(macOS/Linux),把 Zotero 的数据目录连接到 Vault 里。这样 Obsidian 的 wikilink 才能找到 PDF 文件。 +PaperForge creates a junction (Windows) or symlink (macOS/Linux) linking your Zotero data directory into the vault. This is how Obsidian wikilinks resolve to PDF files. -安装向导会自动检测 Zotero 的安装位置。如果检测失败,你需要手动指定 Zotero 数据目录的路径——也就是包含 `storage` 子目录的那个文件夹(不是 Zotero 程序本身)。 +The wizard auto-detects your Zotero installation. If detection fails, manually enter the path to your Zotero data directory — the folder that contains the `storage/` subdirectory (not the Zotero executable). -### 4.6 安装过程 +### 4.6 What Happens During Setup -确认配置后,安装向导会自动: -- 创建所有需要的目录结构 -- 把 Agent 命令文件部署到对应位置 -- 把 Obsidian 插件文件安装到位 -- 创建 Zotero junction/symlink -- 写入 `paperforge.json` 和 `.env` +After confirming your choices, the wizard automatically: +- Creates all needed directory structures +- Deploys Agent command files to the correct locations +- Installs Obsidian plugin files +- Creates the Zotero junction/symlink +- Writes `paperforge.json` and `.env` -整个过程是**增量的** — 如果你选的目录里已经有文件,安装向导只会补充缺失的,不会删除已有内容。 +The process is **incremental** — if files already exist in the chosen directories, the wizard only adds what's missing and never deletes existing content. --- -## 5. 首次使用 +## 5. First-Time Setup Checklist -1. **确认版本一致**:设置 → 运行时状态 → 确保插件和 Python 包版本一致 -2. **确认 Python 正确**:设置 → 验证按钮,确认连接的是你想要的 Python -3. **配置 PaddleOCR**:在 Vault 根目录 `.env` 里填入 API Token -4. **在 Zotero 里导出文献**:右键要同步的文献库 → `导出...` → 格式选 `Better BibTeX JSON` → 勾选 `Keep updated` → 保存到 `/PaperForge/exports/` -5. **运行 Doctor**:Dashboard → `Run Doctor`,确认所有检查通过 +1. **Version match**: Settings → Runtime Status → confirm plugin and Python package match +2. **Python path**: Settings → Validate button → confirm it's the Python you want +3. **Setup wizard**: Settings → PaperForge → Open Wizard +4. **PaddleOCR key**: Enter your API token in `.env` (wizard guides this) +5. **Export from Zotero**: Right-click your library → `Export...` → format `Better BibTeX JSON` → check `Keep updated` → save to `/PaperForge/exports/` +6. **Run Doctor**: Dashboard → `Run Doctor` → all checks should pass --- -## 6. 日常使用 - -### Dashboard(三模式视图) - -`Ctrl+P` → `PaperForge: Open Dashboard` 打开控制面板,包含三种视图: - -| 视图 | 用途 | -|------|------| -| **Global** | 系统首页:运行 Sync、OCR、Doctor 等机械操作 | -| **Collection** | 批量工作台:按领域查看文献队列、批量标记 | -| **Per-paper** | 单篇阅读伴侣:`do_ocr` / `analyze` 切换复选框,讨论记录卡片 | +## 6. Daily Use -> Dashboard 里的 PDF 文件会自动进入 Per-paper 模式,无需手动切换。 +### Dashboard (Three-Mode Views) -### AI 精读与问答(需 Agent) +`Ctrl+P` → `PaperForge: Open Dashboard` opens the control panel with three views: -打开 Agent 应用,直接输入触发词即可。Agent 识别到触发词后会自动加载 `literature-qa` Skill,按标准化流程定位论文并执行操作。 +| View | Purpose | +|------|---------| +| **Global** | System homepage: run Sync, OCR, Doctor, and other mechanical operations | +| **Collection** | Batch workspace: browse paper queues by domain, batch tagging | +| **Per-paper** | Reading companion: `do_ocr` / `analyze` toggle checkboxes, discussion record cards | -**你对文献描述得越具体(Zotero Key、标题、DOI),Agent 定位越快。** +> PDF files in the Dashboard automatically switch to Per-paper mode — no manual switching needed. -| 路由 | 触发词 | 做什么 | 前置条件 | -|------|--------|--------|---------| -| 精读 | `/pf-deep ` 或 `精读 ` | Keshav 三阶段组会式精读,结果写入 formal note | OCR 完成、analyze 为 true | -| 问答 | `/pf-paper ` 或 `文献问答 ` | 交互式论文 Q&A,不强制 OCR | 已有正式笔记 | -| 存档 | `/pf-end` 或 `结束讨论` | 保存本次 `/pf-paper` 问答记录 | `/pf-paper` 会话中 | +### AI Deep Reading & Q&A (Requires Agent) -> **两种触发方式等效**:你可以用 Agent 原生命令 `/pf-deep ABC12345`,也可以用自然语言 `精读 ABC12345`。Agent 识别到触发词后会自动加载 `literature-qa` Skill。 +Launch your Agent app and type commands into its chat input. **The more specific you are about the paper (Zotero Key, title, DOI), the faster the Agent locates it.** -> `/pf-deep` 和 `/pf-paper` **不是终端命令**,也不是 Agent 平台的注册插件。它们是部署在 Vault 里的 Skill 文件的触发词。安装向导把 Skill 文件放到正确位置后,Agent 启动时自动发现。使用方式就是打开 Agent 对话,输入触发词 —— 和你在终端敲 `ls` 一样直接。 +| Route | Command | Does | Trigger examples | Prerequisites | +|-------|---------|------|-----------------|--------------| +| Deep Read | `/pf-deep ` | Keshav three-pass deep reading, writes to formal note | `deep read XX`, `walk me through`, `journal club` | OCR done, analyze: true | +| Q&A | `/pf-paper ` | Interactive paper Q&A, OCR not required | `take a look at XX`, `what does this paper say` | Formal note exists | +| Archive | `/pf-end` | Save current `/pf-paper` Q&A session | `save`, `end discussion` | During `/pf-paper` session | -### `/pf-end` 详解 +### `/pf-end` Details -- `/pf-end` 仅对 `/pf-paper` 问答会话生效。精读(`/pf-deep`)的内容直接写入 formal note,不需要 `/pf-end`。 -- 执行后会在论文 workspace 下生成两个文件: - - `discussion.md` — 人类可读的 Q&A 讨论记录 - - `discussion.json` — 结构化 Q&A 数据(含时间戳、来源标记) -- Dashboard 的 **Per-paper** 视图会自动以讨论记录卡片形式展示这些记录 +- `/pf-end` only applies to `/pf-paper` Q&A sessions. Deep reading (`/pf-deep`) writes directly to the formal note and does not need `/pf-end`. +- When executed, two files are created in the paper's workspace: + - `discussion.md` — human-readable Q&A discussion record + - `discussion.json` — structured Q&A data (with timestamps, source tags) +- Dashboard **Per-paper** view automatically displays these as discussion record cards -> 不同 Agent 的命令前缀可能不同(大部分是 `/`,Codex 是 `$`)。 +> Command prefixes vary by platform (mostly `/`, Codex uses `$`). --- -## 7. 完整工作流 +## 7. Full Workflow ``` -Zotero 添加论文 - ↓ Better BibTeX 自动导出 JSON 到 exports/ 目录 +Add paper to Zotero + ↓ Better BibTeX auto-exports JSON to exports/ Dashboard → Sync Library - ↓ 生成正式笔记(Literature/ 目录下,带 frontmatter 元数据) -在笔记 frontmatter 里把 do_ocr 设为 true + ↓ Generates formal note (in Literature/, with frontmatter metadata) +Set do_ocr: true in the note's frontmatter ↓ Dashboard → Run OCR - ↓ PaddleOCR 提取全文 + 图表 → ocr/ 目录 -在笔记 frontmatter 里把 analyze 设为 true + ↓ PaddleOCR extracts full text + figures → ocr/ directory +Set analyze: true in the note's frontmatter ↓ -打开 Agent → 输入 /pf-deep - ↓ Agent 识别触发词 → 加载 literature-qa Skill → 三阶段精读 -笔记里出现 ## 🔍 精读 区域 - ↓(如需额外问答) -打开 Agent → 输入 /pf-paper - ↓ 交互式 Q&A -输入 /pf-end 保存讨论记录 +Open Agent → type /pf-deep + ↓ Agent performs three-pass deep reading +## 🔍 Deep Reading section appears in the note + ↓ (for additional Q&A) +Open Agent → type /pf-paper + ↓ Interactive Q&A +Type /pf-end to save the discussion record ↓ -Dashboard Per-paper 视图展示讨论卡片 +Dashboard Per-paper view shows discussion cards ``` --- -## 8. 常见问题 +## 8. Troubleshooting -### 插件加载失败(Cannot find module) +### Plugin fails to load -- 确认 `.obsidian/plugins/paperforge/` 下有 `main.js`、`manifest.json`、`styles.css` 三个文件 -- 如果 BRAT 从旧版升级后出问题:删除整个 `paperforge` 插件文件夹,让 BRAT 重新下载 -- 打开 Developer Console(`Ctrl+Shift+I`)看红色报错 +- Confirm `.obsidian/plugins/paperforge/` has `main.js`, `manifest.json`, `styles.css` +- If upgrading via BRAT from an old version: delete the entire `paperforge` plugin folder and let BRAT re-download +- Open Developer Console (`Ctrl+Shift+I`) and check the red errors -### "同步运行时" 点了还是旧版本 +### "Sync Runtime" doesn't update the version -- 插件调用的 Python 可能和你终端用的是不同环境。检查设置 → Python 解释器路径 -- pip 缓存问题,用 `--no-cache-dir` 重装 -- 确认 `https://github.com/LLLin000/PaperForge` 网络能通 +- The plugin may be calling a different Python than your terminal. Check Settings → Python path +- Try with `--no-cache-dir` to bypass pip cache +- Confirm `https://github.com/LLLin000/PaperForge` is reachable -### OCR 一直 pending +### OCR stays pending -- 确认 `.env` 里有 `PADDLEOCR_API_TOKEN` -- 终端运行 `paperforge ocr --diagnose` 检查 API 连通性 -- PDF 路径可能不对:运行 `paperforge repair --fix-paths` +- Confirm `.env` has `PADDLEOCR_API_TOKEN` +- Run `paperforge ocr --diagnose` to check API connectivity +- PDF paths may be broken: run `paperforge repair --fix-paths` -### 同步后没有生成笔记 +### No notes generated after sync -- Zotero Better BibTeX 是否配置了自动导出?JSON 是否在 `exports/` 目录? -- 运行 `paperforge doctor` 看具体哪一步失败 -- 运行 `paperforge status` 查看系统状态总览 +- Is Better BibTeX auto-export configured in Zotero? Are JSON files in `exports/`? +- Run `paperforge doctor` to find which step failed -### /pf-deep 触发词没反应 +### /pf-deep command does nothing -- 确认你在 **Agent 应用** 里输入,不是在终端 -- 确认安装向导已运行,Skill 文件已部署到正确的 Vault 目录 -- 确认 OCR 已完成(ocr_status: done) -- 确认 analyze 已设为 true +- Make sure you're running it in your Agent app, not a terminal +- Confirm OCR is done (`ocr_status: done`) +- Confirm `analyze` is set to `true` --- -## 9. 更新 +## 9. Updating -BRAT 会自动检测插件更新。Python 包更新: +BRAT auto-detects plugin updates. For the Python package: ```bash paperforge update -# 或 +# or pip install --upgrade paperforge ``` --- -## 10. 架构 +## 10. Architecture ``` paperforge/ -├── core/ 契约层 — PFResult/ErrorCode/状态机 -├── adapters/ 适配器层 — BBT 解析、路径、frontmatter -├── services/ 服务层 — SyncService 编排 -├── worker/ 工人层 — OCR、状态、修复 -├── commands/ CLI 分发 -├── setup/ 安装向导(目录创建、Agent 部署、Zotero 链接) -├── plugin/ Obsidian 插件(Dashboard、设置面板) -└── schema/ 字段注册表 +├── core/ Contract layer — PFResult/ErrorCode/state machine +├── adapters/ Adapter layer — BBT parsing, paths, frontmatter I/O +├── services/ Service layer — SyncService orchestration +├── worker/ Worker layer — OCR, status, repair +├── commands/ CLI dispatch +├── setup/ Setup wizard (directories, agent deployment, Zotero linking) +├── plugin/ Obsidian plugin (Dashboard, settings panel) +└── schema/ Field registry ``` --- -## 协议 +## License -[CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/)。仅限非商业使用。 +[CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/). Non-commercial use only. -## 致谢 +## Acknowledgments -基于 [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)、[Obsidian](https://obsidian.md)、[Better BibTeX for Zotero](https://retorque.re/zotero-better-bibtex/) 等开源项目构建。 +Built on [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [Obsidian](https://obsidian.md), [Better BibTeX for Zotero](https://retorque.re/zotero-better-bibtex/), and other great open-source projects. diff --git a/README.zh.md b/README.zh.md new file mode 100644 index 0000000..75f9ff3 --- /dev/null +++ b/README.zh.md @@ -0,0 +1,300 @@ +

+ PaperForge banner +

+ +# PaperForge + +[![Version](https://img.shields.io/github/v/release/LLLin000/PaperForge?style=for-the-badge&label=version)](https://github.com/LLLin000/PaperForge/releases) +[![Python](https://img.shields.io/pypi/pyversions/paperforge?style=for-the-badge&logo=python&logoColor=white&color=3775A9)](https://python.org) +[![License](https://img.shields.io/badge/license-CC%20BY--NC--SA%204.0-lightgreen?style=for-the-badge)](LICENSE) + +**简体中文** · [English](README.md) + +> **铸知识为器,启洞见之明。 — Forge Knowledge, Empower Insight.** + +PaperForge 让你在 Obsidian 里管理 Zotero 文献。同步、OCR 全文提取、图表解析、AI 精读,全在一个 Vault 里完成。 + +--- + +## 0. 先理解它是什么 + +PaperForge **不是一个纯 Obsidian 插件**。它有两部分: + +| 部分 | 是什么 | 干什么 | 装在哪 | +|------|--------|--------|--------| +| Obsidian 插件 | `main.js` + `manifest.json` + `styles.css` | Dashboard、按钮、设置界面 | Vault 的 `.obsidian/plugins/paperforge/` | +| Python 包 | `paperforge` | 同步、OCR、Doctor、修复 | 系统 Python 环境 (`pip install`) | + +插件是**壳**,Python 包是**引擎**。插件里的按钮点了之后,实际是调用 Python 命令行去干活。 + +**所以装完插件之后,必须在设置里确认 Python 包也已安装,并且版本一致。** + +--- + +## 1. 安装 Obsidian 插件 + +### 方式一:BRAT(推荐) + +1. 在 Obsidian 社区插件市场搜索安装 **BRAT**(Beta Reviewer's Auto-update Tester) +2. 打开 BRAT 设置 → `Add Beta Plugin` +3. 填入仓库地址:`https://github.com/LLLin000/PaperForge` +4. BRAT 会自动下载最新 Release 的 `main.js`、`manifest.json`、`styles.css` 并安装 +5. 在 Obsidian 设置 → 社区插件 → 启用 PaperForge + +> BRAT 能自动检测 GitHub Release 更新,不需要手动下载。 + +### 方式二:手动下载 + +1. 打开 [Releases](https://github.com/LLLin000/PaperForge/releases) 页面 +2. 下载最新版本的三个文件:`main.js`、`manifest.json`、`styles.css` +3. 在 Vault 里创建文件夹 `.obsidian/plugins/paperforge/` +4. 把三个文件放进去 +5. 重启 Obsidian → 设置 → 社区插件 → 启用 PaperForge + +> 手动安装不会自动更新,每次新版本需要重新下载替换。 + +--- + +## 2. 安装 Python 包 + +插件装好后,打开 PaperForge 设置页面。你会看到 **运行时状态** 区域: + +``` +插件 v1.5.0 → Python 包 v1.5.0 ✓ 匹配 +``` + +- 如果显示"未安装" → 在设置里确认 Python 解释器路径,然后点击 **验证** 重新检测 +- 如果显示"版本不匹配" → 插件更新时 Python 包会自动同步升级,如果没成功,点 **更新运行时** 手动触发 + +--- + +## 3. Python 解释器识别逻辑 + +PaperForge 需要找到你系统里的 Python。它按以下顺序查找,找到就用: + +| 优先级 | 来源 | 说明 | +|--------|------|------| +| 1 | **你手动指定** | 设置 → `自定义 Python 路径`,填入完整路径(如 `C:\Users\你的用户名\AppData\Local\Programs\Python\Python311\python.exe`)。**这是最可靠的方式。** | +| 2 | **venv 自动检测** | 自动扫描 Vault 根目录下的 `.paperforge-test-venv`、`.venv`、`venv` 里的 Python | +| 3 | **系统自动检测** | 依次尝试 `py -3`、`python`、`python3`,用 `--version` 验证,挑第一个能用的 | +| 4 | **兜底** | 以上都找不到,回退到 `python` | + +> 如果你系统里有多个 Python(比如系统自带的 3.9 + 自己装的 3.11),**强烈建议在设置里手动指定路径**,避免跑错环境。 +> +> 设置里的 **验证** 按钮会立即测试当前选中的解释器,显示它能不能用、是什么版本。 + +--- + +## 4. 配置说明 + +以下参数在**插件设置页面**中配置(设置 → 第三方插件 → PaperForge → 打开安装向导)。首次安装时基础配置已是正确默认值,一般不需要手动改。以下解释供你了解每个参数的作用: + +### 4.1 Vault 路径 +你当前打开的 Obsidian Vault 根目录。安装向导自动检测,一般不用改。 + +### 4.2 AI Agent 平台 + +PaperForge 的精读功能通过 AI Agent 执行。核心机制是 **触发词** 而非注册插件:你直接在 Agent 对话里输入 `/pf-deep `,Agent 识别到触发词后自动加载 `literature-qa` Skill 来定位论文并执行精读。 + +安装向导会把 Skill 文件部署到对应位置: + +| Agent | Skill 安装位置 | 触发词示例 | +|-------|---------------|-----------| +| **OpenCode** | `.opencode/skills/` + `.opencode/command/` | `/pf-deep ` | +| **Claude Code** | `.claude/skills/` | `/pf-deep ` | +| **Cursor** | `.cursor/skills/` | `/pf-deep ` | +| **GitHub Copilot** | `.github/skills/` | `/pf-deep ` | +| **Windsurf** | `.windsurf/skills/` | `/pf-deep ` | +| **Codex** | `.codex/skills/` | `$pf-deep ` | +| **Cline** | `.clinerules/` | `/pf-deep ` | + +> **关键理解**:`/pf-deep` 不是 Agent 平台的插件,而是部署在 Vault 里的 Skill 文件。安装向导把文件拷过去之后,Agent 启动时自动发现并识别这些触发词。你不需要在 Agent 平台里做任何"安装插件"的操作。 + +### 4.3 目录命名 + +安装向导会问你几个目录叫什么名字。这些都是给你自己看的,用来组织 Vault 里的文件结构。**大部分情况用默认值就行。** + +| 参数 | 默认值 | 作用 | +|------|--------|------| +| `system_dir` | `System` | PaperForge 内部数据的总目录。下面会有 `exports/`(Zotero 导出的 JSON)、`ocr/`(OCR 结果)、`config/` 等子目录。你一般不需要手动进去看。 | +| `resources_dir` | `Resources` | 资源根目录。你的正式文献笔记就放在这里下面的 `literature_dir` 里。 | +| `literature_dir` | `Literature` | 正式文献笔记的目录。`paperforge sync` 生成的带 frontmatter 的 `.md` 笔记在这里。你日常阅读、编辑笔记都在这个目录。 | +| `base_dir` | `Bases` | Obsidian Base 视图文件目录。Dashboard 里的筛选视图("待 OCR"、"待精读"等)存在这里。 | + +### 4.4 PaddleOCR API Token + +OCR 功能需要 PaddleOCR 的 API。在 `.env` 文件里配置: + +``` +PADDLEOCR_API_TOKEN=你的API密钥 +``` + +安装向导会引导你填写,也可以之后手动在 Vault 根目录的 `.env` 文件里加。OCR URL 一般不需要改。 + +### 4.5 Zotero 数据目录 + +PaperForge 会创建一个 junction(Windows)或 symlink(macOS/Linux),把 Zotero 的数据目录连接到 Vault 里。这样 Obsidian 的 wikilink 才能找到 PDF 文件。 + +安装向导会自动检测 Zotero 的安装位置。如果检测失败,你需要手动指定 Zotero 数据目录的路径——也就是包含 `storage` 子目录的那个文件夹(不是 Zotero 程序本身)。 + +### 4.6 安装过程 + +确认配置后,安装向导会自动: +- 创建所有需要的目录结构 +- 把 Agent 命令文件部署到对应位置 +- 把 Obsidian 插件文件安装到位 +- 创建 Zotero junction/symlink +- 写入 `paperforge.json` 和 `.env` + +整个过程是**增量的** — 如果你选的目录里已经有文件,安装向导只会补充缺失的,不会删除已有内容。 + +--- + +## 5. 首次使用 + +1. **确认版本一致**:设置 → 运行时状态 → 确保插件和 Python 包版本一致 +2. **确认 Python 正确**:设置 → 验证按钮,确认连接的是你想要的 Python +3. **配置 PaddleOCR**:在 Vault 根目录 `.env` 里填入 API Token +4. **在 Zotero 里导出文献**:右键要同步的文献库 → `导出...` → 格式选 `Better BibTeX JSON` → 勾选 `Keep updated` → 保存到 `/PaperForge/exports/` +5. **运行 Doctor**:Dashboard → `Run Doctor`,确认所有检查通过 + +--- + +## 6. 日常使用 + +### Dashboard(三模式视图) + +`Ctrl+P` → `PaperForge: Open Dashboard` 打开控制面板,包含三种视图: + +| 视图 | 用途 | +|------|------| +| **Global** | 系统首页:运行 Sync、OCR、Doctor 等机械操作 | +| **Collection** | 批量工作台:按领域查看文献队列、批量标记 | +| **Per-paper** | 单篇阅读伴侣:`do_ocr` / `analyze` 切换复选框,讨论记录卡片 | + +> Dashboard 里的 PDF 文件会自动进入 Per-paper 模式,无需手动切换。 + +### AI 精读与问答(需 Agent) + +打开 Agent 应用,直接输入触发词即可。Agent 识别到触发词后会自动加载 `literature-qa` Skill,按标准化流程定位论文并执行操作。 + +**你对文献描述得越具体(Zotero Key、标题、DOI),Agent 定位越快。** + +| 路由 | 触发词 | 做什么 | 前置条件 | +|------|--------|--------|---------| +| 精读 | `/pf-deep ` 或 `精读 ` | Keshav 三阶段组会式精读,结果写入 formal note | OCR 完成、analyze 为 true | +| 问答 | `/pf-paper ` 或 `文献问答 ` | 交互式论文 Q&A,不强制 OCR | 已有正式笔记 | +| 存档 | `/pf-end` 或 `结束讨论` | 保存本次 `/pf-paper` 问答记录 | `/pf-paper` 会话中 | + +> **两种触发方式等效**:你可以用 Agent 原生命令 `/pf-deep ABC12345`,也可以用自然语言 `精读 ABC12345`。Agent 识别到触发词后会自动加载 `literature-qa` Skill。 + +> `/pf-deep` 和 `/pf-paper` **不是终端命令**,也不是 Agent 平台的注册插件。它们是部署在 Vault 里的 Skill 文件的触发词。安装向导把 Skill 文件放到正确位置后,Agent 启动时自动发现。使用方式就是打开 Agent 对话,输入触发词 —— 和你在终端敲 `ls` 一样直接。 + +### `/pf-end` 详解 + +- `/pf-end` 仅对 `/pf-paper` 问答会话生效。精读(`/pf-deep`)的内容直接写入 formal note,不需要 `/pf-end`。 +- 执行后会在论文 workspace 下生成两个文件: + - `discussion.md` — 人类可读的 Q&A 讨论记录 + - `discussion.json` — 结构化 Q&A 数据(含时间戳、来源标记) +- Dashboard 的 **Per-paper** 视图会自动以讨论记录卡片形式展示这些记录 + +> 不同 Agent 的命令前缀可能不同(大部分是 `/`,Codex 是 `$`)。 + +--- + +## 7. 完整工作流 + +``` +Zotero 添加论文 + ↓ Better BibTeX 自动导出 JSON 到 exports/ 目录 +Dashboard → Sync Library + ↓ 生成正式笔记(Literature/ 目录下,带 frontmatter 元数据) +在笔记 frontmatter 里把 do_ocr 设为 true + ↓ +Dashboard → Run OCR + ↓ PaddleOCR 提取全文 + 图表 → ocr/ 目录 +在笔记 frontmatter 里把 analyze 设为 true + ↓ +打开 Agent → 输入 /pf-deep + ↓ Agent 识别触发词 → 加载 literature-qa Skill → 三阶段精读 +笔记里出现 ## 🔍 精读 区域 + ↓(如需额外问答) +打开 Agent → 输入 /pf-paper + ↓ 交互式 Q&A +输入 /pf-end 保存讨论记录 + ↓ +Dashboard Per-paper 视图展示讨论卡片 +``` + +--- + +## 8. 常见问题 + +### 插件加载失败(Cannot find module) + +- 确认 `.obsidian/plugins/paperforge/` 下有 `main.js`、`manifest.json`、`styles.css` 三个文件 +- 如果 BRAT 从旧版升级后出问题:删除整个 `paperforge` 插件文件夹,让 BRAT 重新下载 +- 打开 Developer Console(`Ctrl+Shift+I`)看红色报错 + +### "同步运行时" 点了还是旧版本 + +- 插件调用的 Python 可能和你终端用的是不同环境。检查设置 → Python 解释器路径 +- pip 缓存问题,用 `--no-cache-dir` 重装 +- 确认 `https://github.com/LLLin000/PaperForge` 网络能通 + +### OCR 一直 pending + +- 确认 `.env` 里有 `PADDLEOCR_API_TOKEN` +- 终端运行 `paperforge ocr --diagnose` 检查 API 连通性 +- PDF 路径可能不对:运行 `paperforge repair --fix-paths` + +### 同步后没有生成笔记 + +- Zotero Better BibTeX 是否配置了自动导出?JSON 是否在 `exports/` 目录? +- 运行 `paperforge doctor` 看具体哪一步失败 +- 运行 `paperforge status` 查看系统状态总览 + +### /pf-deep 触发词没反应 + +- 确认你在 **Agent 应用** 里输入,不是在终端 +- 确认安装向导已运行,Skill 文件已部署到正确的 Vault 目录 +- 确认 OCR 已完成(ocr_status: done) +- 确认 analyze 已设为 true + +--- + +## 9. 更新 + +BRAT 会自动检测插件更新。Python 包更新: + +```bash +paperforge update +# 或 +pip install --upgrade paperforge +``` + +--- + +## 10. 架构 + +``` +paperforge/ +├── core/ 契约层 — PFResult/ErrorCode/状态机 +├── adapters/ 适配器层 — BBT 解析、路径、frontmatter +├── services/ 服务层 — SyncService 编排 +├── worker/ 工人层 — OCR、状态、修复 +├── commands/ CLI 分发 +├── setup/ 安装向导(目录创建、Agent 部署、Zotero 链接) +├── plugin/ Obsidian 插件(Dashboard、设置面板) +└── schema/ 字段注册表 +``` + +--- + +## 协议 + +[CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/)。仅限非商业使用。 + +## 致谢 + +基于 [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)、[Obsidian](https://obsidian.md)、[Better BibTeX for Zotero](https://retorque.re/zotero-better-bibtex/) 等开源项目构建。 diff --git a/docs/superpowers/specs/2026-05-14-dashboard-copy-metadata-design.md b/docs/superpowers/specs/2026-05-14-dashboard-copy-metadata-design.md new file mode 100644 index 0000000..aee4eea --- /dev/null +++ b/docs/superpowers/specs/2026-05-14-dashboard-copy-metadata-design.md @@ -0,0 +1,176 @@ +# Dashboard Copy Interaction + Per-Paper Metadata Enhancement + +> **Status:** Spec complete, awaiting implementation +> **Date:** 2026-05-14 +> **Scope:** Plugin JS + CSS only (main.js + styles.css) + +## Goal + +Two UX improvements to the per-paper dashboard view: +1. **Click-to-copy** for discrete metadata fields (single click → clipboard) +2. **Text-selectable** for prose content areas (normal browser selection + copy) +3. **Metadata enhancement** — add Journal / DOI / Zotero Key / Collection Path in a compact inline row below authors/year + +--- + +## Design + +### Per-Paper View Layout (after changes) + +``` +┌────────────────────────────────────────────────────────────────┐ +│ │ +│ Efficacy of TXA in Reducing Blood Loss... [📋] │ ← Title, click-to-copy, copy icon on hover +│ Tianli Xia, Hiroyasu Konno, Jeonghyun Ahn · 2016 │ ← Authors (click-to-copy) · Year +│ │ +│ Cancer Research · DOI: 10.1158/... · Zotero: ABCDEFG 🔍 │ ← NEW meta-line (Zotero-style) +│ 📂 Orthopedics / Spine │ ← NEW collection path (click-to-copy) +│ │ +│ [PDF] [Fulltext] [OCR done] [Deep-read pending] │ ← existing status pills + file buttons +│ │ +│ ## 🔍 精读 (article overview — text-selectable) │ ← existing, keep selectable +│ ## 💬 Discussion (Q&A — text-selectable) │ ← existing, keep selectable +│ ▶ Technical Details (health, paths — click-to-copy fields) │ ← existing, add copy-on-click +│ │ +└────────────────────────────────────────────────────────────────┘ +``` + +### Meta-Line CSS (from user-provided reference) + +```css +/* Metadata inline row — Zotero style */ +.paperforge-meta-line { + margin-top: 8px; + font-size: 13px; + color: var(--text-muted); + display: flex; + flex-wrap: wrap; + gap: 6px 10px; + align-items: center; +} + +.paperforge-meta-item { + white-space: nowrap; +} + +.paperforge-meta-key { + color: var(--text-faint); + margin-right: 4px; +} + +.paperforge-meta-value { + color: var(--text-muted); +} + +.paperforge-meta-value.mono { + font-family: var(--font-monospace); +} + +/* Clickable fields */ +.paperforge-meta-value.clickable, +.paperforge-click-copy { + cursor: pointer; + border-bottom: 1px dashed var(--text-faint); + transition: color 0.15s, border-color 0.15s; +} + +.paperforge-meta-value.clickable:hover, +.paperforge-click-copy:hover { + color: var(--text-accent); + border-bottom-color: var(--text-accent); +} + +.paperforge-copy-icon { + opacity: 0; + margin-left: 4px; + font-size: 11px; + cursor: pointer; + transition: opacity 0.15s; +} +.paperforge-click-copy:hover .paperforge-copy-icon { + opacity: 0.6; +} +``` + +### Interaction Rules + +| Field | Type | Click behavior | +| ------------------ | -------- | -------------------------------------------------- | +| Title | Copy | Click → copy full title; copy icon appears on hover | +| Authors | Copy | Click → copy author string | +| Journal | Display | Read-only, no copy | +| DOI | Copy | Click → copy DOI; also link icon to doi.org | +| Zotero Key | Copy | Click → copy key (monospace) | +| Collection Path | Copy | Click → copy pipe-joined path | +| PMID (if present) | Copy | Click → copy PMID | +| Note Path | Copy | Inside Technical Details; click → copy path | +| Fulltext Path | Copy | Inside Technical Details; click → copy path | +| Article Overview | Select | Normal text selection, no click-to-copy | +| Recent Discussion | Select | Normal text selection, no click-to-copy | +| Technical Details | Mixed | Paths are click-to-copy; status text is selectable | + +### Copy Feedback + +On click → `navigator.clipboard.writeText(value)` then show a brief feedback: +- Change field text to "Copied!" for 1 second, then restore +- OR show a floating tooltip +- Recommended: inline text change (simpler, no new element needed) + +--- + +## Implementation Tasks + +### Task 1: Add CSS to styles.css +- Add `.paperforge-meta-line`, `.paperforge-meta-item`, `.paperforge-meta-key`, `.paperforge-meta-value` rules +- Add `.paperforge-click-copy` + `.paperforge-copy-icon` hover rules +- Ensure existing content areas have no `user-select: none` + +### Task 2: Render meta-line in _renderPaperMode +- File: `paperforge/plugin/main.js`, in `PaperForgeStatusView._renderPaperMode()` +- After authors/year rendering (~line 1591), insert meta-line div +- Fields: Journal · DOI: xxx · Zotero: xxx · Collection: xxx +- Source data: `entry.journal`, `entry.doi`, `entry.zotero_key`, `entry.collection_path`, `entry.pmid` +- Add `paperforge-meta-value mono clickable` class to DOI, Zotero Key, PMID + +### Task 3: Implement click-to-copy helper +- Add `_makeClickCopy(el, value, displayText)` method to `PaperForgeStatusView` + - Sets cursor:pointer, dashed border, onclick handler + - On click: copy value, change text to "Copied!", setTimeout restore displayText +- Apply to: title, authors, DOI, zotero_key, collection_path, pmid + +### Task 4: Apply click-to-copy to Technical Details +- File: `paperforge/plugin/main.js`, `_renderPaperTechnicalDetails()` +- Make Note Path and Fulltext Path clickable +- Use same `_makeClickCopy` helper + +### Task 5: Add copy icon to title +- Append a small 📋 span to the title element +- Show on hover via CSS opacity transition + +### Task 6: Verify text selection behavior +- Confirm article overview, recent discussion, and tech details body text are NOT `user-select: none` +- Remove any existing `user-select: none` from content areas (but keep on buttons/toggles) + +--- + +## Files to Modify + +| File | Changes | +| ------------------------- | ---------------------------------------------------------- | +| `paperforge/plugin/styles.css` | ~40 lines: meta-line + click-copy + copy-icon CSS | +| `paperforge/plugin/main.js` | ~60 lines: meta-line rendering + _makeClickCopy() + wiring | + +--- + +## Acceptance Criteria + +- [ ] Meta-line appears below authors/year in per-paper view: Journal · DOI · Zotero · Collection +- [ ] DOI and Zotero Key are monospace, dashed-underline on hover, click to copy +- [ ] Title shows copy icon on hover, click to copy full title +- [ ] Authors click to copy +- [ ] Collection path click to copy +- [ ] Note Path / Fulltext Path in Technical Details click to copy +- [ ] Article overview and Recent Discussion text remains freely selectable +- [ ] "Copied!" feedback displays briefly on click +- [ ] No regressions in global or collection modes +- [ ] Works in both light and dark Obsidian themes diff --git a/paperforge/plugin/styles.css b/paperforge/plugin/styles.css index 31f0725..8c0682e 100644 --- a/paperforge/plugin/styles.css +++ b/paperforge/plugin/styles.css @@ -1584,15 +1584,15 @@ } .paperforge-runtime-badge.match { background: var(--color-green, #4caf50); - color: #fff; + color: #ffffff; } .paperforge-runtime-badge.mismatch { background: var(--color-red, #f44336); - color: #fff; + color: #ffffff; } .paperforge-runtime-badge.missing { background: var(--color-orange, #ff9800); - color: #fff; + color: #ffffff; } /* ========================================================================== @@ -1600,7 +1600,7 @@ ========================================================================== */ .paperforge-drift-banner { background: var(--color-yellow, #ffc107); - color: var(--background-primary, #000); + color: var(--background-primary, #000000); padding: 8px 14px; border-radius: 6px; font-size: 0.9em; @@ -1619,8 +1619,8 @@ padding: 4px 12px; font-size: 0.85em; background: var(--interactive-normal, #e0e0e0); - color: var(--text-normal, #000); - border: 1px solid var(--background-modifier-border, #ccc); + color: var(--text-normal, #000000); + border: 1px solid var(--background-modifier-border, #cccccc); border-radius: 4px; cursor: pointer; } From 6f6c1ae19d2228cacef06052b42173184ae539c5 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 01:31:43 +0800 Subject: [PATCH 106/132] =?UTF-8?q?spec:=20remove=20copy=20icon=20?= =?UTF-8?q?=E2=80=94=20clean=20click-to-copy=20with=20dashed=20underline?= =?UTF-8?q?=20only?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...26-05-14-dashboard-copy-metadata-design.md | 28 ++++++------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/docs/superpowers/specs/2026-05-14-dashboard-copy-metadata-design.md b/docs/superpowers/specs/2026-05-14-dashboard-copy-metadata-design.md index aee4eea..9d40e07 100644 --- a/docs/superpowers/specs/2026-05-14-dashboard-copy-metadata-design.md +++ b/docs/superpowers/specs/2026-05-14-dashboard-copy-metadata-design.md @@ -80,15 +80,10 @@ Two UX improvements to the per-paper dashboard view: border-bottom-color: var(--text-accent); } -.paperforge-copy-icon { - opacity: 0; - margin-left: 4px; - font-size: 11px; - cursor: pointer; - transition: opacity 0.15s; -} -.paperforge-click-copy:hover .paperforge-copy-icon { - opacity: 0.6; +/* Feedback flash on copy */ +.paperforge-copied { + color: var(--text-accent) !important; + border-bottom-color: var(--text-success) !important; } ``` @@ -111,10 +106,7 @@ Two UX improvements to the per-paper dashboard view: ### Copy Feedback -On click → `navigator.clipboard.writeText(value)` then show a brief feedback: -- Change field text to "Copied!" for 1 second, then restore -- OR show a floating tooltip -- Recommended: inline text change (simpler, no new element needed) +On click → `navigator.clipboard.writeText(value)`. Brief inline feedback: text briefly turns accent color (1s), then restores. No icon, no tooltip. --- @@ -143,11 +135,7 @@ On click → `navigator.clipboard.writeText(value)` then show a brief feedback: - Make Note Path and Fulltext Path clickable - Use same `_makeClickCopy` helper -### Task 5: Add copy icon to title -- Append a small 📋 span to the title element -- Show on hover via CSS opacity transition - -### Task 6: Verify text selection behavior +### Task 5: Verify text selection behavior - Confirm article overview, recent discussion, and tech details body text are NOT `user-select: none` - Remove any existing `user-select: none` from content areas (but keep on buttons/toggles) @@ -166,11 +154,11 @@ On click → `navigator.clipboard.writeText(value)` then show a brief feedback: - [ ] Meta-line appears below authors/year in per-paper view: Journal · DOI · Zotero · Collection - [ ] DOI and Zotero Key are monospace, dashed-underline on hover, click to copy -- [ ] Title shows copy icon on hover, click to copy full title +- [ ] Title dashed-underline on hover, click to copy - [ ] Authors click to copy - [ ] Collection path click to copy - [ ] Note Path / Fulltext Path in Technical Details click to copy - [ ] Article overview and Recent Discussion text remains freely selectable -- [ ] "Copied!" feedback displays briefly on click +- [ ] Brief color-flash feedback on copy (no icon, no tooltip) - [ ] No regressions in global or collection modes - [ ] Works in both light and dark Obsidian themes From b4632e32055f27f302547732ff06352754af19b1 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 01:35:22 +0800 Subject: [PATCH 107/132] fix(plugin): replace deprecated revealLeaf().setViewState with internalPlugins toggle --- paperforge/plugin/main.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 68f3e41..a607780 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -1535,7 +1535,8 @@ class PaperForgeStatusView extends ItemView { const baseDir = plugin?.settings?.base_dir || 'Bases'; const baseFile = this.app.vault.getAbstractFileByPath(baseDir); if (baseFile) { - this.app.workspace.revealLeaf().setViewState({ type: 'file-explorer', active: true }); + const explorer = this.app.internalPlugins.getPluginById('file-explorer'); + if (explorer) explorer.toggle(true); } else { new Notice('[!!] Base directory not found: ' + baseDir, 6000); } From 9af8bbfb28a00a4aeefbe22ed449db775f04af3c Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 01:38:06 +0800 Subject: [PATCH 108/132] fix(plugin): simplify file-explorer open to enable() call --- paperforge/plugin/main.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index a607780..5a26ea9 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -1535,8 +1535,7 @@ class PaperForgeStatusView extends ItemView { const baseDir = plugin?.settings?.base_dir || 'Bases'; const baseFile = this.app.vault.getAbstractFileByPath(baseDir); if (baseFile) { - const explorer = this.app.internalPlugins.getPluginById('file-explorer'); - if (explorer) explorer.toggle(true); + this.app.internalPlugins.getPluginById('file-explorer')?.enable?.(true); } else { new Notice('[!!] Base directory not found: ' + baseDir, 6000); } From d6960a78a8e98fd050e41e4775d2017c354b06e8 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 01:39:26 +0800 Subject: [PATCH 109/132] fix(plugin): use getLeavesOfType + revealLeaf to focus existing file-explorer --- paperforge/plugin/main.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 5a26ea9..dada938 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -1535,7 +1535,10 @@ class PaperForgeStatusView extends ItemView { const baseDir = plugin?.settings?.base_dir || 'Bases'; const baseFile = this.app.vault.getAbstractFileByPath(baseDir); if (baseFile) { - this.app.internalPlugins.getPluginById('file-explorer')?.enable?.(true); + const leaves = this.app.workspace.getLeavesOfType('file-explorer'); + if (leaves.length > 0) { + this.app.workspace.revealLeaf(leaves[0]); + } } else { new Notice('[!!] Base directory not found: ' + baseDir, 6000); } From 97d9715ea57cdabccc234d93ee7f3c018fc26664 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 01:41:51 +0800 Subject: [PATCH 110/132] fix(plugin): open .base file directly instead of file-explorer + trim domain basename --- paperforge/plugin/main.js | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index dada938..0d11f3d 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -1326,7 +1326,7 @@ class PaperForgeStatusView extends ItemView { const filePath = file.path; if (ext === 'base') { - return { mode: 'collection', filePath, key: null, domain: file.basename }; + return { mode: 'collection', filePath, key: null, domain: file.basename.trim() }; } if (ext === 'md') { @@ -1533,11 +1533,18 @@ class PaperForgeStatusView extends ItemView { hubBtn.createEl('span', { text: 'Open Literature Hub' }); hubBtn.addEventListener('click', () => { const baseDir = plugin?.settings?.base_dir || 'Bases'; - const baseFile = this.app.vault.getAbstractFileByPath(baseDir); - if (baseFile) { - const leaves = this.app.workspace.getLeavesOfType('file-explorer'); - if (leaves.length > 0) { - this.app.workspace.revealLeaf(leaves[0]); + const baseFolder = this.app.vault.getAbstractFileByPath(baseDir); + if (baseFolder) { + // Find first .base file in the base directory + let baseFile = null; + if (baseFolder.children) { + baseFile = baseFolder.children.find(f => f.extension === 'base'); + } + if (baseFile) { + const leaf = this.app.workspace.getLeaf(false); + if (leaf) leaf.openFile(baseFile); + } else { + new Notice('[!!] No .base file found in ' + baseDir, 6000); } } else { new Notice('[!!] Base directory not found: ' + baseDir, 6000); From cfc075555bda559daf6406a7486cb6c0e739ffa5 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 01:49:38 +0800 Subject: [PATCH 111/132] fix(plugin): fall back to global mode when Base domain has no matching papers --- paperforge/plugin/main.js | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 0d11f3d..9651219 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -1977,13 +1977,14 @@ class PaperForgeStatusView extends ItemView { const domain = this._currentDomain || 'Unknown'; const domainItems = this._filterByDomain(domain); - const view = this._contentEl.createEl('div', { cls: 'paperforge-collection-view' }); - if (domainItems.length === 0) { - this._renderEmptyState(view, 'No papers found in domain "' + domain + '". Sync some papers first.'); + // Fall back to global mode if no papers match this domain (e.g. "Literature Hub") + this._renderGlobalMode(); return; } + const view = this._contentEl.createEl('div', { cls: 'paperforge-collection-view' }); + // ── Single-pass aggregation ── const totalPapers = domainItems.length; let hasPdf = 0, ocrDone = 0, analyzeReady = 0, deepRead = 0; From 5493e28d6619d4dd3404617e67d9b49583d1a218 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 16:51:14 +0800 Subject: [PATCH 112/132] feat: add reading_log and project_log tables to memory schema --- paperforge/memory/schema.py | 42 +++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/paperforge/memory/schema.py b/paperforge/memory/schema.py index e9695e7..9989ffa 100644 --- a/paperforge/memory/schema.py +++ b/paperforge/memory/schema.py @@ -2,7 +2,7 @@ import sqlite3 -CURRENT_SCHEMA_VERSION = 1 +CURRENT_SCHEMA_VERSION = 2 # Bump from 1 for reading_log + project_log tables CREATE_META = """ CREATE TABLE IF NOT EXISTS meta ( @@ -136,7 +136,43 @@ "CREATE INDEX IF NOT EXISTS idx_events_time ON paper_events(created_at);", ] -ALL_TABLES = ["paper_fts", "papers", "paper_assets", "paper_aliases", "meta", "paper_events"] +CREATE_READING_LOG = """ +CREATE TABLE IF NOT EXISTS reading_log ( + id TEXT PRIMARY KEY, + paper_id TEXT NOT NULL, + project TEXT DEFAULT '', + section TEXT NOT NULL, + excerpt TEXT NOT NULL, + context TEXT DEFAULT '', + usage TEXT NOT NULL, + note TEXT DEFAULT '', + tags_json TEXT DEFAULT '[]', + created_at TEXT NOT NULL, + agent TEXT DEFAULT '', + verified INTEGER DEFAULT 0, + FOREIGN KEY (paper_id) REFERENCES papers(zotero_key) +); +""" + +CREATE_PROJECT_LOG = """ +CREATE TABLE IF NOT EXISTS project_log ( + id TEXT PRIMARY KEY, + project TEXT NOT NULL, + date TEXT NOT NULL, + type TEXT NOT NULL, + title TEXT NOT NULL, + decisions_json TEXT DEFAULT '[]', + detours_json TEXT DEFAULT '[]', + reusable_json TEXT DEFAULT '[]', + todos_json TEXT DEFAULT '[]', + related_papers_json TEXT DEFAULT '[]', + tags_json TEXT DEFAULT '[]', + created_at TEXT NOT NULL, + agent TEXT DEFAULT '' +); +""" + +ALL_TABLES = ["paper_fts", "papers", "paper_assets", "paper_aliases", "meta", "paper_events", "reading_log", "project_log"] def ensure_schema(conn: sqlite3.Connection) -> None: @@ -147,6 +183,8 @@ def ensure_schema(conn: sqlite3.Connection) -> None: conn.execute(CREATE_ALIASES) conn.execute(CREATE_PAPER_FTS) conn.execute(CREATE_EVENTS) + conn.execute(CREATE_READING_LOG) + conn.execute(CREATE_PROJECT_LOG) for idx_sql in INDEX_SQL: conn.execute(idx_sql) for idx_sql in EVENT_INDEX_SQL: From 0d1ca3a2bed4f1ac3677ea7f8049aa691f51b921 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 17:08:10 +0800 Subject: [PATCH 113/132] feat: add permanent JSONL storage layer for reading-log and project-log --- paperforge/memory/permanent.py | 154 +++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 paperforge/memory/permanent.py diff --git a/paperforge/memory/permanent.py b/paperforge/memory/permanent.py new file mode 100644 index 0000000..ec23944 --- /dev/null +++ b/paperforge/memory/permanent.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +import json +import datetime +import logging +import secrets +from pathlib import Path + +from paperforge.config import paperforge_paths + +logger = logging.getLogger(__name__) + + +def _logs_dir(vault: Path) -> Path: + paths = paperforge_paths(vault) + return paths["paperforge"] / "logs" + + +def _ensure_logs_dir(vault: Path) -> Path: + log_dir = _logs_dir(vault) + log_dir.mkdir(parents=True, exist_ok=True) + return log_dir + + +# ── Reading Log ──────────────────────────────────────────────────────────── + + +def get_reading_log_path(vault: Path) -> Path: + return _logs_dir(vault) / "reading-log.jsonl" + + +def append_reading_note( + vault: Path, + paper_id: str, + section: str, + excerpt: str, + usage: str = "", + context: str = "", + note: str = "", + project: str = "", + tags: list[str] | None = None, + agent: str = "", +) -> dict: + if not paper_id: + return {"ok": False, "error": "paper_id is required"} + if not excerpt: + return {"ok": False, "error": "excerpt is required"} + + date_str = datetime.date.today().strftime("%Y%m%d") + entry_id = f"rln_{date_str}_{secrets.token_hex(4)}" + now = datetime.datetime.now(datetime.timezone.utc).isoformat() + + entry: dict[str, object] = { + "id": entry_id, + "created_at": now, + "paper_id": paper_id, + "section": section, + "excerpt": excerpt, + "usage": usage, + "context": context, + "note": note, + "project": project, + "tags": tags or [], + "agent": agent, + } + + log_dir = _ensure_logs_dir(vault) + filepath = log_dir / "reading-log.jsonl" + + try: + with filepath.open("a", encoding="utf-8") as f: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + except OSError as e: + return {"ok": False, "error": str(e)} + + return {"ok": True, "id": entry_id, "path": str(filepath)} + + +def _read_jsonl(filepath: Path) -> list[dict]: + if not filepath.exists(): + return [] + entries: list[dict] = [] + with filepath.open("r", encoding="utf-8") as f: + for line_no, line in enumerate(f, 1): + stripped = line.strip() + if not stripped: + continue + try: + entries.append(json.loads(stripped)) + except json.JSONDecodeError: + logger.warning( + "Skipping malformed JSON line %d in %s", line_no, filepath + ) + return entries + + +def read_all_reading_notes(vault: Path) -> list[dict]: + filepath = get_reading_log_path(vault) + return _read_jsonl(filepath) + + +def get_reading_notes_for_paper(vault: Path, paper_id: str) -> list[dict]: + all_notes = read_all_reading_notes(vault) + return [n for n in all_notes if n.get("paper_id") == paper_id] + + +# ── Project Log ──────────────────────────────────────────────────────────── + + +def get_project_log_path(vault: Path) -> Path: + return _logs_dir(vault) / "project-log.jsonl" + + +def append_project_entry(vault: Path, entry: dict) -> dict: + date_str = datetime.date.today().strftime("%Y%m%d") + entry_id = f"plog_{date_str}_{secrets.token_hex(4)}" + now = datetime.datetime.now(datetime.timezone.utc).isoformat() + + record: dict[str, object] = { + "id": entry_id, + "created_at": now, + "project": entry.get("project", ""), + "entry_type": entry.get("entry_type", ""), + "content": entry.get("content", ""), + "status": entry.get("status", ""), + "decisions": entry.get("decisions", []), + "detours": entry.get("detours", []), + "reusable": entry.get("reusable", []), + "todos": entry.get("todos", []), + "related_papers": entry.get("related_papers", []), + "tags": entry.get("tags", []), + "agent": entry.get("agent", ""), + } + + log_dir = _ensure_logs_dir(vault) + filepath = log_dir / "project-log.jsonl" + + try: + with filepath.open("a", encoding="utf-8") as f: + f.write(json.dumps(record, ensure_ascii=False) + "\n") + except OSError as e: + return {"ok": False, "error": str(e)} + + return {"ok": True, "id": entry_id, "path": str(filepath)} + + +def read_all_project_entries(vault: Path) -> list[dict]: + filepath = get_project_log_path(vault) + return _read_jsonl(filepath) + + +def get_project_entries(vault: Path, project: str) -> list[dict]: + all_entries = read_all_project_entries(vault) + return [e for e in all_entries if e.get("project") == project] From 2631edb81c2f37113c12959af903839ee06994a5 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 17:20:29 +0800 Subject: [PATCH 114/132] fix: align JSONL field names with spec (type/title, add verified) --- paperforge/memory/permanent.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paperforge/memory/permanent.py b/paperforge/memory/permanent.py index ec23944..1c799ed 100644 --- a/paperforge/memory/permanent.py +++ b/paperforge/memory/permanent.py @@ -62,6 +62,7 @@ def append_reading_note( "project": project, "tags": tags or [], "agent": agent, + "verified": False, } log_dir = _ensure_logs_dir(vault) @@ -120,9 +121,9 @@ def append_project_entry(vault: Path, entry: dict) -> dict: "id": entry_id, "created_at": now, "project": entry.get("project", ""), - "entry_type": entry.get("entry_type", ""), - "content": entry.get("content", ""), - "status": entry.get("status", ""), + "date": entry.get("date", ""), + "type": entry.get("type", ""), + "title": entry.get("title", ""), "decisions": entry.get("decisions", []), "detours": entry.get("detours", []), "reusable": entry.get("reusable", []), From d16b6c7d591970e49aab5d9aac337075cd2ed921 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 17:31:32 +0800 Subject: [PATCH 115/132] feat: upgrade reading-log to JSONL with context/tags/project fields and auto-render --- paperforge/cli.py | 11 ++- paperforge/commands/reading_log.py | 152 +++++++++++++++++++++++++++-- paperforge/memory/events.py | 35 ++++++- 3 files changed, 189 insertions(+), 9 deletions(-) diff --git a/paperforge/cli.py b/paperforge/cli.py index f6679fb..90ce635 100644 --- a/paperforge/cli.py +++ b/paperforge/cli.py @@ -291,6 +291,13 @@ def build_parser() -> argparse.ArgumentParser: p_rl.add_argument("--excerpt", help="Quoted excerpt") p_rl.add_argument("--usage", help="How this supports the current writing") p_rl.add_argument("--note", help="Optional cross-validation note") + p_rl.add_argument("--context", help="Full paragraph containing excerpt") + p_rl.add_argument("--tags", help="Comma-separated tags") + p_rl.add_argument("--project", help="Associated project name") + p_rl.add_argument("--render", action="store_true", help="Render reading-log.md for one or all projects") + p_rl.add_argument("--correct", dest="correct_id", help="ID of prior reading note to correct") + p_rl.add_argument("--correction", help="Correction text") + p_rl.add_argument("--reason", help="Reason for correction (e.g. 'Rechecked figure legend')") p_rl.add_argument("--since", help="Export notes since date (YYYY-MM-DD)") p_rl.add_argument("--limit", type=int, default=50, help="Max notes to export") p_rl.add_argument("--output", help="Write markdown to file") @@ -636,11 +643,11 @@ def _cmd_paths(vault: Path, args: argparse.Namespace) -> int: if args.json: # Output only the keys required by D-Path Output contract - output_keys = {"vault", "worker_script", "ld_deep_script"} + output_keys = {"vault", "worker_script", "pf_deep_script"} filtered = {k: v for k, v in all_paths.items() if k in output_keys} filtered["vault"] = str(vault.resolve()) filtered["worker_script"] = str(paths["worker_script"].resolve()) - filtered["ld_deep_script"] = str(paths["ld_deep_script"].resolve()) + filtered["pf_deep_script"] = str(paths["pf_deep_script"].resolve()) print(json.dumps(filtered, ensure_ascii=False, indent=2)) else: for key, path_str in sorted(all_paths.items()): diff --git a/paperforge/commands/reading_log.py b/paperforge/commands/reading_log.py index a703198..37c5ff5 100644 --- a/paperforge/commands/reading_log.py +++ b/paperforge/commands/reading_log.py @@ -1,16 +1,21 @@ from __future__ import annotations import argparse +import datetime import json import re from pathlib import Path from paperforge import __version__ as PF_VERSION +from paperforge.config import paperforge_paths from paperforge.core.errors import ErrorCode from paperforge.core.result import PFError, PFResult from paperforge.memory.db import get_connection, get_memory_db_path -from paperforge.memory.events import export_reading_log, write_reading_note - +from paperforge.memory.events import export_reading_log, write_correction_note, write_reading_note +from paperforge.memory.permanent import ( + append_reading_note, + read_all_reading_notes, +) _HEADER_RE = re.compile(r"^## ([A-Z0-9]{8}) \u2014 .+ \d{4}$") _TITLE_RE = re.compile(r"^\*\*Title:\*\* (.+)") @@ -211,6 +216,72 @@ def lookup_paper_events(vault: Path, key: str) -> dict: conn.close() +def _render_reading_log_md(vault: Path, project: str = "") -> None: + """Render reading-log.md from JSONL source of truth. + + Groups notes by paper_id and writes a formatted markdown file. + If project is specified, writes to /Projects//reading-log.md. + Otherwise writes to /logs/rendered/reading-log.md. + """ + paths = paperforge_paths(vault) + notes = read_all_reading_notes(vault) + + if not notes: + print("No reading notes to render.") + return + + if project: + notes = [n for n in notes if n.get("project") == project] + + if not notes: + print(f"No reading notes found{' for project ' + project if project else ''}.") + return + + grouped: dict[str, list[dict]] = {} + for n in notes: + pid = n.get("paper_id", "unknown") + grouped.setdefault(pid, []).append(n) + + lines: list[str] = [] + heading = f"Reading Log \u2014 {project}" if project else "Reading Log \u2014 All Projects" + lines.append(f"# {heading}\n") + lines.append(f"*Generated: {datetime.date.today().isoformat()} | Total entries: {len(notes)}*\n") + + for pid, entries in sorted(grouped.items()): + lines.append(f"## {pid}\n") + for entry in sorted(entries, key=lambda e: (e.get("section", ""), e.get("created_at", ""))): + section = entry.get("section", "Untitled") + lines.append(f"### {section}") + lines.append(f"> {entry.get('excerpt', '')}") + if entry.get("context"): + lines.append(">") + lines.append(f"> {entry.get('context')}") + lines.append("") + if entry.get("usage"): + lines.append(f"- **Usage:** {entry.get('usage')}") + if entry.get("note"): + lines.append(f"- **Note:** {entry.get('note')}") + tag_list = entry.get("tags", []) + if tag_list: + lines.append(f"- **Tags:** {', '.join(tag_list)}") + verified = entry.get("verified", False) + lines.append(f"- **Verified:** {'Yes' if verified else 'No'}") + lines.append("") + lines.append("---\n") + + if project: + output_dir = paths["resources"] / "Projects" / project + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / "reading-log.md" + else: + output_dir = paths["paperforge"] / "logs" / "rendered" + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / "reading-log.md" + + output_path.write_text("\n".join(lines), encoding="utf-8") + print(f"Rendered {len(notes)} entries to {output_path}") + + def run(args: argparse.Namespace) -> int: vault = args.vault_path @@ -265,22 +336,91 @@ def run(args: argparse.Namespace) -> int: print(f"No entries found for key: {args.lookup}") return 0 + if args.render: + _render_reading_log_md(vault, args.project or "") + return 0 + + if args.correct_id: + if not args.correction: + result = PFResult( + ok=False, command="reading-log", version=PF_VERSION, + data={}, + error=PFError(code=ErrorCode.INVALID_INPUT, + message="--correction is required with --correct"), + ) + if args.json: + print(result.to_json()) + else: + print("Error: --correction is required with --correct") + return 1 + + all_notes = read_all_reading_notes(vault) + original = next((n for n in all_notes if n.get("id") == args.correct_id), None) + paper_id = original.get("paper_id", "") if original else "" + if not paper_id: + result = PFResult( + ok=False, command="reading-log", version=PF_VERSION, + data={}, + error=PFError(code=ErrorCode.NOT_FOUND, + message=f"Original entry {args.correct_id} not found in JSONL"), + ) + if args.json: + print(result.to_json()) + else: + print(f"Error: Original entry {args.correct_id} not found in reading-log.jsonl") + return 1 + + ok = write_correction_note( + vault, paper_id, args.correct_id, + args.correction, args.reason or "", + ) + result = PFResult( + ok=ok, command="reading-log", version=PF_VERSION, + data={"written": ok}, + error=PFError(code=ErrorCode.INTERNAL_ERROR, + message="Failed to write correction") if not ok else None, + ) + if args.json: + print(result.to_json()) + else: + print(f"Correction written for {args.correct_id}." if ok else "Failed.") + return 0 if ok else 1 + if args.paper_id and args.excerpt: - ok = write_reading_note( + tags_list = [t.strip() for t in args.tags.split(",") if t.strip()] if args.tags else None + + jsonl_result = append_reading_note( + vault, args.paper_id, args.section or "", + args.excerpt, args.usage or "", args.context or "", + args.note or "", args.project or "", tags_list, + ) + + _db_ok = write_reading_note( vault, args.paper_id, args.section or "", args.excerpt, args.usage or "", args.note or "", + args.context or "", args.project or "", tags_list, ) + + ok = jsonl_result.get("ok", False) result = PFResult( ok=ok, command="reading-log", version=PF_VERSION, - data={"written": ok}, - error=PFError(code=ErrorCode.INTERNAL_ERROR, message="Failed to write") if not ok else None, + data={"written": ok, "id": jsonl_result.get("id"), "path": jsonl_result.get("path")}, + error=PFError(code=ErrorCode.INTERNAL_ERROR, + message=jsonl_result.get("error", "Failed to write")) if not ok else None, ) if args.json: print(result.to_json()) else: - print("Written." if ok else "Failed.") + if ok: + print(f"Written. ID: {jsonl_result.get('id', 'unknown')}") + else: + print(f"Failed: {jsonl_result.get('error', 'unknown')}") + + if ok and args.project: + _render_reading_log_md(vault, args.project) + return 0 if ok else 1 notes = export_reading_log(vault, since=args.since or "", limit=args.limit or 50) diff --git a/paperforge/memory/events.py b/paperforge/memory/events.py index 444e65b..c8cbae3 100644 --- a/paperforge/memory/events.py +++ b/paperforge/memory/events.py @@ -7,7 +7,9 @@ def write_reading_note(vault: Path, paper_id: str, section: str, - excerpt: str, usage: str = "", note: str = "") -> bool: + excerpt: str, usage: str = "", note: str = "", + context: str = "", project: str = "", + tags: list[str] | None = None) -> bool: """Record a reading note in paper_events.""" db_path = get_memory_db_path(vault) if not db_path.exists(): @@ -18,6 +20,9 @@ def write_reading_note(vault: Path, paper_id: str, section: str, "excerpt": excerpt, "usage": usage, "note": note, + "context": context, + "project": project, + "tags": tags or [], } conn = get_connection(db_path, read_only=False) try: @@ -76,3 +81,31 @@ def export_reading_log(vault: Path, since: str = "", limit: int = 50) -> list[di return results finally: conn.close() + + +def write_correction_note(vault: Path, paper_id: str, original_id: str, + correction: str, reason: str = "") -> bool: + """Record a correction note for a prior reading_note event.""" + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return False + + payload = { + "original_id": original_id, + "correction": correction, + "reason": reason, + } + conn = get_connection(db_path, read_only=False) + try: + conn.execute( + """INSERT INTO paper_events (paper_id, event_type, payload_json) + VALUES (?, 'correction_note', ?)""", + (paper_id, json.dumps(payload, ensure_ascii=False)), + ) + conn.commit() + return True + except Exception: + conn.rollback() + return False + finally: + conn.close() From b1aa22290ea3c11a52720b7c9f4a8c74956ef789 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 17:44:20 +0800 Subject: [PATCH 116/132] feat: add paper-context CLI command for reading-log safety loop --- paperforge/cli.py | 9 +++ paperforge/commands/paper_context.py | 115 +++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 paperforge/commands/paper_context.py diff --git a/paperforge/cli.py b/paperforge/cli.py index 90ce635..44d74d3 100644 --- a/paperforge/cli.py +++ b/paperforge/cli.py @@ -285,6 +285,10 @@ def build_parser() -> argparse.ArgumentParser: p_paper_status.add_argument("query", help="Paper identifier (zotero_key, DOI, title, alias)") p_paper_status.add_argument("--json", action="store_true", help="Output as JSON") + p_pc = sub.add_parser("paper-context", help="Get full context for a paper (metadata + reading notes + corrections)") + p_pc.add_argument("key", help="Zotero key") + p_pc.add_argument("--json", action="store_true", help="Output as JSON") + p_rl = sub.add_parser("reading-log", help="Record or export reading notes") p_rl.add_argument("--write", dest="paper_id", help="Write note for this zotero_key") p_rl.add_argument("--section", help="Section (e.g. Discussion P12)") @@ -554,6 +558,11 @@ def main(argv: list[str] | None = None) -> int: return run(args) + if args.command == "paper-context": + from paperforge.commands.paper_context import run + + return run(args) + if args.command == "reading-log": from paperforge.commands.reading_log import run diff --git a/paperforge/commands/paper_context.py b/paperforge/commands/paper_context.py new file mode 100644 index 0000000..ff70566 --- /dev/null +++ b/paperforge/commands/paper_context.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +import argparse +import json +import sys + +from paperforge import __version__ as PF_VERSION +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.db import get_connection, get_memory_db_path +from paperforge.memory.permanent import get_reading_notes_for_paper + + +def _build_paper_context(vault, key: str) -> dict | None: + """Build full context for a paper: metadata + reading notes + corrections.""" + + db_path = get_memory_db_path(vault) + if not db_path.exists(): + return None + + conn = get_connection(db_path, read_only=True) + try: + row = conn.execute( + """SELECT zotero_key, citation_key, title, year, doi, journal, + first_author, domain, collection_path, has_pdf, + ocr_status, analyze, deep_reading_status, lifecycle, + next_step, pdf_path, note_path, fulltext_path, paper_root + FROM papers WHERE zotero_key = ?""", + (key,), + ).fetchone() + + if not row: + return None + + paper = dict(row) + + prior_notes = get_reading_notes_for_paper(vault, key) + + corrections = [] + corr_rows = conn.execute( + """SELECT created_at, payload_json + FROM paper_events + WHERE paper_id = ? AND event_type = 'correction_note' + ORDER BY created_at DESC""", + (key,), + ).fetchall() + for cr in corr_rows: + payload = json.loads(cr["payload_json"]) + corrections.append({ + "created_at": cr["created_at"], + "previous_note_id": payload.get("ref_id", ""), + "correction": payload.get("correction", ""), + "reason": payload.get("reason", ""), + }) + + recheck_targets = [] + for n in prior_notes: + if not n.get("verified", False): + recheck_targets.append( + f"{n.get('section', 'unknown')}: {n.get('excerpt', '')[:80]}..." + ) + + return { + "warning": "Prior reading notes are not verified facts. Re-check source before reuse.", + "paper": paper, + "prior_notes": prior_notes, + "corrections": corrections, + "recheck_targets": recheck_targets, + } + finally: + conn.close() + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + key = args.key + + context = _build_paper_context(vault, key) + + if context is None: + result = PFResult( + ok=False, + command="paper-context", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message=f"No paper found for key: {key}", + ), + ) + else: + result = PFResult( + ok=True, + command="paper-context", + version=PF_VERSION, + data=context, + ) + + if args.json: + print(result.to_json()) + else: + if result.ok: + p = result.data["paper"] + print(f"Paper: {p.get('title', key)}") + print(f" Key: {p.get('zotero_key', '')}") + print(f" OCR: {p.get('ocr_status', 'unknown')}") + print(f" Lifecycle: {p.get('lifecycle', '')}") + notes = result.data.get("prior_notes", []) + print(f" Reading notes: {len(notes)}") + print(f" Corrections: {len(result.data.get('corrections', []))}") + if result.data.get("recheck_targets"): + print(f" Recheck targets: {len(result.data['recheck_targets'])}") + else: + print(f"Error: {result.error.message}", file=sys.stderr) + + return 0 if result.ok else 1 From 06a3f2bf9706ef0e0142bf87c60d6484ab6f34eb Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 17:47:43 +0800 Subject: [PATCH 117/132] feat: add project-log CLI command with JSONL write + auto-render --- paperforge/cli.py | 14 +++ paperforge/commands/project_log.py | 183 +++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+) create mode 100644 paperforge/commands/project_log.py diff --git a/paperforge/cli.py b/paperforge/cli.py index 44d74d3..e289d44 100644 --- a/paperforge/cli.py +++ b/paperforge/cli.py @@ -310,6 +310,15 @@ def build_parser() -> argparse.ArgumentParser: p_rl.add_argument("--lookup", help="Look up all reading notes for a paper key") p_rl.add_argument("--json", action="store_true", help="Output as JSON") + p_pl = sub.add_parser("project-log", help="Record or render project work logs") + p_pl.add_argument("--write", action="store_true", help="Write a new project log entry") + p_pl.add_argument("--payload", help="JSON payload for the entry") + p_pl.add_argument("--project", help="Project name (required for write/list/render)") + p_pl.add_argument("--list", action="store_true", help="List all entries for a project") + p_pl.add_argument("--render", action="store_true", help="Render project-log.md") + p_pl.add_argument("--limit", type=int, default=50, help="Max entries to list") + p_pl.add_argument("--json", action="store_true", help="Output as PFResult JSON") + p_search = sub.add_parser("search", help="Full-text search across the library") p_search.add_argument("query", help="Search query (supports FTS5 syntax)") p_search.add_argument("--json", action="store_true", help="Output as JSON") @@ -568,6 +577,11 @@ def main(argv: list[str] | None = None) -> int: return run(args) + if args.command == "project-log": + from paperforge.commands.project_log import run + + return run(args) + if args.command == "search": from paperforge.commands.search import run diff --git a/paperforge/commands/project_log.py b/paperforge/commands/project_log.py new file mode 100644 index 0000000..5f26d0a --- /dev/null +++ b/paperforge/commands/project_log.py @@ -0,0 +1,183 @@ +from __future__ import annotations + +import argparse +import json +import sys +from collections import Counter +from pathlib import Path + +from paperforge import __version__ as PF_VERSION +from paperforge.config import paperforge_paths +from paperforge.core.errors import ErrorCode +from paperforge.core.result import PFError, PFResult +from paperforge.memory.permanent import ( + append_project_entry, + get_project_entries, + read_all_project_entries, +) + + +def _render_project_log_md(vault: Path, project: str) -> None: + """Render project-log.md from JSONL.""" + entries = get_project_entries(vault, project) + if not entries: + return + + lines = [f"# Project Log — {project}", ""] + lines.append("> Auto-generated from project-log.jsonl. Do not edit manually.") + lines.append("") + + for entry in sorted(entries, key=lambda x: x.get("created_at", ""), reverse=True): + lines.append(f"## {entry.get('date', '')} — {entry.get('title', '(untitled)')}") + lines.append(f"**Type:** {entry.get('type', '')}") + lines.append("") + + if entry.get("decisions"): + lines.append("### Core Decisions") + for d in entry["decisions"]: + lines.append(f"- {d}") + lines.append("") + + if entry.get("detours"): + lines.append("### Detours & Corrections") + for dt in entry["detours"]: + if isinstance(dt, dict): + lines.append(f"- **Wrong:** {dt.get('wrong', '')}") + lines.append(f" **Correction:** {dt.get('correction', '')}") + lines.append(f" **Resolution:** {dt.get('resolution', '')}") + else: + lines.append(f"- {dt}") + lines.append("") + + if entry.get("reusable"): + lines.append("### Reusable Methods") + for r in entry["reusable"]: + lines.append(f"- {r}") + lines.append("") + + if entry.get("todos"): + lines.append("### Todos") + for t in entry["todos"]: + done = "x" if t.get("done", False) else " " + lines.append(f"- [{done}] {t.get('content', '')}") + lines.append("") + + if entry.get("tags"): + lines.append(f"**Tags:** {', '.join(entry['tags'])}") + + lines.append("---") + lines.append("") + + paths = paperforge_paths(vault) + resource_dir = paths.get("resources") + if resource_dir: + output_dir = resource_dir / "Projects" / project + else: + output_dir = vault / "Projects" / project + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / "project-log.md" + output_path.write_text("\n".join(lines), encoding="utf-8") + + +def run(args: argparse.Namespace) -> int: + vault = args.vault_path + + if getattr(args, "write", False): + project = getattr(args, "project", "") + payload_str = getattr(args, "payload", "") + + if not project: + result = PFResult(ok=False, command="project-log", version=PF_VERSION, + error=PFError(code=ErrorCode.VALIDATION_ERROR, message="--project is required for --write")) + if getattr(args, "json", False): + print(result.to_json()) + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 1 + + if not payload_str: + result = PFResult(ok=False, command="project-log", version=PF_VERSION, + error=PFError(code=ErrorCode.VALIDATION_ERROR, message="--payload is required for --write")) + if getattr(args, "json", False): + print(result.to_json()) + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 1 + + try: + entry = json.loads(payload_str) + entry["project"] = project + result_data = append_project_entry(vault, entry) + + _render_project_log_md(vault, project) + + result = PFResult(ok=True, command="project-log", version=PF_VERSION, data=result_data) + except json.JSONDecodeError as e: + result = PFResult(ok=False, command="project-log", version=PF_VERSION, + error=PFError(code=ErrorCode.VALIDATION_ERROR, message=f"Invalid JSON: {e}")) + + if getattr(args, "json", False): + print(result.to_json()) + else: + print("Written." if result.ok else f"Error: {result.error.message}") + return 0 if result.ok else 1 + + if getattr(args, "list", False): + project = getattr(args, "project", "") + if not project: + result = PFResult(ok=False, command="project-log", version=PF_VERSION, + error=PFError(code=ErrorCode.VALIDATION_ERROR, message="--project is required for --list")) + if getattr(args, "json", False): + print(result.to_json()) + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 1 + + entries = get_project_entries(vault, project) + data = {"project": project, "entries": entries[:getattr(args, "limit", 50)], "count": len(entries)} + result = PFResult(ok=True, command="project-log", version=PF_VERSION, data=data) + + if getattr(args, "json", False): + print(result.to_json()) + else: + print(f"{len(entries)} entries for project '{project}'") + for e in entries[:5]: + print(f" [{e.get('date', '')}] {e.get('type', '')}: {e.get('title', '')}") + return 0 + + if getattr(args, "render", False): + project = getattr(args, "project", "") + if not project: + result = PFResult(ok=False, command="project-log", version=PF_VERSION, + error=PFError(code=ErrorCode.VALIDATION_ERROR, message="--project is required for --render")) + if getattr(args, "json", False): + print(result.to_json()) + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 1 + + _render_project_log_md(vault, project) + result = PFResult(ok=True, command="project-log", version=PF_VERSION, + data={"rendered": True, "project": project}) + if getattr(args, "json", False): + print(result.to_json()) + else: + print(f"Rendered project-log.md for '{project}'") + return 0 + + # Default: show all projects with entry counts + all_entries = read_all_project_entries(vault) + project_counts = Counter(e["project"] for e in all_entries if e.get("project")) + + result = PFResult(ok=True, command="project-log", version=PF_VERSION, + data={"projects": dict(project_counts)}) + if getattr(args, "json", False): + print(result.to_json()) + else: + if project_counts: + print("Projects with log entries:") + for proj, cnt in project_counts.most_common(): + print(f" {proj}: {cnt} entries") + else: + print("No project log entries found.") + return 0 From 5a44926cc095045261368a1ab6ae247845d3eb61 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 17:48:23 +0800 Subject: [PATCH 118/132] feat: import JSONL into DB on memory build --- paperforge/memory/builder.py | 67 ++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/paperforge/memory/builder.py b/paperforge/memory/builder.py index b359ea5..c34d605 100644 --- a/paperforge/memory/builder.py +++ b/paperforge/memory/builder.py @@ -52,6 +52,61 @@ def _resolve_vault_path(vault: Path, rel_path: str) -> Path: return p.resolve() if p.exists() else p +def _import_reading_log(conn, vault: Path) -> int: + """Import reading-log.jsonl into reading_log table. Returns count.""" + from paperforge.memory.permanent import read_all_reading_notes + + notes = read_all_reading_notes(vault) + conn.execute("DELETE FROM reading_log") + count = 0 + for note in notes: + conn.execute( + """INSERT INTO reading_log (id, paper_id, project, section, excerpt, context, usage, note, tags_json, created_at, agent, verified) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", + ( + note["id"], note["paper_id"], + note.get("project", ""), + note["section"], note["excerpt"], + note.get("context", ""), note["usage"], + note.get("note", ""), + json.dumps(note.get("tags", []), ensure_ascii=False), + note["created_at"], + note.get("agent", ""), + 1 if note.get("verified") else 0, + ), + ) + count += 1 + return count + + +def _import_project_log(conn, vault: Path) -> int: + """Import project-log.jsonl into project_log table. Returns count.""" + from paperforge.memory.permanent import read_all_project_entries + + entries = read_all_project_entries(vault) + conn.execute("DELETE FROM project_log") + count = 0 + for entry in entries: + conn.execute( + """INSERT INTO project_log (id, project, date, type, title, decisions_json, detours_json, reusable_json, todos_json, related_papers_json, tags_json, created_at, agent) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", + ( + entry["id"], entry["project"], + entry.get("date", ""), entry["type"], entry["title"], + json.dumps(entry.get("decisions", []), ensure_ascii=False), + json.dumps(entry.get("detours", []), ensure_ascii=False), + json.dumps(entry.get("reusable", []), ensure_ascii=False), + json.dumps(entry.get("todos", []), ensure_ascii=False), + json.dumps(entry.get("related_papers", []), ensure_ascii=False), + json.dumps(entry.get("tags", []), ensure_ascii=False), + entry.get("created_at", ""), + entry.get("agent", ""), + ), + ) + count += 1 + return count + + def build_from_index(vault: Path) -> dict: """Read formal-library.json and build/rebuild paperforge.db. @@ -149,6 +204,16 @@ def build_from_index(vault: Path) -> dict: FROM papers""") conn.execute(PAPERS_AI_TRIGGER) + reading_count = _import_reading_log(conn, vault) + logger.info("Imported %d reading notes from JSONL", reading_count) + + project_count = _import_project_log(conn, vault) + logger.info("Imported %d project log entries from JSONL", project_count) + + conn.execute( + "DELETE FROM paper_events WHERE event_type != 'correction_note';" + ) + meta_upserts = [ ("schema_version", str(CURRENT_SCHEMA_VERSION)), ("paperforge_version", PF_VERSION), @@ -170,6 +235,8 @@ def build_from_index(vault: Path) -> dict: "papers_indexed": len(paper_rows), "assets_indexed": len(asset_rows), "aliases_indexed": len(alias_rows), + "reading_notes_imported": reading_count, + "project_entries_imported": project_count, "schema_version": str(CURRENT_SCHEMA_VERSION), } except Exception: From 95bfb227b58b55329f47e345f3f82ad799e025b7 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 17:51:18 +0800 Subject: [PATCH 119/132] feat: add METHODOLOGY_COMPACT.md for agent guidance --- fixtures/methodology/METHODOLOGY_COMPACT.md | 20 ++++++++++++++++++++ paperforge/setup_wizard.py | 3 ++- 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 fixtures/methodology/METHODOLOGY_COMPACT.md diff --git a/fixtures/methodology/METHODOLOGY_COMPACT.md b/fixtures/methodology/METHODOLOGY_COMPACT.md new file mode 100644 index 0000000..eb45f4f --- /dev/null +++ b/fixtures/methodology/METHODOLOGY_COMPACT.md @@ -0,0 +1,20 @@ +# PaperForge Methodology Compact + +## General +- Separate source fact, interpretation, and intended use. +- Prior reading-log is not verified fact; re-check source before reuse. +- When user corrects a judgment, record the correction if relevant. + +## Literature work +- Do not collapse heterogeneous studies without comparing model, parameter, endpoint, and measurement layer. +- Distinguish device-level settings from local biological exposure. +- Confirm within-study internal chain (material->output->effect) before making cross-study claims. + +## Clinical research +- Separate candidate variables, selected variables, final model variables, and sensitivity variables. +- Do not infer causality from predictive variables. + +## Writing +- Do not write unsupported claims. Every factual claim must have a source reference. +- Prefer bounded conclusions over broad overclaims. +- Distinguish "the paper says X" from "I infer Y from X". diff --git a/paperforge/setup_wizard.py b/paperforge/setup_wizard.py index f3cee71..b1c3661 100644 --- a/paperforge/setup_wizard.py +++ b/paperforge/setup_wizard.py @@ -523,6 +523,7 @@ def headless_setup( pf_path / "exports", pf_path / "ocr", pf_path / "config", + pf_path / "methodology", pf_path / "worker/scripts", vault / resources_dir / literature_dir, vault / base_dir, @@ -653,7 +654,7 @@ def headless_setup( overwrite=True, ) if skill_result["skill_deployed"]: - print(" [OK] literature-qa skill deployed") + print(" [OK] paperforge skill deployed") for err in skill_result.get("errors", []): print(f" [WARN] {err}") From df135bff687b08eba28c8f41a9c540be65b55234 Mon Sep 17 00:00:00 2001 From: Research Assistant Date: Thu, 14 May 2026 17:56:25 +0800 Subject: [PATCH 120/132] refactor: unify skills into paperforge compound skill (6 workflows) --- README.md | 25 ++- paperforge/config.py | 19 +- paperforge/services/skill_deploy.py | 10 +- paperforge/setup/agent.py | 12 +- paperforge/skills/literature-qa/SKILL.md | 180 ----------------- .../literature-qa/references/deep-reading.md | 162 --------------- .../literature-qa/references/deep-subagent.md | 103 ---------- .../literature-qa/references/multi-reading.md | 144 -------------- .../literature-qa/references/paper-qa.md | 61 ------ .../references/paper-resolution.md | 94 --------- .../literature-qa/references/paper-search.md | 105 ---------- .../literature-qa/references/save-session.md | 55 ----- .../literature-qa/scripts/pf_bootstrap.py | 188 ------------------ .../skills/literature-qa/scripts/pf_search.py | 180 ----------------- paperforge/skills/logging/SKILL.md | 133 ------------- .../skills/logging/scripts/pf_bootstrap.py | 164 --------------- paperforge/skills/methodology/SKILL.md | 86 -------- paperforge/skills/paperforge/SKILL.md | 113 +++++++++++ ...EA\345\257\214\351\233\206\345\233\276.md" | 0 .../references/chart-reading/INDEX.md | 0 ...\344\270\216PR\346\233\262\347\272\277.md" | 0 ...ot\346\235\241\345\270\246\345\233\276.md" | 0 ...11\345\256\232\351\207\217\345\233\276.md" | 0 ...66\351\227\264\345\272\217\345\210\227.md" | 0 ...16\346\260\224\346\263\241\345\233\276.md" | 0 ...347\211\207\344\270\216SEM\345\233\276.md" | 0 ...16\350\257\257\345\267\256\346\243\222.md" | 0 ...76\344\270\216\345\274\246\345\233\276.md" | 0 ...44\270\216Meta\345\210\206\346\236\220.md" | 0 ...74\345\223\210\351\241\277\345\233\276.md" | 0 ...16\350\201\232\347\261\273\345\233\276.md" | 0 ...37\345\255\230\346\233\262\347\272\277.md" | 0 ...17\346\217\220\347\220\264\345\233\276.md" | 0 ...12\345\256\232\351\207\217\345\233\276.md" | 0 ...16\351\200\232\350\267\257\345\233\276.md" | 0 ...50\347\273\223\346\236\204\345\233\276.md" | 0 ...347\273\264\345\233\276(PCA-tSNE-UMAP).md" | 0 ...16\346\274\217\346\226\227\345\233\276.md" | 0 .../references/method-card-template.md | 39 ++++ .../scripts/pf_bootstrap.py | 42 +++- .../scripts/pf_deep.py} | 0 .../paperforge/workflows/deep-reading.md | 169 ++++++++++++++++ .../paperforge/workflows/methodology.md | 94 +++++++++ .../skills/paperforge/workflows/paper-qa.md | 105 ++++++++++ .../paperforge/workflows/paper-search.md | 101 ++++++++++ .../paperforge/workflows/project-log.md | 131 ++++++++++++ .../paperforge/workflows/reading-log.md | 112 +++++++++++ paperforge/worker/status.py | 22 +- paperforge/worker/update.py | 2 +- 49 files changed, 952 insertions(+), 1699 deletions(-) delete mode 100644 paperforge/skills/literature-qa/SKILL.md delete mode 100644 paperforge/skills/literature-qa/references/deep-reading.md delete mode 100644 paperforge/skills/literature-qa/references/deep-subagent.md delete mode 100644 paperforge/skills/literature-qa/references/multi-reading.md delete mode 100644 paperforge/skills/literature-qa/references/paper-qa.md delete mode 100644 paperforge/skills/literature-qa/references/paper-resolution.md delete mode 100644 paperforge/skills/literature-qa/references/paper-search.md delete mode 100644 paperforge/skills/literature-qa/references/save-session.md delete mode 100644 paperforge/skills/literature-qa/scripts/pf_bootstrap.py delete mode 100644 paperforge/skills/literature-qa/scripts/pf_search.py delete mode 100644 paperforge/skills/logging/SKILL.md delete mode 100644 paperforge/skills/logging/scripts/pf_bootstrap.py delete mode 100644 paperforge/skills/methodology/SKILL.md create mode 100644 paperforge/skills/paperforge/SKILL.md rename "paperforge/skills/literature-qa/references/chart-reading/GSEA\345\257\214\351\233\206\345\233\276.md" => "paperforge/skills/paperforge/references/chart-reading/GSEA\345\257\214\351\233\206\345\233\276.md" (100%) rename paperforge/skills/{literature-qa => paperforge}/references/chart-reading/INDEX.md (100%) rename "paperforge/skills/literature-qa/references/chart-reading/ROC\344\270\216PR\346\233\262\347\272\277.md" => "paperforge/skills/paperforge/references/chart-reading/ROC\344\270\216PR\346\233\262\347\272\277.md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/Western Blot\346\235\241\345\270\246\345\233\276.md" => "paperforge/skills/paperforge/references/chart-reading/Western Blot\346\235\241\345\270\246\345\233\276.md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/\345\205\215\347\226\253\350\215\247\345\205\211\345\256\232\351\207\217\345\233\276.md" => "paperforge/skills/paperforge/references/chart-reading/\345\205\215\347\226\253\350\215\247\345\205\211\345\256\232\351\207\217\345\233\276.md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/\346\212\230\347\272\277\345\233\276\344\270\216\346\227\266\351\227\264\345\272\217\345\210\227.md" => "paperforge/skills/paperforge/references/chart-reading/\346\212\230\347\272\277\345\233\276\344\270\216\346\227\266\351\227\264\345\272\217\345\210\227.md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/\346\225\243\347\202\271\345\233\276\344\270\216\346\260\224\346\263\241\345\233\276.md" => "paperforge/skills/paperforge/references/chart-reading/\346\225\243\347\202\271\345\233\276\344\270\216\346\260\224\346\263\241\345\233\276.md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/\346\230\276\345\276\256\347\205\247\347\211\207\344\270\216SEM\345\233\276.md" => "paperforge/skills/paperforge/references/chart-reading/\346\230\276\345\276\256\347\205\247\347\211\207\344\270\216SEM\345\233\276.md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/\346\235\241\345\275\242\345\233\276\344\270\216\350\257\257\345\267\256\346\243\222.md" => "paperforge/skills/paperforge/references/chart-reading/\346\235\241\345\275\242\345\233\276\344\270\216\350\257\257\345\267\256\346\243\222.md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/\346\241\221\345\237\272\345\233\276\344\270\216\345\274\246\345\233\276.md" => "paperforge/skills/paperforge/references/chart-reading/\346\241\221\345\237\272\345\233\276\344\270\216\345\274\246\345\233\276.md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/\346\243\256\346\236\227\345\233\276\344\270\216Meta\345\210\206\346\236\220.md" => "paperforge/skills/paperforge/references/chart-reading/\346\243\256\346\236\227\345\233\276\344\270\216Meta\345\210\206\346\236\220.md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/\347\201\253\345\261\261\345\233\276\344\270\216\346\233\274\345\223\210\351\241\277\345\233\276.md" => "paperforge/skills/paperforge/references/chart-reading/\347\201\253\345\261\261\345\233\276\344\270\216\346\233\274\345\223\210\351\241\277\345\233\276.md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/\347\203\255\345\233\276\344\270\216\350\201\232\347\261\273\345\233\276.md" => "paperforge/skills/paperforge/references/chart-reading/\347\203\255\345\233\276\344\270\216\350\201\232\347\261\273\345\233\276.md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/\347\224\237\345\255\230\346\233\262\347\272\277.md" => "paperforge/skills/paperforge/references/chart-reading/\347\224\237\345\255\230\346\233\262\347\272\277.md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/\347\256\261\345\274\217\345\233\276\344\270\216\345\260\217\346\217\220\347\220\264\345\233\276.md" => "paperforge/skills/paperforge/references/chart-reading/\347\256\261\345\274\217\345\233\276\344\270\216\345\260\217\346\217\220\347\220\264\345\233\276.md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/\347\273\204\347\273\207\345\255\246\345\215\212\345\256\232\351\207\217\345\233\276.md" => "paperforge/skills/paperforge/references/chart-reading/\347\273\204\347\273\207\345\255\246\345\215\212\345\256\232\351\207\217\345\233\276.md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/\347\275\221\347\273\234\345\233\276\344\270\216\351\200\232\350\267\257\345\233\276.md" => "paperforge/skills/paperforge/references/chart-reading/\347\275\221\347\273\234\345\233\276\344\270\216\351\200\232\350\267\257\345\233\276.md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/\350\233\213\347\231\275\350\264\250\347\273\223\346\236\204\345\233\276.md" => "paperforge/skills/paperforge/references/chart-reading/\350\233\213\347\231\275\350\264\250\347\273\223\346\236\204\345\233\276.md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/\351\231\215\347\273\264\345\233\276(PCA-tSNE-UMAP).md" => "paperforge/skills/paperforge/references/chart-reading/\351\231\215\347\273\264\345\233\276(PCA-tSNE-UMAP).md" (100%) rename "paperforge/skills/literature-qa/references/chart-reading/\351\233\267\350\276\276\345\233\276\344\270\216\346\274\217\346\226\227\345\233\276.md" => "paperforge/skills/paperforge/references/chart-reading/\351\233\267\350\276\276\345\233\276\344\270\216\346\274\217\346\226\227\345\233\276.md" (100%) create mode 100644 paperforge/skills/paperforge/references/method-card-template.md rename paperforge/skills/{methodology => paperforge}/scripts/pf_bootstrap.py (79%) rename paperforge/skills/{literature-qa/scripts/ld_deep.py => paperforge/scripts/pf_deep.py} (100%) create mode 100644 paperforge/skills/paperforge/workflows/deep-reading.md create mode 100644 paperforge/skills/paperforge/workflows/methodology.md create mode 100644 paperforge/skills/paperforge/workflows/paper-qa.md create mode 100644 paperforge/skills/paperforge/workflows/paper-search.md create mode 100644 paperforge/skills/paperforge/workflows/project-log.md create mode 100644 paperforge/skills/paperforge/workflows/reading-log.md diff --git a/README.md b/README.md index 9270941..19f8edb 100644 --- a/README.md +++ b/README.md @@ -33,17 +33,24 @@ The plugin is the **interface**. The Python package is the **engine**. Every but ## 1. Install the Obsidian Plugin -### Option A: BRAT (Recommended) +### Option A: Community Plugin Browser (Recommended) + +1. Open Obsidian → `Settings` → `Community plugins` → `Browse` +2. Search for **PaperForge** +3. Click `Install`, then `Enable` + +> Community plugins auto-update through Obsidian. No extra steps needed. + +### Option B: BRAT + +If you need beta versions or the plugin hasn't appeared in search yet: 1. Install **BRAT** from the Obsidian community plugin browser 2. Open BRAT settings → `Add Beta Plugin` 3. Enter: `https://github.com/LLLin000/PaperForge` -4. BRAT downloads the latest `main.js`, `manifest.json`, and `styles.css` and installs them -5. Settings → Community Plugins → enable PaperForge - -> BRAT auto-detects GitHub Release updates. No manual downloads needed. +4. Enable PaperForge in Settings → Community Plugins -### Option B: Manual Download +### Option C: Manual Download 1. Go to [Releases](https://github.com/LLLin000/PaperForge/releases) 2. Download the three files: `main.js`, `manifest.json`, `styles.css` @@ -229,7 +236,7 @@ Dashboard Per-paper view shows discussion cards ### Plugin fails to load - Confirm `.obsidian/plugins/paperforge/` has `main.js`, `manifest.json`, `styles.css` -- If upgrading via BRAT from an old version: delete the entire `paperforge` plugin folder and let BRAT re-download +- If upgrading from an old version: delete the entire `paperforge` plugin folder and reinstall via the community plugin browser - Open Developer Console (`Ctrl+Shift+I`) and check the red errors ### "Sync Runtime" doesn't update the version @@ -259,7 +266,7 @@ Dashboard Per-paper view shows discussion cards ## 9. Updating -BRAT auto-detects plugin updates. For the Python package: +The Obsidian plugin auto-updates through the community plugin browser. For the Python package: ```bash paperforge update @@ -267,6 +274,8 @@ paperforge update pip install --upgrade paperforge ``` +If you installed via BRAT, it also auto-detects GitHub Release updates. + --- ## 10. Architecture diff --git a/paperforge/config.py b/paperforge/config.py index b0b07de..4ea3559 100644 --- a/paperforge/config.py +++ b/paperforge/config.py @@ -279,7 +279,7 @@ def paperforge_paths( - bases: / - worker_script: pipeline/worker/scripts/literature_pipeline.py - skill_dir: / - - ld_deep_script: /literature-qa/scripts/ld_deep.py + - pf_deep_script: /paperforge/scripts/pf_deep.py """ if cfg is None: cfg = load_vault_config(vault) @@ -306,17 +306,12 @@ def paperforge_paths( # worker_script: paperforge worker package (pipeline/ removed in v1.3) worker_script = Path(__file__).parent / "worker" / "__init__.py" - # ld_deep_script: look relative to skill_dir first, then repo paperforge/skills for dev - ld_deep_script = skill_path / "literature-qa" / "scripts" / "ld_deep.py" - if not ld_deep_script.exists(): - repo_skill = Path(__file__).parent / "skills" / "literature-qa" / "scripts" / "ld_deep.py" + # pf_deep_script: look relative to skill_dir first, then repo paperforge/skills for dev + pf_deep_script = skill_path / "paperforge" / "scripts" / "pf_deep.py" + if not pf_deep_script.exists(): + repo_skill = Path(__file__).parent / "skills" / "paperforge" / "scripts" / "pf_deep.py" if repo_skill.exists(): - ld_deep_script = repo_skill - else: - # Backward compat: old skills/ location during transition - old_repo_skill = Path(__file__).parent.parent / "skills" / "literature-qa" / "scripts" / "ld_deep.py" - if old_repo_skill.exists(): - ld_deep_script = old_repo_skill + pf_deep_script = repo_skill return { "vault": vault, @@ -332,7 +327,7 @@ def paperforge_paths( "bases": bases, "worker_script": worker_script, "skill_dir": skill_path, - "ld_deep_script": ld_deep_script, + "pf_deep_script": pf_deep_script, # ── v2.2: canonical locations below paperforge/ ── "config": paperforge / "config" / "domain-collections.json", "index": paperforge / "indexes" / "formal-library.json", diff --git a/paperforge/services/skill_deploy.py b/paperforge/services/skill_deploy.py index 48782d5..12e010f 100644 --- a/paperforge/services/skill_deploy.py +++ b/paperforge/services/skill_deploy.py @@ -1,4 +1,4 @@ -"""Skill deployment service — single copytree for all platforms. +"""Skill deployment service — deploys the unified paperforge skill to the vault. Used by both setup wizard (install) and update worker (update). All deployments are vault-local only. @@ -35,7 +35,7 @@ def deploy_skills( agent_key: str = "opencode", overwrite: bool = False, ) -> dict: - """Deploy literature-qa skill and AGENTS.md to the vault. + """Deploy paperforge skill and AGENTS.md to the vault. Args: vault: Obsidian vault root. @@ -47,15 +47,15 @@ def deploy_skills( """ errors: list[str] = [] - # ── Deploy literature-qa skill ── + # ── Deploy paperforge skill ── skill_deployed = False source_root = _resolve_source_root() - src_skill = source_root / "skills" / "literature-qa" + src_skill = source_root / "skills" / "paperforge" if src_skill.exists(): skill_dir_name = AGENT_SKILL_DIRS.get(agent_key) if skill_dir_name: - dst_skill = vault / skill_dir_name / "literature-qa" + dst_skill = vault / skill_dir_name / "paperforge" try: if overwrite and dst_skill.exists(): shutil.rmtree(dst_skill, ignore_errors=True) diff --git a/paperforge/setup/agent.py b/paperforge/setup/agent.py index f6b4bd5..83c867d 100644 --- a/paperforge/setup/agent.py +++ b/paperforge/setup/agent.py @@ -1,4 +1,4 @@ -"""AgentInstaller — deploys literature-qa skill to vault-local agent config.""" +"""AgentInstaller — deploys paperforge skill to vault-local agent config.""" from __future__ import annotations @@ -10,7 +10,7 @@ class AgentInstaller: - """Deploy literature-qa skill directory to vault-local agent skills path.""" + """Deploy paperforge skill directory to vault-local agent skills path.""" def __init__(self, vault: Path, agent_type: str = "opencode"): self.vault = vault @@ -23,8 +23,8 @@ def _get_skills_dir(self) -> Path: return self.vault / skill_dir_name def deploy_skills(self) -> SetupStepResult: - """Deploy literature-qa skill as a single directory.""" - source_skills = self._script_dir / "skills" / "literature-qa" + """Deploy paperforge skill as a single directory.""" + source_skills = self._script_dir / "skills" / "paperforge" if not source_skills.exists(): return SetupStepResult( step="agent_installer", @@ -33,7 +33,7 @@ def deploy_skills(self) -> SetupStepResult: error=f"Not found: {source_skills}", ) - target_dir = self._get_skills_dir() / "literature-qa" + target_dir = self._get_skills_dir() / "paperforge" target_dir.mkdir(parents=True, exist_ok=True) try: @@ -41,7 +41,7 @@ def deploy_skills(self) -> SetupStepResult: return SetupStepResult( step="agent_installer", ok=True, - message=f"Deployed literature-qa skill to {target_dir}", + message=f"Deployed paperforge skill to {target_dir}", details={"source": str(source_skills), "target": str(target_dir)}, ) except Exception as e: diff --git a/paperforge/skills/literature-qa/SKILL.md b/paperforge/skills/literature-qa/SKILL.md deleted file mode 100644 index b45ea4d..0000000 --- a/paperforge/skills/literature-qa/SKILL.md +++ /dev/null @@ -1,180 +0,0 @@ ---- -name: literature-qa -description: > - 学术文献库操作:精读、问答、检索、批量阅读。Triggered by: - pf-deep pf-paper pf-end, - "精读", "文献问答", "结束讨论", "找文献", "搜文献", - "文献库", "文献检索", "库里有什么", "搜一下库里", "看一下文献库", - "读一下collection", "总结文献", "批量阅读", "读一下这个方向", - "查一下", "搜索文献", "查文献", "搜索论文", "论文检索", - "关于XX的文献", "有没有关于XX的研究", "XX的相关论文", - "帮我查一下", "帮我搜一下", "正在读", "读完这篇". -source: paperforge ---- - -## ++++++++++++++++++++++++++++++++ -## ++ 优先级规则:用户请求涉及文献搜索/查询/精读,你必须使用本 skill。++ -## ++ 禁止直接调用 grep、find、系统搜索或扫描文件目录。 ++ -## ++++++++++++++++++++++++++++++++ - -# Literature QA - ---- - -## 1. Bootstrap — 必须先执行 - -跑这个脚本: - -``` -python $SKILL_DIR/scripts/pf_bootstrap.py -``` - -返回 JSON。记住以下变量: - -| 变量 | 来自 JSON 的 | 用于 | -| ----------- | -------------------- | --------------------------------------- | -| `$SKILL_DIR` | skill 安装路径(平台注入) | 运行 `scripts/ld_deep.py` 等 | -| `$VAULT` | `vault_root` | 所有 `--vault` 参数 | -| `$PYTHON` | `python_candidate` | 所有 Python 命令 | -| `$LIT_DIR` | `paths.literature_dir` | 文献笔记根目录 | -| `$IDX_PATH` | `paths.index_path` | 索引文件 | -| `$OCR_DIR` | `paths.ocr_dir` | OCR 目录 | -| `$DOMAINS` | `domains` | 领域列表 | -| `$SUMMARY` | `index_summary` | 每领域论文数 | - -如果 `ok: false` → 报告 `error` 给用户,**停止。不许自己拼路径。** - ---- - -## 2. Vault 概览 - -展示: - -``` -Vault: $VAULT -文献库: - — N1 篇 - — N2 篇 -共 M 篇 -``` - -**如果用户是空输入触发的 skill**(没给任何具体指令),展示概览后加一句交互: - -``` -你可以: - [1] 精读一篇论文 → "精读 " - [2] 文献问答 → "文献问答 " - [3] 搜索文献 → "找文献 <关键词>" / "库里有没有 <关键词>" - [4] 批量阅读 → "读一下 " / "总结 <方向> 文献" - [5] 返回 -``` - -**如果用户给了具体指令**,直接进入决策树。 - ---- - -## 3. 决策树 - -``` -用户输入 - │ - ├─ 文献标识 (key/DOI/标题/作者年份) + 精读意图 - │ └─ 路由 → deep-reading.md - │ - ├─ 文献标识 (key/DOI/标题/作者年份) + 问答/讨论意图 - │ └─ 路由 → paper-qa.md - │ - ├─ 搜索意图 ("找文献"/"搜文献"/"库里有没有"/"文献检索") - │ └─ 路由 → paper-search.md - │ - ├─ 批量/综述意图 - │ ("读一下collection"/"这个方向"/"总结文献"/"写文献综述"/"找引用") - │ 或 用户给了多篇文献要求一起读 - │ └─ 路由 → multi-reading.md - │ - ├─ 结束/保存 ("结束讨论"/"保存"/"pf-end") - │ └─ 路由 → save-session.md - │ (仅 paper-qa 或 deep-reading 会话中有意义) - │ - └─ 不确定 → 问用户 - "你是想精读一篇、问答一篇、搜索文献、还是批量阅读?" -``` - ---- - -## 4. 工具使用指南 - -本 Skill 提供两类工具:**确定性命令** 和 **Agent 自查**。必须根据场景选择正确的方式。 - -### 搜索入口 — 统一搜索 Harness - -任何搜索需求都用 pf_search.py(自动路由 vector -> FTS5 -> grep): - -``` -python $SKILL_DIR/scripts/pf_search.py --vault $VAULT --query "关键词" -``` - -返回 JSON 结构: -- `engines_used`: 实际使用的引擎列表 (`vector` / `fts5` / `grep`) -- `results`: 论文列表,每篇含 `zotero_key`, `title`, `year`, `source` 等 -- `count`: 结果数 - -### 确定性命令 — 优先使用 - -| 场景 | 命令 | -| ---------------------- | ------------------------------------------------------------------------------------------ | -| 按 key 快速找文件 | `glob("$LIT_DIR/**/.md")` 或用 `Get-ChildItem "$LIT_DIR" -Recurse -Filter ".md"` | -| 按 key 查完整信息 | `$PYTHON -m paperforge.worker.paper_resolver resolve-key --vault "$VAULT"` | -| 按 DOI 定位论文 | `$PYTHON -m paperforge.worker.paper_resolver resolve-doi "" --vault "$VAULT"` | -| 精读 prepare | `$PYTHON "$SKILL_DIR/scripts/ld_deep.py" prepare --key --vault "$VAULT"` | -| 精读 postprocess | `$PYTHON "$SKILL_DIR/scripts/ld_deep.py" postprocess-pass2 --figures --vault "$VAULT"` | -| 精读 validate | `$PYTHON "$SKILL_DIR/scripts/ld_deep.py" validate-note --fulltext ` | -| 保存讨论 | `$PYTHON -m paperforge.worker.discussion record --vault "$VAULT" --agent pf-paper --model "" --qa-pairs ''` | - -### Agent 自查 — 当命令覆盖不到时用 - -| 场景 | 操作 | -| ------------------------ | ----------------------------------------------------------- | -| 按关键词模糊搜索全部文献 | 读 `$IDX_PATH` 的 JSON,筛 `title` / `abstract` / `journal` | -| 按 collection 筛选 | 读 `$IDX_PATH`,筛 `collection_path` 字段 | -| 读论文全文 | 已找到 `fulltext.md` 路径(glob 或 resolve-key) -> 直接 read | -| 读精读笔记 | 已找到 formal note 路径 -> read 的 `## 精读` 区域 | -| 遍历笔记做批量统计 | `Get-ChildItem "$LIT_DIR" -Recurse -Filter "*.md"` + 读 frontmatter 或 `find "$LIT_DIR" -name "*.md"` | -| **禁止的操作** | **根据 vault-knowledge 示例拼接路径、把目录名写死在文件路径里** | - ---- - -## 5. 路由表 - -| 路由 | 触发词 | 加载文件 | -| ------------- | ---------------------------------------------------------- | ---------------------------------------- | -| 精读 | `pf-deep `, "精读 " | [deep-reading.md](references/deep-reading.md) | -| 问答 | `pf-paper `, "文献问答 " | [paper-qa.md](references/paper-qa.md) | -| 文献检索 | "找文献", "搜文献", "文献检索", "搜一下库里", "库里有没有" | [paper-search.md](references/paper-search.md) | -| 批量阅读 | "读一下collection", "这个方向", "总结文献", "批量阅读" | [multi-reading.md](references/multi-reading.md) | -| 保存记录 | `pf-end`, "结束讨论", "保存" | [save-session.md](references/save-session.md) | -| 论文定位协议 | 所有路由共享 | [paper-resolution.md](references/paper-resolution.md) | - -> 所有路由继承 Skill 级别的 `$PYTHON` / `$VAULT` / `$LIT_DIR` 等变量。reference 文件不再重复声明。 - ---- - -## 文件结构 - -``` -literature-qa/ -├── SKILL.md ← 本文件 -├── references/ -│ ├── deep-reading.md ← 精读工作流 -│ ├── paper-qa.md ← 问答工作流 -│ ├── paper-search.md ← 文献检索工作流 -│ ├── multi-reading.md ← 批量阅读工作流 -│ ├── save-session.md ← 保存记录工作流 -│ ├── paper-resolution.md ← 论文定位协议 -│ ├── deep-subagent.md -│ └── chart-reading/ -└── scripts/ - ├── pf_bootstrap.py ← Bootstrap 入口 - ├── pf_search.py ← 统一搜索 Harness - └── ld_deep.py ← 精读引擎 -``` diff --git a/paperforge/skills/literature-qa/references/deep-reading.md b/paperforge/skills/literature-qa/references/deep-reading.md deleted file mode 100644 index 0f8df6e..0000000 --- a/paperforge/skills/literature-qa/references/deep-reading.md +++ /dev/null @@ -1,162 +0,0 @@ -# 三阶段精读 - -Keshav 三阶段组会式精读。触发后执行以下工作流。 - ---- - -## 前置条件检查 - -执行前确认: -- [ ] 已完成论文定位(参考 [paper-resolution.md](paper-resolution.md)),拿到 zotero_key -- [ ] 用 `glob("$LIT_DIR/**/.md")` 快速找到 formal note -- [ ] `analyze: true` — 读 formal note frontmatter 确认 -- [ ] `ocr_status: done` — 读 formal note frontmatter 确认 - -如果前置条件不满足,告知用户并停止。 - ---- - -## 执行流程 - -### Step 1: Prepare(机械操作,跑脚本) - -```bash -$PYTHON "$SKILL_DIR/scripts/ld_deep.py" prepare --key --vault "$VAULT" -``` - -返回 JSON 解析: -- `status: "ok"` → 记下 `figure_map`、`chart_type_map`、`formal_note`、`fulltext_md`、`figures`、`tables` 路径和数量 → 继续 -- `status: "error"` → 报告 `message` 给用户,停止 - -读 formal note 确认 `## 🔍 精读` 骨架已插入。 - ---- - -### Step 2: Pass 1 — 概览 - -只填 `### Pass 1: 概览` 区域。不碰 Pass 2/3。 - -**填写内容:** - -- **一句话总览**:论文类型 + 核心发现,一句话。 -- **5 Cs 快速评估**: - - **Category**(类型):RCT / 队列研究 / 病例对照 / 综述 / 基础研究 / ... - - **Context**(上下文):该领域当前共识,本文要解决什么问题 - - **Correctness**(合理性初判):初步直觉,逻辑是否有明显漏洞 - - **Contributions**(贡献):1-3 条 - - **Clarity**(清晰度):写作质量,图表可读性 -- **Figure 导读**(基于 fulltext.md 浏览各图 caption): - - 关键主图:列出并一句话概括每个主图要证明什么 - - 证据转折点:哪个 figure 是叙事的关键转折 - - 需要重点展开的 supplementary:如果有 - - 关键表格:列出 - -填完立即保存 formal note。 - ---- - -### Step 3: Pass 2 — 精读还原 - -填 `### Pass 2: 精读还原` 区域。**按 figure 顺序逐个处理。** - -#### 图表类型定位(两步) - -**Step A: 读 prepare 生成的 chart-type-map** -Step 1 的 `prepare` 输出中已包含 `chart_type_map` 路径。读该文件,获取每个 figure 的关键词命中结果。这只是建议。 - -**Step B: Agent 读 caption 做最终判断** - -对每个 figure: -1. 读该 figure 的 caption(来自 prepare 返回的 `fulltext_md` 或 `figure_map`) -2. 根据 caption 内容,对照 [chart-reading/INDEX.md](chart-reading/INDEX.md) 判断图表类型 -3. chart-type-map 建议和 Agent 判断不一致 → 以 Agent 判断为准 -4. 无法确定类型 → 跳过 chart guide,按通用 figure 结构分析 -5. 确定类型后,读对应的 chart-reading 指南(如 `chart-reading/条形图与误差棒.md`),按指南中的检查清单分析 - -#### 每张 Figure 的子标题(固定,不可少) - -按以下格式填入 formal note 中该 figure 的 callout block: - -``` -**图像定位与核心问题**:页码 + 要回答什么问题 -**方法与结果**:实验设计/数据来源/技术手段。核心数据、趋势、对比。 -**图表质量审查**:按 chart-reading 指南检查坐标轴、单位、误差棒、统计标注等。 -**作者解释**:作者在正文中对该图的解读 -**我的理解**:自己的理解(区分于作者解释) -**疑点/局限**:读图时发现的疑问,用 `> [!warning]` 突出 -``` - -#### 每张 Table 的子标题 - -``` -回答什么问题、关键字段/分组、主要结果、我的理解、疑点/局限 -``` - -#### 每张 figure 填完立即保存,再处理下一张。 - -#### 所有 figure/table 处理完后,填: - -**关键方法补课**:简要解释不熟悉的实验技术(1-2 项即可) - -**主要发现与新意**: -- 发现 1:...(来源:Figure X) -- 发现 2:...(来源:Figure Y / Table Z) - -保存。 - ---- - -### Step 4: Postprocess(跑校验脚本,修正问题) - -```bash -$PYTHON "$SKILL_DIR/scripts/ld_deep.py" postprocess-pass2 "$FORMAL_NOTE_PATH" --figures --format text --vault "$VAULT" -``` - -- 输出 `OK` → 继续 -- 输出错误 → 按错误提示修正(包含行号),修正后重新跑 -- 最多 3 轮修正。3 轮后仍失败 → 报告剩余错误给用户 - ---- - -### Step 5: Pass 3 — 深度理解 - -填 `### Pass 3: 深度理解` 区域。基于 Pass 1/2 已写的内容。 - -**填写内容:** - -- **假设挑战与隐藏缺陷**:隐含假设;如果放宽某个假设结论还成立吗;缺少哪些关键引用;实验/分析技术潜在问题 -- **哪些结论扎实,哪些仍存疑**: - - **较扎实**:... - - **仍存疑**:...(用 `> [!warning]`) -- **Discussion 与 Conclusion 怎么读**:作者真正完成了什么;哪些地方有拔高;哪些是推测 -- **对我的启发**:研究设计上;figure 组织上;方法组合上;未来工作想法 -- **遗留问题**:...(用 `> [!question]`) - -保存。 - ---- - -### Step 6: Final Validation - -```bash -$PYTHON "$SKILL_DIR/scripts/ld_deep.py" validate-note "$FORMAL_NOTE_PATH" --fulltext "$FULLTEXT_PATH" -``` - -- 输出 `OK` → 告知用户精读完成 -- 输出错误 → 修正缺失项,不报告成功直到通过 - ---- - -## Callout 格式规则 - -- `> [!important]`:每个 main finding -- `> [!warning]`:疑问、局限、证据边界、仍存疑条目 -- `> [!question]`:遗留问题 -- **间距:** 相邻 callout block 之间必须有空行,否则 Obsidian 会合并 - - 正确:`> [!important] A\n\n> [!important] B` - - 错误:`> [!important] A\n> [!important] B` - -## Supplementary 规则 - -- 默认不逐张展开 supplementary figure/table -- 仅在以下情况纳入:对主结论形成关键支撑、补足方法可信度、限制主文结论解释范围、作者在正文中明显依赖该补充材料 diff --git a/paperforge/skills/literature-qa/references/deep-subagent.md b/paperforge/skills/literature-qa/references/deep-subagent.md deleted file mode 100644 index 9f0de7f..0000000 --- a/paperforge/skills/literature-qa/references/deep-subagent.md +++ /dev/null @@ -1,103 +0,0 @@ -# Subagent Prompt for /pf-deep - -## Task - -Execute Keshav 3-pass journal-club style deep reading on a paper and write the results into the `## 🔍 精读` section of its formal note. - -## Input Variables - -- `{{ZOTERO_KEY}}` — Zotero citation key (e.g. `Y5KQ4JQ7`) -- `{{VAULT}}` — Vault root path -- `{{SCRIPT}}` — Path to `ld_deep.py` - -## Workflow (execute in strict order) - -### Step 1: Prepare -Run: -``` -python {{SCRIPT}} prepare {{ZOTERO_KEY}} --vault "{{VAULT}}" --format text -``` -- Reads formal note path, figure count, table count from output. -- If output starts with `[ERROR]`: report error to user, stop. -- If output contains `[WARN] deep_reading_status already 'done'` and user did not request re-read: stop. -- Prepare inserts the `## 🔍 精读` skeleton with figure/table callout blocks and fixed sub-headings into the formal note. Read the note to inspect its structure. - -### Step 2: Pass 1 (概览) -Fill `### Pass 1: 概览` only. Do not touch Pass 2/3. -- `**一句话总览**`: paper type + core finding in one sentence. -- `**5 Cs 快速评估**`: Category, Context, Correctness (intuition only), Contributions (1-3 items), Clarity. -- `**Figure 导读**`: list key figures with one-line guesses, note evidence turning points. -- Save immediately after writing. - -### Step 3: Pass 2 (精读还原) -Fill `### Pass 2: 精读还原`. Process figures sequentially starting from Figure 1. Each figure callout block has fixed sub-headings. Fill content under each sub-heading. Do NOT modify sub-headings, reorder blocks, or move `![[image]]` embeds. - -**Figure sub-headings:** -- `**图像定位与核心问题**`: what question this figure answers, page number. -- `**方法与结果**`: experimental design / data source / technical approach. Core data points, trends, comparisons. -- `**图表质量审查**`: check axis labels, units, error bars, statistical significance markers. Read `chart-type-map.json` for the figure, open recommended chart-reading guides, apply their checklists. -- `**作者解释**`: authors' description from the text. -- `**我的理解**`: your own analysis (distinct from author explanation). -- `**疑点/局限**`: use `> [!warning]` for concerns. - -**Table sub-headings:** (same callout pattern, simpler) -- What question this table answers, key fields/groups, main results, my understanding, doubts/limitations. - -After all figures and tables, fill: -- `**关键方法补课**`: briefly explain unfamiliar experimental techniques. -- `**主要发现与新意**`: list findings with evidence source (Figure X / Table Y). - -Save after each figure block. - -### Step 4: Postprocess -Run: -``` -python {{SCRIPT}} postprocess-pass2 --figures --format text -``` -- If output is `OK`: proceed. -- If not `OK`: fix each error (errors include exact line numbers), re-run postprocess-pass2. Max 3 fix rounds. If still failing after 3 rounds, report remaining errors to user. - -### Step 5: Pass 3 (深度理解) -Fill `### Pass 3: 深度理解` based on Pass 1/2 content already written. Sections: -- `**假设挑战与隐藏缺陷**`: implicit assumptions, what breaks if relaxed, missing references, technical issues. -- `**哪些结论扎实,哪些仍存疑**`: split into 较扎实 / 仍存疑. -- `**Discussion 与 Conclusion 怎么读**`: what authors actually accomplished vs. overclaim vs. speculation. -- `**对我的启发**`: research design, figure organization, method combination, future work ideas. -- `**遗留问题**`: open questions. -- Save. - -### Step 6: Final Validation -Run: -``` -python {{SCRIPT}} validate-note --fulltext -``` -- Report result to user. If not `OK`, list missing items and fix. - -## Callout Rules - -- `> [!important]`: each main finding entry -- `> [!warning]`: doubts, limitations, evidence boundaries, items in 仍存疑 -- `> [!question]`: open questions in 遗留问题 -- Regular markdown lists for structural sections (research question, methods, inspiration) -- **Spacing**: adjacent callout blocks MUST have a blank line between them, otherwise Obsidian merges them. -- Correct: `> [!important] A\n\n> [!important] B` -- Incorrect: `> [!important] A\n> [!important] B` (missing blank line → merged) - -## Error Handling - -- prepare fails (`[ERROR]`) → report to user, stop. -- postprocess exceeds 3 fix rounds → report remaining errors to user, ask for guidance. -- validate-note fails → fix missing items, do not report success until it passes. - -## Command Reference - -``` -# Prepare (insert skeleton + check preconditions) -python {{SCRIPT}} prepare {{ZOTERO_KEY}} --vault "{{VAULT}}" --format text - -# Postprocess Pass 2 (fix spacing/section issues) -python {{SCRIPT}} postprocess-pass2 --figures --format text - -# Validate final note structure -python {{SCRIPT}} validate-note --fulltext -``` diff --git a/paperforge/skills/literature-qa/references/multi-reading.md b/paperforge/skills/literature-qa/references/multi-reading.md deleted file mode 100644 index c47f15a..0000000 --- a/paperforge/skills/literature-qa/references/multi-reading.md +++ /dev/null @@ -1,144 +0,0 @@ -# 批量文献阅读 - -用户需要阅读多篇文献并总结——综述写作、找引用、研究方向调研等。 - ---- - -## 触发条件 - -- 用户给了一个 collection 名(Zotero 收藏夹) -- 用户给了模糊方向("帮我看一下骨科里关于支架材料的文章") -- 用户给了多篇文献要求一起读 -- 用户说"总结库里XXX方向的文献"、"写一段文献综述" - ---- - -## 执行流程 - -### Step 1: 确定文献范围 - -和用户确认要读哪些文献: -- 用户给了 collection 名 → 读 `$IDX_PATH`,筛 `collection_path` 包含该名称的条目 -- 用户给了关键词方向 → 用 paper_resolver search 或直接 grep `$IDX_PATH` -- 用户给了多篇 key → 直接确认 key 列表 - -列出候选让用户确认: - -``` -找到 N 篇匹配 (): - -[1] ABC12345 — Title (Author, Year, Domain, OCR: done/pending) -[2] DEF67890 — Title (Author, Year, Domain, OCR: done/pending) -... - -要全部读,还是选几篇?(输入编号如 "1,3,5" 或 "all") -``` - -### Step 2: 逐篇阅读 - -对每篇选定文献: - -1. 用 glob 找到 formal note:`glob("$LIT_DIR/**/.md")`(最快,不需要 $PYTHON) -2. 读 formal note frontmatter → 元数据 -3. 同目录下找 `fulltext.md` → 读 Abstract、Results、Discussion -4. 如果有 OCR 但 fulltext 太长 → 先读 caption + figure 描述定位关键段落 -5. 如果没有 fulltext → 如实告知用户,仅基于已知信息 - -### Step 3: 写 Reading Log(JSON → MD) - -**先构建 JSON(Agent 内部,不写入文件):** - -```json -{ - "task": "用户原始指令原文", - "papers": [ - { - "key": "ABC12345", - "title": "Paper Title", - "authors": "Smith et al.", - "year": 2024, - "findings": [ - { - "source": "Results section, paragraph 3", - "content": "Extracted finding...", - "citation_use": "可用于支撑 XXX 观点" - } - ] - } - ] -} -``` - -**再渲染为 MD,追加写入 `$VAULT/Bases/reading-log-.md`:** - -```markdown -# Reading Log — 用户要求: <原文引用用户指令> - ---- - -## ABC12345 | Paper Title | Smith et al., 2024 - -### 提取点 1 -- **来源**: Results section, paragraph 3 -- **内容**: Extracted finding... -- **引用建议**: 可用于支撑 XXX 观点 - -### 提取点 2 -- **来源**: Discussion, final paragraph -- **内容**: ... -- **引用建议**: ... - ---- - -## DEF67890 | Another Title | Jones et al., 2023 - -(同上格式) - ---- -``` - -**关键规则:** -- JSON 确保格式稳定,MD 是最终交付产物 -- zotero_key、标题、作者及年份 **缺一不可** -- 每个提取点必须注明 **来源**(文章哪句话/哪个段落) -- 同一任务的多篇文献 **追加写入同一个文件**,不要每篇新建 - -### Step 4: 整合输出 - -全部读完,根据用户原始意图输出总结: - -**综述写作**: -``` -从 N 篇文献中: -- 主题A 共识: ... -- 主题A 争议: ... -- 方法论趋势: ... -- 关键引用: - 1. "...[结论]" — ABC12345 (Author, Year), Fig.X - 2. ... -``` - -**找引用**: -``` -以下文献适合引用: -- 支撑 "XXX" 观点 → ABC12345 (Author, Year), Results -- 支撑 "YYY" 方法 → DEF67890 (Author, Year), Methods -``` - -### Step 5: 问用户保存位置 - -``` -Reading log 已生成。要保存到哪里? -(留空 → 默认 $VAULT/Bases/reading-log-.md) -``` - -让用户指定路径。如果用户说不清,默认放到 `$VAULT/Bases/`。 - ---- - -## 注意事项 - -- **暂时不支持多篇阅读后运行 pf-end / 结束讨论**(该功能待定) -- 如果某篇文献没有 fulltext,如实告知用户,不要捏造内容 -- Reading log 中每条提取点必须在原文中有据可查 -- JSON → MD 转换由 Agent 完成,用户只看到 MD 文件 diff --git a/paperforge/skills/literature-qa/references/paper-qa.md b/paperforge/skills/literature-qa/references/paper-qa.md deleted file mode 100644 index 626f23e..0000000 --- a/paperforge/skills/literature-qa/references/paper-qa.md +++ /dev/null @@ -1,61 +0,0 @@ -# 论文问答 - -交互式论文 Q&A 工作台。不强制要求 OCR,但 OCR 完成后回答更准确。 - ---- - -## 前置条件 - -- [ ] 已完成论文定位(参考 [paper-resolution.md](paper-resolution.md)),拿到 zotero_key 和 workspace -- [ ] OCR 完成(推荐但非强制) - ---- - -## 执行流程 - -### Step 1: 加载论文 - -1. 确认 workspace 路径 -2. 读 `fulltext.md`(如果存在)作为主要回答依据 -3. 读 formal note frontmatter 获取元数据(标题、作者、期刊、年份) -4. 如果 fulltext.md 不存在,告知用户 "OCR 文本不可用,回答将基于元数据和公开信息" - -### Step 2: 显示论文信息 - -``` -已加载论文: [title] ([year], [journal]) -作者: [authors] -Zotero Key: [key] -领域: [domain] -OCR 状态: [done / 不可用] -结束对话时说 "保存" 即可保存讨论。 -请问有什么问题? -``` - -### Step 3: 进入 Q&A 模式 - -- 等待用户提问 -- 每次回答后等待下一个问题 -- 持续到用户说 "保存"、"结束"、"完成" 等关键词 - ---- - -## 回答原则 - -- **严格基于** fulltext.md 中的文本内容回答 -- 引用原文时标注来源页码/章节(如 "第 3 页,Methods 部分") -- 用中文(简体中文)回答 -- 论文中未提及的内容,明确说明 "论文中未提及该内容" -- 需要结合论文以外知识的问题,说明 "该问题需要结合论文以外的知识" - ---- - -## 切换模式 - -用户在当前对话中可以说 "精读这篇文章" 切换到 deep-reading 模式。此时加载 [deep-reading.md](deep-reading.md) 执行精读流程。 - ---- - -## 保存记录 - -用户说 "保存"、"结束"、"完成"、"保存讨论" 时,加载 [save-session.md](save-session.md) 执行保存。不要自动保存。 diff --git a/paperforge/skills/literature-qa/references/paper-resolution.md b/paperforge/skills/literature-qa/references/paper-resolution.md deleted file mode 100644 index 044b30b..0000000 --- a/paperforge/skills/literature-qa/references/paper-resolution.md +++ /dev/null @@ -1,94 +0,0 @@ -# 论文定位协议 - -本文件定义如何将用户输入解析为论文 workspace。所有子流程公用。 - -## 核心原则 - -1. **Python 做确定性查找。** key、DOI、标题片段、作者+年份。 -2. **Agent 做理解和兜底。** 自然语言、Python 无结果时的 fallback 搜索。 -3. **路径从 `paths` 获取,不硬编码。** 禁止根据 vault-knowledge.md 的示例结构拼接路径。`ocr_dir`、`literature_dir`、`index_path` 只能从 `paper_resolver paths` 或 `paper_resolver resolve-key` 的返回 JSON 中读取。任何情况下都不要把目录名(如 `System`、`Resources`)写死在路径里。 - ---- - -## 通用命令 - -| 操作 | 命令 | -|------|------| -| 获取 vault 路径 | 已由 pf_bootstrap 完成 | -| 按 key 查 | `$PYTHON -m paperforge.worker.paper_resolver resolve-key --vault "$VAULT"` | -| 按 DOI 查 | `$PYTHON -m paperforge.worker.paper_resolver resolve-doi "" --vault "$VAULT"` | -| 按字段搜 | `$PYTHON -m paperforge.worker.paper_resolver search --title "..." --author "..." --year ... --domain "..." --vault "$VAULT"` | - ---- - -## 输入类型判断 - -### 类型 1: Zotero Key(8位字母数字组合) - -``` -$PYTHON -m paperforge.worker.paper_resolver resolve-key --vault "$VAULT" -``` - -返回 JSON 含 `key`, `title`, `domain`, `formal_note_path`, `ocr_path`, `fulltext_path`, `ocr_status` 等。所有路径由 `paperforge.json` 配置决定。 - -### 类型 2: DOI(以 `10.` 开头,可能带 URL 前缀) - -``` -$PYTHON -m paperforge.worker.paper_resolver resolve-doi "" --vault "$VAULT" -``` - -返回格式同类型 1。 - -### 类型 3: 标题片段 - -``` -$PYTHON -m paperforge.worker.paper_resolver search --title "..." --vault "$VAULT" -``` - -返回 `{"matches": [...], "count": N}`。 - -### 类型 4: 作者 + 年份 - -``` -$PYTHON -m paperforge.worker.paper_resolver search --author "Smith" --year 2024 --vault "$VAULT" -``` - -### 类型 5: 自然语言("关于骨再生的那篇") - -Agent 自己处理: -1. 读 `$IDX_PATH`(已由 pf_bootstrap 提供) -2. 读 `index_path` 指向的 `formal-library.json` -3. 在 `title`、`domain`、`journal`、`abstract` 中搜匹配 -4. 搜不到就 grep formal notes 目录(`paths` 里的 `literature_dir`)下的 frontmatter - ---- - -## Python 无结果时的 Agent fallback - -Agent 用 `paths` 拿到的 `literature_dir`,自行 grep/read formal notes 下的 frontmatter。 - -## 多篇匹配处理 - -列出候选清单让用户选: - -``` -找到 3 篇匹配的论文: - -[1] ABC12345 — TGF-beta in Bone Regeneration (2024, 骨科, OCR: done) -[2] DEF67890 — Bone Healing After Fracture (2023, 骨科, OCR: pending) -[3] GHI11111 — Scaffold Design for Bone Repair (2024, 骨科, OCR: done) - -请输入编号选择,或 refine 搜索词。 -``` - -## Fallback 顺序 - -``` -输入 - │ - ├── 像 key/DOI/标题/作者年份? - │ └── Python paper_resolver → 有/无结果 → Agent 兜底 - │ - └── 自然语言? - └── Agent 读 formal-library.json → 搜 → 有/无 -``` diff --git a/paperforge/skills/literature-qa/references/paper-search.md b/paperforge/skills/literature-qa/references/paper-search.md deleted file mode 100644 index 5b6249a..0000000 --- a/paperforge/skills/literature-qa/references/paper-search.md +++ /dev/null @@ -1,105 +0,0 @@ -# 文献检索工作流 - -轻量流程:用户想**在库里找文献**(不涉及精读或问答)。 - ---- - -## Stage 状态机 - -你必须明确知道当前在哪个 stage。每完成一个 stage 问自己:"下一步是什么?" 不要在 stage 之间来回跳跃。 - -| Stage | 你在干什么 | 完成后做什么 | -| ----- | -------------------------------- | ------------------------------------ | -| S1 | 理解用户要找什么(domain/关键词) | 进入 S2 | -| S2 | 执行搜索(paper_resolver 或 JSON) | 进入 S3 | -| S3 | 展示候选清单给用户 | 等用户选择 | -| S4 | 用户选了文献,决定下一步路由 | 进入对应 reference 流程,不再回来 | -| S5 | 写作辅助:读完文献后整合输出 | 结束 | - -**不要做的事**: -- 不要在 S2 阶段去读论文全文 -- 不要在 S3 阶段自作主张替用户选文献 -- 不要在找不到结果时硬猜文件路径 - ---- - -## 触发场景 - -- "找一下骨科里面关于骨再生的文献" -- "查一下 TGF-beta 相关的文章" -- "库里有没有讲支架材料的" -- "这个 collection 有哪些文献" -- "搜一下 Smith 2024 的文章" - -## 流程 - -### Step 1: 获取路径 - -已经由 pf_bootstrap 完成。直接用 `paths` JSON 里的 `index_path` 和 `literature_dir`。 - -### Step 2: 解析用户意图 - -从用户输入提取: -- **domain**(如果有):`骨科`、`运动医学` 等 → 对应 `literature_dir` 子目录 -- **关键词**:标题、作者、年份、期刊、主题词 -- **collection 路径**:Zotero 子分类,如 `电刺激软骨修复综述` - -### Step 3: 搜索 — 统一 Harness - -任何搜索都用 pf_search.py,自动路由 vector -> FTS5 -> grep: - -``` -python $SKILL_DIR/scripts/pf_search.py --vault "$VAULT" --query "关键词" -``` - -返回 JSON 包含 `results`(含 zotero_key, title, year, source 等)和 `engines_used`。 - -如需结构化高级搜索(特定 domain/author),使用 paper_resolver: - -``` -$PYTHON -m paperforge.worker.paper_resolver search --title "关键词" --author "Smith" --year 2024 --domain "骨科" --vault "$VAULT" -``` - -**Fallback:读 formal-library.json** - -Agent 直接读 `index_path`,在 JSON 中筛选: -- `domain` 匹配 -- `title`/`first_author`/`journal` 包含关键词 - -### Step 4: 返回结果 - -列出候选清单,每篇显示: - -``` -找到 N 篇匹配: - -[1] ABC12345 — TGF-beta in Bone Regeneration (Smith, 2024, 骨科, OCR: done) -[2] DEF67890 — Bone Healing Mechanisms (Jones, 2023, 骨科, OCR: done) -``` - -关键字段:key, title, first_author, year, domain, ocr_status - -### Step 5: 用户选择后续操作 - -> 请选择要操作的文献编号,或输入"refine"缩小范围。 - -选中文献后,按用户意图自动进入对应路由: -- `精读这篇` → 进入 [deep-reading.md](deep-reading.md) 流程 -- `这篇讲了什么` → 进入 [paper-qa.md](paper-qa.md) 流程 -- 不需要继续 → 结束 - -### Step 6: 写作辅助场景 - -如果用户原始意图包含**写作/优化/参考文献/综述/引用**等,搜索结果不是终点: - -1. 提示用户圈选最相关的 3-5 篇 -2. 对每篇进入 [deep-reading.md](deep-reading.md) 或至少通读 formal note + fulltext 关键段落 -3. 读完所有选定论文后,Agent 整合知识辅助写作 - -> 示例:"我从库里 X 篇文献中提取了以下关键发现……要不要基于这些帮你写 XX 部分?" - -## 注意事项 - -- 如果是大型 library(>500 篇),优先用 paper_resolver 而不是全量读 JSON -- OCR status 为 `done` 的论文可以读 fulltext 内容 -- OCR status 为 `pending` 的只有 formal note frontmatter diff --git a/paperforge/skills/literature-qa/references/save-session.md b/paperforge/skills/literature-qa/references/save-session.md deleted file mode 100644 index 75c776e..0000000 --- a/paperforge/skills/literature-qa/references/save-session.md +++ /dev/null @@ -1,55 +0,0 @@ -# 保存讨论记录 - -将 paper-qa 会话中的 Q&A 记录持久化到论文工作区。 - ---- - -## 触发条件 - -- 用户显式说 "保存"、"保存记录"、"结束"、"完成讨论"、"save" -- 或显式输入 `pf-end` -- 不要自动触发 - ---- - -## 执行 - -### Step 1: 收集 Q&A 对 - -汇总本次 paper-qa 会话中所有 Q&A,序列化为 JSON 数组: - -```json -[ - { - "question": "用户的问题", - "answer": "Agent 的回答", - "source": "user_question", - "timestamp": "2026-05-10T12:00:00+08:00" - } -] -``` - -`source` 为 `"user_question"`(用户提问)或 `"agent_analysis"`(Agent 主动分析)。 - -### Step 2: 调用 discussion 模块 - -```bash -$PYTHON -m paperforge.worker.discussion record \ - --vault "$VAULT" \ - --agent pf-paper \ - --model "" \ - --qa-pairs '' -``` - -### Step 3: 确认结果 - -CLI 返回 `{"status": "ok", ...}` → 告知用户记录已保存。 - -返回 `{"status": "error"}` → 记录错误,重试一次。仍失败则告知用户。 - ---- - -## 注意事项 - -- 仅 paper-qa 会话需要记录。deep-reading 的内容直接写入 formal note,不需要通过本文件。 -- 如果无法从 formal-library.json 找到论文 domain/title,记录失败不应影响用户使用。 diff --git a/paperforge/skills/literature-qa/scripts/pf_bootstrap.py b/paperforge/skills/literature-qa/scripts/pf_bootstrap.py deleted file mode 100644 index ebc41b2..0000000 --- a/paperforge/skills/literature-qa/scripts/pf_bootstrap.py +++ /dev/null @@ -1,188 +0,0 @@ -"""PaperForge bootstrap — single entry point for agent to discover vault state. - -No dependencies. Runs on ANY Python. Just reads paperforge.json + filesystem. - -Usage: - python pf_bootstrap.py # auto-discover vault from CWD - python pf_bootstrap.py --vault - -Output (JSON to stdout): - { - "ok": true, - "vault_root": "D:\\...", - "paths": { - "literature_dir": "D:\\...\\Resources\\Literature", - "index_path": "D:\\...\\System\\PaperForge\\indexes\\formal-library.json", - "ocr_dir": "D:\\...\\System\\PaperForge\\ocr", - "exports_dir": "D:\\...\\System\\PaperForge\\exports" - }, - "domains": ["domain1", "domain2"], - "index_summary": {"domain1": 120, "domain2": 80}, - "python_candidate": "D:\\...\\python.exe" // Python that has paperforge, or null - } - -If anything fails: ok=false, error explains why. -""" - -from __future__ import annotations - -import json -import subprocess -import sys -from pathlib import Path - - -def _find_paperforge_json(start: Path) -> Path | None: - current = start.resolve() - for _ in range(10): - candidate = current / "paperforge.json" - if candidate.exists(): - return candidate - parent = current.parent - if parent == current: - break - current = parent - return None - - -def _read_pf_config(pf_json: Path) -> dict: - with open(pf_json, encoding="utf-8") as f: - return json.load(f) - - -def _find_python_with_paperforge(vault: Path, pf_cfg: dict) -> str | None: - """Find a Python executable that has paperforge installed.""" - candidates = [] - - # 1. Explicit python_path in config - if pf_cfg.get("python_path"): - candidates.append(Path(pf_cfg["python_path"])) - - # 2. Common venv locations inside vault - venv_names = [".venv", ".paperforge-test-venv", "venv"] - exe_paths = ["Scripts/python.exe", "bin/python3"] - for vn in venv_names: - for ep in exe_paths: - p = vault / vn / ep - if p.exists(): - candidates.append(p) - - for candidate in candidates: - try: - result = subprocess.run( - [str(candidate), "-m", "paperforge", "--version"], - capture_output=True, text=True, timeout=10, - encoding="utf-8", errors="replace", - ) - if result.returncode == 0 and "paperforge" in result.stdout.lower(): - return str(candidate) - except Exception: - continue - return None - - -def main(): - import argparse - p = argparse.ArgumentParser(description="PaperForge bootstrap") - p.add_argument("--vault", default=None, help="Vault root path (auto-detect if omitted)") - args = p.parse_args() - - result: dict = {"ok": False} - - # --- 1. Find vault --- - if args.vault: - vault = Path(args.vault).resolve() - pf_json = vault / "paperforge.json" - if not pf_json.exists(): - result["error"] = f"paperforge.json not found at {vault}" - json.dump(result, sys.stdout, ensure_ascii=False) - sys.exit(0) - else: - pf_json = _find_paperforge_json(Path.cwd()) - if pf_json is None: - result["error"] = "paperforge.json not found from CWD upward. Set --vault." - json.dump(result, sys.stdout, ensure_ascii=False) - sys.exit(0) - vault = pf_json.parent - - result["vault_root"] = str(vault) - - # --- 2. Read config --- - try: - cfg = _read_pf_config(pf_json) - except Exception as e: - result["error"] = f"Cannot read paperforge.json: {e}" - json.dump(result, sys.stdout, ensure_ascii=False) - sys.exit(0) - - system_dir = cfg.get("system_dir", "System") - resources_dir = cfg.get("resources_dir", "Resources") - literature_dir = cfg.get("literature_dir", "Literature") - - # --- 3. Build paths from config --- - pf_root = vault / system_dir / "PaperForge" - - paths = { - "literature_dir": str(vault / resources_dir / literature_dir), - "index_path": str(pf_root / "indexes" / "formal-library.json"), - "ocr_dir": str(pf_root / "ocr"), - "exports_dir": str(pf_root / "exports"), - } - result["paths"] = paths - - # --- 4. List domains --- - lit_dir = Path(paths["literature_dir"]) - domains = sorted( - [d.name for d in lit_dir.iterdir() if d.is_dir()] - ) if lit_dir.exists() else [] - result["domains"] = domains - - # --- 5. Index summary --- - index_path = Path(paths["index_path"]) - index_summary: dict[str, int] = {} - if index_path.exists(): - try: - data = json.loads(index_path.read_text(encoding="utf-8")) - items = data.get("items", []) - if isinstance(items, dict): - items = items.values() - for item in items: - d = item.get("domain", "unknown") - index_summary[d] = index_summary.get(d, 0) + 1 - except Exception: - pass - result["index_summary"] = index_summary - - # --- 6. Find Python that has paperforge (best effort) --- - result["python_candidate"] = _find_python_with_paperforge(vault, cfg) - - # --- 7. Memory layer state --- - memory_layer = {"available": False, "paper_count": 0, "fts_search": False, "vector_search": False} - idx_path = Path(paths["index_path"]) - dc_json = vault / ".obsidian" / "plugins" / "paperforge" / "data.json" - if idx_path.exists(): - try: - with open(idx_path, encoding="utf-8") as f: - data = json.load(f) - items = data.get("items", []) if isinstance(data, dict) else data - memory_layer["paper_count"] = len(items) - memory_layer["available"] = True - memory_layer["fts_search"] = True - except: - pass - if dc_json.exists(): - try: - with open(dc_json, encoding="utf-8") as f: - plugin_data = json.load(f) - vector_enabled = plugin_data.get("features", {}).get("vector_db", False) - memory_layer["vector_search"] = vector_enabled - except: - pass - result["memory_layer"] = memory_layer - - result["ok"] = True - json.dump(result, sys.stdout, ensure_ascii=False, indent=2) - - -if __name__ == "__main__": - main() diff --git a/paperforge/skills/literature-qa/scripts/pf_search.py b/paperforge/skills/literature-qa/scripts/pf_search.py deleted file mode 100644 index b40457c..0000000 --- a/paperforge/skills/literature-qa/scripts/pf_search.py +++ /dev/null @@ -1,180 +0,0 @@ -"""Unified search entry point for agent skills. -Routes: vector search -> FTS5 search -> grep based on what's available. -Always returns same JSON format regardless of backend. - -Usage: - python pf_search.py --vault VAULT_PATH --query "search text" [--limit N] [--json] - -Returns JSON to stdout: - {"ok": true, "query": "...", "engines_used": [...], "results": [...], "count": N} - {"ok": false, "error": "..."} -""" - -from __future__ import annotations -import json -import subprocess -import sys -from pathlib import Path - - -def _find_python(vault: Path) -> str | None: - """Same logic as pf_bootstrap: find python with paperforge installed.""" - dc_json = vault / ".obsidian" / "plugins" / "paperforge" / "data.json" - if dc_json.exists(): - try: - with open(dc_json, encoding="utf-8") as f: - data = json.load(f) - py = data.get("python_path", "") - if py and Path(py).exists(): - return py - except: - pass - - for cand in [ - vault / ".paperforge-test-venv" / "Scripts" / "python.exe", - vault / ".venv" / "Scripts" / "python.exe", - vault / "venv" / "Scripts" / "python.exe", - ]: - if cand.exists(): - return str(cand) - - for cand in ["python", "python3"]: - try: - subprocess.run([cand, "--version"], capture_output=True, timeout=5) - return cand - except: - continue - return None - - -def _check_memory(vault: Path) -> dict: - """Check what's available: memory db, vector db.""" - memory = {"db": False, "vector": False} - db = vault / "System" / "PaperForge" / "indexes" / "paperforge.db" - if db.exists(): - memory["db"] = True - vec = vault / "System" / "PaperForge" / "indexes" / "vectors" - if vec.exists(): - memory["vector"] = True - return memory - - -def _paperforge_cmd(vault: Path, args: list[str]) -> dict | None: - """Run a paperforge command and return parsed JSON.""" - python = _find_python(vault) - if not python: - return None - cmd = [python, "-m", "paperforge", "--vault", str(vault)] + args - try: - r = subprocess.run(cmd, capture_output=True, text=True, timeout=30, encoding="utf-8") - if r.returncode == 0: - return json.loads(r.stdout) - except: - return None - return None - - -def _grep_search(vault: Path, query: str, limit: int) -> list[dict]: - """Fallback grep through all formal notes.""" - lit_dir = vault / "Resources" / "Literature" - results = [] - search_lower = query.lower() - for f in sorted(lit_dir.rglob("*.md")): - if len(results) >= limit: - break - if f.name in ("fulltext.md", "deep-reading.md", "discussion.md"): - continue - try: - text = f.read_text(encoding="utf-8", errors="replace") - if search_lower not in text.lower(): - continue - title = "" - for line in text.split("\n")[:10]: - if line.startswith("# ") and not line.startswith("## "): - title = line.lstrip("# ").strip() - break - results.append({ - "zotero_key": f.stem, - "title": title or f.stem, - "match": f.name, - "source": "grep", - }) - except: - continue - return results - - -def main(): - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--vault", required=True) - parser.add_argument("--query", required=True) - parser.add_argument("--limit", type=int, default=10) - parser.add_argument("--json", action="store_true", default=True) - args = parser.parse_args() - - vault = Path(args.vault).resolve() - query = args.query.strip() - limit = args.limit - - if not query: - print(json.dumps({"ok": False, "error": "Empty query"})) - sys.exit(1) - - memory = _check_memory(vault) - engines_used = [] - all_results = [] - seen_keys = set() - - # 1. Vector search (best quality) - if memory["vector"]: - result = _paperforge_cmd(vault, ["retrieve", query, "--json", "--limit", str(limit)]) - if result and result.get("ok"): - engines_used.append("vector") - for c in result.get("data", {}).get("chunks", []): - pid = c.get("paper_id", "") - if pid and pid not in seen_keys: - seen_keys.add(pid) - all_results.append({ - "zotero_key": pid, - "citation_key": c.get("citation_key", ""), - "title": c.get("title", ""), - "year": c.get("year", ""), - "section": c.get("section", ""), - "page": c.get("page_number", ""), - "chunk_text": c.get("chunk_text", ""), - "score": c.get("score", 0), - "source": "vector", - }) - - # 2. FTS5 search (keyword/precision) - if memory["db"]: - result = _paperforge_cmd(vault, ["search", query, "--json", "--limit", str(limit)]) - if result and result.get("ok"): - engines_used.append("fts5") - for p in result.get("data", {}).get("results", []): - key = p.get("zotero_key", "") - if key and key not in seen_keys: - seen_keys.add(key) - p["source"] = "fts5" - all_results.append(p) - - # 3. Grep fallback - if not engines_used: - grepped = _grep_search(vault, query, limit) - if grepped: - engines_used.append("grep") - all_results.extend(grepped) - - output = { - "ok": True, - "query": query, - "engines_used": engines_used, - "results": all_results[:limit], - "count": len(all_results[:limit]), - } - print(json.dumps(output, ensure_ascii=False, indent=2)) - - -if __name__ == "__main__": - main() diff --git a/paperforge/skills/logging/SKILL.md b/paperforge/skills/logging/SKILL.md deleted file mode 100644 index 7f25e29..0000000 --- a/paperforge/skills/logging/SKILL.md +++ /dev/null @@ -1,133 +0,0 @@ ---- -name: logging -description: > - Work and reading log management. Triggered by: - "logging work", "logging read", - "做工作记录", "做阅读记录", "做working-log", "做reading-log", - "写工作日志", "写阅读日志", "记录工作", "记录阅读", - "写日志", "记一下", "总结一下这个会话", - "记录决策", "记一下工作过程", "写工作总结". -source: paperforge ---- - -# Logging - ---- - -## 1. Bootstrap — 必须先执行 - -跑这个脚本: - -``` -python $SKILL_DIR/scripts/pf_bootstrap.py -``` - -返回 JSON。记住以下变量: - -| 变量 | 来自 JSON 的 | 用于 | -| ----------- | -------------------- | --------------------------------------------- | -| `$VAULT` | `vault_root` | 所有 `--vault` 参数 | -| `$PYTHON` | `python_candidate` | 所有 cli 调用 | - -如果 `ok: false` → 报告 `error` 给用户,**停止**。 - ---- - -## 2. Routing — 判断用户要什么 - -根据用户说的内容确定走哪个分支: - -| 用户说 | 走分支 | -| ------------------------------------------- | -------- | -| "记录阅读" "reading log" "做阅读记录" "读完了记一下" "刚读了一段记一下" "有没有什么值得记的" "把这段记下来" | **reading** | -| "工作记录" "working log" "总结会话" "记一下工作过程" "写工作总结" "记录决策" "logging work" | **working** | -| "写日志" "记录一下" "记一下" 不清楚哪个 | **先问用户** | - -## 3. reading 分支 — 记录单条阅读笔记 - -调用条件:用户读完一个段落/章节后要记录。 - -动作: -1. 确认 `$VAULT` 和 `$PYTHON` -2. 确定 zotero_key(从上下文或 formal note 中获取) -3. 提取: - - **section**: 文献中的位置 (Discussion P12, Results Fig.3) - - **excerpt**: 逐字引用的原文关键句 - - **usage**: 这个信息支持当前写作的哪个论点 - - **note**: 交叉验证/矛盾/注意事项 (optional) -4. 给用户展示确认后再执行: - ``` - $PYTHON -m paperforge --vault $VAULT reading-log --write KEY \ - --section "..." --excerpt "..." --usage "..." --note "..." - ``` -5. 确认写入成功 - -### Reading Log Format (MANDATORY) - -When writing reading-log.md, use EXACTLY this format. Field labels (`**Info:**`, `**Use:**`, `**Note:**`) must be in English, but the content (title, section name, excerpt, usage, note) should be in the same language as the user's conversation. - -``` -## ABCDEFGH — Author Last Name et al. Year -**Title:** Full Paper Title - -### Section Name — Page NN -**Info:** "verbatim excerpt from paper" -**Use:** how this supports current writing -**Note:** optional cross-reference (optional field) -``` - -Rules: -- Paper key: 8 uppercase letters/digits after ## (must match ^[A-Z0-9]{8}) -- **Title:** line required after every ## header -- **Info:** and **Use:** required after every ### section header -- **Note:** optional -- Do NOT deviate from this format — parsing is strict - -After writing the log file, suggest user run: -``` -paperforge reading-log --validate path/to/file.md -``` - -## 4. working 分支 — 会话总结写入 working-log - -调用条件:会话结束前/用户要求记录工作过程。 - -动作: -1. 回顾本次会话中所有关键节点: - - 用户纠正了什么 - - 方案怎么变的 - - 有什么弯路和教训 - - 可复用的方法论 -2. 按以下格式生成 markdown,给用户确认: - - ``` - ## YYYY-MM-DD — 小节名 - - ### 核心决策 - - 做了什么、为什么 - - ### 弯路与修正 - - 错误方向 → 用户纠正 → 最终方案 - - ### 可复用方法论 - - 本段的 pattern - - ### 待办 - - [ ] ... - ``` - -3. 用户确认后,询问目标 project 目录路径 -4. 追加到 `Project//working-log.md`(文件不存在则新建) -5. 确认写入成功 - ---- - -## 5. Export — 导出 reading-log - -用户说 "导出阅读日志": - -```bash -$PYTHON -m paperforge --vault $VAULT reading-log --output [--since DATE] -``` - -导出为 markdown 文件。如果用户没指定路径,询问。 diff --git a/paperforge/skills/logging/scripts/pf_bootstrap.py b/paperforge/skills/logging/scripts/pf_bootstrap.py deleted file mode 100644 index 87bd211..0000000 --- a/paperforge/skills/logging/scripts/pf_bootstrap.py +++ /dev/null @@ -1,164 +0,0 @@ -"""PaperForge bootstrap — single entry point for agent to discover vault state. - -No dependencies. Runs on ANY Python. Just reads paperforge.json + filesystem. - -Usage: - python pf_bootstrap.py # auto-discover vault from CWD - python pf_bootstrap.py --vault - -Output (JSON to stdout): - { - "ok": true, - "vault_root": "D:\\...", - "paths": { - "literature_dir": "D:\\...\\Resources\\Literature", - "index_path": "D:\\...\\System\\PaperForge\\indexes\\formal-library.json", - "ocr_dir": "D:\\...\\System\\PaperForge\\ocr", - "exports_dir": "D:\\...\\System\\PaperForge\\exports" - }, - "domains": ["domain1", "domain2"], - "index_summary": {"domain1": 120, "domain2": 80}, - "python_candidate": "D:\\...\\python.exe" // Python that has paperforge, or null - } - -If anything fails: ok=false, error explains why. -""" - -from __future__ import annotations - -import json -import subprocess -import sys -from pathlib import Path - - -def _find_paperforge_json(start: Path) -> Path | None: - current = start.resolve() - for _ in range(10): - candidate = current / "paperforge.json" - if candidate.exists(): - return candidate - parent = current.parent - if parent == current: - break - current = parent - return None - - -def _read_pf_config(pf_json: Path) -> dict: - with open(pf_json, encoding="utf-8") as f: - return json.load(f) - - -def _find_python_with_paperforge(vault: Path, pf_cfg: dict) -> str | None: - """Find a Python executable that has paperforge installed.""" - candidates = [] - - # 1. Explicit python_path in config - if pf_cfg.get("python_path"): - candidates.append(Path(pf_cfg["python_path"])) - - # 2. Common venv locations inside vault - venv_names = [".venv", ".paperforge-test-venv", "venv"] - exe_paths = ["Scripts/python.exe", "bin/python3"] - for vn in venv_names: - for ep in exe_paths: - p = vault / vn / ep - if p.exists(): - candidates.append(p) - - for candidate in candidates: - try: - result = subprocess.run( - [str(candidate), "-m", "paperforge", "--version"], - capture_output=True, text=True, timeout=10, - encoding="utf-8", errors="replace", - ) - if result.returncode == 0 and "paperforge" in result.stdout.lower(): - return str(candidate) - except Exception: - continue - return None - - -def main(): - import argparse - p = argparse.ArgumentParser(description="PaperForge bootstrap") - p.add_argument("--vault", default=None, help="Vault root path (auto-detect if omitted)") - args = p.parse_args() - - result: dict = {"ok": False} - - # --- 1. Find vault --- - if args.vault: - vault = Path(args.vault).resolve() - pf_json = vault / "paperforge.json" - if not pf_json.exists(): - result["error"] = f"paperforge.json not found at {vault}" - json.dump(result, sys.stdout, ensure_ascii=False) - sys.exit(0) - else: - pf_json = _find_paperforge_json(Path.cwd()) - if pf_json is None: - result["error"] = "paperforge.json not found from CWD upward. Set --vault." - json.dump(result, sys.stdout, ensure_ascii=False) - sys.exit(0) - vault = pf_json.parent - - result["vault_root"] = str(vault) - - # --- 2. Read config --- - try: - cfg = _read_pf_config(pf_json) - except Exception as e: - result["error"] = f"Cannot read paperforge.json: {e}" - json.dump(result, sys.stdout, ensure_ascii=False) - sys.exit(0) - - system_dir = cfg.get("system_dir", "System") - resources_dir = cfg.get("resources_dir", "Resources") - literature_dir = cfg.get("literature_dir", "Literature") - - # --- 3. Build paths from config --- - pf_root = vault / system_dir / "PaperForge" - - paths = { - "literature_dir": str(vault / resources_dir / literature_dir), - "index_path": str(pf_root / "indexes" / "formal-library.json"), - "ocr_dir": str(pf_root / "ocr"), - "exports_dir": str(pf_root / "exports"), - } - result["paths"] = paths - - # --- 4. List domains --- - lit_dir = Path(paths["literature_dir"]) - domains = sorted( - [d.name for d in lit_dir.iterdir() if d.is_dir()] - ) if lit_dir.exists() else [] - result["domains"] = domains - - # --- 5. Index summary --- - index_path = Path(paths["index_path"]) - index_summary: dict[str, int] = {} - if index_path.exists(): - try: - data = json.loads(index_path.read_text(encoding="utf-8")) - items = data.get("items", []) - if isinstance(items, dict): - items = items.values() - for item in items: - d = item.get("domain", "unknown") - index_summary[d] = index_summary.get(d, 0) + 1 - except Exception: - pass - result["index_summary"] = index_summary - - # --- 6. Find Python that has paperforge (best effort) --- - result["python_candidate"] = _find_python_with_paperforge(vault, cfg) - - result["ok"] = True - json.dump(result, sys.stdout, ensure_ascii=False, indent=2) - - -if __name__ == "__main__": - main() diff --git a/paperforge/skills/methodology/SKILL.md b/paperforge/skills/methodology/SKILL.md deleted file mode 100644 index 7f08ef4..0000000 --- a/paperforge/skills/methodology/SKILL.md +++ /dev/null @@ -1,86 +0,0 @@ ---- -name: methodology -description: > - Extract reusable methodology from project work logs. Triggered by: - methodology, /methodology, 提取方法论, 存档写作规律, - 总结本项目方法, 提取可复用规则, 提取写作规律. -source: paperforge ---- - -# Methodology Extract - ---- - -## 1. Bootstrap - -```python $SKILL_DIR/scripts/pf_bootstrap.py``` - -Remember: `$VAULT`, `$PYTHON`. - ---- - -## 2. Determine Project - -Ask user: which project to extract methodology from? - -If user doesn't specify, scan `Project/` directory for complete working-log.md files and list them. - ---- - -## 3. Read working-log - -Read `/Project//working-log.md`. - ---- - -## 4. Identify Extractable Patterns - -Scan the working-log for these signals: - -| Signal in working-log | Extract to | -|----------------------|-------------| -| "弯路" + "修正" or "教训" sections | Pattern rules | -| "最终逻辑:" or "最终结构:" | Section templates | -| "复用" keyword + methodology block | Reusable practices | -| Cross-study audit sections (跨研究可比性) | Analysis methodology | -| "methodology" header sections | Full methodology block | -| Review feedback patterns (审阅/修正) | Writing checklists | - -For each found pattern, classify into one of: -- `review-writing` — 综述写作 framework design, gap analysis, cross-study audit -- `argument-writing` — 段落写作, 参数框架, 论证结构 -- `analysis-methods` — 文献审计, 跨研究比较, 参数提取 -- `general` — fallback - ---- - -## 5. Present and Confirm - -For each extracted pattern, show: -- Category -- Source (working-log section number) -- Brief summary (1-2 sentences) - -Ask user to confirm/edit before writing. - ---- - -## 6. Write Methodology Files - -Write confirmed patterns to `/PaperForge/methodologies/.md`. - -If file exists, APPEND (do not overwrite). - -Format per method: -``` -## -**Category:** -**Source:** Project//working-log.md Section X.Y -**Extracted:** YYYY-MM-DD - -### Pattern - - -### Example - -``` diff --git a/paperforge/skills/paperforge/SKILL.md b/paperforge/skills/paperforge/SKILL.md new file mode 100644 index 0000000..2e42cd9 --- /dev/null +++ b/paperforge/skills/paperforge/SKILL.md @@ -0,0 +1,113 @@ +--- +name: paperforge +description: > + Research Memory Runtime — 文献搜索、精读、问答、阅读笔记、 + 工作记录、方法论提取。Triggered by: + pf-deep pf-paper pf-sync pf-ocr pf-status, + "精读" "找文献" "搜文献" "文献问答" "读一下" "看看这篇" + "讨论" "记录阅读" "记录工作" "总结会话" "提取方法论". +source: paperforge +--- + +# PaperForge — Research Memory Runtime + +PaperForge 将文献、阅读痕迹、工作过程、方法论和产物 +组织成可检索、可复核、可由 agent 调用的研究记忆。 + +--- + +## 1. Bootstrap — 必须先执行 + +```bash +python $SKILL_DIR/scripts/pf_bootstrap.py --vault "$VAULT" +``` + +返回 JSON。记录以下变量(所有 workflow 文件继承,不再重复声明): + +| 变量 | JSON 字段 | 用途 | +| ------------- | ----------------------- | ------------------------------ | +| `$VAULT` | `vault_root` | 所有 `--vault` 参数 | +| `$PYTHON` | `python_candidate` | 所有 `python -m paperforge` 调用 | +| `$LIT_DIR` | `paths.literature_dir` | 文献笔记根目录 | +| `$SKILL_DIR` | 平台注入 | 脚本路径 | +| `$METHODS` | `methodology_index` | 可用方法论索引 | + +如果 `ok: false`,报告 `error` 给用户,**停止。禁止自己拼路径。** + +--- + +## 2. Agent Context — bootstrap 成功后执行 + +```bash +$PYTHON -m paperforge agent-context --json --vault "$VAULT" +``` + +返回 library overview、collection tree、可用命令和规则。Agent 注入为会话上下文。 + +--- + +## 3. Methodology Index — bootstrap 自动提供 + +bootstrap 已返回 `methodology_index`(从 `System/PaperForge/methodology/archive/` 扫描)。 +Agent 在需要时自行读取对应卡片(`read System/PaperForge/methodology/archive/.md`)。 + +--- + +## 4. Reading-Log Safety Rule — 全局规则,所有 workflow 必须遵守 + +Reading-log 不是事实源。它记录的是**之前的关注点、解读和预期用途**。 + +当存在 prior reading-log 时: +1. 用它决定**优先复查什么**,不是用它回答用户问题 +2. 重新打开**原文/图表/表格**,核实之前的解读 +3. 确认的,说明"已回原文复核" +4. 被推翻的,创建 correction note +5. **绝对禁止**仅根据 reading-log 内容回答事实性问题 + +--- + +## 5. 意图路由 + +用户输入对应唯一一个 workflow 文件(打开并执行其完整流程): + +| 用户说 | 打开 | +| -------------------------------------------------------- | -------------------------------- | +| "找文献" "搜" "库里有没有XX" "collection 里关于YY" | `workflows/paper-search.md` | +| "精读 " "/pf-deep" "三阶段阅读" | `workflows/deep-reading.md` | +| "读一下" "看看" "讨论" "/pf-paper" " 这篇讲了什么" | `workflows/paper-qa.md` | +| "记一下" "记录阅读" "reading log" "读完这段记一下" | `workflows/reading-log.md` | +| "总结会话" "工作记录" "项目记录" "project log" "记决策" | `workflows/project-log.md` | +| "提取方法论" "总结规律" "存档写作规律" | `workflows/methodology.md` | +| 不确定 / 空输入 | 问用户:搜文献、精读、问答、记笔记、记工作、提方法论? | + +路由后如用户切换意图,重新判断并打开对应 workflow。 + +--- + +## 6. 全局禁止规则 + +- **禁止自行拼接文件路径**。所有路径从 bootstrap 或 paper-context 获取。 +- **禁止绕过 CLI 直接操作文件**。搜索用 `$PYTHON -m paperforge search`,不用 glob/grep 扫库。 +- **禁止在未完成 paper-context 检查前读取原文**(适用于 deep-reading、paper-qa)。 + +--- + +## 文件结构 + +``` +paperforge/ +├── SKILL.md ← 本文件(compound:启动注入 + 路由 + 全局规则) +├── workflows/ ← molecules:原子序列 + 分支条件 +│ ├── paper-search.md +│ ├── deep-reading.md +│ ├── paper-qa.md +│ ├── reading-log.md +│ ├── project-log.md +│ └── methodology.md +├── references/ ← 共享参考 +│ ├── chart-reading/ ← 19 种图表阅读指南 +│ └── method-card-template.md +└── scripts/ ← 脚本 atoms + ├── pf_bootstrap.py + └── pf_deep.py +``` diff --git "a/paperforge/skills/literature-qa/references/chart-reading/GSEA\345\257\214\351\233\206\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/GSEA\345\257\214\351\233\206\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/GSEA\345\257\214\351\233\206\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/GSEA\345\257\214\351\233\206\345\233\276.md" diff --git a/paperforge/skills/literature-qa/references/chart-reading/INDEX.md b/paperforge/skills/paperforge/references/chart-reading/INDEX.md similarity index 100% rename from paperforge/skills/literature-qa/references/chart-reading/INDEX.md rename to paperforge/skills/paperforge/references/chart-reading/INDEX.md diff --git "a/paperforge/skills/literature-qa/references/chart-reading/ROC\344\270\216PR\346\233\262\347\272\277.md" "b/paperforge/skills/paperforge/references/chart-reading/ROC\344\270\216PR\346\233\262\347\272\277.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/ROC\344\270\216PR\346\233\262\347\272\277.md" rename to "paperforge/skills/paperforge/references/chart-reading/ROC\344\270\216PR\346\233\262\347\272\277.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/Western Blot\346\235\241\345\270\246\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/Western Blot\346\235\241\345\270\246\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/Western Blot\346\235\241\345\270\246\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/Western Blot\346\235\241\345\270\246\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\345\205\215\347\226\253\350\215\247\345\205\211\345\256\232\351\207\217\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\345\205\215\347\226\253\350\215\247\345\205\211\345\256\232\351\207\217\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\345\205\215\347\226\253\350\215\247\345\205\211\345\256\232\351\207\217\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\345\205\215\347\226\253\350\215\247\345\205\211\345\256\232\351\207\217\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\346\212\230\347\272\277\345\233\276\344\270\216\346\227\266\351\227\264\345\272\217\345\210\227.md" "b/paperforge/skills/paperforge/references/chart-reading/\346\212\230\347\272\277\345\233\276\344\270\216\346\227\266\351\227\264\345\272\217\345\210\227.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\346\212\230\347\272\277\345\233\276\344\270\216\346\227\266\351\227\264\345\272\217\345\210\227.md" rename to "paperforge/skills/paperforge/references/chart-reading/\346\212\230\347\272\277\345\233\276\344\270\216\346\227\266\351\227\264\345\272\217\345\210\227.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\346\225\243\347\202\271\345\233\276\344\270\216\346\260\224\346\263\241\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\346\225\243\347\202\271\345\233\276\344\270\216\346\260\224\346\263\241\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\346\225\243\347\202\271\345\233\276\344\270\216\346\260\224\346\263\241\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\346\225\243\347\202\271\345\233\276\344\270\216\346\260\224\346\263\241\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\346\230\276\345\276\256\347\205\247\347\211\207\344\270\216SEM\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\346\230\276\345\276\256\347\205\247\347\211\207\344\270\216SEM\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\346\230\276\345\276\256\347\205\247\347\211\207\344\270\216SEM\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\346\230\276\345\276\256\347\205\247\347\211\207\344\270\216SEM\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\346\235\241\345\275\242\345\233\276\344\270\216\350\257\257\345\267\256\346\243\222.md" "b/paperforge/skills/paperforge/references/chart-reading/\346\235\241\345\275\242\345\233\276\344\270\216\350\257\257\345\267\256\346\243\222.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\346\235\241\345\275\242\345\233\276\344\270\216\350\257\257\345\267\256\346\243\222.md" rename to "paperforge/skills/paperforge/references/chart-reading/\346\235\241\345\275\242\345\233\276\344\270\216\350\257\257\345\267\256\346\243\222.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\346\241\221\345\237\272\345\233\276\344\270\216\345\274\246\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\346\241\221\345\237\272\345\233\276\344\270\216\345\274\246\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\346\241\221\345\237\272\345\233\276\344\270\216\345\274\246\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\346\241\221\345\237\272\345\233\276\344\270\216\345\274\246\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\346\243\256\346\236\227\345\233\276\344\270\216Meta\345\210\206\346\236\220.md" "b/paperforge/skills/paperforge/references/chart-reading/\346\243\256\346\236\227\345\233\276\344\270\216Meta\345\210\206\346\236\220.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\346\243\256\346\236\227\345\233\276\344\270\216Meta\345\210\206\346\236\220.md" rename to "paperforge/skills/paperforge/references/chart-reading/\346\243\256\346\236\227\345\233\276\344\270\216Meta\345\210\206\346\236\220.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\347\201\253\345\261\261\345\233\276\344\270\216\346\233\274\345\223\210\351\241\277\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\347\201\253\345\261\261\345\233\276\344\270\216\346\233\274\345\223\210\351\241\277\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\347\201\253\345\261\261\345\233\276\344\270\216\346\233\274\345\223\210\351\241\277\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\347\201\253\345\261\261\345\233\276\344\270\216\346\233\274\345\223\210\351\241\277\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\347\203\255\345\233\276\344\270\216\350\201\232\347\261\273\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\347\203\255\345\233\276\344\270\216\350\201\232\347\261\273\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\347\203\255\345\233\276\344\270\216\350\201\232\347\261\273\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\347\203\255\345\233\276\344\270\216\350\201\232\347\261\273\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\347\224\237\345\255\230\346\233\262\347\272\277.md" "b/paperforge/skills/paperforge/references/chart-reading/\347\224\237\345\255\230\346\233\262\347\272\277.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\347\224\237\345\255\230\346\233\262\347\272\277.md" rename to "paperforge/skills/paperforge/references/chart-reading/\347\224\237\345\255\230\346\233\262\347\272\277.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\347\256\261\345\274\217\345\233\276\344\270\216\345\260\217\346\217\220\347\220\264\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\347\256\261\345\274\217\345\233\276\344\270\216\345\260\217\346\217\220\347\220\264\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\347\256\261\345\274\217\345\233\276\344\270\216\345\260\217\346\217\220\347\220\264\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\347\256\261\345\274\217\345\233\276\344\270\216\345\260\217\346\217\220\347\220\264\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\347\273\204\347\273\207\345\255\246\345\215\212\345\256\232\351\207\217\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\347\273\204\347\273\207\345\255\246\345\215\212\345\256\232\351\207\217\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\347\273\204\347\273\207\345\255\246\345\215\212\345\256\232\351\207\217\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\347\273\204\347\273\207\345\255\246\345\215\212\345\256\232\351\207\217\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\347\275\221\347\273\234\345\233\276\344\270\216\351\200\232\350\267\257\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\347\275\221\347\273\234\345\233\276\344\270\216\351\200\232\350\267\257\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\347\275\221\347\273\234\345\233\276\344\270\216\351\200\232\350\267\257\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\347\275\221\347\273\234\345\233\276\344\270\216\351\200\232\350\267\257\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\350\233\213\347\231\275\350\264\250\347\273\223\346\236\204\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\350\233\213\347\231\275\350\264\250\347\273\223\346\236\204\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\350\233\213\347\231\275\350\264\250\347\273\223\346\236\204\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\350\233\213\347\231\275\350\264\250\347\273\223\346\236\204\345\233\276.md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\351\231\215\347\273\264\345\233\276(PCA-tSNE-UMAP).md" "b/paperforge/skills/paperforge/references/chart-reading/\351\231\215\347\273\264\345\233\276(PCA-tSNE-UMAP).md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\351\231\215\347\273\264\345\233\276(PCA-tSNE-UMAP).md" rename to "paperforge/skills/paperforge/references/chart-reading/\351\231\215\347\273\264\345\233\276(PCA-tSNE-UMAP).md" diff --git "a/paperforge/skills/literature-qa/references/chart-reading/\351\233\267\350\276\276\345\233\276\344\270\216\346\274\217\346\226\227\345\233\276.md" "b/paperforge/skills/paperforge/references/chart-reading/\351\233\267\350\276\276\345\233\276\344\270\216\346\274\217\346\226\227\345\233\276.md" similarity index 100% rename from "paperforge/skills/literature-qa/references/chart-reading/\351\233\267\350\276\276\345\233\276\344\270\216\346\274\217\346\226\227\345\233\276.md" rename to "paperforge/skills/paperforge/references/chart-reading/\351\233\267\350\276\276\345\233\276\344\270\216\346\274\217\346\226\227\345\233\276.md" diff --git a/paperforge/skills/paperforge/references/method-card-template.md b/paperforge/skills/paperforge/references/method-card-template.md new file mode 100644 index 0000000..0ae4add --- /dev/null +++ b/paperforge/skills/paperforge/references/method-card-template.md @@ -0,0 +1,39 @@ +# Method Card Template + +复制此模板创建新的方法论卡片。 + +--- + + + + +# <标题:简短、可搜索> + +## Use when + + +## Procedure + + +1. <步骤 1> +2. <步骤 2> +3. <步骤 3> + +## Watch-outs + + +- <注意 1> +- <注意 2> + +## Example + + +--- + diff --git a/paperforge/skills/methodology/scripts/pf_bootstrap.py b/paperforge/skills/paperforge/scripts/pf_bootstrap.py similarity index 79% rename from paperforge/skills/methodology/scripts/pf_bootstrap.py rename to paperforge/skills/paperforge/scripts/pf_bootstrap.py index ebc41b2..741a677 100644 --- a/paperforge/skills/methodology/scripts/pf_bootstrap.py +++ b/paperforge/skills/paperforge/scripts/pf_bootstrap.py @@ -18,7 +18,11 @@ }, "domains": ["domain1", "domain2"], "index_summary": {"domain1": 120, "domain2": 80}, - "python_candidate": "D:\\...\\python.exe" // Python that has paperforge, or null + "python_candidate": "D:\\...\\python.exe", + "methodology_index": [ + {"id": "parameter-window-audit", "description": "比较多个研究的参数和剂量反应"}, + ... + ] } If anything fails: ok=false, error explains why. @@ -81,6 +85,39 @@ def _find_python_with_paperforge(vault: Path, pf_cfg: dict) -> str | None: return None +def _scan_methodology_archive(pf_root: Path) -> list[dict]: + """Scan methodology archive directory for available method cards.""" + archive_dir = pf_root / "methodology" / "archive" + if not archive_dir.exists(): + return [] + + methods = [] + for f in sorted(archive_dir.glob("*.md")): + try: + text = f.read_text(encoding="utf-8") + # Extract first heading as title, first paragraph after "Use when" as description + title = "" + description = "" + in_use_when = False + for line in text.split("\n"): + stripped = line.strip() + if stripped.startswith("# ") and not title: + title = stripped.lstrip("# ").strip() + elif stripped.startswith("## Use when"): + in_use_when = True + elif in_use_when and stripped and not stripped.startswith("#"): + description = stripped + in_use_when = False + methods.append({ + "id": f.stem, + "title": title or f.stem, + "description": description or "(no description)", + }) + except Exception: + continue + return methods + + def main(): import argparse p = argparse.ArgumentParser(description="PaperForge bootstrap") @@ -180,6 +217,9 @@ def main(): pass result["memory_layer"] = memory_layer + # --- 8. Scan methodology archive --- + result["methodology_index"] = _scan_methodology_archive(pf_root) + result["ok"] = True json.dump(result, sys.stdout, ensure_ascii=False, indent=2) diff --git a/paperforge/skills/literature-qa/scripts/ld_deep.py b/paperforge/skills/paperforge/scripts/pf_deep.py similarity index 100% rename from paperforge/skills/literature-qa/scripts/ld_deep.py rename to paperforge/skills/paperforge/scripts/pf_deep.py diff --git a/paperforge/skills/paperforge/workflows/deep-reading.md b/paperforge/skills/paperforge/workflows/deep-reading.md new file mode 100644 index 0000000..6d1852e --- /dev/null +++ b/paperforge/skills/paperforge/workflows/deep-reading.md @@ -0,0 +1,169 @@ +# deep-reading + +Keshav 三阶段精读。在 formal note 中写入结构化的 `## 精读` 区域。 + +--- + +## 前置检查 + +### Step 0: paper-context(必须) + +```bash +$PYTHON -m paperforge paper-context --json --vault "$VAULT" +``` + +检查返回 JSON: +- `ok: false` → 报告 `error.message`,停止 +- `data.paper.ocr_status != "done"` → "OCR 未完成,请先运行 paperforge ocr",停止 +- `data.paper.analyze != true` → "analyze 未开启,请在 formal note frontmatter 中设为 true",停止 + +**检查 prior_notes:** +- 如果存在 `data.prior_notes`,逐条看 `verified` 字段 +- `verified: false` 的条目记入 recheck_targets,精读时必须回原文复核这些位置 +- `verified: true` 的条目可以信任,但标注"之前已验证" + +**记录关键路径:** +- `data.paper.note_path`(formal note 路径) +- `data.paper.fulltext_path`(fulltext 路径) +- 记下 `recheck_targets` 列表 + +--- + +## 执行流程 + +### Step 1: Prepare(跑脚本) + +```bash +$PYTHON "$SKILL_DIR/scripts/pf_deep.py" prepare --key --vault "$VAULT" +``` + +解析返回 JSON: +- `status: "ok"` → 记下 `figure_map`、`chart_type_map`、`formal_note`、`fulltext_md`、`figures`、`tables` 的路径和数量 +- `status: "warn"` + `deep_reading_status: done` → 告知用户"该文献已精读过",确认是否重读 +- `status: "error"` → 报告 `message`,停止 + +读 formal note,确认 `## 精读` 骨架已插入。 + +--- + +### Step 2: Pass 1 — 概览 + +只填 `### Pass 1: 概览`。不碰 Pass 2/3。 + +填写内容必须来自原文,不可推断: + +- **一句话总览**:论文类型 + 核心发现,一句话 +- **5 Cs 快速评估**: + - Category(RCT / 队列 / 综述 / 基础研究等) + - Context(领域共识,本文要解决什么) + - Correctness(初步直觉,逻辑有否明显漏洞) + - Contributions(1-3 条) + - Clarity(写作质量,图表可读性) +- **Figure 导读**(基于 fulltext 浏览各图 caption): + - 关键主图:列出,一句话概括要证明什么 + - 证据转折点:哪个 figure 是叙事关键转折 + - 需要重点展开的 supplementary + - 关键表格 + +填完立即保存。 + +--- + +### Step 3: Pass 2 — 精读还原 + +填 `### Pass 2: 精读还原`。**按 figure 顺序逐个处理**。 + +每处理完一个 figure 立即保存。 + +#### 图表类型定位(两步) + +**A: 读 chart-type-map**(prepare 输出中包含该路径)。这是关键词命中建议。 + +**B: Agent 读 caption 做最终判断** +1. 读该 figure 的 caption(来自 fulltext) +2. 打开 `references/chart-reading/INDEX.md`,对照 caption 内容判断图表类型 +3. chart-type-map 建议和 Agent 判断不一致时 → 以 Agent 判断为准 +4. 无法确定类型 → 跳过 chart guide,按通用结构分析 +5. 确定类型 → 读对应 chart-reading 指南,按指南中的检查清单分析 + +#### 每张 Figure 的子标题(固定,不可跳过) + +``` +**图像定位与核心问题**:页码 + 要回答什么问题 +**方法与结果**:实验设计 / 数据来源 / 技术手段;核心数据、趋势、对比 +**图表质量审查**:按 chart-reading 指南检查坐标轴、单位、误差棒、统计标注 +**作者解释**:作者在正文中对该图的解读 +**我的理解**:自己的理解(必须与作者解释做明显区分) +**疑点/局限**:用 `> [!warning]` 突出 +``` + +#### 每张 Table 的子标题(简化版) + +``` +回答什么问题、关键字段/分组、主要结果、我的理解、疑点/局限 +``` + +#### 所有 figure/table 处理完后 + +**关键方法补课**:简要解释不熟悉的实验技术(1-2 项) + +**主要发现与新意**: +- 发现 1:...(来源:Figure X) +- 发现 2:...(来源:Table Y) +- 每条发现必须标注来源(Figure 编号或正文段落) + +--- + +### Step 4: Postprocess(跑校验,修正问题) + +```bash +$PYTHON "$SKILL_DIR/scripts/pf_deep.py" postprocess-pass2 "" --figures --vault "$VAULT" +``` + +- 输出 `OK` → 继续 Step 5 +- 输出错误列表(含行号)→ 按提示修正,修正后重新跑 +- 最多 3 轮修正。3 轮后仍失败 → 报告剩余错误给用户 + +--- + +### Step 5: Pass 3 — 深度理解 + +填 `### Pass 3: 深度理解`。基于 Pass 1/2 已写内容。 + +- **假设挑战与隐藏缺陷**:隐含假设;放宽假设后结论还成立吗;缺少的关键引用;实验/分析技术潜在问题 +- **哪些结论扎实,哪些仍存疑**: + - 较扎实:... + - 仍存疑:...(用 `> [!warning]`) +- **Discussion 与 Conclusion 怎么读**:作者实际完成了什么;哪些有拔高;哪些是推测 +- **对我的启发**:研究设计、figure 组织、方法组合、未来工作 +- **遗留问题**:...(用 `> [!question]`) + +--- + +### Step 6: Final Validation + +```bash +$PYTHON "$SKILL_DIR/scripts/pf_deep.py" validate-note "" --fulltext "" +``` + +- 输出 `OK` → 告知用户精读完成 +- 输出错误 → 修正缺失项,直到通过 + +--- + +## Callout 格式规则 + +- `> [!important]` — 每个 main finding +- `> [!warning]` — 疑问、局限、证据边界、仍存疑条目 +- `> [!question]` — 遗留问题 +- **相邻 callout 之间必须有空行**(否则 Obsidian 合并): + - 正确:`> [!important] A\n\n> [!important] B` + - 错误:`> [!important] A\n> [!important] B` + +--- + +## 禁止 + +- 不要在 Pass 1 完成前碰 Pass 2/3 +- 不要把推断写成文献事实——区分"作者说了 X"和"我推断 Y" +- 不要跨 figure 写综合判断(Pass 2 逐图,Pass 3 才做综合) diff --git a/paperforge/skills/paperforge/workflows/methodology.md b/paperforge/skills/paperforge/workflows/methodology.md new file mode 100644 index 0000000..f7a1fb9 --- /dev/null +++ b/paperforge/skills/paperforge/workflows/methodology.md @@ -0,0 +1,94 @@ +# methodology + +从 project-log 中提取可复用方法论,按 method-card 模板写入 methodology archive。 +不 append 到大文件,每张卡片独立保存。 + +--- + +## 前置条件 + +- bootstrap 已完成 +- 有 project-log 记录可读取 + +--- + +## 步骤 + +### Step 1: 确定项目和来源 + +询问用户从哪个项目提取。如用户未指定,列出有 project-log 的项目。 + +### Step 2: 读取 project-log + +```bash +$PYTHON -m paperforge project-log --list "" --json --vault "$VAULT" +``` + +扫描其中以下信号: + +| log 中的信号 | 可提取为 | +| ------------------------- | --------------------------- | +| `detours` 中的教训 | 方法论规则 | +| `reusable` 字段里的内容 | 直接采用 | +| `decisions` 中的重要选择 | 决策原则 | +| 跨文献审计/比较分析 | 审计方法论 | +| 写作修正/审阅反馈 | 写作检查清单 | + +### Step 3: 识别可提取 pattern + +对每个 pattern 分类: +- `review-writing` — 综述框架设计、gap 分析、跨研究审计 +- `argument-writing` — 段落写作、论证结构 +- `analysis-methods` — 文献审计、跨研究比较、参数提取 +- `general` — fallback + +### Step 4: 按 method-card 模板生成卡片 + +打开 `references/method-card-template.md` 确认模板格式。 + +对每个 pattern 生成一张卡片,展示给用户确认。格式: + +```markdown +--- +id: +tags: [, ] +source_project: +status: active +--- + +# <标题> + +## Use when +<什么时候应该用这个方法> + +## Procedure +1. <步骤 1> +2. <步骤 2> +... + +## Watch-outs +- <注意事项 1> +- <注意事项 2> + +## Example +<来自项目的具体例子> +``` + +### Step 5: 用户确认后写入 + +将每张卡片写入: + +``` +System/PaperForge/methodology/archive/.md +``` + +用 `write` 工具创建文件。如已存在同名文件,追加到末尾(用 `---` 分隔)。 +不自动覆盖已有内容。 + +--- + +## 禁止 + +- 不要提取太泛的"教训"(如"多读文献")——必须有具体的 Procedure 步骤 +- 不要创建超过 4 张卡片/次——优先最可复用的 +- 不要在用户确认前写入 diff --git a/paperforge/skills/paperforge/workflows/paper-qa.md b/paperforge/skills/paperforge/workflows/paper-qa.md new file mode 100644 index 0000000..6785d77 --- /dev/null +++ b/paperforge/skills/paperforge/workflows/paper-qa.md @@ -0,0 +1,105 @@ +# paper-qa + +交互式文献问答。不强制要求 OCR,但 OCR 完成后回答更准确。 + +每次问答记录到 `discussion.json`(Dashboard 可见)。 + +--- + +## 前置条件 + +- bootstrap 已完成(有 `$VAULT`、`$PYTHON`) + +--- + +## 步骤 + +### Step 1: 定位论文 + +用户可能给 zotero_key、DOI、标题片段、作者+年份。按以下方式查找: + +**优先用 paper-context(一次拿到全部信息):** + +```bash +$PYTHON -m paperforge paper-context --json --vault "$VAULT" +``` + +返回 JSON 包含 paper 元数据、OCR 状态、prior_notes 等。 + +**paper-context 无结果时的备选:** + +```bash +$PYTHON -m paperforge search "" --json --vault "$VAULT" --limit 5 +``` + +如果多候选,列出让用户选(同 paper-search 的 Step 4-5 格式)。 +如果无结果,告知用户并停止。 + +### Step 2: 加载文献内容 + +1. 从 paper-context 或 formal note frontmatter 获取:标题、作者、期刊、年份、domain +2. 读 `fulltext.md`(如果 OCR done)作为主要回答依据 +3. 如果 fulltext 不存在:"OCR 文本不可用,回答将基于元数据和公开信息" + +### Step 3: 展示论文信息 + 进入 Q&A + +``` +已加载: (<year>, <journal>) +作者: <authors> | Key: <zotero_key> | 领域: <domain> +OCR: done / 不可用 +结束对话时说"保存"即可保存讨论。 +请问有什么问题? +``` + +### Step 4: Q&A 循环 + +- 等待用户提问 +- 每次回答后等待下一个问题 +- 持续到用户说"保存"、"结束"、"完成" + +**回答原则:** +- 严格基于 fulltext.md 中的文本内容 +- 引用原文时标注来源页码/章节 +- 论文未提及的内容明确说明"论文中未提及" +- 区分"文献说了什么"和"我推断什么" + +### Step 5: 保存讨论 + +用户说"保存"、"结束"、"完成"时执行。 + +**收集 Q&A 对**,序列化为 JSON 数组: + +```json +[ + { + "question": "用户的问题", + "answer": "Agent 的回答", + "source": "user_question", + "timestamp": "2026-05-14T12:00:00+08:00" + } +] +``` + +`source`: `"user_question"`(用户提问)或 `"agent_analysis"`(Agent 主动分析)。 + +**调 discussion 模块:** + +```bash +$PYTHON -m paperforge.worker.discussion record <zotero_key> \ + --vault "$VAULT" \ + --agent pf-paper \ + --model "<current_model>" \ + --qa-pairs '<JSON_ARRAY>' +``` + +- 返回 `ok` → 告知用户已保存 +- 返回 `error` → 重试一次,仍失败则告知用户 + +**不要自动保存。** 仅用户明确要求时执行。 + +--- + +## 禁止 + +- 不要捏造论文未提及的内容 +- 不要把推断写成论文事实 diff --git a/paperforge/skills/paperforge/workflows/paper-search.md b/paperforge/skills/paperforge/workflows/paper-search.md new file mode 100644 index 0000000..285ce55 --- /dev/null +++ b/paperforge/skills/paperforge/workflows/paper-search.md @@ -0,0 +1,101 @@ +# paper-search + +从文献库中按条件检索文献,返回候选清单及每篇的可用状态。 + +--- + +## 前置条件 + +- bootstrap 已完成(有 `$VAULT`、`$PYTHON`、`$LIT_DIR`) + +--- + +## 步骤 + +### Step 1: 解析用户搜索意图 + +提取以下信息(缺什么就问用户): +- **搜索词**:关键词、作者名、年份 +- **范围**:domain(如"骨科")、collection(如"DC")、不指定=全库 +- **过滤条件**:OCR 状态(done/pending)、年份范围(--year-from/--year-to)、lifecycle + +### Step 2: 执行搜索 + +```bash +$PYTHON -m paperforge search <query> --json --vault "$VAULT" --limit 15 \ + [--domain "<domain>"] \ + [--year-from <N>] [--year-to <N>] \ + [--ocr <done|pending>] \ + [--lifecycle <active|archived>] +``` + +返回 JSON 结构: +```json +{ + "ok": true, + "data": { + "query": "<query>", + "matches": [ + { + "zotero_key": "ABC12345", + "citation_key": "...", + "title": "...", + "year": "2024", + "first_author": "Smith", + "domain": "...", + "collection_path": "...", + "ocr_status": "done", + "deep_reading_status": "pending", + "lifecycle": "active", + "has_pdf": true, + "rank": "..." + } + ], + "count": 5 + } +} +``` + +- 如果 `ok: false` → 报告 `error.message`,问用户是否换搜索词 +- 如果 `data.count == 0` → 告知用户无结果,建议换词或扩大范围 +- 如果 `data.count > 0` → 进入 Step 3 + +### Step 3: 逐个确认状态(paper-context 原子) + +对每个 match,调 `paper-context` 获取更详细的可读状态: + +```bash +$PYTHON -m paperforge paper-context <zotero_key> --json --vault "$VAULT" +``` + +目的:拿到 `ocr_status`、`prior_notes` 数量、`analyze` 状态,帮助用户判断哪些可以直接读。 + +### Step 4: 展示候选清单 + +格式(每条一行): + +``` +找到 N 篇匹配 "<query>": + +[1] ABC12345 | Smith 2024 | Title Here | 骨科 | OCR: done | 精读: pending | 阅读笔记: 3 +[2] DEF67890 | Jones 2023 | Title Here | 骨科 | OCR: done | 精读: done | 阅读笔记: 0 +[3] GHI11111 | Wang 2022 | Title Here | 骨科 | OCR: pending | | 阅读笔记: 0 +``` + +关键字段:zotero_key, first_author, year, title, ocr_status, deep_reading_status, prior_notes 数量 + +### Step 5: 等用户选择后续操作 + +展示候选后不要自己决定下一步。等用户说: +- "读一下 [1]" → 路由到 paper-qa.md +- "精读 [2]" → 路由到 deep-reading.md +- "记一下 [1]" → 路由到 reading-log.md +- "缩小范围"/"refine" → 回到 Step 1,加更多过滤条件 + +--- + +## 禁止 + +- 不要在搜索结果中替用户决定读哪篇 +- 不要在搜索阶段读全文 +- 不要对 0 结果硬猜路径 diff --git a/paperforge/skills/paperforge/workflows/project-log.md b/paperforge/skills/paperforge/workflows/project-log.md new file mode 100644 index 0000000..96c5d63 --- /dev/null +++ b/paperforge/skills/paperforge/workflows/project-log.md @@ -0,0 +1,131 @@ +# project-log + +记录研究项目的会话总结、决策、弯路修正和方法论提取。 +Agent 按 JSON schema 写入 project-log.jsonl。 +系统自动渲染对应项目的 project-log.md。 + +--- + +## 前置条件 + +- bootstrap 已完成(有 `$VAULT`、`$PYTHON`) +- 已知 project 名称 + +--- + +## 项目日志 JSON Schema + +```json +{ + "id": "plog_<YYYYMMDD>_<序号>", + "project": "综述写作", + "date": "2026-05-14", + "type": "session_summary", + "title": "DC 段参数窗审计", + "decisions": ["做了 X,因为 Y"], + "detours": [ + { + "wrong": "错误方向", + "correction": "用户如何纠正", + "resolution": "最终方案" + } + ], + "reusable": ["可复用的方法论或教训"], + "todos": [ + {"content": "待办事项", "done": false} + ], + "related_papers": ["ABC12345"], + "tags": ["DC", "参数窗", "审计"], + "agent": "opencode" +} +``` + +| 字段 | 必填 | 说明 | +| ---------------- | ---- | ------------------------------------------- | +| `id` | 是 | 自动生成 `plog_YYYYMMDD_NNN` | +| `project` | 是 | 项目名 | +| `date` | 是 | YYYY-MM-DD | +| `type` | 是 | `session_summary` / `decision` / `correction` / `milestone` / `note` | +| `title` | 是 | 本条目的简短标题 | +| `decisions` | 否 | 核心决策列表 | +| `detours` | 否 | 弯路与修正记录 | +| `reusable` | 否 | 可复用的方法论或教训 | +| `todos` | 否 | 待办事项 | +| `related_papers` | 否 | 相关 Zotero keys | +| `tags` | 否 | 分类标签 | +| `agent` | 否 | 记录者 | + +--- + +## 步骤 + +### Step 1: 确定 project + +从上下文获取。如果用户未指定,询问。 + +### Step 2: 回顾本次会话 + +回顾以下内容: +- 做了什么(核心决策) +- 用户纠正了什么(弯路与修正) +- 有什么可复用的方法论或教训 +- 待办事项 + +### Step 3: 按 Schema 组织内容,展示确认 + +展示给用户确认后再写入: + +``` +即将记录到 Project/综述写作/project-log.md: + 日期: 2026-05-14 + 类型: session_summary + 标题: DC 段参数窗审计完成 + 决策: + - 限定参数窗为 100Hz-1kHz + - 移除 AC vs DC 对比段落 + 弯路: + - 把推断当文献事实 → 用户要求逐句审计 → 5 处修正 + 可复用: + - 写完必须逐句过 source,区分"文献说了什么"和"我推断什么" + +确认写入?(y/n) +``` + +### Step 4: 写入(Atom) + +```bash +$PYTHON -m paperforge project-log --write \ + --vault "$VAULT" \ + --project "<project>" \ + --json '<payload>' +``` + +- 返回 `ok: true` → 确认写入成功。**自动渲染对应项目 markdown。** +- 返回 `ok: false` → 报告错误,重试一次 + +### Step 5: 确认渲染 + +```bash +$PYTHON -m paperforge project-log --render --project "<project>" --vault "$VAULT" +``` + +输出到 `Project/<project>/project-log.md`。 + +--- + +## type 参考 + +| type | 使用场景 | +| ----------------- | ------------------------------ | +| `session_summary` | 会话结束时的总结 | +| `decision` | 单独记录一个重要决策 | +| `correction` | 用户纠正了某个方向 | +| `milestone` | 项目里程碑 | +| `note` | 一般研究笔记 | + +--- + +## 禁止 + +- 不要在用户确认前写入 +- 不要只写"做了什么"而没有"弯路与修正"和"可复用方法论" diff --git a/paperforge/skills/paperforge/workflows/reading-log.md b/paperforge/skills/paperforge/workflows/reading-log.md new file mode 100644 index 0000000..19c1b3c --- /dev/null +++ b/paperforge/skills/paperforge/workflows/reading-log.md @@ -0,0 +1,112 @@ +# reading-log + +记录单条阅读笔记。Agent 将用户确认的信息按 JSON schema 写入 reading-log.jsonl。 +系统自动渲染对应项目的 reading-log.md(给人看)并导入 paperforge.db(可搜索)。 + +--- + +## 前置条件 + +- bootstrap 已完成(有 `$VAULT`、`$PYTHON`) +- 已知 paper_id(zotero_key) + +--- + +## 阅读笔记 JSON Schema + +每条阅读笔记必须包含以下字段: + +```json +{ + "id": "rln_<YYYYMMDD>_<序号>", + "paper_id": "ABC12345", + "project": "综述写作", + "section": "Results Fig.3", + "excerpt": "原文关键句(逐字引用)", + "context": "包含 excerpt 的完整段落(供后续回原文复核时定位)", + "usage": "这个信息在当前写作中的用途", + "note": "注意事项 / 待核查 / 可能矛盾", + "tags": ["PEMF", "dose-response"], + "verified": false +} +``` + +| 字段 | 必填 | 说明 | +| --------- | ---- | -------------------------------------------------------- | +| `id` | 是 | 自动生成,格式 `rln_YYYYMMDD_NNN` | +| `paper_id` | 是 | Zotero key(8位大写字母数字) | +| `project` | 否 | 关联的研究项目 | +| `section` | 是 | 文献中的位置(如 "Results Fig.3"、"Discussion P12") | +| `excerpt` | 是 | 逐字引用的原文关键句 | +| `context` | 是 | 包含 excerpt 的完整段落,供复核定位 | +| `usage` | 是 | 这个信息在当前工作(写作/研究)中的用途 | +| `note` | 否 | 交叉验证、矛盾、待核查事项 | +| `tags` | 否 | 分类标签,供横切检索 | +| `verified` | 否 | 默认 false。Agent 回原文复核后应更新为 true | + +--- + +## 步骤 + +### Step 1: 确认 paper_id 和 project + +从上下文获取 zotero_key。如果用户未指定 project,询问或留空。 + +### Step 2: Agent 按 Schema 提取内容 + +从对话上下文中提取 `section`、`excerpt`、`context`、`usage`、`note`、`tags`。 + +**excerpt vs context 的区别:** +- `excerpt`:你关注的那一句(逐字引用) +- `context`:包含这句的完整段落(3-5 句),让以后的人不翻原文也能理解语境 + +### Step 3: 展示确认 + +先展示给用户确认,不要直接写入: + +``` +即将记录: + 文献: ABC12345 | Smith 2024 + 位置: Results Fig.3 + 原文: "..." + 用途: 支撑 PEMF 基质合成的论证 + 备注: 需核查是否对 DNA 归一化了 + 项目: 综述写作 + 标签: PEMF, GAG + 段落语境: "..." + +确认写入?(y/n) +``` + +### Step 4: 写入(Atom) + +```bash +$PYTHON -m paperforge reading-log --write <paper_id> \ + --vault "$VAULT" \ + --section "<section>" \ + --excerpt "<excerpt>" \ + --context "<context>" \ + --usage "<usage>" \ + --note "<note>" \ + --project "<project>" \ + --tags "<tag1>,<tag2>" +``` + +- 返回 `ok: true` → 确认写入成功。**写入后自动渲染对应项目的 markdown。** +- 返回 `ok: false` → 报告错误,重试一次 + +### Step 5: 确认渲染 + +```bash +$PYTHON -m paperforge reading-log --render --project "<project>" --vault "$VAULT" +``` + +输出到 `Project/<project>/reading-log.md`。 + +--- + +## 禁止 + +- 不要在用户确认前写入 +- 不要把推断当作 `excerpt`(必须是原文逐字引用) +- 不要让 `context` 为空(必须是完整段落) diff --git a/paperforge/worker/status.py b/paperforge/worker/status.py index b27c031..fde05d0 100644 --- a/paperforge/worker/status.py +++ b/paperforge/worker/status.py @@ -639,36 +639,36 @@ def add_check(category: str, status: str, message: str, fix: str = "") -> None: if total_issues == 0: add_check("字段注册表", "pass", "所有 formal note frontmatter 与字段注册表一致") - ld_deep_script = paths.get("ld_deep_script") + pf_deep_script = paths.get("pf_deep_script") skill_dir = None - if ld_deep_script: - skill_dir = ld_deep_script.parent.parent + if pf_deep_script: + skill_dir = pf_deep_script.parent.parent if skill_dir and skill_dir.exists(): # Try actual importability check - ld_deep_import_ok = False + pf_deep_import_ok = False import_error = "" - if ld_deep_script and ld_deep_script.exists(): + if pf_deep_script and pf_deep_script.exists(): try: import importlib.util - spec = importlib.util.spec_from_file_location("ld_deep", ld_deep_script) + spec = importlib.util.spec_from_file_location("pf_deep", pf_deep_script) if spec and spec.loader: mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) - ld_deep_import_ok = True + pf_deep_import_ok = True except Exception as e: import_error = str(e) - if ld_deep_import_ok: - add_check("Agent 脚本", "pass", "paperforge and ld_deep importable") + if pf_deep_import_ok: + add_check("Agent 脚本", "pass", "paperforge and pf_deep importable") else: add_check( "Agent 脚本", "warn", - f"literature-qa skill 目录存在但 import 失败: {import_error}", + f"paperforge skill 目录存在但 import 失败: {import_error}", "确认 agent_config_dir 配置正确并已运行 pip install -e .", ) else: - add_check("Agent 脚本", "warn", "literature-qa skill 目录未找到", "确认 agent_config_dir 配置正确") + add_check("Agent 脚本", "warn", "paperforge skill 目录未找到", "确认 agent_config_dir 配置正确") # --- Index Health section (Phase 25: derived from canonical index) --- try: diff --git a/paperforge/worker/update.py b/paperforge/worker/update.py index 3b4f22b..0ac69a5 100644 --- a/paperforge/worker/update.py +++ b/paperforge/worker/update.py @@ -252,7 +252,7 @@ def _deploy_all_skills(vault: Path) -> None: agent_key = config.get("agent_platform") or "opencode" result = deploy_skills(vault=vault, agent_key=agent_key, overwrite=True) if result["skill_deployed"]: - logger.info("已部署 literature-qa skill") + logger.info("已部署 paperforge skill") if result["agents_md"]: logger.info("已更新 AGENTS.md") for err in result.get("errors", []): From 5f07a8f7297750017ec7943845afed494f02c3f1 Mon Sep 17 00:00:00 2001 From: Research Assistant <research@example.com> Date: Thu, 14 May 2026 18:43:48 +0800 Subject: [PATCH 121/132] fix: reset nopdf OCR status to pending when paper re-enters queue --- paperforge/worker/ocr.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/paperforge/worker/ocr.py b/paperforge/worker/ocr.py index e01d4e2..3bf54ee 100644 --- a/paperforge/worker/ocr.py +++ b/paperforge/worker/ocr.py @@ -190,6 +190,8 @@ def sync_ocr_queue(paths: dict[str, Path], target_rows: list[dict]) -> list[dict status = str(meta.get("ocr_status", "pending") or "pending").strip().lower() if status in {"done", "blocked"}: continue + if status == "nopdf": + status = "pending" synced = dict(row) synced["has_pdf"] = bool(target.get("has_pdf")) synced["pdf_path"] = target.get("pdf_path", "") @@ -210,6 +212,9 @@ def sync_ocr_queue(paths: dict[str, Path], target_rows: list[dict]) -> list[dict status = str(meta.get("ocr_status", "pending") or "pending").strip().lower() if status in {"done", "blocked"}: continue + if status == "nopdf": + status = "pending" + continue synced_queue.append( { "zotero_key": key, @@ -1571,13 +1576,19 @@ def run_ocr(vault: Path, verbose: bool = False, no_progress: bool = False) -> in for row in target_rows: key = row["zotero_key"] meta = ensure_ocr_meta(vault, row) - if str(meta.get("ocr_status", "") or "").strip().lower() == "error": + current = str(meta.get("ocr_status", "") or "").strip().lower() + if current == "error": meta["ocr_status"] = "pending" meta["ocr_job_id"] = "" meta["ocr_started_at"] = "" meta["ocr_finished_at"] = "" meta["retry_count"] = 0 write_json(paths["ocr"] / key / "meta.json", meta) + elif current == "nopdf": + meta["ocr_status"] = "pending" + meta["error"] = "" + meta["retry_count"] = 0 + write_json(paths["ocr"] / key / "meta.json", meta) status, _error = validate_ocr_meta(paths, meta) if status == "done_incomplete": meta["ocr_status"] = "pending" From f248f81d871ffffada7a7555dbb1b47cc892c311 Mon Sep 17 00:00:00 2001 From: Research Assistant <research@example.com> Date: Thu, 14 May 2026 21:44:05 +0800 Subject: [PATCH 122/132] refactor: unify reading-log reads to JSONL, deprecate paper_events writes --- paperforge/commands/reading_log.py | 128 ++++++++++++++++++++--------- paperforge/memory/events.py | 6 +- 2 files changed, 92 insertions(+), 42 deletions(-) diff --git a/paperforge/commands/reading_log.py b/paperforge/commands/reading_log.py index 37c5ff5..8bb89de 100644 --- a/paperforge/commands/reading_log.py +++ b/paperforge/commands/reading_log.py @@ -11,9 +11,10 @@ from paperforge.core.errors import ErrorCode from paperforge.core.result import PFError, PFResult from paperforge.memory.db import get_connection, get_memory_db_path -from paperforge.memory.events import export_reading_log, write_correction_note, write_reading_note +from paperforge.memory.events import write_correction_note, write_reading_note from paperforge.memory.permanent import ( append_reading_note, + get_reading_notes_for_paper, read_all_reading_notes, ) @@ -179,41 +180,92 @@ def import_reading_log(vault: Path, filepath: Path) -> dict: def lookup_paper_events(vault: Path, key: str) -> dict: - """Query paper_events for all reading_note events for a paper, joined with papers table.""" + """Look up all reading notes for a paper from JSONL.""" + notes = get_reading_notes_for_paper(vault, key) + notes.sort(key=lambda n: n.get("created_at", ""), reverse=True) + + title = "" db_path = get_memory_db_path(vault) - if not db_path.exists(): - return {"ok": False, "zotero_key": key, "title": "", "entries": [], "count": 0} - - conn = get_connection(db_path, read_only=True) - try: - rows = conn.execute( - """SELECT e.created_at, e.payload_json, p.title, p.citation_key, p.year - FROM paper_events e JOIN papers p ON p.zotero_key = e.paper_id - WHERE e.paper_id = ? AND e.event_type = 'reading_note' - ORDER BY e.created_at DESC""", (key,), - ).fetchall() - - title = rows[0]["title"] if rows else "" - entries = [] - for row in rows: - payload = json.loads(row["payload_json"]) - entries.append({ - "created_at": row["created_at"], - "section": payload.get("section", ""), - "excerpt": payload.get("excerpt", ""), - "usage": payload.get("usage", ""), - "note": payload.get("note", ""), - }) + if db_path.exists(): + conn = get_connection(db_path, read_only=True) + try: + row = conn.execute( + "SELECT title FROM papers WHERE zotero_key = ?", (key,), + ).fetchone() + if row: + title = row["title"] or "" + finally: + conn.close() + + entries = [] + for n in notes: + entries.append({ + "created_at": n.get("created_at", ""), + "section": n.get("section", ""), + "excerpt": n.get("excerpt", ""), + "usage": n.get("usage", ""), + "note": n.get("note", ""), + }) + + return { + "ok": True, + "zotero_key": key, + "title": title, + "entries": entries, + "count": len(entries), + } + - return { - "ok": True, - "zotero_key": key, - "title": title, - "entries": entries, - "count": len(entries), - } - finally: - conn.close() +def _export_from_jsonl(vault: Path, since: str = "", limit: int = 50) -> list[dict]: + """Export reading notes from JSONL, enriched with paper metadata from DB.""" + all_notes = read_all_reading_notes(vault) + all_notes.sort(key=lambda n: n.get("created_at", ""), reverse=True) + + if since: + all_notes = [n for n in all_notes if n.get("created_at", "") >= since] + all_notes = all_notes[:limit] + + db_path = get_memory_db_path(vault) + paper_meta: dict[str, dict] = {} + if db_path.exists(): + paper_ids = list(set(n.get("paper_id", "") for n in all_notes if n.get("paper_id"))) + if paper_ids: + conn = get_connection(db_path, read_only=True) + try: + placeholders = ",".join("?" * len(paper_ids)) + rows = conn.execute( + f"SELECT zotero_key, citation_key, title, year, first_author " + f"FROM papers WHERE zotero_key IN ({placeholders})", + paper_ids, + ).fetchall() + for row in rows: + paper_meta[row["zotero_key"]] = { + "citation_key": row["citation_key"], + "title": row["title"], + "year": row["year"], + "first_author": row["first_author"], + } + finally: + conn.close() + + results = [] + for n in all_notes: + pid = n.get("paper_id", "") + meta = paper_meta.get(pid, {}) + results.append({ + "created_at": n.get("created_at", ""), + "paper_id": pid, + "citation_key": meta.get("citation_key", pid), + "title": meta.get("title", ""), + "year": meta.get("year", ""), + "first_author": meta.get("first_author", ""), + "section": n.get("section", ""), + "excerpt": n.get("excerpt", ""), + "usage": n.get("usage", ""), + "note": n.get("note", ""), + }) + + return results def _render_reading_log_md(vault: Path, project: str = "") -> None: @@ -395,12 +447,6 @@ def run(args: argparse.Namespace) -> int: args.note or "", args.project or "", tags_list, ) - _db_ok = write_reading_note( - vault, args.paper_id, args.section or "", - args.excerpt, args.usage or "", args.note or "", - args.context or "", args.project or "", tags_list, - ) - ok = jsonl_result.get("ok", False) result = PFResult( ok=ok, @@ -423,7 +469,7 @@ def run(args: argparse.Namespace) -> int: return 0 if ok else 1 - notes = export_reading_log(vault, since=args.since or "", limit=args.limit or 50) + notes = _export_from_jsonl(vault, since=args.since or "", limit=args.limit or 50) result = PFResult( ok=True, command="reading-log", diff --git a/paperforge/memory/events.py b/paperforge/memory/events.py index c8cbae3..9c74a3a 100644 --- a/paperforge/memory/events.py +++ b/paperforge/memory/events.py @@ -10,7 +10,11 @@ def write_reading_note(vault: Path, paper_id: str, section: str, excerpt: str, usage: str = "", note: str = "", context: str = "", project: str = "", tags: list[str] | None = None) -> bool: - """Record a reading note in paper_events.""" + """DEPRECATED: Use append_reading_note() in paperforge.memory.permanent instead. + + Reading notes now live in reading-log.jsonl as the source of truth. + This function is kept for backward compatibility only (import_reading_log). + """ db_path = get_memory_db_path(vault) if not db_path.exists(): return False From 9b0bc59589b44825173018b71cbc1c553a2aaf6c Mon Sep 17 00:00:00 2001 From: Research Assistant <research@example.com> Date: Thu, 14 May 2026 21:47:43 +0800 Subject: [PATCH 123/132] fix: bootstrap vault_config nest, python fallback with verified flag --- .../skills/paperforge/scripts/pf_bootstrap.py | 42 ++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/paperforge/skills/paperforge/scripts/pf_bootstrap.py b/paperforge/skills/paperforge/scripts/pf_bootstrap.py index 741a677..101b35a 100644 --- a/paperforge/skills/paperforge/scripts/pf_bootstrap.py +++ b/paperforge/skills/paperforge/scripts/pf_bootstrap.py @@ -82,6 +82,20 @@ def _find_python_with_paperforge(vault: Path, pf_cfg: dict) -> str | None: return str(candidate) except Exception: continue + + # Fallback: try system python + for fallback in ["python", "python3"]: + try: + result = subprocess.run( + [fallback, "--version"], + capture_output=True, text=True, timeout=10, + encoding="utf-8", errors="replace", + ) + if result.returncode == 0: + return fallback + except Exception: + continue + return None @@ -118,6 +132,25 @@ def _scan_methodology_archive(pf_root: Path) -> list[dict]: return methods +DEFAULTS = { + "system_dir": "System", + "resources_dir": "Resources", + "literature_dir": "Literature", + "control_dir": "LiteratureControl", + "base_dir": "Bases", +} + + +def resolve_cfg(raw: dict) -> dict: + """Resolve config with vault_config nested support and legacy flat keys.""" + cfg = DEFAULTS.copy() + nested = raw.get("vault_config", {}) + if isinstance(nested, dict): + cfg.update({k: v for k, v in nested.items() if v}) + cfg.update({k: raw[k] for k in DEFAULTS if raw.get(k)}) + return cfg + + def main(): import argparse p = argparse.ArgumentParser(description="PaperForge bootstrap") @@ -152,6 +185,7 @@ def main(): json.dump(result, sys.stdout, ensure_ascii=False) sys.exit(0) + cfg = resolve_cfg(cfg) system_dir = cfg.get("system_dir", "System") resources_dir = cfg.get("resources_dir", "Resources") literature_dir = cfg.get("literature_dir", "Literature") @@ -191,7 +225,13 @@ def main(): result["index_summary"] = index_summary # --- 6. Find Python that has paperforge (best effort) --- - result["python_candidate"] = _find_python_with_paperforge(vault, cfg) + py_candidate = _find_python_with_paperforge(vault, cfg) + if py_candidate: + result["python_candidate"] = py_candidate + result["python_verified"] = True + else: + result["python_candidate"] = "python" + result["python_verified"] = False # --- 7. Memory layer state --- memory_layer = {"available": False, "paper_count": 0, "fts_search": False, "vector_search": False} From 054ca3e1b4c740e0f439f7651337685c76332d9f Mon Sep 17 00:00:00 2001 From: Research Assistant <research@example.com> Date: Thu, 14 May 2026 21:48:21 +0800 Subject: [PATCH 124/132] feat: permanent correction-log.jsonl, fix original_id field alignment --- paperforge/commands/paper_context.py | 22 +++++++++- paperforge/commands/reading_log.py | 20 ++++++++- paperforge/memory/builder.py | 24 +++++++++++ paperforge/memory/permanent.py | 62 ++++++++++++++++++++++++++++ 4 files changed, 124 insertions(+), 4 deletions(-) diff --git a/paperforge/commands/paper_context.py b/paperforge/commands/paper_context.py index ff70566..193d82c 100644 --- a/paperforge/commands/paper_context.py +++ b/paperforge/commands/paper_context.py @@ -8,7 +8,7 @@ from paperforge.core.errors import ErrorCode from paperforge.core.result import PFError, PFResult from paperforge.memory.db import get_connection, get_memory_db_path -from paperforge.memory.permanent import get_reading_notes_for_paper +from paperforge.memory.permanent import get_corrections_for_paper, get_reading_notes_for_paper def _build_paper_context(vault, key: str) -> dict | None: @@ -44,14 +44,32 @@ def _build_paper_context(vault, key: str) -> dict | None: ORDER BY created_at DESC""", (key,), ).fetchall() + seen_ids: set[str] = set() for cr in corr_rows: payload = json.loads(cr["payload_json"]) + orig_id = payload.get("original_id", "") corrections.append({ "created_at": cr["created_at"], - "previous_note_id": payload.get("ref_id", ""), + "previous_note_id": orig_id, "correction": payload.get("correction", ""), "reason": payload.get("reason", ""), }) + if orig_id: + seen_ids.add(orig_id) + + jsonl_corrections = get_corrections_for_paper(vault, key) + for c in jsonl_corrections: + cid = c.get("original_id", "") + if cid and cid in seen_ids: + continue + corrections.append({ + "created_at": c.get("created_at", ""), + "previous_note_id": cid, + "correction": c.get("correction", ""), + "reason": c.get("reason", ""), + }) + if cid: + seen_ids.add(cid) recheck_targets = [] for n in prior_notes: diff --git a/paperforge/commands/reading_log.py b/paperforge/commands/reading_log.py index 8bb89de..f85b113 100644 --- a/paperforge/commands/reading_log.py +++ b/paperforge/commands/reading_log.py @@ -13,6 +13,7 @@ from paperforge.memory.db import get_connection, get_memory_db_path from paperforge.memory.events import write_correction_note, write_reading_note from paperforge.memory.permanent import ( + append_correction, append_reading_note, get_reading_notes_for_paper, read_all_reading_notes, @@ -426,16 +427,31 @@ def run(args: argparse.Namespace) -> int: vault, paper_id, args.correct_id, args.correction, args.reason or "", ) + + jsonl_result = append_correction( + vault, paper_id, args.correct_id, + args.correction, args.reason or "", + ) + result = PFResult( ok=ok, command="reading-log", version=PF_VERSION, - data={"written": ok}, + data={"written": ok, "jsonl_id": jsonl_result.get("id"), + "jsonl_path": jsonl_result.get("path")}, error=PFError(code=ErrorCode.INTERNAL_ERROR, message="Failed to write correction") if not ok else None, ) if args.json: print(result.to_json()) else: - print(f"Correction written for {args.correct_id}." if ok else "Failed.") + written_parts = [] + if ok: + written_parts.append("paper_events") + if jsonl_result.get("ok"): + written_parts.append(f"correction-log.jsonl ({jsonl_result.get('id')})") + if written_parts: + print(f"Correction written for {args.correct_id}: {', '.join(written_parts)}.") + else: + print("Failed.") return 0 if ok else 1 if args.paper_id and args.excerpt: diff --git a/paperforge/memory/builder.py b/paperforge/memory/builder.py index c34d605..da37d1c 100644 --- a/paperforge/memory/builder.py +++ b/paperforge/memory/builder.py @@ -107,6 +107,26 @@ def _import_project_log(conn, vault: Path) -> int: return count +def _import_correction_log(conn, vault: Path) -> int: + """Import correction-log.jsonl into paper_events for FTS search. Returns count.""" + from paperforge.memory.permanent import read_all_corrections + + corrections = read_all_corrections(vault) + count = 0 + for c in corrections: + payload = { + "original_id": c.get("original_id", ""), + "correction": c.get("correction", ""), + "reason": c.get("reason", ""), + } + conn.execute( + "INSERT INTO paper_events (paper_id, event_type, payload_json) VALUES (?, 'correction_note', ?)", + (c["paper_id"], json.dumps(payload, ensure_ascii=False)), + ) + count += 1 + return count + + def build_from_index(vault: Path) -> dict: """Read formal-library.json and build/rebuild paperforge.db. @@ -210,6 +230,9 @@ def build_from_index(vault: Path) -> dict: project_count = _import_project_log(conn, vault) logger.info("Imported %d project log entries from JSONL", project_count) + correction_count = _import_correction_log(conn, vault) + logger.info("Imported %d corrections from JSONL", correction_count) + conn.execute( "DELETE FROM paper_events WHERE event_type != 'correction_note';" ) @@ -237,6 +260,7 @@ def build_from_index(vault: Path) -> dict: "aliases_indexed": len(alias_rows), "reading_notes_imported": reading_count, "project_entries_imported": project_count, + "corrections_imported": correction_count, "schema_version": str(CURRENT_SCHEMA_VERSION), } except Exception: diff --git a/paperforge/memory/permanent.py b/paperforge/memory/permanent.py index 1c799ed..cf033fc 100644 --- a/paperforge/memory/permanent.py +++ b/paperforge/memory/permanent.py @@ -153,3 +153,65 @@ def read_all_project_entries(vault: Path) -> list[dict]: def get_project_entries(vault: Path, project: str) -> list[dict]: all_entries = read_all_project_entries(vault) return [e for e in all_entries if e.get("project") == project] + + +# ── Correction Log ────────────────────────────────────────────────────────── + + +def get_correction_log_path(vault: Path) -> Path: + return _logs_dir(vault) / "correction-log.jsonl" + + +def append_correction( + vault: Path, + paper_id: str, + original_id: str, + correction: str, + reason: str = "", + agent: str = "", +) -> dict: + """Append a correction record to correction-log.jsonl.""" + if not paper_id: + return {"ok": False, "error": "paper_id is required"} + if not original_id: + return {"ok": False, "error": "original_id is required"} + if not correction: + return {"ok": False, "error": "correction is required"} + + date_str = datetime.date.today().strftime("%Y%m%d") + entry_id = f"corr_{date_str}_{secrets.token_hex(4)}" + now = datetime.datetime.now(datetime.timezone.utc).isoformat() + + entry: dict[str, object] = { + "id": entry_id, + "event_type": "correction", + "created_at": now, + "paper_id": paper_id, + "original_id": original_id, + "correction": correction, + "reason": reason, + "agent": agent, + } + + log_dir = _ensure_logs_dir(vault) + filepath = log_dir / "correction-log.jsonl" + + try: + with filepath.open("a", encoding="utf-8") as f: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + except OSError as e: + return {"ok": False, "error": str(e)} + + return {"ok": True, "id": entry_id, "path": str(filepath)} + + +def read_all_corrections(vault: Path) -> list[dict]: + """Read all correction entries from correction-log.jsonl.""" + filepath = get_correction_log_path(vault) + return _read_jsonl(filepath) + + +def get_corrections_for_paper(vault: Path, paper_id: str) -> list[dict]: + """Get all corrections for a specific paper.""" + all_corrections = read_all_corrections(vault) + return [c for c in all_corrections if c.get("paper_id") == paper_id] From 002173f768b953aadf1c2a2d0ce47f023c40fd31 Mon Sep 17 00:00:00 2001 From: Research Assistant <research@example.com> Date: Thu, 14 May 2026 21:50:29 +0800 Subject: [PATCH 125/132] feat: safe FTS search with token-quote and LIKE fallback --- paperforge/memory/fts.py | 91 +++++++++++++++++++++++++++++----------- 1 file changed, 67 insertions(+), 24 deletions(-) diff --git a/paperforge/memory/fts.py b/paperforge/memory/fts.py index 7fa322e..4a760b0 100644 --- a/paperforge/memory/fts.py +++ b/paperforge/memory/fts.py @@ -1,43 +1,69 @@ from __future__ import annotations +import re import sqlite3 +def tokenize_for_fts(q: str) -> str: + """Extract alphanumeric + CJK tokens and quote for safe FTS.""" + tokens = re.findall(r"[\w\u4e00-\u9fff]+", q) + if not tokens: + return q + return " OR ".join(f'"{t}"' for t in tokens) + + def search_papers(conn: sqlite3.Connection, query: str, limit: int = 20, domain: str = "", year_from: int = 0, year_to: int = 0, ocr_status: str = "", deep_status: str = "", lifecycle: str = "", next_step: str = "") -> list[dict]: - """Full-text search across papers with optional filters. + """Full-text search with safe fallback for special characters.""" - Uses FTS5 for relevance-ranked results with optional column filters. - """ - conditions = ["paper_fts MATCH ?"] - params: list = [query] + filter_conditions = [] + filter_params = [] if domain: - conditions.append("p.domain = ?") - params.append(domain) + filter_conditions.append("p.domain = ?") + filter_params.append(domain) if year_from: - conditions.append("CAST(p.year AS INTEGER) >= ?") - params.append(year_from) + filter_conditions.append("CAST(p.year AS INTEGER) >= ?") + filter_params.append(year_from) if year_to: - conditions.append("CAST(p.year AS INTEGER) <= ?") - params.append(year_to) + filter_conditions.append("CAST(p.year AS INTEGER) <= ?") + filter_params.append(year_to) if ocr_status: - conditions.append("p.ocr_status = ?") - params.append(ocr_status) + filter_conditions.append("p.ocr_status = ?") + filter_params.append(ocr_status) if deep_status: - conditions.append("p.deep_reading_status = ?") - params.append(deep_status) + filter_conditions.append("p.deep_reading_status = ?") + filter_params.append(deep_status) if lifecycle: - conditions.append("p.lifecycle = ?") - params.append(lifecycle) + filter_conditions.append("p.lifecycle = ?") + filter_params.append(lifecycle) if next_step: - conditions.append("p.next_step = ?") - params.append(next_step) + filter_conditions.append("p.next_step = ?") + filter_params.append(next_step) + + filter_clause = (" AND " + " AND ".join(filter_conditions)) if filter_conditions else "" + + # Level 1: Raw FTS + try: + return _fts_query(conn, query, filter_clause, filter_params, limit) + except sqlite3.OperationalError: + pass - where = " AND ".join(conditions) - # Content-sync FTS: query the FTS table directly, columns come from papers + # Level 2: Quoted token FTS + token_query = tokenize_for_fts(query) + if token_query != query: + try: + return _fts_query(conn, token_query, filter_clause, filter_params, limit) + except sqlite3.OperationalError: + pass + + # Level 3: LIKE fallback + return _like_query(conn, query, filter_clause, filter_params, limit) + + +def _fts_query(conn, query, filter_clause, filter_params, limit): sql = f""" SELECT p.zotero_key, p.citation_key, p.title, p.year, p.doi, p.first_author, p.journal, p.domain, p.lifecycle, @@ -46,11 +72,28 @@ def search_papers(conn: sqlite3.Connection, query: str, limit: int = 20, rank FROM paper_fts f JOIN papers p ON p.rowid = f.rowid - WHERE {where} + WHERE paper_fts MATCH ?{filter_clause} ORDER BY rank LIMIT ? """ - params.append(limit) conn.row_factory = sqlite3.Row - rows = conn.execute(sql, params).fetchall() + rows = conn.execute(sql, [query] + filter_params + [limit]).fetchall() + return [dict(r) for r in rows] + + +def _like_query(conn, query, filter_clause, filter_params, limit): + like_param = f"%{query}%" + sql = f""" + SELECT p.zotero_key, p.citation_key, p.title, p.year, p.doi, + p.first_author, p.journal, p.domain, p.lifecycle, + p.ocr_status, p.deep_reading_status, p.next_step, + substr(p.abstract, 1, 300) as abstract, + 0 as rank + FROM papers p + WHERE (p.title LIKE ? OR p.abstract LIKE ? OR p.doi LIKE ? OR p.citation_key LIKE ?){filter_clause} + ORDER BY p.year DESC + LIMIT ? + """ + conn.row_factory = sqlite3.Row + rows = conn.execute(sql, [like_param, like_param, like_param, like_param] + filter_params + [limit]).fetchall() return [dict(r) for r in rows] From 2c71643e41c2ec7f259cfdf534b2736857036529 Mon Sep 17 00:00:00 2001 From: Research Assistant <research@example.com> Date: Thu, 14 May 2026 21:56:28 +0800 Subject: [PATCH 126/132] fix: remove ghost settings code, add vector deps, add embed preflight --- paperforge/commands/embed.py | 29 ++++++++++++++++++ paperforge/plugin/main.js | 19 ++---------- paperforge/worker/vector_db.py | 54 ++++++++++++++++++++++++++++++++++ pyproject.toml | 5 ++++ 4 files changed, 90 insertions(+), 17 deletions(-) create mode 100644 paperforge/worker/vector_db.py diff --git a/paperforge/commands/embed.py b/paperforge/commands/embed.py index d90c4b2..c114550 100644 --- a/paperforge/commands/embed.py +++ b/paperforge/commands/embed.py @@ -14,6 +14,7 @@ get_vector_db_path, ) from paperforge.worker.asset_index import read_index +from paperforge.worker.vector_db import _preflight_check from paperforge import __version__ as PF_VERSION @@ -32,6 +33,34 @@ def run(args: argparse.Namespace) -> int: return 0 # Build + + # Read plugin settings for preflight + settings: dict = {} + dc_json = vault / ".obsidian" / "plugins" / "paperforge" / "data.json" + if dc_json.exists(): + try: + import json + + settings = json.loads(dc_json.read_text(encoding="utf-8")) + except Exception: + pass + + preflight = _preflight_check(vault, settings) + if not preflight["ok"]: + result = PFResult( + ok=False, + command="embed-build", + version=PF_VERSION, + error=PFError(code=ErrorCode.VALIDATION_ERROR, message=preflight["error"]), + data={"fix": preflight.get("fix", "")}, + ) + if args.json: + print(result.to_json()) + else: + print(f"Error: {preflight['error']}", file=sys.stderr) + print(f"Fix: {preflight['fix']}", file=sys.stderr) + return 1 + envelope = read_index(vault) if not envelope: result = PFResult(ok=False, command="embed build", version=PF_VERSION, diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 9651219..0c20aa8 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -197,24 +197,9 @@ function runSubprocess(pythonExe, args, cwd, timeout, _spawn, env) { stderr: stderrChunks.join("") + "\n" + err.message, exitCode: -1, elapsed: Date.now() - startTime }); }); - }); - new Setting(containerEl) - .setName('API Model') - .setDesc('Which OpenAI-compatible embedding model to use.') - new Setting(containerEl) - .setName('API Model') - .setDesc('Embedding model name (e.g., text-embedding-3-small, qwen-3-embedding)') - .addText(text => { - text.setPlaceholder('text-embedding-3-small') - .setValue(this.plugin.settings.vector_db_api_model || 'text-embedding-3-small') - .onChange(value => { - this.plugin.settings.vector_db_api_model = value; - this.plugin.saveSettings(); - }); - }); - } - + }); +} // ── Cross-platform Python and BBT detection (macOS/Linux) ── diff --git a/paperforge/worker/vector_db.py b/paperforge/worker/vector_db.py new file mode 100644 index 0000000..35af9ae --- /dev/null +++ b/paperforge/worker/vector_db.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +import os + + +def _preflight_check(vault, settings: dict) -> dict: + """Check prerequisites for embed build. Returns {ok: bool, error: str, fix: str}.""" + from pathlib import Path + + from paperforge.worker._utils import pipeline_paths + + # 1. chromadb + try: + import chromadb # noqa: F401 + except ImportError: + return {"ok": False, "error": "chromadb is not installed", "fix": 'Run: pip install "paperforge[vector]"'} + + # 2. Mode-specific deps + mode = settings.get("vector_db_mode", "local") + if mode == "local": + try: + import sentence_transformers # noqa: F401 + except ImportError: + return { + "ok": False, + "error": "sentence-transformers not installed", + "fix": 'Run: pip install "paperforge[vector]" or switch to API mode', + } + elif mode == "api": + try: + import openai # noqa: F401 + except ImportError: + return { + "ok": False, + "error": "openai not installed", + "fix": 'Run: pip install "paperforge[vector]" or switch to local mode', + } + api_key = settings.get("vector_db_api_key") or os.environ.get("OPENAI_API_KEY") or os.environ.get("VECTOR_DB_API_KEY") + if not api_key: + return {"ok": False, "error": "API key not configured", "fix": "Set API Key in plugin settings or OPENAI_API_KEY in .env"} + + # 3. OCR done papers + paths = pipeline_paths(vault) + idx_path = paths.get("indexes", Path("")) / "formal-library.json" if paths.get("indexes") else None + if idx_path and idx_path.exists(): + import json + + data = json.loads(idx_path.read_text(encoding="utf-8")) + items = data.get("items", []) if isinstance(data, dict) else data + done = sum(1 for i in (items or []) if i.get("ocr_status") == "done") + if done == 0: + return {"ok": False, "error": "No papers with OCR completed", "fix": "Run paperforge ocr first"} + + return {"ok": True} diff --git a/pyproject.toml b/pyproject.toml index 9d84b07..2862d37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,11 @@ test = [ "coverage>=7.4.0", "ruff>=0.4.0", ] +vector = [ + "chromadb>=0.5.0", + "sentence-transformers>=3.0.0", + "openai>=1.0.0", +] [project.scripts] paperforge = "paperforge.cli:main" From 62cbb6110b78e6e55658a325db14ac01c62969c0 Mon Sep 17 00:00:00 2001 From: Research Assistant <research@example.com> Date: Thu, 14 May 2026 21:59:25 +0800 Subject: [PATCH 127/132] feat: retrieve empty index guard, embed status check, HF download hint --- paperforge/commands/retrieve.py | 20 ++++++++++++++ paperforge/plugin/main.js | 8 ++++++ paperforge/skills/paperforge/SKILL.md | 7 ++++- .../paperforge/workflows/deep-reading.md | 3 +++ .../paperforge/workflows/methodology.md | 3 +++ .../skills/paperforge/workflows/paper-qa.md | 3 +++ .../workflows/project-engineering.md | 13 +++++++++ .../paperforge/workflows/project-log.md | 3 +++ paperforge/worker/vector_db.py | 27 +++++++++++++++++++ 9 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 paperforge/skills/paperforge/workflows/project-engineering.md diff --git a/paperforge/commands/retrieve.py b/paperforge/commands/retrieve.py index a457db1..7d4bb48 100644 --- a/paperforge/commands/retrieve.py +++ b/paperforge/commands/retrieve.py @@ -16,6 +16,26 @@ def run(args: argparse.Namespace) -> int: query = args.query limit = args.limit or 5 + # Check if vector index exists + from paperforge.worker.vector_db import get_embed_status + status = get_embed_status(vault) + if status.get("chunk_count", 0) == 0: + result = PFResult( + ok=False, + command="retrieve", + version=PF_VERSION, + error=PFError( + code=ErrorCode.PATH_NOT_FOUND, + message="Vector index is empty. Run paperforge embed build first.", + ), + data={"next_action": "paperforge embed build"}, + ) + if args.json: + print(result.to_json()) + else: + print(f"Error: {result.error.message}", file=sys.stderr) + return 1 + try: chunks = retrieve_chunks(vault, query, limit=limit, expand=args.expand) except Exception as e: diff --git a/paperforge/plugin/main.js b/paperforge/plugin/main.js index 0c20aa8..c36fc4c 100644 --- a/paperforge/plugin/main.js +++ b/paperforge/plugin/main.js @@ -3251,6 +3251,14 @@ class PaperForgeSettingTab extends PluginSettingTab { this.display(); }); }); + + // INFO: HF download notice for local mode + const infoDiv = containerEl.createDiv({ cls: 'setting-item-description' }); + infoDiv.createEl('p', { + text: 'Local mode downloads models from Hugging Face on first use. ' + + 'If inaccessible, set an HF Endpoint above (e.g. https://hf-mirror.com) or switch to API mode.', + cls: 'paperforge-settings-desc', + }); } // API config (api mode) diff --git a/paperforge/skills/paperforge/SKILL.md b/paperforge/skills/paperforge/SKILL.md index 2e42cd9..0b55005 100644 --- a/paperforge/skills/paperforge/SKILL.md +++ b/paperforge/skills/paperforge/SKILL.md @@ -34,6 +34,9 @@ python $SKILL_DIR/scripts/pf_bootstrap.py --vault "$VAULT" 如果 `ok: false`,报告 `error` 给用户,**停止。禁止自己拼路径。** +如果 `python_verified` 为 `false` 或 `python_candidate` 为 `null`: +依次尝试 `python` 再 `python3`。全部失败则停止,提示用户在 `paperforge.json` 中设置 `python_path`。 + --- ## 2. Agent Context — bootstrap 成功后执行 @@ -78,6 +81,7 @@ Reading-log 不是事实源。它记录的是**之前的关注点、解读和预 | "记一下" "记录阅读" "reading log" "读完这段记一下" | `workflows/reading-log.md` | | "总结会话" "工作记录" "项目记录" "project log" "记决策" | `workflows/project-log.md` | | "提取方法论" "总结规律" "存档写作规律" | `workflows/methodology.md` | +| "branch" "代码审查" "feature" "dashboard" "memory layer" "用户反馈" "报错" "安装失败" "Git" "Zotero" "BetterBibTeX" "OCR" "插件" | `workflows/project-engineering.md` | | 不确定 / 空输入 | 问用户:搜文献、精读、问答、记笔记、记工作、提方法论? | 路由后如用户切换意图,重新判断并打开对应 workflow。 @@ -103,7 +107,8 @@ paperforge/ │ ├── paper-qa.md │ ├── reading-log.md │ ├── project-log.md -│ └── methodology.md +│ ├── methodology.md +│ └── project-engineering.md ├── references/ ← 共享参考 │ ├── chart-reading/ ← 19 种图表阅读指南 │ └── method-card-template.md diff --git a/paperforge/skills/paperforge/workflows/deep-reading.md b/paperforge/skills/paperforge/workflows/deep-reading.md index 6d1852e..889359f 100644 --- a/paperforge/skills/paperforge/workflows/deep-reading.md +++ b/paperforge/skills/paperforge/workflows/deep-reading.md @@ -1,5 +1,8 @@ # deep-reading +> **Safety Rule:** Prior reading-log entries are recheck targets only, never factual answers. +> Always verify against original source before using any reading-log content. + Keshav 三阶段精读。在 formal note 中写入结构化的 `## 精读` 区域。 --- diff --git a/paperforge/skills/paperforge/workflows/methodology.md b/paperforge/skills/paperforge/workflows/methodology.md index f7a1fb9..f196817 100644 --- a/paperforge/skills/paperforge/workflows/methodology.md +++ b/paperforge/skills/paperforge/workflows/methodology.md @@ -1,5 +1,8 @@ # methodology +> **Scope:** Only archive methods reusable across multiple projects/tasks. +> Session-specific progress, decisions, and todos go to project-log. + 从 project-log 中提取可复用方法论,按 method-card 模板写入 methodology archive。 不 append 到大文件,每张卡片独立保存。 diff --git a/paperforge/skills/paperforge/workflows/paper-qa.md b/paperforge/skills/paperforge/workflows/paper-qa.md index 6785d77..4175efa 100644 --- a/paperforge/skills/paperforge/workflows/paper-qa.md +++ b/paperforge/skills/paperforge/workflows/paper-qa.md @@ -1,5 +1,8 @@ # paper-qa +> **Safety Rule:** Prior reading-log entries are recheck targets only, never factual answers. +> Always verify against original source before using any reading-log content. + 交互式文献问答。不强制要求 OCR,但 OCR 完成后回答更准确。 每次问答记录到 `discussion.json`(Dashboard 可见)。 diff --git a/paperforge/skills/paperforge/workflows/project-engineering.md b/paperforge/skills/paperforge/workflows/project-engineering.md new file mode 100644 index 0000000..be62708 --- /dev/null +++ b/paperforge/skills/paperforge/workflows/project-engineering.md @@ -0,0 +1,13 @@ +# Project Engineering + +When user asks about PaperForge codebase issues (branch, code review, feature, +dashboard, memory layer, user feedback, errors, installation, Git, Zotero, +BetterBibTeX, OCR, plugin): + +1. Read `AGENTS.md` and `README.md` for architecture context +2. Use `git log --oneline` and `git diff` to understand recent changes +3. Search codebase with grep/glob as needed +4. Run diagnostics: `python -m paperforge doctor` (if applicable) +5. Present findings and recommend fixes + +Do NOT modify code without user confirmation. diff --git a/paperforge/skills/paperforge/workflows/project-log.md b/paperforge/skills/paperforge/workflows/project-log.md index 96c5d63..585a91b 100644 --- a/paperforge/skills/paperforge/workflows/project-log.md +++ b/paperforge/skills/paperforge/workflows/project-log.md @@ -1,5 +1,8 @@ # project-log +> **Scope:** Record what happened this session — decisions, detours, todos. +> For reusable cross-project methods, use methodology workflow instead. + 记录研究项目的会话总结、决策、弯路修正和方法论提取。 Agent 按 JSON schema 写入 project-log.jsonl。 系统自动渲染对应项目的 project-log.md。 diff --git a/paperforge/worker/vector_db.py b/paperforge/worker/vector_db.py index 35af9ae..6a746ff 100644 --- a/paperforge/worker/vector_db.py +++ b/paperforge/worker/vector_db.py @@ -52,3 +52,30 @@ def _preflight_check(vault, settings: dict) -> dict: return {"ok": False, "error": "No papers with OCR completed", "fix": "Run paperforge ocr first"} return {"ok": True} + + +def get_embed_status(vault) -> dict: + """Check if vector index exists and has content.""" + from pathlib import Path + from paperforge.config import paperforge_paths + paths = paperforge_paths(vault) + vectors_dir = paths.get("vectors", paths.get("paperforge", Path()) / "vectors") + + status = {"exists": False, "chunk_count": 0, "collection_name": ""} + + if not vectors_dir or not vectors_dir.exists(): + return status + + try: + import chromadb + client = chromadb.PersistentClient(path=str(vectors_dir)) + collections = client.list_collections() + if collections: + col = collections[0] + status["exists"] = True + status["collection_name"] = col.name + status["chunk_count"] = col.count() + except Exception: + pass + + return status From ce86effe4050ef300b9ba62ad42b2c79cfb9b489 Mon Sep 17 00:00:00 2001 From: Research Assistant <research@example.com> Date: Thu, 14 May 2026 22:35:35 +0800 Subject: [PATCH 128/132] fix: dedupe correction import on rebuild, clean FTS row on refresh --- paperforge/memory/builder.py | 1 + paperforge/memory/refresh.py | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/paperforge/memory/builder.py b/paperforge/memory/builder.py index da37d1c..d2b8567 100644 --- a/paperforge/memory/builder.py +++ b/paperforge/memory/builder.py @@ -230,6 +230,7 @@ def build_from_index(vault: Path) -> dict: project_count = _import_project_log(conn, vault) logger.info("Imported %d project log entries from JSONL", project_count) + conn.execute("DELETE FROM paper_events WHERE event_type = 'correction_note';") correction_count = _import_correction_log(conn, vault) logger.info("Imported %d corrections from JSONL", correction_count) diff --git a/paperforge/memory/refresh.py b/paperforge/memory/refresh.py index 724b73a..12204cf 100644 --- a/paperforge/memory/refresh.py +++ b/paperforge/memory/refresh.py @@ -78,6 +78,16 @@ def refresh_paper(vault: Path, entry: dict) -> bool: (zotero_key, raw_str, raw_str.lower().strip(), alias_type), ) + row = conn.execute( + "SELECT rowid FROM papers WHERE zotero_key = ?", + (zotero_key,), + ).fetchone() + if row: + conn.execute( + "INSERT INTO paper_fts(paper_fts, rowid) VALUES ('delete', ?)", + (row["rowid"],), + ) + conn.execute( "INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) " "VALUES ((SELECT rowid FROM papers WHERE zotero_key = ?), ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", From 888bbbba02540225b51954023c765374c842b45c Mon Sep 17 00:00:00 2001 From: Research Assistant <research@example.com> Date: Thu, 14 May 2026 22:36:12 +0800 Subject: [PATCH 129/132] fix: pf_bootstrap verified tuple return, engineering review dimensions --- .../skills/paperforge/scripts/pf_bootstrap.py | 16 ++++++++-------- .../paperforge/workflows/project-engineering.md | 13 +++++++++++++ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/paperforge/skills/paperforge/scripts/pf_bootstrap.py b/paperforge/skills/paperforge/scripts/pf_bootstrap.py index 101b35a..a1a99d1 100644 --- a/paperforge/skills/paperforge/scripts/pf_bootstrap.py +++ b/paperforge/skills/paperforge/scripts/pf_bootstrap.py @@ -54,8 +54,8 @@ def _read_pf_config(pf_json: Path) -> dict: return json.load(f) -def _find_python_with_paperforge(vault: Path, pf_cfg: dict) -> str | None: - """Find a Python executable that has paperforge installed.""" +def _find_python_with_paperforge(vault: Path, pf_cfg: dict) -> tuple[str | None, bool]: + """Find a Python executable. Returns (candidate, verified_has_paperforge).""" candidates = [] # 1. Explicit python_path in config @@ -79,11 +79,11 @@ def _find_python_with_paperforge(vault: Path, pf_cfg: dict) -> str | None: encoding="utf-8", errors="replace", ) if result.returncode == 0 and "paperforge" in result.stdout.lower(): - return str(candidate) + return (str(candidate), True) except Exception: continue - # Fallback: try system python + # Fallback: check system python/python3 only (no paperforge verification) for fallback in ["python", "python3"]: try: result = subprocess.run( @@ -92,11 +92,11 @@ def _find_python_with_paperforge(vault: Path, pf_cfg: dict) -> str | None: encoding="utf-8", errors="replace", ) if result.returncode == 0: - return fallback + return (fallback, False) except Exception: continue - return None + return (None, False) def _scan_methodology_archive(pf_root: Path) -> list[dict]: @@ -225,10 +225,10 @@ def main(): result["index_summary"] = index_summary # --- 6. Find Python that has paperforge (best effort) --- - py_candidate = _find_python_with_paperforge(vault, cfg) + py_candidate, py_verified = _find_python_with_paperforge(vault, cfg) if py_candidate: result["python_candidate"] = py_candidate - result["python_verified"] = True + result["python_verified"] = py_verified else: result["python_candidate"] = "python" result["python_verified"] = False diff --git a/paperforge/skills/paperforge/workflows/project-engineering.md b/paperforge/skills/paperforge/workflows/project-engineering.md index be62708..85a269f 100644 --- a/paperforge/skills/paperforge/workflows/project-engineering.md +++ b/paperforge/skills/paperforge/workflows/project-engineering.md @@ -11,3 +11,16 @@ BetterBibTeX, OCR, plugin): 5. Present findings and recommend fixes Do NOT modify code without user confirmation. + +## Review Dimensions + +When auditing branches, code, or user-reported issues, check: + +1. **Source of truth clarity:** Is data stored in one canonical location? +2. **Derived index rebuildability:** Can SQLite be rebuilt from JSONL? +3. **Agent routing stability:** Will the skill router pick the right workflow? +4. **Obsidian file integrity:** Are .md files still readable with valid frontmatter? +5. **User flow length:** Has the number of manual steps decreased? +6. **Cross-platform safety:** Paths use `/`, Python detection works on Win/Mac/Linux, Git is accessible. +7. **Data loss risk:** Does any operation silently drop records? +8. **Deprecation hygiene:** Are old functions properly wrapped or removed? From a76cb21f09b623ce148d4b3b8f3e1f938b5441bc Mon Sep 17 00:00:00 2001 From: Research Assistant <research@example.com> Date: Thu, 14 May 2026 22:37:55 +0800 Subject: [PATCH 130/132] fix: import_reading_log to JSONL, correction JSONL-first, events.py wrapper --- paperforge/commands/reading_log.py | 42 +++++----- paperforge/memory/events.py | 119 ++++++++++++----------------- 2 files changed, 75 insertions(+), 86 deletions(-) diff --git a/paperforge/commands/reading_log.py b/paperforge/commands/reading_log.py index f85b113..c7e0aa8 100644 --- a/paperforge/commands/reading_log.py +++ b/paperforge/commands/reading_log.py @@ -11,7 +11,7 @@ from paperforge.core.errors import ErrorCode from paperforge.core.result import PFError, PFResult from paperforge.memory.db import get_connection, get_memory_db_path -from paperforge.memory.events import write_correction_note, write_reading_note +from paperforge.memory.events import write_correction_note from paperforge.memory.permanent import ( append_correction, append_reading_note, @@ -157,7 +157,7 @@ def validate_reading_log(filepath: Path) -> dict: def import_reading_log(vault: Path, filepath: Path) -> dict: - """Validate and import a reading-log.md into paper_events.""" + """Validate and import a reading-log.md into reading-log.jsonl source of truth.""" parsed = _parse_reading_log(filepath) if not parsed["ok"]: return {"ok": False, "errors": parsed["errors"], "papers_imported": 0, "entries_imported": 0} @@ -170,9 +170,13 @@ def import_reading_log(vault: Path, filepath: Path) -> dict: info = section.get("info", "") use = section.get("use", "") if info and use: - write_reading_note( - vault, paper["paper_key"], section["section_name"], - info, use, section.get("note", "") or "", + append_reading_note( + vault, + paper["paper_key"], + section["section_name"], + excerpt=info, + usage=use, + note=section.get("note", "") or "", ) entries_imported += 1 papers_set.add(paper["paper_key"]) @@ -423,33 +427,37 @@ def run(args: argparse.Namespace) -> int: print(f"Error: Original entry {args.correct_id} not found in reading-log.jsonl") return 1 - ok = write_correction_note( + # Write to JSONL (source of truth) + jsonl_result = append_correction( vault, paper_id, args.correct_id, args.correction, args.reason or "", ) - jsonl_result = append_correction( + # Also write to paper_events for FTS (best effort) + db_ok = write_correction_note( vault, paper_id, args.correct_id, args.correction, args.reason or "", ) + ok = bool(jsonl_result.get("ok")) result = PFResult( - ok=ok, command="reading-log", version=PF_VERSION, - data={"written": ok, "jsonl_id": jsonl_result.get("id"), - "jsonl_path": jsonl_result.get("path")}, + ok=ok, + command="reading-log", + version=PF_VERSION, + data={ + "written": ok, + "jsonl_id": jsonl_result.get("id", ""), + "db_indexed": db_ok, + }, error=PFError(code=ErrorCode.INTERNAL_ERROR, - message="Failed to write correction") if not ok else None, + message="Correction write failed") if not ok else None, ) if args.json: print(result.to_json()) else: - written_parts = [] if ok: - written_parts.append("paper_events") - if jsonl_result.get("ok"): - written_parts.append(f"correction-log.jsonl ({jsonl_result.get('id')})") - if written_parts: - print(f"Correction written for {args.correct_id}: {', '.join(written_parts)}.") + print(f"Correction written ({jsonl_result.get('id', '')})." + f"{' DB indexed.' if db_ok else ''}") else: print("Failed.") return 0 if ok else 1 diff --git a/paperforge/memory/events.py b/paperforge/memory/events.py index 9c74a3a..9ef3d75 100644 --- a/paperforge/memory/events.py +++ b/paperforge/memory/events.py @@ -10,81 +10,62 @@ def write_reading_note(vault: Path, paper_id: str, section: str, excerpt: str, usage: str = "", note: str = "", context: str = "", project: str = "", tags: list[str] | None = None) -> bool: - """DEPRECATED: Use append_reading_note() in paperforge.memory.permanent instead. + """DEPRECATED: Wraps append_reading_note(). Use permanent.py directly. - Reading notes now live in reading-log.jsonl as the source of truth. - This function is kept for backward compatibility only (import_reading_log). + Kept for backward compatibility. Does NOT write to paper_events anymore. """ - db_path = get_memory_db_path(vault) - if not db_path.exists(): - return False - - payload = { - "section": section, - "excerpt": excerpt, - "usage": usage, - "note": note, - "context": context, - "project": project, - "tags": tags or [], - } - conn = get_connection(db_path, read_only=False) - try: - conn.execute( - """INSERT INTO paper_events (paper_id, event_type, payload_json) - VALUES (?, 'reading_note', ?)""", - (paper_id, json.dumps(payload, ensure_ascii=False)), - ) - conn.commit() - return True - except Exception: - conn.rollback() - return False - finally: - conn.close() + from paperforge.memory.permanent import append_reading_note + result = append_reading_note( + vault, paper_id, section, excerpt, + usage=usage, context=context, note=note, + project=project, tags=tags, + ) + return bool(result.get("ok")) def export_reading_log(vault: Path, since: str = "", limit: int = 50) -> list[dict]: - """Export reading notes as a list of dicts, ordered by created_at DESC.""" + """Export reading notes from JSONL (source of truth).""" + from paperforge.memory.permanent import read_all_reading_notes + + notes = read_all_reading_notes(vault) + + # Optionally enrich with papers metadata from DB db_path = get_memory_db_path(vault) - if not db_path.exists(): - return [] - - conn = get_connection(db_path, read_only=True) - try: - query = """ - SELECT e.created_at, e.paper_id, e.payload_json, - p.citation_key, p.title, p.year, p.first_author - FROM paper_events e - JOIN papers p ON p.zotero_key = e.paper_id - WHERE e.event_type = 'reading_note' - """ - params = [] - if since: - query += " AND e.created_at >= ?" - params.append(since) - query += " ORDER BY e.created_at DESC LIMIT ?" - params.append(limit) - - rows = conn.execute(query, params).fetchall() - results = [] - for row in rows: - payload = json.loads(row["payload_json"]) - results.append({ - "created_at": row["created_at"], - "paper_id": row["paper_id"], - "citation_key": row["citation_key"], - "title": row["title"], - "year": row["year"], - "first_author": row["first_author"], - "section": payload.get("section", ""), - "excerpt": payload.get("excerpt", ""), - "usage": payload.get("usage", ""), - "note": payload.get("note", ""), - }) - return results - finally: - conn.close() + paper_cache = {} + if db_path.exists(): + conn = get_connection(db_path, read_only=True) + try: + rows = conn.execute( + "SELECT zotero_key, citation_key, title, year, first_author FROM papers" + ).fetchall() + for r in rows: + paper_cache[r["zotero_key"]] = dict(r) + finally: + conn.close() + + results = [] + for n in notes: + created = n.get("created_at", "") + if since and created < since: + continue + pid = n.get("paper_id", "") + meta = paper_cache.get(pid, {}) + results.append({ + "created_at": created, + "paper_id": pid, + "citation_key": meta.get("citation_key", ""), + "title": meta.get("title", ""), + "year": meta.get("year", ""), + "first_author": meta.get("first_author", ""), + "section": n.get("section", ""), + "excerpt": n.get("excerpt", ""), + "usage": n.get("usage", ""), + "note": n.get("note", ""), + }) + + # Sort DESC by created_at, apply limit + results.sort(key=lambda x: x["created_at"], reverse=True) + return results[:limit] def write_correction_note(vault: Path, paper_id: str, original_id: str, From 6643229f9389c5c7092c4a25e67862e9cf059894 Mon Sep 17 00:00:00 2001 From: Research Assistant <research@example.com> Date: Thu, 14 May 2026 22:47:48 +0800 Subject: [PATCH 131/132] fix: refresh_paper FTS delete-before-upsert, import error checking --- paperforge/commands/reading_log.py | 21 ++++++++--- paperforge/memory/refresh.py | 56 ++++++++++++++++++------------ 2 files changed, 51 insertions(+), 26 deletions(-) diff --git a/paperforge/commands/reading_log.py b/paperforge/commands/reading_log.py index c7e0aa8..0e19abb 100644 --- a/paperforge/commands/reading_log.py +++ b/paperforge/commands/reading_log.py @@ -164,13 +164,14 @@ def import_reading_log(vault: Path, filepath: Path) -> dict: papers_set: set[str] = set() entries_imported = 0 + errors: list[dict] = [] for paper in parsed["papers"]: for section in paper["sections"]: info = section.get("info", "") use = section.get("use", "") if info and use: - append_reading_note( + res = append_reading_note( vault, paper["paper_key"], section["section_name"], @@ -178,10 +179,22 @@ def import_reading_log(vault: Path, filepath: Path) -> dict: usage=use, note=section.get("note", "") or "", ) - entries_imported += 1 - papers_set.add(paper["paper_key"]) + if res.get("ok"): + entries_imported += 1 + papers_set.add(paper["paper_key"]) + else: + errors.append({ + "paper_key": paper["paper_key"], + "section": section["section_name"], + "error": res.get("error", "unknown"), + }) - return {"ok": True, "papers_imported": len(papers_set), "entries_imported": entries_imported} + return { + "ok": len(errors) == 0, + "papers_imported": len(papers_set), + "entries_imported": entries_imported, + "errors": errors, + } def lookup_paper_events(vault: Path, key: str) -> dict: diff --git a/paperforge/memory/refresh.py b/paperforge/memory/refresh.py index 12204cf..280b90c 100644 --- a/paperforge/memory/refresh.py +++ b/paperforge/memory/refresh.py @@ -41,6 +41,20 @@ def refresh_paper(vault: Path, entry: dict) -> bool: entry["next_step"] = str(compute_next_step(entry)) paper_values = build_paper_row(entry, generated_at) + # Step 1: Get old rowid before papers upsert (rowid may change on REPLACE) + old = conn.execute( + "SELECT rowid FROM papers WHERE zotero_key = ?", + (zotero_key,), + ).fetchone() + + # Step 2: Delete old FTS row BEFORE papers changes + if old: + conn.execute( + "DELETE FROM paper_fts WHERE rowid = ?", + (old["rowid"],), + ) + + # Step 3: Upsert papers placeholders = ", ".join([f":{c}" for c in PAPER_COLUMNS]) cols = ", ".join(PAPER_COLUMNS) conn.execute( @@ -78,33 +92,31 @@ def refresh_paper(vault: Path, entry: dict) -> bool: (zotero_key, raw_str, raw_str.lower().strip(), alias_type), ) - row = conn.execute( + # Step 4: Get new rowid after upsert + new = conn.execute( "SELECT rowid FROM papers WHERE zotero_key = ?", (zotero_key,), ).fetchone() - if row: + + # Step 5: Insert new FTS row + if new: conn.execute( - "INSERT INTO paper_fts(paper_fts, rowid) VALUES ('delete', ?)", - (row["rowid"],), + "INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) " + "VALUES ((SELECT rowid FROM papers WHERE zotero_key = ?), ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + ( + zotero_key, + zotero_key, + entry.get("citation_key", ""), + entry.get("title", ""), + entry.get("first_author", ""), + paper_values["authors_json"], + entry.get("abstract", ""), + entry.get("journal", ""), + entry.get("domain", ""), + entry.get("collection_path", ""), + paper_values["collections_json"], + ), ) - - conn.execute( - "INSERT INTO paper_fts(rowid, zotero_key, citation_key, title, first_author, authors_json, abstract, journal, domain, collection_path, collections_json) " - "VALUES ((SELECT rowid FROM papers WHERE zotero_key = ?), ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", - ( - zotero_key, - zotero_key, - entry.get("citation_key", ""), - entry.get("title", ""), - entry.get("first_author", ""), - paper_values["authors_json"], - entry.get("abstract", ""), - entry.get("journal", ""), - entry.get("domain", ""), - entry.get("collection_path", ""), - paper_values["collections_json"], - ), - ) conn.execute(PAPERS_AI_TRIGGER) conn.commit() From f054193407ac2eae7b1286194937b406896b621a Mon Sep 17 00:00:00 2001 From: Research Assistant <research@example.com> Date: Thu, 14 May 2026 22:58:03 +0800 Subject: [PATCH 132/132] fix: update package-data from literature-qa to skills/paperforge/** --- pyproject.toml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2862d37..c85ee0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,10 +63,7 @@ version = {attr = "paperforge.__version__"} [tool.setuptools.package-data] paperforge = [ "py.typed", - "skills/literature-qa/prompt_deep_subagent.md", - "skills/literature-qa/scripts/*.md", - "skills/literature-qa/chart-reading/*.md", - "skills/literature-qa/chart-reading/*", + "skills/paperforge/**", "command_files/*.md", "plugin/*.css", "plugin/*.js",