GeneralUserModels · oshaikh13 · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "napsack"
-version = "0.1.1"
+version = "0.1.2"
 readme = "README.md"
 description = "NAPsack records and aggregates your computer use — screenshots plus input events (click, keypress, scroll, cursor move). It groups activity into event bursts and uses a VLM pipeline to generate human-readable captions describing what happened."
 requires-python = ">=3.11,<=3.13"

diff --git a/src/napsack/label/__main__.py b/src/napsack/label/__main__.py
@@ -28,6 +28,7 @@ def parse_args():
     p.add_argument("--dedupe-threshold", type=int, default=1, help="Hamming distance threshold for deduplication (drop if <= threshold, default: 1)")
     p.add_argument("--annotate", action="store_true", help="Annotate videos with cursor positions and clicks (only for standard processing)")
     p.add_argument("--image-mode", action="store_true", help="Send frames as individual images instead of video (for models that don't support video input)")
+    p.add_argument("--dense-caption", action="store_true", help="Include a dense text caption per chunk describing important text the user focused on, for retrieval")
     p.add_argument("--skip-existing", action="store_true", help="Skip sessions that have already been processed")
     p.add_argument("--visualize", action="store_true", help="Create annotated video visualizations after processing")
     p.add_argument("--encode-only", action="store_true", help="Only encode videos (create chunks), skip labeling. Useful for pre-processing before running the full pipeline.")
@@ -121,6 +122,7 @@ def process_with_litellm(args, configs):
         hash_cache_path=args.hash_cache,
         dedupe_threshold=args.dedupe_threshold,
         image_mode=args.image_mode,
+        dense_caption=args.dense_caption,
     )
 
     return processor.process_sessions(
@@ -151,6 +153,7 @@ def process_with_bigquery(args, configs):
         hash_cache_path=args.hash_cache,
         dedupe_threshold=args.dedupe_threshold,
         image_mode=args.image_mode,
+        dense_caption=args.dense_caption,
     )
 
     return processor.process_sessions(
@@ -179,6 +182,7 @@ def process_with_tinfoil(args, configs):
         hash_cache_path=args.hash_cache,
         dedupe_threshold=args.dedupe_threshold,
         image_mode=args.image_mode,
+        dense_caption=args.dense_caption,
     )
 
     return processor.process_sessions(

diff --git a/src/napsack/label/clients/__init__.py b/src/napsack/label/clients/__init__.py
@@ -1,4 +1,4 @@
-from napsack.label.clients.client import VLMClient, CAPTION_SCHEMA, IMAGE_CAPTION_SCHEMA
+from napsack.label.clients.client import VLMClient, CAPTION_SCHEMA, IMAGE_CAPTION_SCHEMA, DENSE_CAPTION_SCHEMA, DENSE_IMAGE_CAPTION_SCHEMA
 from napsack.label.clients.litellm import LiteLLMClient
 from napsack.label.clients.bigquery import BigQueryClient, BigQueryResponse
 from napsack.label.clients.tinfoil import TinfoilClient
@@ -23,5 +23,7 @@ def create_client(client_type: str, **kwargs) -> VLMClient:
     "TinfoilClient",
     "CAPTION_SCHEMA",
     "IMAGE_CAPTION_SCHEMA",
+    "DENSE_CAPTION_SCHEMA",
+    "DENSE_IMAGE_CAPTION_SCHEMA",
     "create_client",
 ]
diff --git a/src/napsack/label/clients/client.py b/src/napsack/label/clients/client.py
@@ -28,6 +28,23 @@
     }
 }
 
+DENSE_CAPTION_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "actions": CAPTION_SCHEMA,
+        "dense_caption": {"type": "string"}
+    },
+    "required": ["actions", "dense_caption"]
+}
+
+DENSE_IMAGE_CAPTION_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "actions": IMAGE_CAPTION_SCHEMA,
+        "dense_caption": {"type": "string"}
+    },
+    "required": ["actions", "dense_caption"]
+}
 
 class VLMClient(ABC):
     @abstractmethod

diff --git a/src/napsack/label/models.py b/src/napsack/label/models.py
@@ -366,6 +366,7 @@ class Caption:
     end_seconds: float
     text: str
     chunk_index: int = 0
+    dense_caption: Optional[str] = None
 
     @property
     def start_formatted(self) -> str:
@@ -381,18 +382,22 @@ def from_dict(cls, data: Dict) -> Caption:
             start_seconds=data['start_seconds'],
             end_seconds=data['end_seconds'],
             text=data['caption'],
-            chunk_index=data.get('chunk_index', 0)
+            chunk_index=data.get('chunk_index', 0),
+            dense_caption=data.get('dense_caption'),
         )
 
     def to_dict(self) -> Dict:
-        return {
+        d = {
             'start': self.start_formatted,
             'end': self.end_formatted,
             'start_seconds': self.start_seconds,
             'end_seconds': self.end_seconds,
             'caption': self.text,
             'chunk_index': self.chunk_index
         }
+        if self.dense_caption is not None:
+            d['dense_caption'] = self.dense_caption
+        return d
 
 
 @dataclass
@@ -417,7 +422,7 @@ def all_events(self) -> List[Event]:
         return events
 
     def to_dict(self) -> Dict:
-        return {
+        d = {
             'start_time': self.aggregations[0].timestamp if self.aggregations else 0,
             'end_time': self.aggregations[-1].timestamp if self.aggregations else 0,
             'start_index': self.start_index,
@@ -430,6 +435,9 @@ def to_dict(self) -> Dict:
             'end_formatted': self.caption.end_formatted,
             'scale_factor': self.screenshot_scale_factor
         }
+        if self.caption.dense_caption is not None:
+            d['dense_caption'] = self.caption.dense_caption
+        return d
 
 
 @dataclass

diff --git a/src/napsack/label/processor.py b/src/napsack/label/processor.py
@@ -9,7 +9,7 @@
 
 from napsack.label.models import SessionConfig, ChunkTask, Caption, Aggregation, VideoPath, MatchedCaption
 from napsack.label.video import create_video, split_video, compute_max_size
-from napsack.label.clients import VLMClient, CAPTION_SCHEMA, IMAGE_CAPTION_SCHEMA
+from napsack.label.clients import VLMClient, CAPTION_SCHEMA, IMAGE_CAPTION_SCHEMA, DENSE_CAPTION_SCHEMA, DENSE_IMAGE_CAPTION_SCHEMA
 
 
 # ============================================================================
@@ -144,23 +144,35 @@ def __init__(
         hash_cache_path: Optional[str] = None,
         dedupe_threshold: int = 1,
         image_mode: bool = False,
+        dense_caption: bool = False,
     ):
         self.client = client
         self.encode_workers = encode_workers
         self.label_workers = label_workers
         self.screenshots_only = screenshots_only
-        self.prompt = self._load_prompt(prompt_file)
         self.max_time_gap = max_time_gap
         self.dedupe_threshold = dedupe_threshold
         self.hash_map = load_hash_cache(hash_cache_path) if hash_cache_path else None
         self.image_mode = image_mode
+        self.dense_caption = dense_caption
+
+        base_prompt = self._load_prompt(prompt_file)
+        output_format = self._load_output_format(image_mode, dense_caption)
+        self.prompt = base_prompt.replace("{{OUTPUT_FORMAT}}", output_format)
 
     def _load_prompt(self, path: str) -> str:
         p = Path(path)
         if not p.exists():
             p = Path(__file__).parent / path
         return p.read_text()
 
+    def _load_output_format(self, image_mode: bool, dense_caption: bool) -> str:
+        if dense_caption:
+            filename = "prompts/output/dense_image.txt" if image_mode else "prompts/output/dense.txt"
+        else:
+            filename = "prompts/output/standard_image.txt" if image_mode else "prompts/output/standard.txt"
+        return self._load_prompt(filename)
+
     def process_sessions(
         self,
         configs: List[SessionConfig],
@@ -535,17 +547,19 @@ def _process_tasks(self, tasks: List[ChunkTask], config_map: dict) -> List[Tuple
     def _process_single_task(self, task: ChunkTask) -> any:
         """Process single task with schema."""
         if self.image_mode:
+            schema = DENSE_IMAGE_CAPTION_SCHEMA if self.dense_caption else IMAGE_CAPTION_SCHEMA
             per_frame_text = None
             if task.aggregations:
                 per_frame_text = [agg.to_prompt(f"Frame {j + 1}") for j, agg in enumerate(task.aggregations)]
             file_desc = self.client.upload_images(
                 [str(p) for p in task.image_paths], session_id=task.session_id,
                 per_frame_text=per_frame_text,
             )
-            response = self.client.generate(task.prompt, file_desc, schema=IMAGE_CAPTION_SCHEMA)
+            response = self.client.generate(task.prompt, file_desc, schema=schema)
         else:
+            schema = DENSE_CAPTION_SCHEMA if self.dense_caption else CAPTION_SCHEMA
             file_desc = self.client.upload_file(str(task.video_path.resolve()), session_id=task.session_id)
-            response = self.client.generate(task.prompt, file_desc, schema=CAPTION_SCHEMA)
+            response = self.client.generate(task.prompt, file_desc, schema=schema)
 
         return response
 
@@ -597,6 +611,12 @@ def _save_results(
 
     def _extract_captions(self, result: any, task: ChunkTask, fps: int = 1) -> List[Caption]:
         captions = []
+        dense_caption_text = None
+
+        # Unwrap dense caption response format
+        if isinstance(result, dict) and "actions" in result:
+            dense_caption_text = result.get("dense_caption") if self.dense_caption else None
+            result = result.get("actions", [])
 
         if isinstance(result, str) or not isinstance(result, list):
             return captions
@@ -636,7 +656,8 @@ def _extract_captions(self, result: any, task: ChunkTask, fps: int = 1) -> List[
                 start_seconds=abs_start,
                 end_seconds=abs_end,
                 text=item.get("caption", item.get("description", "")),
-                chunk_index=task.chunk_index
+                chunk_index=task.chunk_index,
+                dense_caption=dense_caption_text,
             ))
 
         return captions

diff --git a/src/napsack/label/prompts/default.txt b/src/napsack/label/prompts/default.txt
@@ -83,16 +83,4 @@ Generated captions must be in past tense, and at the level of detail as the exam
 
 You MUST quote specific things from the screen so it's easy to reproduce your steps.
 
-## Output
-
-A JSON array of objects:
-
-```json
-[
-  {
-    "start": "MM:SS",
-    "end":   "MM:SS",
-    "caption": "..."
-  }
-]
-```
+{{OUTPUT_FORMAT}}
diff --git a/src/napsack/label/prompts/image_mode.txt b/src/napsack/label/prompts/image_mode.txt
@@ -77,16 +77,4 @@ Generated captions must be in past tense, and at the level of detail as the exam
 
 You MUST quote specific things from the screen so it's easy to reproduce your steps.
 
-## Output
-
-A JSON array of objects. Use **frame numbers** (integers) for start and end:
-
-```json
-[
-  {
-    "start": 1,
-    "end":   3,
-    "caption": "..."
-  }
-]
-```
+{{OUTPUT_FORMAT}}
diff --git a/src/napsack/label/prompts/output/dense.txt b/src/napsack/label/prompts/output/dense.txt
@@ -0,0 +1,33 @@
+## Dense Caption
+
+In addition to the actions array, you MUST also produce a `dense_caption` field: a single string that captures the **important text content the user was focused on or interacting with** during this chunk. This includes:
+- Text being read, written, or edited
+- URLs, file paths, code snippets, search queries
+- Names, labels, or data values visible and relevant to the user's activity
+
+The dense caption should be a concise but information-rich summary optimized for text search and retrieval. Do NOT describe actions — instead, capture the **textual content** itself.
+
+### Examples
+
+- "VS Code editor open to src/utils/auth.py. Function `verify_jwt_token(token: str, secret: str) -> dict` on lines 42-58. Import statements: `from jose import jwt`, `from datetime import datetime, timedelta`. Error highlighted on line 51: `jwt.ExpiredSignatureError`."
+- "Chrome browser on GitHub pull request #347 'Fix race condition in connection pool' in repo acme/backend. Files changed: pool.py (+23 -8), test_pool.py (+45). Review comment from @danl: 'Should we add a timeout here?' on line 89 of pool.py."
+- "Terminal running `kubectl get pods -n staging`. Output shows 4 pods: api-server-7f8b (Running), worker-3a2c (CrashLoopBackOff), redis-cache-1d4e (Running), nginx-proxy-9b1f (Running). Error on worker pod: OOMKilled, restarts: 12."
+- "Google Sheets spreadsheet 'Q2 Marketing Budget'. Editing cell D14 with formula `=SUM(D2:D13)`. Column D header: 'Spend ($)'. Visible values: D2=1200, D3=3400, D5=890. Sheet tab: 'Paid Ads'."
+- "Slack conversation in #eng-incidents channel. User typed: 'looks like the redis cluster in us-east-1 is throwing CLUSTERDOWN errors — can someone check if the failover completed? cc @oncall-infra'. Replying to message from @mkhan: 'We're seeing elevated 5xx rates on the payments service since 2:14pm PT.' Channel topic: 'Production incident triage'."
+
+## Output
+
+Return a JSON object with two fields:
+
+```json
+{
+  "actions": [
+    {
+      "start": "MM:SS",
+      "end":   "MM:SS",
+      "caption": "..."
+    }
+  ],
+  "dense_caption": "A single string capturing important text content..."
+}
+```
diff --git a/src/napsack/label/prompts/output/dense_image.txt b/src/napsack/label/prompts/output/dense_image.txt
@@ -0,0 +1,33 @@
+## Dense Caption
+
+In addition to the actions array, you MUST also produce a `dense_caption` field: a single string that captures the **important text content the user was focused on or interacting with** during this chunk. This includes:
+- Text being read, written, or edited
+- URLs, file paths, code snippets, search queries
+- Names, labels, or data values visible and relevant to the user's activity
+
+The dense caption should be a concise but information-rich summary optimized for text search and retrieval. Do NOT describe actions — instead, capture the **textual content** itself.
+
+### Examples
+
+- "VS Code editor open to src/utils/auth.py. Function `verify_jwt_token(token: str, secret: str) -> dict` on lines 42-58. Import statements: `from jose import jwt`, `from datetime import datetime, timedelta`. Error highlighted on line 51: `jwt.ExpiredSignatureError`."
+- "Chrome browser on GitHub pull request #347 'Fix race condition in connection pool' in repo acme/backend. Files changed: pool.py (+23 -8), test_pool.py (+45). Review comment from @danl: 'Should we add a timeout here?' on line 89 of pool.py."
+- "Terminal running `kubectl get pods -n staging`. Output shows 4 pods: api-server-7f8b (Running), worker-3a2c (CrashLoopBackOff), redis-cache-1d4e (Running), nginx-proxy-9b1f (Running). Error on worker pod: OOMKilled, restarts: 12."
+- "Google Sheets spreadsheet 'Q2 Marketing Budget'. Editing cell D14 with formula `=SUM(D2:D13)`. Column D header: 'Spend ($)'. Visible values: D2=1200, D3=3400, D5=890. Sheet tab: 'Paid Ads'."
+- "Slack conversation in #eng-incidents channel. User typed: 'looks like the redis cluster in us-east-1 is throwing CLUSTERDOWN errors — can someone check if the failover completed? cc @oncall-infra'. Replying to message from @mkhan: 'We're seeing elevated 5xx rates on the payments service since 2:14pm PT.' Channel topic: 'Production incident triage'."
+
+## Output
+
+Return a JSON object with two fields. Use **frame numbers** (integers) for start and end:
+
+```json
+{
+  "actions": [
+    {
+      "start": 1,
+      "end":   3,
+      "caption": "..."
+    }
+  ],
+  "dense_caption": "A single string capturing important text content..."
+}
+```
diff --git a/src/napsack/label/prompts/output/standard.txt b/src/napsack/label/prompts/output/standard.txt
@@ -0,0 +1,13 @@
+## Output
+
+A JSON array of objects:
+
+```json
+[
+  {
+    "start": "MM:SS",
+    "end":   "MM:SS",
+    "caption": "..."
+  }
+]
+```
diff --git a/src/napsack/label/prompts/output/standard_image.txt b/src/napsack/label/prompts/output/standard_image.txt
@@ -0,0 +1,13 @@
+## Output
+
+A JSON array of objects. Use **frame numbers** (integers) for start and end:
+
+```json
+[
+  {
+    "start": 1,
+    "end":   3,
+    "caption": "..."
+  }
+]
+```
diff --git a/src/napsack/label/prompts/screenshots_only.txt b/src/napsack/label/prompts/screenshots_only.txt
@@ -72,16 +72,4 @@ Generated captions must be in past tense, and at the level of detail as the exam
 
 You MUST quote specific things from the screen so it's easy to reproduce your steps.
 
-## Output
-
-A JSON array of objects:
-
-```json
-[
-  {
-    "start": "MM:SS",
-    "end":   "MM:SS",
-    "caption": "..."
-  }
-]
-```
+{{OUTPUT_FORMAT}}