Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "napsack"
version = "0.1.1"
version = "0.1.2"
readme = "README.md"
description = "NAPsack records and aggregates your computer use — screenshots plus input events (click, keypress, scroll, cursor move). It groups activity into event bursts and uses a VLM pipeline to generate human-readable captions describing what happened."
requires-python = ">=3.11,<=3.13"
Expand Down
4 changes: 4 additions & 0 deletions src/napsack/label/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def parse_args():
p.add_argument("--dedupe-threshold", type=int, default=1, help="Hamming distance threshold for deduplication (drop if <= threshold, default: 1)")
p.add_argument("--annotate", action="store_true", help="Annotate videos with cursor positions and clicks (only for standard processing)")
p.add_argument("--image-mode", action="store_true", help="Send frames as individual images instead of video (for models that don't support video input)")
p.add_argument("--dense-caption", action="store_true", help="Include a dense text caption per chunk describing important text the user focused on, for retrieval")
p.add_argument("--skip-existing", action="store_true", help="Skip sessions that have already been processed")
p.add_argument("--visualize", action="store_true", help="Create annotated video visualizations after processing")
p.add_argument("--encode-only", action="store_true", help="Only encode videos (create chunks), skip labeling. Useful for pre-processing before running the full pipeline.")
Expand Down Expand Up @@ -121,6 +122,7 @@ def process_with_litellm(args, configs):
hash_cache_path=args.hash_cache,
dedupe_threshold=args.dedupe_threshold,
image_mode=args.image_mode,
dense_caption=args.dense_caption,
)

return processor.process_sessions(
Expand Down Expand Up @@ -151,6 +153,7 @@ def process_with_bigquery(args, configs):
hash_cache_path=args.hash_cache,
dedupe_threshold=args.dedupe_threshold,
image_mode=args.image_mode,
dense_caption=args.dense_caption,
)

return processor.process_sessions(
Expand Down Expand Up @@ -179,6 +182,7 @@ def process_with_tinfoil(args, configs):
hash_cache_path=args.hash_cache,
dedupe_threshold=args.dedupe_threshold,
image_mode=args.image_mode,
dense_caption=args.dense_caption,
)

return processor.process_sessions(
Expand Down
4 changes: 3 additions & 1 deletion src/napsack/label/clients/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from napsack.label.clients.client import VLMClient, CAPTION_SCHEMA, IMAGE_CAPTION_SCHEMA
from napsack.label.clients.client import VLMClient, CAPTION_SCHEMA, IMAGE_CAPTION_SCHEMA, DENSE_CAPTION_SCHEMA, DENSE_IMAGE_CAPTION_SCHEMA
from napsack.label.clients.litellm import LiteLLMClient
from napsack.label.clients.bigquery import BigQueryClient, BigQueryResponse
from napsack.label.clients.tinfoil import TinfoilClient
Expand All @@ -23,5 +23,7 @@ def create_client(client_type: str, **kwargs) -> VLMClient:
"TinfoilClient",
"CAPTION_SCHEMA",
"IMAGE_CAPTION_SCHEMA",
"DENSE_CAPTION_SCHEMA",
"DENSE_IMAGE_CAPTION_SCHEMA",
"create_client",
]
17 changes: 17 additions & 0 deletions src/napsack/label/clients/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,23 @@
}
}

DENSE_CAPTION_SCHEMA = {
"type": "object",
"properties": {
"actions": CAPTION_SCHEMA,
"dense_caption": {"type": "string"}
},
"required": ["actions", "dense_caption"]
}

DENSE_IMAGE_CAPTION_SCHEMA = {
"type": "object",
"properties": {
"actions": IMAGE_CAPTION_SCHEMA,
"dense_caption": {"type": "string"}
},
"required": ["actions", "dense_caption"]
}

class VLMClient(ABC):
@abstractmethod
Expand Down
14 changes: 11 additions & 3 deletions src/napsack/label/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ class Caption:
end_seconds: float
text: str
chunk_index: int = 0
dense_caption: Optional[str] = None

@property
def start_formatted(self) -> str:
Expand All @@ -381,18 +382,22 @@ def from_dict(cls, data: Dict) -> Caption:
start_seconds=data['start_seconds'],
end_seconds=data['end_seconds'],
text=data['caption'],
chunk_index=data.get('chunk_index', 0)
chunk_index=data.get('chunk_index', 0),
dense_caption=data.get('dense_caption'),
)

def to_dict(self) -> Dict:
return {
d = {
'start': self.start_formatted,
'end': self.end_formatted,
'start_seconds': self.start_seconds,
'end_seconds': self.end_seconds,
'caption': self.text,
'chunk_index': self.chunk_index
}
if self.dense_caption is not None:
d['dense_caption'] = self.dense_caption
return d


@dataclass
Expand All @@ -417,7 +422,7 @@ def all_events(self) -> List[Event]:
return events

def to_dict(self) -> Dict:
return {
d = {
'start_time': self.aggregations[0].timestamp if self.aggregations else 0,
'end_time': self.aggregations[-1].timestamp if self.aggregations else 0,
'start_index': self.start_index,
Expand All @@ -430,6 +435,9 @@ def to_dict(self) -> Dict:
'end_formatted': self.caption.end_formatted,
'scale_factor': self.screenshot_scale_factor
}
if self.caption.dense_caption is not None:
d['dense_caption'] = self.caption.dense_caption
return d


@dataclass
Expand Down
31 changes: 26 additions & 5 deletions src/napsack/label/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from napsack.label.models import SessionConfig, ChunkTask, Caption, Aggregation, VideoPath, MatchedCaption
from napsack.label.video import create_video, split_video, compute_max_size
from napsack.label.clients import VLMClient, CAPTION_SCHEMA, IMAGE_CAPTION_SCHEMA
from napsack.label.clients import VLMClient, CAPTION_SCHEMA, IMAGE_CAPTION_SCHEMA, DENSE_CAPTION_SCHEMA, DENSE_IMAGE_CAPTION_SCHEMA


# ============================================================================
Expand Down Expand Up @@ -144,23 +144,35 @@ def __init__(
hash_cache_path: Optional[str] = None,
dedupe_threshold: int = 1,
image_mode: bool = False,
dense_caption: bool = False,
):
self.client = client
self.encode_workers = encode_workers
self.label_workers = label_workers
self.screenshots_only = screenshots_only
self.prompt = self._load_prompt(prompt_file)
self.max_time_gap = max_time_gap
self.dedupe_threshold = dedupe_threshold
self.hash_map = load_hash_cache(hash_cache_path) if hash_cache_path else None
self.image_mode = image_mode
self.dense_caption = dense_caption

base_prompt = self._load_prompt(prompt_file)
output_format = self._load_output_format(image_mode, dense_caption)
self.prompt = base_prompt.replace("{{OUTPUT_FORMAT}}", output_format)

def _load_prompt(self, path: str) -> str:
p = Path(path)
if not p.exists():
p = Path(__file__).parent / path
return p.read_text()

def _load_output_format(self, image_mode: bool, dense_caption: bool) -> str:
if dense_caption:
filename = "prompts/output/dense_image.txt" if image_mode else "prompts/output/dense.txt"
else:
filename = "prompts/output/standard_image.txt" if image_mode else "prompts/output/standard.txt"
return self._load_prompt(filename)

def process_sessions(
self,
configs: List[SessionConfig],
Expand Down Expand Up @@ -535,17 +547,19 @@ def _process_tasks(self, tasks: List[ChunkTask], config_map: dict) -> List[Tuple
def _process_single_task(self, task: ChunkTask) -> any:
"""Process single task with schema."""
if self.image_mode:
schema = DENSE_IMAGE_CAPTION_SCHEMA if self.dense_caption else IMAGE_CAPTION_SCHEMA
per_frame_text = None
if task.aggregations:
per_frame_text = [agg.to_prompt(f"Frame {j + 1}") for j, agg in enumerate(task.aggregations)]
file_desc = self.client.upload_images(
[str(p) for p in task.image_paths], session_id=task.session_id,
per_frame_text=per_frame_text,
)
response = self.client.generate(task.prompt, file_desc, schema=IMAGE_CAPTION_SCHEMA)
response = self.client.generate(task.prompt, file_desc, schema=schema)
else:
schema = DENSE_CAPTION_SCHEMA if self.dense_caption else CAPTION_SCHEMA
file_desc = self.client.upload_file(str(task.video_path.resolve()), session_id=task.session_id)
response = self.client.generate(task.prompt, file_desc, schema=CAPTION_SCHEMA)
response = self.client.generate(task.prompt, file_desc, schema=schema)

return response

Expand Down Expand Up @@ -597,6 +611,12 @@ def _save_results(

def _extract_captions(self, result: any, task: ChunkTask, fps: int = 1) -> List[Caption]:
captions = []
dense_caption_text = None

# Unwrap dense caption response format
if isinstance(result, dict) and "actions" in result:
dense_caption_text = result.get("dense_caption") if self.dense_caption else None
result = result.get("actions", [])

if isinstance(result, str) or not isinstance(result, list):
return captions
Expand Down Expand Up @@ -636,7 +656,8 @@ def _extract_captions(self, result: any, task: ChunkTask, fps: int = 1) -> List[
start_seconds=abs_start,
end_seconds=abs_end,
text=item.get("caption", item.get("description", "")),
chunk_index=task.chunk_index
chunk_index=task.chunk_index,
dense_caption=dense_caption_text,
))

return captions
Expand Down
14 changes: 1 addition & 13 deletions src/napsack/label/prompts/default.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,4 @@ Generated captions must be in past tense, and at the level of detail as the exam

You MUST quote specific things from the screen so it's easy to reproduce your steps.

## Output

A JSON array of objects:

```json
[
{
"start": "MM:SS",
"end": "MM:SS",
"caption": "..."
}
]
```
{{OUTPUT_FORMAT}}
14 changes: 1 addition & 13 deletions src/napsack/label/prompts/image_mode.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,16 +77,4 @@ Generated captions must be in past tense, and at the level of detail as the exam

You MUST quote specific things from the screen so it's easy to reproduce your steps.

## Output

A JSON array of objects. Use **frame numbers** (integers) for start and end:

```json
[
{
"start": 1,
"end": 3,
"caption": "..."
}
]
```
{{OUTPUT_FORMAT}}
33 changes: 33 additions & 0 deletions src/napsack/label/prompts/output/dense.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
## Dense Caption

In addition to the actions array, you MUST also produce a `dense_caption` field: a single string that captures the **important text content the user was focused on or interacting with** during this chunk. This includes:
- Text being read, written, or edited
- URLs, file paths, code snippets, search queries
- Names, labels, or data values visible and relevant to the user's activity

The dense caption should be a concise but information-rich summary optimized for text search and retrieval. Do NOT describe actions — instead, capture the **textual content** itself.

### Examples

- "VS Code editor open to src/utils/auth.py. Function `verify_jwt_token(token: str, secret: str) -> dict` on lines 42-58. Import statements: `from jose import jwt`, `from datetime import datetime, timedelta`. Error highlighted on line 51: `jwt.ExpiredSignatureError`."
- "Chrome browser on GitHub pull request #347 'Fix race condition in connection pool' in repo acme/backend. Files changed: pool.py (+23 -8), test_pool.py (+45). Review comment from @danl: 'Should we add a timeout here?' on line 89 of pool.py."
- "Terminal running `kubectl get pods -n staging`. Output shows 4 pods: api-server-7f8b (Running), worker-3a2c (CrashLoopBackOff), redis-cache-1d4e (Running), nginx-proxy-9b1f (Running). Error on worker pod: OOMKilled, restarts: 12."
- "Google Sheets spreadsheet 'Q2 Marketing Budget'. Editing cell D14 with formula `=SUM(D2:D13)`. Column D header: 'Spend ($)'. Visible values: D2=1200, D3=3400, D5=890. Sheet tab: 'Paid Ads'."
- "Slack conversation in #eng-incidents channel. User typed: 'looks like the redis cluster in us-east-1 is throwing CLUSTERDOWN errors — can someone check if the failover completed? cc @oncall-infra'. Replying to message from @mkhan: 'We're seeing elevated 5xx rates on the payments service since 2:14pm PT.' Channel topic: 'Production incident triage'."

## Output

Return a JSON object with two fields:

```json
{
"actions": [
{
"start": "MM:SS",
"end": "MM:SS",
"caption": "..."
}
],
"dense_caption": "A single string capturing important text content..."
}
```
33 changes: 33 additions & 0 deletions src/napsack/label/prompts/output/dense_image.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
## Dense Caption

In addition to the actions array, you MUST also produce a `dense_caption` field: a single string that captures the **important text content the user was focused on or interacting with** during this chunk. This includes:
- Text being read, written, or edited
- URLs, file paths, code snippets, search queries
- Names, labels, or data values visible and relevant to the user's activity

The dense caption should be a concise but information-rich summary optimized for text search and retrieval. Do NOT describe actions — instead, capture the **textual content** itself.

### Examples

- "VS Code editor open to src/utils/auth.py. Function `verify_jwt_token(token: str, secret: str) -> dict` on lines 42-58. Import statements: `from jose import jwt`, `from datetime import datetime, timedelta`. Error highlighted on line 51: `jwt.ExpiredSignatureError`."
- "Chrome browser on GitHub pull request #347 'Fix race condition in connection pool' in repo acme/backend. Files changed: pool.py (+23 -8), test_pool.py (+45). Review comment from @danl: 'Should we add a timeout here?' on line 89 of pool.py."
- "Terminal running `kubectl get pods -n staging`. Output shows 4 pods: api-server-7f8b (Running), worker-3a2c (CrashLoopBackOff), redis-cache-1d4e (Running), nginx-proxy-9b1f (Running). Error on worker pod: OOMKilled, restarts: 12."
- "Google Sheets spreadsheet 'Q2 Marketing Budget'. Editing cell D14 with formula `=SUM(D2:D13)`. Column D header: 'Spend ($)'. Visible values: D2=1200, D3=3400, D5=890. Sheet tab: 'Paid Ads'."
- "Slack conversation in #eng-incidents channel. User typed: 'looks like the redis cluster in us-east-1 is throwing CLUSTERDOWN errors — can someone check if the failover completed? cc @oncall-infra'. Replying to message from @mkhan: 'We're seeing elevated 5xx rates on the payments service since 2:14pm PT.' Channel topic: 'Production incident triage'."

## Output

Return a JSON object with two fields. Use **frame numbers** (integers) for start and end:

```json
{
"actions": [
{
"start": 1,
"end": 3,
"caption": "..."
}
],
"dense_caption": "A single string capturing important text content..."
}
```
13 changes: 13 additions & 0 deletions src/napsack/label/prompts/output/standard.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
## Output

A JSON array of objects:

```json
[
{
"start": "MM:SS",
"end": "MM:SS",
"caption": "..."
}
]
```
13 changes: 13 additions & 0 deletions src/napsack/label/prompts/output/standard_image.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
## Output

A JSON array of objects. Use **frame numbers** (integers) for start and end:

```json
[
{
"start": 1,
"end": 3,
"caption": "..."
}
]
```
14 changes: 1 addition & 13 deletions src/napsack/label/prompts/screenshots_only.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,4 @@ Generated captions must be in past tense, and at the level of detail as the exam

You MUST quote specific things from the screen so it's easy to reproduce your steps.

## Output

A JSON array of objects:

```json
[
{
"start": "MM:SS",
"end": "MM:SS",
"caption": "..."
}
]
```
{{OUTPUT_FORMAT}}
Loading