# Memoirr: Preprocessor + Chunker Pipeline Smoke Test

This notebook runs a minimal Haystack pipeline using the SRT preprocessor and the semantic chunker.

Requirements:
- Ensure a local sentence-transformers model is available under `models/<EMBEDDING_MODEL_NAME>/` (with `model.safetensors` and tokenizer/config files).
- `.env` should set `EMBEDDING_MODEL_NAME` (default included in repo). Optionally set `EMBEDDING_DEVICE` (e.g., `cuda:0`).

Notes:
- The preprocessor emits cleaned JSONL lines, one per caption.
- The chunker uses Chonkie SemanticChunker with the self-hosted embeddings to create time-aware chunks.


In [1]:
import json
import pathlib
import textwrap

from haystack import Pipeline

from src.components.chunker.semantic_chunker import SemanticChunker
from src.components.preprocessor.srt_preprocessor import SRTPreprocessor
from src.core.config import get_settings

settings = get_settings()
print('EMBEDDING_MODEL_NAME =', settings.embedding_model_name)
print('EMBEDDING_DEVICE     =', settings.device)

# Quick existence check to help the user
model_path = pathlib.Path('models') / settings.embedding_model_name
if not model_path.exists():
    # Fallback: search by terminal folder name (case-insensitive), similar to runtime resolver
    target = settings.embedding_model_name.split('/')[-1].lower()
    candidates = [p for p in pathlib.Path('models').rglob('*') if p.is_dir() and p.name.lower() == target]
    if candidates:
        print('Found candidate model dir at:', candidates[0])
    else:
        print('WARNING: Expected model folder not found under models/. The chunker cell may fail.')


EMBEDDING_MODEL_NAME = qwen3-embedding-0.6B
EMBEDDING_DEVICE     = 
Found candidate model dir at: models/chunker/qwen3-embedding-0.6B


In [2]:
# Sample SRT content (very small)
sample_srt = textwrap.dedent('''
1
00:00:01,000 --> 00:00:02,000
- Hello there!

2
00:00:02,100 --> 00:00:03,000
How are you doing?

3
00:00:03,100 --> 00:00:04,000
I'm fine. Thanks!
''')
print(sample_srt)



1
00:00:01,000 --> 00:00:02,000
- Hello there!

2
00:00:02,100 --> 00:00:03,000
How are you doing?

3
00:00:03,100 --> 00:00:04,000
I'm fine. Thanks!



In [3]:
# Build a simple Haystack pipeline
pipe = Pipeline()
pipe.add_component('pre', SRTPreprocessor(min_len=1, dedupe_window_ms=1000))
pipe.add_component('chunk', SemanticChunker())
pipe.connect('pre.jsonl_lines', 'chunk.jsonl_lines')

result = pipe.run({
    'pre': {
        'srt_text': sample_srt
    }
})

# Pipeline returns outputs per component.
chunk_lines = result['chunk']['chunk_lines']
chunk_stats = result['chunk']['stats']

print('Chunks produced:', len(chunk_lines))
print('Stats:', chunk_stats)

# Preview first few chunk JSONL records
for i, line in enumerate(chunk_lines[:5]):
    print(i, json.loads(line))


Chunks produced: 1
Stats: {'input_captions': 3, 'output_chunks': 1, 'avg_tokens_per_chunk': 15.0}
0 {'text': "Hello there! How are you doing? I'm fine. Thanks!", 'start_ms': 1000, 'end_ms': 4000, 'token_count': 15, 'caption_indices': [1, 2, 3], 'chunker_params': {'threshold': '0.75', 'chunk_size': 512, 'similarity_window': 3, 'min_sentences': 2, 'min_characters_per_sentence': 24, 'delim': ['. ', '! ', '? ', '\n'], 'include_delim': 'prev', 'skip_window': 0}}


In [None]:
# Add TextEmbedder component and get an embedding
from src.components.embedder.text_embedder import TextEmbedder
try:
    pipe
except NameError:
    from haystack import Pipeline
    pipe = Pipeline()
emb = TextEmbedder()
pipe.add_component('emb', emb)

import json
if 'chunk_lines' in locals() and chunk_lines:
    text = json.loads(chunk_lines[0])['text']
    print('Embedding first chunk text:', text)
    out = emb.run(text)
else:
    print('Embedding sample text: Hello world')
    out = emb.run('Hello world')
print('Embedding length:', len(out['embedding']))
print('Embedding preview:', out['embedding'][:8])
