# Memoirr: Preprocessor + Chunker Pipeline Smoke Test

This notebook runs a minimal Haystack pipeline using the SRT preprocessor and the semantic chunker.

Requirements:
- Ensure a local sentence-transformers model is available under `models/<EMBEDDING_MODEL_NAME>/` (with `model.safetensors` and tokenizer/config files).
- `.env` should set `EMBEDDING_MODEL_NAME` (default included in repo). Optionally set `EMBEDDING_DEVICE` (e.g., `cuda:0`).

Notes:
- The preprocessor emits cleaned JSONL lines, one per caption.
- The chunker uses Chonkie SemanticChunker with the self-hosted embeddings to create time-aware chunks.


In [1]:

import pathlib
import textwrap
from src.core.config import get_settings

settings = get_settings()
print('EMBEDDING_MODEL_NAME =', settings.embedding_model_name)
print('EMBEDDING_DEVICE     =', settings.device)

# Quick existence check to help the user
model_path = pathlib.Path('models') / settings.embedding_model_name
if not model_path.exists():
    # Fallback: search by terminal folder name (case-insensitive), similar to runtime resolver
    target = settings.embedding_model_name.split('/')[-1].lower()
    candidates = [p for p in pathlib.Path('models').rglob('*') if p.is_dir() and p.name.lower() == target]
    if candidates:
        print('Found candidate model dir at:', candidates[0])
    else:
        print('WARNING: Expected model folder not found under models/. The chunker cell may fail.')


EMBEDDING_MODEL_NAME = qwen3-embedding-0.6B
EMBEDDING_DEVICE     = 
Found candidate model dir at: models/chunker/qwen3-embedding-0.6B


In [2]:
# Sample SRT content (very small)
sample_srt = textwrap.dedent('''
1
00:00:01,000 --> 00:00:02,000
- Hello there!

2
00:00:02,100 --> 00:00:03,000
How are you doing?

3
00:00:03,100 --> 00:00:04,000
I'm fine. Thanks!
''')
print(sample_srt)



1
00:00:01,000 --> 00:00:02,000
- Hello there!

2
00:00:02,100 --> 00:00:03,000
How are you doing?

3
00:00:03,100 --> 00:00:04,000
I'm fine. Thanks!



In [3]:
# Run the complete end-to-end pipeline: SRT → Preprocess → Chunk → Embed → Qdrant
from src.pipelines.srt_to_qdrant import build_srt_to_qdrant_pipeline

print('Building the complete SRT-to-Qdrant pipeline...')
pipe = build_srt_to_qdrant_pipeline()

print('Pipeline components:')
for component_name in pipe.graph.nodes:
    print(f'  - {component_name}')

print('Pipeline connections:')
for edge in pipe.graph.edges:
    print(f'  - {edge[0]} → {edge[1]}')

print('Running pipeline on sample SRT...')
result = pipe.run({'pre': {'srt_text': sample_srt}})

print('Pipeline Results:')
print('=================')

# Show preprocessing stats
pre_stats = result['pre']['stats']
print(f'Preprocessor: {pre_stats}')

# Show chunking stats
chunk_stats = result['chunk']['stats']
print(f'Chunker: {chunk_stats}')

# Show write stats
write_stats = result['write']['stats']
print(f'Writer: {write_stats}')

print('✅ SUCCESS! Check Qdrant UI at http://localhost:6300/dashboard to see the embedded chunks!')
print('Collection name: memoirr')

[2m2025-09-20T14:53:10.723540Z[0m [[32m[1minfo     [0m] [1mPipelines module initialized  [0m [36mavailable_pipelines[0m=[35m['srt_to_qdrant'][0m [36menvironment[0m=[35mdevelopment[0m [36mmodule[0m=[35mpipelines[0m [36mservice[0m=[35mmemoirr[0m


[2m2025-09-20T14:53:10.943916Z[0m [[32m[1minfo     [0m] [1mBuilding SRT-to-Qdrant pipeline[0m [36mcomponent[0m=[35mpipeline_builder[0m [36mpipeline_type[0m=[35msrt_to_qdrant[0m


Building the complete SRT-to-Qdrant pipeline...
{'pipeline_type': 'srt_to_qdrant', 'component': 'pipeline_builder', 'event': 'Building SRT-to-Qdrant pipeline', 'level': 'info', 'timestamp': '2025-09-20T14:53:10.943916Z'}


[2m2025-09-20T14:53:14.258216Z[0m [[32m[1minfo     [0m] [1mSRTPreprocessor initialized   [0m [36mcomponent[0m=[35mpreprocessor[0m [36mdedupe_window_ms[0m=[35m1000[0m [36mmin_len[0m=[35m1[0m


{'min_len': 1, 'dedupe_window_ms': 1000, 'component': 'preprocessor', 'event': 'SRTPreprocessor initialized', 'level': 'info', 'timestamp': '2025-09-20T14:53:14.258216Z'}


[2m2025-09-20T14:53:14.259538Z[0m [[32m[1minfo     [0m] [1mSemanticChunker initialized   [0m [36mchunk_size[0m=[35m512[0m [36mcomponent[0m=[35mchunker[0m [36mmin_sentences[0m=[35m2[0m [36msimilarity_window[0m=[35m3[0m [36mthreshold[0m=[35m0.75[0m


{'threshold': '0.75', 'chunk_size': 512, 'similarity_window': 3, 'min_sentences': 2, 'component': 'chunker', 'event': 'SemanticChunker initialized', 'level': 'info', 'timestamp': '2025-09-20T14:53:14.259538Z'}


[2m2025-09-20T14:53:14.260471Z[0m [[32m[1minfo     [0m] [1mChunkJsonlToTexts initialized [0m [36mcomponent[0m=[35mpipeline_glue[0m


{'component': 'pipeline_glue', 'event': 'ChunkJsonlToTexts initialized', 'level': 'info', 'timestamp': '2025-09-20T14:53:14.260471Z'}


[2m2025-09-20T14:53:14.261846Z[0m [[32m[1minfo     [0m] [1mModel resolved via recursive search[0m [36mcomponent[0m=[35mmodel_utils[0m [36mmethod[0m=[35mrecursive[0m [36mmodel_name[0m=[35mqwen3-embedding-0.6B[0m [36mresolved_path[0m=[35mmodels/chunker/qwen3-embedding-0.6B[0m [36mtotal_candidates[0m=[35m1[0m


{'model_name': 'qwen3-embedding-0.6B', 'resolved_path': 'models/chunker/qwen3-embedding-0.6B', 'method': 'recursive', 'total_candidates': 1, 'component': 'model_utils', 'event': 'Model resolved via recursive search', 'level': 'info', 'timestamp': '2025-09-20T14:53:14.261846Z'}


[2m2025-09-20T14:53:14.262464Z[0m [[32m[1minfo     [0m] [1mEmbedding model resolved successfully[0m [36mcomponent[0m=[35membedder[0m [36mmodel_name[0m=[35mqwen3-embedding-0.6B[0m [36mmodel_path[0m=[35mmodels/chunker/qwen3-embedding-0.6B[0m


{'model_name': 'qwen3-embedding-0.6B', 'model_path': 'models/chunker/qwen3-embedding-0.6B', 'component': 'embedder', 'event': 'Embedding model resolved successfully', 'level': 'info', 'timestamp': '2025-09-20T14:53:14.262464Z'}


[2m2025-09-20T14:53:14.367553Z[0m [[32m[1minfo     [0m] [1mLoad pretrained SentenceTransformer: models/chunker/qwen3-embedding-0.6B[0m [36mlineno[0m=[35m227[0m [36mmodule[0m=[35msentence_transformers.SentenceTransformer[0m


Load pretrained SentenceTransformer: models/chunker/qwen3-embedding-0.6B


[2m2025-09-20T14:53:15.763432Z[0m [[32m[1minfo     [0m] [1m1 prompt is loaded, with the key: query[0m [36mlineno[0m=[35m378[0m [36mmodule[0m=[35msentence_transformers.SentenceTransformer[0m


1 prompt is loaded, with the key: query


[2m2025-09-20T14:53:15.764785Z[0m [[32m[1minfo     [0m] [1mTextEmbedder initialized successfully[0m [36mcomponent[0m=[35membedder[0m [36membedding_dimension[0m=[35m1024[0m [36mmodel_name[0m=[35mqwen3-embedding-0.6B[0m [36mmodel_path[0m=[35mmodels/chunker/qwen3-embedding-0.6B[0m


{'model_name': 'qwen3-embedding-0.6B', 'embedding_dimension': 1024, 'model_path': 'models/chunker/qwen3-embedding-0.6B', 'component': 'embedder', 'event': 'TextEmbedder initialized successfully', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.764785Z'}


[2m2025-09-20T14:53:15.765700Z[0m [[32m[1minfo     [0m] [1mBuildDocuments initialized    [0m [36mcomponent[0m=[35mpipeline_glue[0m


{'component': 'pipeline_glue', 'event': 'BuildDocuments initialized', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.765700Z'}


[2m2025-09-20T14:53:15.767632Z[0m [[32m[1minfo     [0m] [1mQdrantWriter initialized successfully[0m [36mcollection_name[0m=[35mmemoirr[0m [36mcomponent[0m=[35mqdrant_writer[0m [36membedding_dimension[0m=[35m1024[0m [36mqdrant_url[0m=[35mhttp://localhost:6300[0m


{'qdrant_url': 'http://localhost:6300', 'collection_name': 'memoirr', 'embedding_dimension': 1024, 'component': 'qdrant_writer', 'event': 'QdrantWriter initialized successfully', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.767632Z'}


[2m2025-09-20T14:53:15.768619Z[0m [[32m[1minfo     [0m] [1mSRT-to-Qdrant pipeline built successfully[0m [36mcomponent[0m=[35mpipeline_builder[0m [36mcomponent_names[0m=[35m['pre', 'chunk', 'explode', 'embed', 'docs', 'write'][0m [36mpipeline_type[0m=[35msrt_to_qdrant[0m [36mtotal_components[0m=[35m6[0m [36mtotal_connections[0m=[35m7[0m


{'total_components': 6, 'component_names': ['pre', 'chunk', 'explode', 'embed', 'docs', 'write'], 'total_connections': 7, 'pipeline_type': 'srt_to_qdrant', 'component': 'pipeline_builder', 'event': 'SRT-to-Qdrant pipeline built successfully', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.768619Z'}


[2m2025-09-20T14:53:15.787135Z[0m [[32m[1minfo     [0m] [1mRunning component pre         [0m [36mcomponent_name[0m=[35mpre[0m [36mlineno[0m=[35m67[0m [36mmodule[0m=[35mhaystack.core.pipeline.pipeline[0m


Pipeline components:
  - pre
  - chunk
  - explode
  - embed
  - docs
  - write
Pipeline connections:
  - pre → chunk
  - chunk → explode
  - explode → embed
  - explode → docs
  - explode → docs
  - embed → docs
  - docs → write
Running pipeline on sample SRT...
Running component pre


[2m2025-09-20T14:53:15.788504Z[0m [[32m[1minfo     [0m] [1msrt_preprocessing_started     [0m [36minput_length[0m=[35m151[0m [36moperation[0m=[35msrt_preprocessing[0m


{'operation': 'srt_preprocessing', 'input_length': 151, 'event': 'srt_preprocessing_started', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.788504Z'}


[2m2025-09-20T14:53:15.789154Z[0m [[32m[1minfo     [0m] [1mStarting SRT preprocessing    [0m [36mcomponent[0m=[35mpreprocessor[0m [36mdedupe_window_ms[0m=[35m1000[0m [36minput_size_chars[0m=[35m151[0m [36mmin_len[0m=[35m1[0m


{'input_size_chars': 151, 'min_len': 1, 'dedupe_window_ms': 1000, 'component': 'preprocessor', 'event': 'Starting SRT preprocessing', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.789154Z'}


[2m2025-09-20T14:53:15.974633Z[0m [[32m[1minfo     [0m] [1mCaption language filtering completed[0m [36mcomponent[0m=[35mlanguage_filter[0m [36mdropped_captions[0m=[35m0[0m [36mdropped_lines[0m=[35m0[0m [36mkept_captions[0m=[35m3[0m [36mkept_lines[0m=[35m3[0m [36mtotal_captions[0m=[35m3[0m [36mtotal_lines[0m=[35m3[0m


{'total_captions': 3, 'kept_captions': 3, 'dropped_captions': 0, 'total_lines': 3, 'kept_lines': 3, 'dropped_lines': 0, 'component': 'language_filter', 'event': 'Caption language filtering completed', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.974633Z'}


[2m2025-09-20T14:53:15.975661Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mlanguage_filter[0m [36mmetric_name[0m=[35mcaptions_language_filtered_total[0m [36mmetric_type[0m=[35mcounter[0m [36mvalue[0m=[35m3[0m


{'metric_name': 'captions_language_filtered_total', 'metric_type': 'counter', 'value': 3, 'component': 'language_filter', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.975661Z'}


[2m2025-09-20T14:53:15.976279Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mlanguage_filter[0m [36mmetric_name[0m=[35mcaptions_kept_after_language_filter[0m [36mmetric_type[0m=[35mcounter[0m [36mstatus[0m=[35mkept[0m [36mvalue[0m=[35m3[0m


{'metric_name': 'captions_kept_after_language_filter', 'metric_type': 'counter', 'value': 3, 'component': 'language_filter', 'status': 'kept', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.976279Z'}


[2m2025-09-20T14:53:15.976839Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mlanguage_filter[0m [36mmetric_name[0m=[35mcaptions_dropped_after_language_filter[0m [36mmetric_type[0m=[35mcounter[0m [36mstatus[0m=[35mdropped[0m [36mvalue[0m=[35m0[0m


{'metric_name': 'captions_dropped_after_language_filter', 'metric_type': 'counter', 'value': 0, 'component': 'language_filter', 'status': 'dropped', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.976839Z'}


[2m2025-09-20T14:53:15.977330Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mlanguage_filter[0m [36mmetric_name[0m=[35mlines_language_filtered_total[0m [36mmetric_type[0m=[35mcounter[0m [36mvalue[0m=[35m3[0m


{'metric_name': 'lines_language_filtered_total', 'metric_type': 'counter', 'value': 3, 'component': 'language_filter', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.977330Z'}


[2m2025-09-20T14:53:15.977809Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mlanguage_filter[0m [36mmetric_name[0m=[35mlines_kept_after_language_filter[0m [36mmetric_type[0m=[35mcounter[0m [36mstatus[0m=[35mkept[0m [36mvalue[0m=[35m3[0m


{'metric_name': 'lines_kept_after_language_filter', 'metric_type': 'counter', 'value': 3, 'component': 'language_filter', 'status': 'kept', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.977809Z'}


[2m2025-09-20T14:53:15.978314Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mlanguage_filter[0m [36mmetric_name[0m=[35mlines_dropped_after_language_filter[0m [36mmetric_type[0m=[35mcounter[0m [36mstatus[0m=[35mdropped[0m [36mvalue[0m=[35m0[0m


{'metric_name': 'lines_dropped_after_language_filter', 'metric_type': 'counter', 'value': 0, 'component': 'language_filter', 'status': 'dropped', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.978314Z'}


[2m2025-09-20T14:53:15.978897Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mpreprocessor[0m [36mmetric_name[0m=[35mcaptions_processed_total[0m [36mmetric_type[0m=[35mcounter[0m [36mvalue[0m=[35m0[0m


{'metric_name': 'captions_processed_total', 'metric_type': 'counter', 'value': 0, 'component': 'preprocessor', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.978897Z'}


[2m2025-09-20T14:53:15.979398Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mpreprocessor[0m [36mmetric_name[0m=[35mcaptions_kept_total[0m [36mmetric_type[0m=[35mcounter[0m [36mstatus[0m=[35mkept[0m [36mvalue[0m=[35m3[0m


{'metric_name': 'captions_kept_total', 'metric_type': 'counter', 'value': 3, 'component': 'preprocessor', 'status': 'kept', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.979398Z'}


[2m2025-09-20T14:53:15.979886Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mpreprocessor[0m [36mmetric_name[0m=[35mcaptions_dropped_total[0m [36mmetric_type[0m=[35mcounter[0m [36mreason[0m=[35mempty[0m [36mvalue[0m=[35m0[0m


{'metric_name': 'captions_dropped_total', 'metric_type': 'counter', 'value': 0, 'component': 'preprocessor', 'reason': 'empty', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.979886Z'}


[2m2025-09-20T14:53:15.981320Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mpreprocessor[0m [36mmetric_name[0m=[35mcaptions_dropped_total[0m [36mmetric_type[0m=[35mcounter[0m [36mreason[0m=[35mnon_english[0m [36mvalue[0m=[35m0[0m


{'metric_name': 'captions_dropped_total', 'metric_type': 'counter', 'value': 0, 'component': 'preprocessor', 'reason': 'non_english', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.981320Z'}


[2m2025-09-20T14:53:15.981807Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mpreprocessor[0m [36mmetric_name[0m=[35mcaptions_deduped_total[0m [36mmetric_type[0m=[35mcounter[0m [36mreason[0m=[35mduplicate[0m [36mvalue[0m=[35m0[0m


{'metric_name': 'captions_deduped_total', 'metric_type': 'counter', 'value': 0, 'component': 'preprocessor', 'reason': 'duplicate', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.981807Z'}


[2m2025-09-20T14:53:15.982273Z[0m [[32m[1minfo     [0m] [1mSRT preprocessing completed   [0m [36mcomponent[0m=[35mpreprocessor[0m [36moutput_lines[0m=[35m3[0m [36mprocessing_stats[0m=[35m{'total_captions': 3, 'kept': 3, 'dropped_empty': 0, 'dropped_non_english': 0, 'deduped': 0}[0m


{'output_lines': 3, 'processing_stats': {'total_captions': 3, 'kept': 3, 'dropped_empty': 0, 'dropped_non_english': 0, 'deduped': 0}, 'component': 'preprocessor', 'event': 'SRT preprocessing completed', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.982273Z'}


[2m2025-09-20T14:53:15.982738Z[0m [[32m[1minfo     [0m] [1msrt_preprocessing_completed   [0m [36mcaptions_dropped[0m=[35m-3[0m [36mcaptions_kept[0m=[35m3[0m [36mduration_ms[0m=[35m194[0m [36minput_length[0m=[35m151[0m [36moperation[0m=[35msrt_preprocessing[0m [36moutput_lines[0m=[35m3[0m


{'operation': 'srt_preprocessing', 'duration_ms': 194, 'input_length': 151, 'output_lines': 3, 'captions_kept': 3, 'captions_dropped': -3, 'event': 'srt_preprocessing_completed', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.982738Z'}


[2m2025-09-20T14:53:15.983433Z[0m [[32m[1minfo     [0m] [1mRunning component chunk       [0m [36mcomponent_name[0m=[35mchunk[0m [36mlineno[0m=[35m67[0m [36mmodule[0m=[35mhaystack.core.pipeline.pipeline[0m


Running component chunk


[2m2025-09-20T14:53:15.984152Z[0m [[32m[1minfo     [0m] [1msemantic_chunking_started     [0m [36minput_captions[0m=[35m3[0m [36moperation[0m=[35msemantic_chunking[0m


{'operation': 'semantic_chunking', 'input_captions': 3, 'event': 'semantic_chunking_started', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.984152Z'}


[2m2025-09-20T14:53:15.984652Z[0m [[32m[1minfo     [0m] [1mStarting semantic chunking    [0m [36mchunk_size[0m=[35m512[0m [36mcomponent[0m=[35mchunker[0m [36minput_captions[0m=[35m3[0m [36mthreshold[0m=[35m0.75[0m


{'input_captions': 3, 'threshold': '0.75', 'chunk_size': 512, 'component': 'chunker', 'event': 'Starting semantic chunking', 'level': 'info', 'timestamp': '2025-09-20T14:53:15.984652Z'}


[2m2025-09-20T14:53:15.987024Z[0m [[32m[1minfo     [0m] [1mUse pytorch device_name: cuda:0[0m [36mlineno[0m=[35m219[0m [36mmodule[0m=[35msentence_transformers.SentenceTransformer[0m


Use pytorch device_name: cuda:0


[2m2025-09-20T14:53:15.987683Z[0m [[32m[1minfo     [0m] [1mLoad pretrained SentenceTransformer: models/chunker/qwen3-embedding-0.6B[0m [36mlineno[0m=[35m227[0m [36mmodule[0m=[35msentence_transformers.SentenceTransformer[0m


Load pretrained SentenceTransformer: models/chunker/qwen3-embedding-0.6B


[2m2025-09-20T14:53:17.118402Z[0m [[32m[1minfo     [0m] [1m1 prompt is loaded, with the key: query[0m [36mlineno[0m=[35m378[0m [36mmodule[0m=[35msentence_transformers.SentenceTransformer[0m


1 prompt is loaded, with the key: query


[2m2025-09-20T14:53:17.121007Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mchunker[0m [36mmetric_name[0m=[35mcaptions_chunked_total[0m [36mmetric_type[0m=[35mcounter[0m [36mvalue[0m=[35m3[0m


{'metric_name': 'captions_chunked_total', 'metric_type': 'counter', 'value': 3, 'component': 'chunker', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.121007Z'}


[2m2025-09-20T14:53:17.121971Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mchunker[0m [36mmetric_name[0m=[35mchunks_generated_total[0m [36mmetric_type[0m=[35mcounter[0m [36mvalue[0m=[35m1[0m


{'metric_name': 'chunks_generated_total', 'metric_type': 'counter', 'value': 1, 'component': 'chunker', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.121971Z'}


[2m2025-09-20T14:53:17.122668Z[0m [[32m[1minfo     [0m] [1mmetric_histogram              [0m [36mcomponent[0m=[35mchunker[0m [36mmetric_name[0m=[35mavg_tokens_per_chunk[0m [36mmetric_type[0m=[35mhistogram[0m [36mvalue[0m=[35m15.0[0m


{'metric_name': 'avg_tokens_per_chunk', 'metric_type': 'histogram', 'value': 15.0, 'component': 'chunker', 'event': 'metric_histogram', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.122668Z'}


[2m2025-09-20T14:53:17.123345Z[0m [[32m[1minfo     [0m] [1mSemantic chunking completed   [0m [36mavg_tokens_per_chunk[0m=[35m15.0[0m [36mcomponent[0m=[35mchunker[0m [36minput_captions[0m=[35m3[0m [36moutput_chunks[0m=[35m1[0m


{'input_captions': 3, 'output_chunks': 1, 'avg_tokens_per_chunk': 15.0, 'component': 'chunker', 'event': 'Semantic chunking completed', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.123345Z'}


[2m2025-09-20T14:53:17.123778Z[0m [[32m[1minfo     [0m] [1msemantic_chunking_completed   [0m [36mavg_tokens_per_chunk[0m=[35m15.0[0m [36mduration_ms[0m=[35m1139[0m [36minput_captions[0m=[35m3[0m [36moperation[0m=[35msemantic_chunking[0m [36moutput_chunks[0m=[35m1[0m [36moutput_chunks_final[0m=[35m1[0m


{'operation': 'semantic_chunking', 'duration_ms': 1139, 'input_captions': 3, 'output_chunks': 1, 'avg_tokens_per_chunk': 15.0, 'output_chunks_final': 1, 'event': 'semantic_chunking_completed', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.123778Z'}


[2m2025-09-20T14:53:17.124445Z[0m [[32m[1minfo     [0m] [1mRunning component explode     [0m [36mcomponent_name[0m=[35mexplode[0m [36mlineno[0m=[35m67[0m [36mmodule[0m=[35mhaystack.core.pipeline.pipeline[0m


Running component explode


[2m2025-09-20T14:53:17.126211Z[0m [[32m[1minfo     [0m] [1mchunk_jsonl_parsing_started   [0m [36minput_lines[0m=[35m1[0m [36moperation[0m=[35mchunk_jsonl_parsing[0m


{'operation': 'chunk_jsonl_parsing', 'input_lines': 1, 'event': 'chunk_jsonl_parsing_started', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.126211Z'}


[2m2025-09-20T14:53:17.127412Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mpipeline_glue[0m [36mmetric_name[0m=[35mchunk_lines_processed_total[0m [36mmetric_type[0m=[35mcounter[0m [36mvalue[0m=[35m1[0m


{'metric_name': 'chunk_lines_processed_total', 'metric_type': 'counter', 'value': 1, 'component': 'pipeline_glue', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.127412Z'}


[2m2025-09-20T14:53:17.128447Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mpipeline_glue[0m [36mmetric_name[0m=[35mtexts_extracted_total[0m [36mmetric_type[0m=[35mcounter[0m [36mstatus[0m=[35msuccess[0m [36mvalue[0m=[35m1[0m


{'metric_name': 'texts_extracted_total', 'metric_type': 'counter', 'value': 1, 'component': 'pipeline_glue', 'status': 'success', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.128447Z'}


[2m2025-09-20T14:53:17.129591Z[0m [[32m[1minfo     [0m] [1mChunk JSONL parsing completed [0m [36mcomponent[0m=[35mpipeline_glue[0m [36minput_lines[0m=[35m1[0m [36moutput_texts[0m=[35m1[0m [36mskipped_lines[0m=[35m0[0m


{'input_lines': 1, 'output_texts': 1, 'skipped_lines': 0, 'component': 'pipeline_glue', 'event': 'Chunk JSONL parsing completed', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.129591Z'}


[2m2025-09-20T14:53:17.130734Z[0m [[32m[1minfo     [0m] [1mchunk_jsonl_parsing_completed [0m [36mduration_ms[0m=[35m4[0m [36minput_lines[0m=[35m1[0m [36moperation[0m=[35mchunk_jsonl_parsing[0m [36moutput_texts[0m=[35m1[0m [36mskipped_lines[0m=[35m0[0m [36msuccess_rate[0m=[35m1.0[0m


{'operation': 'chunk_jsonl_parsing', 'duration_ms': 4, 'input_lines': 1, 'output_texts': 1, 'skipped_lines': 0, 'success_rate': 1.0, 'event': 'chunk_jsonl_parsing_completed', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.130734Z'}


[2m2025-09-20T14:53:17.133282Z[0m [[32m[1minfo     [0m] [1mRunning component embed       [0m [36mcomponent_name[0m=[35membed[0m [36mlineno[0m=[35m67[0m [36mmodule[0m=[35mhaystack.core.pipeline.pipeline[0m


Running component embed


[2m2025-09-20T14:53:17.134413Z[0m [[32m[1minfo     [0m] [1mtext_embedding_started        [0m [36minput_texts[0m=[35m1[0m [36moperation[0m=[35mtext_embedding[0m


{'operation': 'text_embedding', 'input_texts': 1, 'event': 'text_embedding_started', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.134413Z'}


[2m2025-09-20T14:53:17.135551Z[0m [[32m[1minfo     [0m] [1mUsing individual embedding processing[0m [36mcomponent[0m=[35membedder[0m [36mtext_count[0m=[35m1[0m


{'text_count': 1, 'component': 'embedder', 'event': 'Using individual embedding processing', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.135551Z'}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[2m2025-09-20T14:53:17.350831Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35membedder[0m [36mmetric_name[0m=[35membeddings_generated_total[0m [36mmetric_type[0m=[35mcounter[0m [36mmode[0m=[35mindividual[0m [36mstatus[0m=[35msuccess[0m [36mvalue[0m=[35m1[0m


{'metric_name': 'embeddings_generated_total', 'metric_type': 'counter', 'value': 1, 'component': 'embedder', 'mode': 'individual', 'status': 'success', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.350831Z'}


[2m2025-09-20T14:53:17.352253Z[0m [[32m[1minfo     [0m] [1mtext_embedding_completed      [0m [36mduration_ms[0m=[35m217[0m [36mfailed_embeddings[0m=[35m0[0m [36minput_texts[0m=[35m1[0m [36moperation[0m=[35mtext_embedding[0m [36mprocessing_mode[0m=[35mindividual[0m [36msuccessful_embeddings[0m=[35m1[0m


{'operation': 'text_embedding', 'duration_ms': 217, 'input_texts': 1, 'successful_embeddings': 1, 'failed_embeddings': 0, 'processing_mode': 'individual', 'event': 'text_embedding_completed', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.352253Z'}


[2m2025-09-20T14:53:17.367112Z[0m [[32m[1minfo     [0m] [1mRunning component docs        [0m [36mcomponent_name[0m=[35mdocs[0m [36mlineno[0m=[35m67[0m [36mmodule[0m=[35mhaystack.core.pipeline.pipeline[0m


Running component docs


[2m2025-09-20T14:53:17.368876Z[0m [[32m[1minfo     [0m] [1mdocument_building_started     [0m [36minput_embeddings[0m=[35m1[0m [36minput_metas[0m=[35m1[0m [36minput_texts[0m=[35m1[0m [36moperation[0m=[35mdocument_building[0m


{'operation': 'document_building', 'input_texts': 1, 'input_metas': 1, 'input_embeddings': 1, 'event': 'document_building_started', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.368876Z'}


[2m2025-09-20T14:53:17.369664Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mpipeline_glue[0m [36mmetric_name[0m=[35mdocuments_built_total[0m [36mmetric_type[0m=[35mcounter[0m [36mstatus[0m=[35msuccess[0m [36mvalue[0m=[35m1[0m


{'metric_name': 'documents_built_total', 'metric_type': 'counter', 'value': 1, 'component': 'pipeline_glue', 'status': 'success', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.369664Z'}


[2m2025-09-20T14:53:17.371300Z[0m [[32m[1minfo     [0m] [1mmetric_histogram              [0m [36mcomponent[0m=[35mpipeline_glue[0m [36mmetric_name[0m=[35mdocument_build_batch_size[0m [36mmetric_type[0m=[35mhistogram[0m [36mvalue[0m=[35m1[0m


{'metric_name': 'document_build_batch_size', 'metric_type': 'histogram', 'value': 1, 'component': 'pipeline_glue', 'event': 'metric_histogram', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.371300Z'}


[2m2025-09-20T14:53:17.372200Z[0m [[32m[1minfo     [0m] [1mDocument building completed   [0m [36mcomponent[0m=[35mpipeline_glue[0m [36minput_embeddings[0m=[35m1[0m [36minput_metas[0m=[35m1[0m [36minput_texts[0m=[35m1[0m [36moutput_documents[0m=[35m1[0m


{'input_texts': 1, 'input_metas': 1, 'input_embeddings': 1, 'output_documents': 1, 'component': 'pipeline_glue', 'event': 'Document building completed', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.372200Z'}


[2m2025-09-20T14:53:17.373026Z[0m [[32m[1minfo     [0m] [1mdocument_building_completed   [0m [36mdocuments_built[0m=[35m1[0m [36mduration_ms[0m=[35m4[0m [36minput_alignment_ratio[0m=[35m1.0[0m [36minput_embeddings[0m=[35m1[0m [36minput_metas[0m=[35m1[0m [36minput_texts[0m=[35m1[0m [36moperation[0m=[35mdocument_building[0m [36moutput_documents[0m=[35m1[0m


{'operation': 'document_building', 'duration_ms': 4, 'input_texts': 1, 'input_metas': 1, 'input_embeddings': 1, 'output_documents': 1, 'input_alignment_ratio': 1.0, 'documents_built': 1, 'event': 'document_building_completed', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.373026Z'}


[2m2025-09-20T14:53:17.381450Z[0m [[32m[1minfo     [0m] [1mRunning component write       [0m [36mcomponent_name[0m=[35mwrite[0m [36mlineno[0m=[35m67[0m [36mmodule[0m=[35mhaystack.core.pipeline.pipeline[0m


Running component write


[2m2025-09-20T14:53:17.382781Z[0m [[32m[1minfo     [0m] [1mdocument_writing_started      [0m [36moperation[0m=[35mdocument_writing[0m [36mtotal_documents[0m=[35m1[0m


{'operation': 'document_writing', 'total_documents': 1, 'event': 'document_writing_started', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.382781Z'}


[2m2025-09-20T14:53:17.385166Z[0m [[32m[1minfo     [0m] [1mWriting documents to Qdrant   [0m [36mcomponent[0m=[35mqdrant_writer[0m [36mdocument_count[0m=[35m1[0m


{'document_count': 1, 'component': 'qdrant_writer', 'event': 'Writing documents to Qdrant', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.385166Z'}


[2m2025-09-20T14:53:17.526088Z[0m [[32m[1minfo     [0m] [1mHTTP Request: GET http://localhost:6300 "HTTP/1.1 200 OK"[0m [36mlineno[0m=[35m1025[0m [36mmodule[0m=[35mhttpx[0m


HTTP Request: GET http://localhost:6300 "HTTP/1.1 200 OK"


[2m2025-09-20T14:53:17.530454Z[0m [[32m[1minfo     [0m] [1mHTTP Request: GET http://localhost:6300/collections/memoirr/exists "HTTP/1.1 200 OK"[0m [36mlineno[0m=[35m1025[0m [36mmodule[0m=[35mhttpx[0m


HTTP Request: GET http://localhost:6300/collections/memoirr/exists "HTTP/1.1 200 OK"


[2m2025-09-20T14:53:17.545917Z[0m [[32m[1minfo     [0m] [1mHTTP Request: DELETE http://localhost:6300/collections/memoirr "HTTP/1.1 200 OK"[0m [36mlineno[0m=[35m1025[0m [36mmodule[0m=[35mhttpx[0m


HTTP Request: DELETE http://localhost:6300/collections/memoirr "HTTP/1.1 200 OK"


[2m2025-09-20T14:53:17.602068Z[0m [[32m[1minfo     [0m] [1mHTTP Request: PUT http://localhost:6300/collections/memoirr "HTTP/1.1 200 OK"[0m [36mlineno[0m=[35m1025[0m [36mmodule[0m=[35mhttpx[0m


HTTP Request: PUT http://localhost:6300/collections/memoirr "HTTP/1.1 200 OK"


[2m2025-09-20T14:53:17.608438Z[0m [[32m[1minfo     [0m] [1mHTTP Request: POST http://localhost:6300/collections/memoirr/points "HTTP/1.1 200 OK"[0m [36mlineno[0m=[35m1025[0m [36mmodule[0m=[35mhttpx[0m


HTTP Request: POST http://localhost:6300/collections/memoirr/points "HTTP/1.1 200 OK"


  0%|          | 0/1 [00:00<?, ?it/s][2m2025-09-20T14:53:17.630377Z[0m [[32m[1minfo     [0m] [1mHTTP Request: PUT http://localhost:6300/collections/memoirr/points?wait=true "HTTP/1.1 200 OK"[0m [36mlineno[0m=[35m1025[0m [36mmodule[0m=[35mhttpx[0m


HTTP Request: PUT http://localhost:6300/collections/memoirr/points?wait=true "HTTP/1.1 200 OK"


100it [00:00, 6301.35it/s]           
[2m2025-09-20T14:53:17.633381Z[0m [[32m[1minfo     [0m] [1mDocuments written successfully[0m [36mcomponent[0m=[35mqdrant_writer[0m [36mwritten_count[0m=[35m1[0m


{'written_count': 1, 'component': 'qdrant_writer', 'event': 'Documents written successfully', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.633381Z'}


[2m2025-09-20T14:53:17.635153Z[0m [[32m[1minfo     [0m] [1mmetric_counter                [0m [36mcomponent[0m=[35mqdrant_writer[0m [36mmetric_name[0m=[35mdocuments_written_total[0m [36mmetric_type[0m=[35mcounter[0m [36mstatus[0m=[35msuccess[0m [36mvalue[0m=[35m1[0m


{'metric_name': 'documents_written_total', 'metric_type': 'counter', 'value': 1, 'component': 'qdrant_writer', 'status': 'success', 'event': 'metric_counter', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.635153Z'}


[2m2025-09-20T14:53:17.638352Z[0m [[32m[1minfo     [0m] [1mdocument_writing_completed    [0m [36mdocuments_skipped[0m=[35m0[0m [36mdocuments_written[0m=[35m1[0m [36mduration_ms[0m=[35m255[0m [36moperation[0m=[35mdocument_writing[0m [36msuccess_rate[0m=[35m1.0[0m [36mtotal_documents[0m=[35m1[0m


{'operation': 'document_writing', 'duration_ms': 255, 'total_documents': 1, 'documents_written': 1, 'documents_skipped': 0, 'success_rate': 1.0, 'event': 'document_writing_completed', 'level': 'info', 'timestamp': '2025-09-20T14:53:17.638352Z'}
Pipeline Results:
Preprocessor: {'total_captions': 3, 'kept': 3, 'dropped_empty': 0, 'dropped_non_english': 0, 'deduped': 0}
Chunker: {'input_captions': 3, 'output_chunks': 1, 'avg_tokens_per_chunk': 15.0}
Writer: {'written': 1, 'skipped': 0, 'total': 1}
✅ SUCCESS! Check Qdrant UI at http://localhost:6300/dashboard to see the embedded chunks!
Collection name: memoirr
