# Test pipeline for processing PDFs and storing in ChromaDB

In [1]:
%load_ext autoreload
%autoreload 2


import glob
import json
import os
import textwrap
from typing import List, Dict, Any
import sys

# Get the absolute path of the current notebook
current_dir = os.getcwd()

# Get the parent directory (which is the 'GenAI' folder)
parent_dir = os.path.dirname(current_dir)

# Add the parent directory to sys.path
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Verify it worked
print(f"Added to path: {parent_dir}")

import chromadb
from tqdm import tqdm

from embeddingModels.BaseEmbeddingModel import BaseEmbeddingModel
from embeddingModels.ModernBertEmbedder import ModernBertEmbedder
from embeddingModels.QwenEmbedder import QwenEmbedder
from pdfProcessing.doclingTest import setup_docling_converter, extract_sections_from_doc, extract_metadata

Added to path: c:\Users\kronask\OneDrive - TU Wien\TU Wien\3. Semester\GenAI\GenAI


In [2]:
import torch

# 1. Check if CUDA (NVIDIA GPU support) is available
gpu_available = torch.cuda.is_available()
print(f"Is GPU available? {gpu_available}")

# 2. If available, print the name of the GPU
if gpu_available:
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
else:
    print("Running on CPU only.")

Is GPU available? True
GPU Name: NVIDIA GeForce RTX 2080 Ti
CUDA Version: 12.4


## Set up folders

In [3]:
CURRENT_MODEL = "bert"  # Select either qwen or bert
INPUT_FOLDER = "../data/testPDFs"
OUTPUT_FOLDER = "../data/testPDFOutput/pipelineTest"
CHROMA_DB_DIR = "./chroma_db"

In [4]:
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
pdf_files = glob.glob(os.path.join(INPUT_FOLDER, "*.pdf"))
collection_names = {"bert": "scientific_papers_bert", "qwen": "scientific_papers_qwen"}

## Set up ChromaDB Client, Collection and Document Converter

In [5]:
chroma_client = chromadb.PersistentClient(path=CHROMA_DB_DIR)
pipeline_test_collection = chroma_client.get_or_create_collection(
    name=collection_names[CURRENT_MODEL],
    metadata={"hnsw:space": "ip"}
)
converter = setup_docling_converter()
# I used docling from IBM, can also describe images

2026-01-05 20:13:23,703 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


‚úÖ CUDA detected. Using GPU.


### Docling
https://www.docling.ai/

## Convert PDFs and store in json

In [6]:
%%time
for pdf_path in tqdm(pdf_files):
    file_stem = os.path.splitext(os.path.basename(pdf_path))[0]
    try:
        result = converter.convert(pdf_path)

        sections = extract_sections_from_doc(result.document)

        metadata = extract_metadata(sections)

        final_output = {
            "filename": os.path.basename(pdf_path),
            "metadata": metadata,
            "sections": sections
        }

        out_path = os.path.join(OUTPUT_FOLDER, f"{file_stem}_converted.json")
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(final_output, f, indent=2)

        print(f"‚úÖ Processed: {file_stem}")
        print(f"   found ID: {metadata.get('arxiv_id')}")
        print(f"   found {len(metadata.get('authors', []))} authors")

    except Exception as e:
        print(f"‚ùå Failed {file_stem}: {e}")


  0%|          | 0/14 [00:00<?, ?it/s]2026-01-05 20:13:24,252 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-05 20:13:24,287 - INFO - Going to convert document batch...
2026-01-05 20:13:24,289 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 1064fff70b16649e2a9cc84da931292b
2026-01-05 20:13:24,307 - INFO - Loading plugin 'docling_defaults'
2026-01-05 20:13:24,311 - INFO - Registered picture descriptions: ['vlm', 'api']
2026-01-05 20:13:24,325 - INFO - Loading plugin 'docling_defaults'
2026-01-05 20:13:24,334 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2026-01-05 20:13:24,670 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2026-01-05 20:13:24,695 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2026-01-05 20:13:24,704 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\kronask\.conda\envs\genai_env\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.o

‚úÖ Processed: Kandel et al. - 2023 - Demonstration of an AI-driven workflow for autonomous high-resolution scanning microscopy
   found ID: None
   found 1 authors


2026-01-05 20:13:51,583 - INFO - Finished converting document Kuprikov et al. - 2022 - Deep reinforcement learning for self-tuning laser source of dissipative solitons.pdf in 13.20 sec.
 14%|‚ñà‚ñç        | 2/14 [00:27<02:43, 13.59s/it]2026-01-05 20:13:51,596 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-05 20:13:51,604 - INFO - Going to convert document batch...
2026-01-05 20:13:51,605 - INFO - Processing document MacLeod et al. - 2022 - A self-driving laboratory advances the Pareto front for material properties.pdf


‚úÖ Processed: Kuprikov et al. - 2022 - Deep reinforcement learning for self-tuning laser source of dissipative solitons
   found ID: None
   found 25 authors


2026-01-05 20:14:00,483 - INFO - Finished converting document MacLeod et al. - 2022 - A self-driving laboratory advances the Pareto front for material properties.pdf in 8.89 sec.
 21%|‚ñà‚ñà‚ñè       | 3/14 [00:36<02:05, 11.45s/it]2026-01-05 20:14:00,493 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-05 20:14:00,497 - INFO - Going to convert document batch...
2026-01-05 20:14:00,497 - INFO - Processing document Mareev et al. - 2023 - Self-Adjusting Optical Systems Based on Reinforcement Learning.pdf


‚úÖ Processed: MacLeod et al. - 2022 - A self-driving laboratory advances the Pareto front for material properties
   found ID: None
   found 2 authors


2026-01-05 20:14:12,124 - INFO - Finished converting document Mareev et al. - 2023 - Self-Adjusting Optical Systems Based on Reinforcement Learning.pdf in 11.64 sec.
 29%|‚ñà‚ñà‚ñä       | 4/14 [00:47<01:55, 11.53s/it]2026-01-05 20:14:12,140 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-05 20:14:12,147 - INFO - Going to convert document batch...
2026-01-05 20:14:12,149 - INFO - Processing document Morgado et al. - 2024 - The rise of data‚Äêdriven microscopy powered by machine learning.pdf


‚úÖ Processed: Mareev et al. - 2023 - Self-Adjusting Optical Systems Based on Reinforcement Learning
   found ID: None
   found 6 authors


2026-01-05 20:14:29,045 - INFO - Finished converting document Morgado et al. - 2024 - The rise of data‚Äêdriven microscopy powered by machine learning.pdf in 16.91 sec.
 36%|‚ñà‚ñà‚ñà‚ñå      | 5/14 [01:04<02:01, 13.47s/it]2026-01-05 20:14:29,059 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-05 20:14:29,070 - INFO - Going to convert document batch...
2026-01-05 20:14:29,072 - INFO - Processing document Morris et al. - 2024 - A general Bayesian algorithm for the autonomous alignment of beamlines.pdf


‚úÖ Processed: Morgado et al. - 2024 - The rise of data‚Äêdriven microscopy powered by machine learning
   found ID: None
   found 4 authors


2026-01-05 20:14:58,594 - INFO - Finished converting document Morris et al. - 2024 - A general Bayesian algorithm for the autonomous alignment of beamlines.pdf in 29.55 sec.
 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 6/14 [01:34<02:31, 18.94s/it]2026-01-05 20:14:58,605 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-05 20:14:58,611 - INFO - Going to convert document batch...
2026-01-05 20:14:58,612 - INFO - Processing document Nousiainen et al. - 2024 - Laboratory experiments of model-based reinforcement learning for adaptive optics control.pdf


‚úÖ Processed: Morris et al. - 2024 - A general Bayesian algorithm for the autonomous alignment of beamlines
   found ID: None
   found 2 authors


2026-01-05 20:15:37,343 - INFO - Finished converting document Nousiainen et al. - 2024 - Laboratory experiments of model-based reinforcement learning for adaptive optics control.pdf in 38.75 sec.
 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 7/14 [02:13<02:57, 25.42s/it]2026-01-05 20:15:37,360 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-05 20:15:37,365 - INFO - Going to convert document batch...
2026-01-05 20:15:37,367 - INFO - Processing document Rebuffi et al. - 2023 - AutoFocus AI-driven alignment of nanofocusing X-ray mirror systems.pdf


‚úÖ Processed: Nousiainen et al. - 2024 - Laboratory experiments of model-based reinforcement learning for adaptive optics control
   found ID: None
   found 2 authors


2026-01-05 20:16:10,008 - INFO - Finished converting document Rebuffi et al. - 2023 - AutoFocus AI-driven alignment of nanofocusing X-ray mirror systems.pdf in 32.64 sec.
 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 8/14 [02:45<02:46, 27.72s/it]2026-01-05 20:16:10,029 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-05 20:16:10,038 - INFO - Going to convert document batch...
2026-01-05 20:16:10,040 - INFO - Processing document Schloz et al. - 2023 - Deep reinforcement learning for data-driven adaptive scanning in ptychography.pdf


‚úÖ Processed: Rebuffi et al. - 2023 - AutoFocus AI-driven alignment of nanofocusing X-ray mirror systems
   found ID: None
   found 3 authors


2026-01-05 20:16:21,936 - INFO - Finished converting document Schloz et al. - 2023 - Deep reinforcement learning for data-driven adaptive scanning in ptychography.pdf in 11.92 sec.
 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 9/14 [02:57<01:53, 22.78s/it]2026-01-05 20:16:21,954 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-05 20:16:21,971 - INFO - Going to convert document batch...
2026-01-05 20:16:21,972 - INFO - Processing document Szymanski et al. - 2023 - An autonomous laboratory for the accelerated synthesis of novel materials.pdf


‚úÖ Processed: Schloz et al. - 2023 - Deep reinforcement learning for data-driven adaptive scanning in ptychography
   found ID: None
   found 45 authors


2026-01-05 20:16:32,090 - INFO - Finished converting document Szymanski et al. - 2023 - An autonomous laboratory for the accelerated synthesis of novel materials.pdf in 10.14 sec.
 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 10/14 [03:07<01:15, 18.89s/it]2026-01-05 20:16:32,122 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-05 20:16:32,217 - INFO - Going to convert document batch...
2026-01-05 20:16:32,219 - INFO - Processing document Tom et al. - 2024 - Self-Driving Laboratories for Chemistry and Materials Science.pdf


‚úÖ Processed: Szymanski et al. - 2023 - An autonomous laboratory for the accelerated synthesis of novel materials
   found ID: None
   found 245 authors


2026-01-05 20:18:50,346 - INFO - Finished converting document Tom et al. - 2024 - Self-Driving Laboratories for Chemistry and Materials Science.pdf in 138.25 sec.
 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 11/14 [05:26<02:46, 55.42s/it]2026-01-05 20:18:50,364 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-05 20:18:50,369 - INFO - Going to convert document batch...
2026-01-05 20:18:50,370 - INFO - Processing document Volk and Abolhasani - 2024 - Performance metrics to unleash the power of self-driving labs in chemistry and materials science.pdf


‚úÖ Processed: Tom et al. - 2024 - Self-Driving Laboratories for Chemistry and Materials Science
   found ID: None
   found 3 authors


2026-01-05 20:19:00,989 - INFO - Finished converting document Volk and Abolhasani - 2024 - Performance metrics to unleash the power of self-driving labs in chemistry and materials science.pdf in 10.62 sec.
 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 12/14 [05:36<01:23, 41.82s/it]2026-01-05 20:19:01,083 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-05 20:19:01,093 - INFO - Going to convert document batch...
2026-01-05 20:19:01,095 - INFO - Processing document Xie et al. - 2023 - Inverse design of chiral functional films by a robotic AI-guided system.pdf


‚úÖ Processed: Volk and Abolhasani - 2024 - Performance metrics to unleash the power of self-driving labs in chemistry and materials science
   found ID: None
   found 3 authors


2026-01-05 20:19:23,759 - INFO - Finished converting document Xie et al. - 2023 - Inverse design of chiral functional films by a robotic AI-guided system.pdf in 22.67 sec.
 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 13/14 [05:59<00:36, 36.03s/it]2026-01-05 20:19:23,771 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-05 20:19:23,785 - INFO - Going to convert document batch...
2026-01-05 20:19:23,787 - INFO - Processing document Zhang et al. - 2024 - Precision autofocus in optical microscopy with liquid lenses controlled by deep reinforcement learni.pdf


‚úÖ Processed: Xie et al. - 2023 - Inverse design of chiral functional films by a robotic AI-guided system
   found ID: None
   found 1 authors


2026-01-05 20:19:38,070 - INFO - Finished converting document Zhang et al. - 2024 - Precision autofocus in optical microscopy with liquid lenses controlled by deep reinforcement learni.pdf in 14.30 sec.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14/14 [06:13<00:00, 26.70s/it]

‚úÖ Processed: Zhang et al. - 2024 - Precision autofocus in optical microscopy with liquid lenses controlled by deep reinforcement learni
   found ID: None
   found 1 authors
CPU times: total: 27min 46s
Wall time: 6min 13s





In [7]:
del converter

## Embed and store in ChromaDB

In [8]:
def ingest_papers_to_chroma(
        json_folder: str,
        collection: chromadb.Collection,
        embedding_model: BaseEmbeddingModel
):
    """
    Reads structured JSON papers and ingests them into ChromaDB.
    """

    json_files = glob.glob(os.path.join(json_folder, "*.json"))
    print(f"Found {len(json_files)} JSON files to ingest.")

    for json_file in tqdm(json_files, desc="Processing Papers"):
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # --- A. Determine Parent ID ---
        # Prefer arXiv ID, fallback to filename if missing
        parent_id = data['metadata'].get('arxiv_id')
        if not parent_id:
            parent_id = data['filename']
            # TODO: get ID from sematic scholar

        # Clean ID (Chroma requires IDs to be strings, usually safe chars)
        parent_id = parent_id.replace(" ", "_").replace(":", "_")

        # --- B. Prepare Batches for this Document ---
        documents: List[str] = []
        metadatas: List[Dict[str, Any]] = []
        ids: List[str] = []

        global_meta = {
            "parent_id": parent_id,
            "filename": data['filename'],
            "title": data['metadata'].get('title', "Unknown"),
            "authors": ", ".join(data['metadata'].get('authors', [])),
            "arxiv_id": data['metadata'].get('arxiv_id', "N/A")
        }

        for section_header, content in tqdm(data['sections'].items(), desc="Processing Sections"):
            if not content.strip():
                continue

            # 1. Create Unique ID for this chunk
            safe_header = section_header.replace(" ", "_")[:50]
            chunk_id = f"{parent_id}#{safe_header}"

            # 2. Create Metadata for this chunk
            chunk_meta = global_meta.copy()
            chunk_meta["section"] = section_header
            chunk_meta["is_preamble"] = (section_header == "Preamble")

            # removing \n from content
            content = content.replace("\n", " ")
            documents.append(content)
            metadatas.append(chunk_meta)
            ids.append(chunk_id)

        # --- D. Generate Embeddings ---
        if documents:
            # Use your custom class to encode
            embeddings_np = embedding_model.encode(documents)
            # Convert numpy to python list for Chroma
            embeddings_list = embeddings_np.tolist()

            # --- E. Upsert to Chroma ---
            # using upsert handles re-runs gracefully (updates existing IDs)
            print('Generating embeddings')
            collection.upsert(
                ids=ids,
                embeddings=embeddings_list,
                documents=documents,
                metadatas=metadatas
            )

    print("Ingestion Complete.")

In [9]:
match CURRENT_MODEL:
    case "bert":
        embedder = ModernBertEmbedder(
            model_name="Alibaba-NLP/gte-modernbert-base",
            normalize=True
        )
    case "qwen":
        embedder = QwenEmbedder("Qwen/Qwen3-Embedding-8B", use_fp16=True)


Loading Alibaba-NLP/gte-modernbert-base on cuda...


In [10]:
%%time
ingest_papers_to_chroma(
    json_folder=OUTPUT_FOLDER,
    collection=pipeline_test_collection,
    embedding_model=embedder
)

Found 14 JSON files to ingest.


Processing Sections: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 21/21 [00:00<?, ?it/s]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing Papers:   7%|‚ñã         | 1/14 [00:02<00:29,  2.23s/it]

Generating embeddings


Processing Sections: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [00:00<?, ?it/s]
Processing Papers:  14%|‚ñà‚ñç        | 2/14 [00:03<00:17,  1.50s/it]

Generating embeddings


Processing Sections: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13/13 [00:00<?, ?it/s]
Processing Papers:  21%|‚ñà‚ñà‚ñè       | 3/14 [00:10<00:44,  4.08s/it]

Generating embeddings


Processing Sections: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [00:00<?, ?it/s]
Processing Papers:  29%|‚ñà‚ñà‚ñä       | 4/14 [00:11<00:29,  2.91s/it]

Generating embeddings


Processing Sections: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:00<?, ?it/s]
Processing Papers:  36%|‚ñà‚ñà‚ñà‚ñå      | 5/14 [00:12<00:19,  2.22s/it]

Generating embeddings


Processing Sections: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 31/31 [00:00<?, ?it/s]
Processing Papers:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 6/14 [00:15<00:19,  2.42s/it]

Generating embeddings


Processing Sections: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 33/33 [00:00<00:00, 61846.31it/s]
Processing Papers:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 7/14 [00:17<00:15,  2.20s/it]

Generating embeddings


Processing Sections: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:00<?, ?it/s]
Processing Papers:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 8/14 [00:19<00:12,  2.16s/it]

Generating embeddings


Processing Sections: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [00:00<00:00, 11599.83it/s]
Processing Papers:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 9/14 [00:22<00:12,  2.57s/it]

Generating embeddings


Processing Sections: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18/18 [00:00<?, ?it/s]
Processing Papers:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 10/14 [00:24<00:09,  2.31s/it]

Generating embeddings


Processing Sections: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 53/53 [00:00<00:00, 27252.43it/s]
Processing Papers:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 10/14 [05:28<02:11, 32.85s/it]


CPU times: total: 5min 29s
Wall time: 5min 28s


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.11 GiB. GPU 0 has a total capacity of 11.00 GiB of which 8.96 GiB is free. Of the allocated memory 766.01 MiB is allocated by PyTorch, and 115.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
print(f"Collection Count: {pipeline_test_collection.count()}")

Collection Count: 0


In [None]:
def query_chroma(
        collection: chromadb.Collection,
        query_text: str,
        model: BaseEmbeddingModel,
        n_results: int = 5,
):
    print(f"--- üîç Querying for: '{query_text}' ---")

    try:
        query_vector_np = model.encode([query_text])
        query_vector_list = query_vector_np.tolist()

        results = collection.query(
            query_embeddings=query_vector_list,
            n_results=n_results,
            # Optional: Filter by metadata (e.g., only from specific paper)
            # where={"parent_id": "arXiv:1706.03762v7"}
        )

        # 5. Display Results
        if not results['ids'][0]:
            print("No results found.")
            return

        print(f"\n‚úÖ Found {len(results['ids'][0])} relevant chunks:\n")

        for i in range(len(results['ids'][0])):
            doc_id = results['ids'][0][i]
            score = results['distances'][0][i]  # Lower is better (L2 distance)
            content = results['documents'][0][i]
            metadata = results['metadatas'][0][i]

            print(f"Result #{i + 1} (Distance: {score:.4f})")
            print(f"üìÑ Paper: {metadata.get('title', 'Unknown')}")
            print(f"üìå Section: {metadata.get('section', 'Unknown')}")
            print(f"üîó ID: {doc_id}")
            print("-" * 40)
            print("üìù Content Snippet:")
            print(textwrap.fill(content[:300] + "...", width=80))  # Preview first 300 chars
            print("\n" + "=" * 60 + "\n")

        return results
    except Exception as e:
        print(e)

In [None]:
results = query_chroma(
    collection=pipeline_test_collection,
    query_text="transformer architecture details",
    model=embedder,
    n_results=3
)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


--- üîç Querying for: 'transformer architecture details' ---
No results found.
