# Test pipeline for processing PDFs and storing in ChromaDB

In [6]:
%load_ext autoreload
%autoreload 2

import glob
import json
import os
import textwrap
from typing import List, Dict, Any

import chromadb
from tqdm import tqdm

from embeddingModels.BaseEmbeddingModel import BaseEmbeddingModel
from embeddingModels.ModernBertEmbedder import ModernBertEmbedder
from embeddingModels.QwenEmbedder import QwenEmbedder
from pdfProcessing.doclingTest import setup_docling_converter, extract_sections_from_doc, extract_metadata

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Set up folders

In [7]:
CURRENT_MODEL = "bert"  # Select either qwen or bert
INPUT_FOLDER = "../data/testPDFs"
OUTPUT_FOLDER = "../data/testPDFOutput/pipelineTest"
CHROMA_DB_DIR = "./chroma_db"

In [8]:
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
pdf_files = glob.glob(os.path.join(INPUT_FOLDER, "*.pdf"))
collection_names = {"bert": "scientific_papers_bert", "qwen": "scientific_papers_qwen"}

## Set up ChromaDB Client, Collection and Document Converter

In [9]:
chroma_client = chromadb.PersistentClient(path=CHROMA_DB_DIR)
pipeline_test_collection = chroma_client.get_or_create_collection(
    name=collection_names[CURRENT_MODEL],
    metadata={"hnsw:space": "ip"}
)
converter = setup_docling_converter()
# I used docling from IBM, can also describe images

‚úÖ CUDA detected. Using GPU.


### Docling
https://www.docling.ai/

## Convert PDFs and store in json

In [4]:
%%time
for pdf_path in tqdm(pdf_files):
    file_stem = os.path.splitext(os.path.basename(pdf_path))[0]
    try:
        result = converter.convert(pdf_path)

        sections = extract_sections_from_doc(result.document)

        metadata = extract_metadata(sections)

        final_output = {
            "filename": os.path.basename(pdf_path),
            "metadata": metadata,
            "sections": sections
        }

        out_path = os.path.join(OUTPUT_FOLDER, f"{file_stem}_converted.json")
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(final_output, f, indent=2)

        print(f"‚úÖ Processed: {file_stem}")
        print(f"   found ID: {metadata.get('arxiv_id')}")
        print(f"   found {len(metadata.get('authors', []))} authors")

    except Exception as e:
        print(f"‚ùå Failed {file_stem}: {e}")


  0%|          | 0/3 [00:00<?, ?it/s]2025-12-08 15:58:13,861 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-08 15:58:13,916 - INFO - Going to convert document batch...
2025-12-08 15:58:13,917 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 1064fff70b16649e2a9cc84da931292b
2025-12-08 15:58:13,954 - INFO - Loading plugin 'docling_defaults'
2025-12-08 15:58:13,956 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-12-08 15:58:13,995 - INFO - Loading plugin 'docling_defaults'
2025-12-08 15:58:14,000 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-12-08 15:58:14,305 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2025-12-08 15:58:14,318 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-12-08 15:58:14,324 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\tnkru\anaconda3\envs\GenAI\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx

‚úÖ Processed: Attention is all you need
   found ID: arXiv:1706.03762v7
   found 9 authors


2025-12-08 15:58:32,199 - INFO - Finished converting document BERT.pdf in 7.47 sec.
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 2/3 [00:18<00:08,  8.87s/it]2025-12-08 15:58:32,205 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-08 15:58:32,206 - INFO - Going to convert document batch...
2025-12-08 15:58:32,207 - INFO - Processing document sentence bert.pdf


‚úÖ Processed: BERT
   found ID: arXiv:1810.04805v2
   found 3 authors


2025-12-08 15:58:36,094 - INFO - Finished converting document sentence bert.pdf in 3.89 sec.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:22<00:00,  7.41s/it]

‚úÖ Processed: sentence bert
   found ID: arXiv:1908.10084v1
   found 3 authors
CPU times: total: 1min 9s
Wall time: 22.2 s





In [10]:
del converter

## Embed and store in ChromaDB

In [14]:
def ingest_papers_to_chroma(
        json_folder: str,
        collection: chromadb.Collection,
        embedding_model: BaseEmbeddingModel
):
    """
    Reads structured JSON papers and ingests them into ChromaDB.
    """

    json_files = glob.glob(os.path.join(json_folder, "*.json"))
    print(f"Found {len(json_files)} JSON files to ingest.")

    for json_file in tqdm(json_files, desc="Processing Papers"):
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # --- A. Determine Parent ID ---
        # Prefer arXiv ID, fallback to filename if missing
        parent_id = data['metadata'].get('arxiv_id')
        if not parent_id:
            parent_id = data['filename']
            # TODO: get ID from sematic scholar

        # Clean ID (Chroma requires IDs to be strings, usually safe chars)
        parent_id = parent_id.replace(" ", "_").replace(":", "_")

        # --- B. Prepare Batches for this Document ---
        documents: List[str] = []
        metadatas: List[Dict[str, Any]] = []
        ids: List[str] = []

        global_meta = {
            "parent_id": parent_id,
            "filename": data['filename'],
            "title": data['metadata'].get('title', "Unknown"),
            "authors": ", ".join(data['metadata'].get('authors', [])),
            "arxiv_id": data['metadata'].get('arxiv_id', "N/A")
        }

        for section_header, content in tqdm(data['sections'].items(), desc="Processing Sections"):
            if not content.strip():
                continue

            # 1. Create Unique ID for this chunk
            safe_header = section_header.replace(" ", "_")[:50]
            chunk_id = f"{parent_id}#{safe_header}"

            # 2. Create Metadata for this chunk
            chunk_meta = global_meta.copy()
            chunk_meta["section"] = section_header
            chunk_meta["is_preamble"] = (section_header == "Preamble")

            # removing \n from content
            content = content.replace("\n", " ")
            documents.append(content)
            metadatas.append(chunk_meta)
            ids.append(chunk_id)

        # --- D. Generate Embeddings ---
        if documents:
            # Use your custom class to encode
            embeddings_np = embedding_model.encode(documents)
            # Convert numpy to python list for Chroma
            embeddings_list = embeddings_np.tolist()

            # --- E. Upsert to Chroma ---
            # using upsert handles re-runs gracefully (updates existing IDs)
            print('Generating embeddings')
            collection.upsert(
                ids=ids,
                embeddings=embeddings_list,
                documents=documents,
                metadatas=metadatas
            )

    print("Ingestion Complete.")

In [12]:
match CURRENT_MODEL:
    case "bert":
        embedder = ModernBertEmbedder(
            model_name="Alibaba-NLP/gte-modernbert-base",
            normalize=True
        )
    case "qwen":
        embedder = QwenEmbedder("Qwen/Qwen3-Embedding-8B", use_fp16=True)


Loading Qwen/Qwen3-Embedding-8B on cuda...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [13]:
%%time
ingest_papers_to_chroma(
    json_folder=OUTPUT_FOLDER,
    collection=pipeline_test_collection,
    embedding_model=embedder
)

Found 3 JSON files to ingest.


Processing Papers:   0%|          | 0/3 [00:00<?, ?it/s]
Processing Sections: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:00<?, ?it/s][A
Processing Papers:  33%|‚ñà‚ñà‚ñà‚ñé      | 1/3 [01:35<03:11, 95.85s/it]
Processing Sections: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 31/31 [00:00<?, ?it/s][A
Processing Papers:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 2/3 [04:51<02:34, 154.56s/it]
Processing Sections: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18/18 [00:00<00:00, 35985.45it/s]
Processing Papers: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [06:47<00:00, 135.86s/it]

Ingestion Complete.
CPU times: total: 6min 31s
Wall time: 6min 47s





In [15]:
print(f"Collection Count: {pipeline_test_collection.count()}")

Collection Count: 77


In [16]:
def query_chroma(
        collection: chromadb.Collection,
        query_text: str,
        model: BaseEmbeddingModel,
        n_results: int = 5,
):
    print(f"--- üîç Querying for: '{query_text}' ---")

    try:
        query_vector_np = model.encode([query_text])
        query_vector_list = query_vector_np.tolist()

        results = collection.query(
            query_embeddings=query_vector_list,
            n_results=n_results,
            # Optional: Filter by metadata (e.g., only from specific paper)
            # where={"parent_id": "arXiv:1706.03762v7"}
        )

        # 5. Display Results
        if not results['ids'][0]:
            print("No results found.")
            return

        print(f"\n‚úÖ Found {len(results['ids'][0])} relevant chunks:\n")

        for i in range(len(results['ids'][0])):
            doc_id = results['ids'][0][i]
            score = results['distances'][0][i]  # Lower is better (L2 distance)
            content = results['documents'][0][i]
            metadata = results['metadatas'][0][i]

            print(f"Result #{i + 1} (Distance: {score:.4f})")
            print(f"üìÑ Paper: {metadata.get('title', 'Unknown')}")
            print(f"üìå Section: {metadata.get('section', 'Unknown')}")
            print(f"üîó ID: {doc_id}")
            print("-" * 40)
            print("üìù Content Snippet:")
            print(textwrap.fill(content[:300] + "...", width=80))  # Preview first 300 chars
            print("\n" + "=" * 60 + "\n")

        return results
    except Exception as e:
        print(e)

In [18]:
results = query_chroma(
    collection=pipeline_test_collection,
    query_text="transformer architecture details",
    model=embedder,
    n_results=3
)

--- üîç Querying for: 'Sentence embeddings' ---

‚úÖ Found 3 relevant chunks:

Result #1 (Distance: 0.2980)
üìÑ Paper: Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks
üìå Section: References
üîó ID: arXiv_1908.10084v1#References
----------------------------------------
üìù Content Snippet:
Eneko Agirre, Carmen Banea, Claire Cardie, Daniel Cer, Mona Diab, Aitor
Gonzalez-Agirre, Weiwei Guo, Inigo Lopez-Gazpio, Montse Maritxalar, Rada
Mihalcea, German Rigau, Larraitz Uria, and Janyce Wiebe. 2015. SemEval-2015 Task
2: Semantic Textual Similarity, English, Spanish and Pilot on Interpretabi...


Result #2 (Distance: 0.3155)
üìÑ Paper: BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
üìå Section: References
üîó ID: arXiv_1810.04805v2#References
----------------------------------------
üìù Content Snippet:
Alan Akbik, Duncan Blythe, and Roland Vollgraf. 2018. Contextual string
embeddings for sequence labeling. In Proceedings of the 27th I