In [1]:
from docling.document_converter import DocumentConverter
from chonkie import RecursiveChunker
from verbatim_rag.document import (
    Document,
    Chunk,
    ProcessedChunk,
    DocumentType,
    ChunkType,
)
from verbatim_rag.index import VerbatimIndex

# Paper URL
paper_url = "https://aclanthology.org/2020.lrec-1.448.pdf"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
converter = DocumentConverter()
result = converter.convert(paper_url)
result_md = result.document.export_to_markdown()



In [3]:
# Now we can create a Document with the content
document = Document(
    title="Better Together: Modern Methods Plus Traditional Thinking in NP Alignment",
    source=paper_url,
    content_type=DocumentType.PDF,
    raw_content=result_md,
    metadata={
        "authors": ["Ádám Kovács", "Judit Ács", "András Kornai", "Gábor Recski"],
        "venue": "LREC 2020",
        "year": 2020,
    },
)

In [4]:
# Lets chunk the raw content with chonkie
chunker = RecursiveChunker.from_recipe("markdown", lang="en")
chunks = chunker(result_md)

In [5]:
chunks[0]

RecursiveChunk(text=c © European Language Resources Association (ELRA), licensed under CC-BY-NC

## Better Together: Modern Methods Plus Traditional Thinking in NP Alignment

´ Ad´ am Kov´ acs 1 , 2 , Judit ´ Acs 1 , 2 , Andr´ as Kornai 2 , G´ abor Recski 1 , 3

1 BME Dept. of Automation and Applied Informatics, 2 SZTAKI Institute of Computer Science, 3 Apollo.AI lastname.firstname@aut.bme.hu, andras@kornai.com, gabor@apollo.ai

## Abstract

We study a typical intermediary task to Machine Translation, the alignment of NPs in the bitext. After arguing that the task remains relevant even in an end-to-end paradigm, we present simple, dictionary- and word vector-based baselines and a BERT-based system. Our results make clear that even state of the art systems relying on the best end-to-end methods can be improved by bringing in old-fashioned methods such as stopword removal, lemmatization, and dictionaries

Keywords: NP-alignment, rule-based, BERT, hybrid

, start_index=0, end_index=947, t

In [6]:
# Process each chunk and create the structure
for i, chunk in enumerate(chunks):
    # Create basic Chunk
    doc_chunk = Chunk(
        document_id=document.id,
        content=chunk.text,
        chunk_number=i,
        chunk_type=ChunkType.PARAGRAPH,
    )

    # Create basic ProcessedChunk
    processed_chunk = ProcessedChunk(
        chunk_id=doc_chunk.id,
        enhanced_content=chunk.text,  # Use original text directly
    )

    # Add to document
    doc_chunk.add_processed_chunk(processed_chunk)
    document.add_chunk(doc_chunk)

In [7]:
# Create the index with a sparse model
index = VerbatimIndex(
    dense_model=None, sparse_model="naver/splade-v3", db_path="./index.db"
)

In [8]:
# Add documents
index.add_documents([document])

In [9]:
queries = [
    "main contributions",
    "methodology approach",
    "experimental results",
]

for query in queries:
    print(f"\n🔍 '{query}':")
    results = index.search(query, k=2)

    for i, result in enumerate(results):
        print(f"  {i + 1}. Score: {result.score:.2f}")
        print(f"     Text: {result.text[:100]}...")


🔍 'main contributions':
  1. Score: 4.25
     Text: In this paper we argue that using traditional, structure-based thinking, even about tasks such as MT...
  2. Score: 2.89
     Text: ## 5. Conclusion

The results presented in Table 2 make clear that even SOTA systems relying on the ...

🔍 'methodology approach':
  1. Score: 9.28
     Text: c © European Language Resources Association (ELRA), licensed under CC-BY-NC

## Better Together: Mod...
  2. Score: 8.46
     Text: ## 5. Conclusion

The results presented in Table 2 make clear that even SOTA systems relying on the ...

🔍 'experimental results':
  1. Score: 9.50
     Text: ## 3.2. BERT

Our second method maps English and Hungarian NPs to vectors using the multilingual BER...
  2. Score: 9.41
     Text: ## 4. Results

We split the set of labeled NP pairs extracted from the 1984 dataset into train and t...
