In [1]:
from docling.document_converter import DocumentConverter
converter = DocumentConverter()

In [None]:
from docling_core.transforms.chunker import HierarchicalChunker

sample_pdf = "./resources/JerarquiaDocs.pdf"
# most resource-intensive part
converted_result = converter.convert(sample_pdf) # interpreta el contenido y es parseado en un formato estructurado

doc = converted_result.document

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [3]:
doc

DoclingDocument(schema_name='DoclingDocument', version='1.0.0', name='JerarquiaDocs', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=7167666031040985369, filename='JerarquiaDocs.pdf', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(self_ref='#/body', parent=None, children=[RefItem(cref='#/texts/0'), RefItem(cref='#/texts/1'), RefItem(cref='#/texts/2'), RefItem(cref='#/texts/3'), RefItem(cref='#/groups/0'), RefItem(cref='#/texts/6'), RefItem(cref='#/texts/7'), RefItem(cref='#/texts/8'), RefItem(cref='#/texts/9'), RefItem(cref='#/texts/10')], name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), groups=[GroupItem(self_ref='#/groups/0', parent=RefItem(cref='#/body'), children=[RefItem(cref='#/texts/4'), RefItem(cref='#/texts/5')], name='list', label=<GroupLabel.LIST: 'list'>)], texts=[SectionHeaderItem(self_ref='#/texts/0', parent=RefItem(cref='#/body'), c

La variable doc es la que se pasará como input a nuestro chunker avanzado. Sin embargo, podemos probar a hacer primeramente un chunking jerárquico de ese doc (hier_chunks no es usado en el resto del documento, el chunker avanzado hará de forma automático este mismo chunking para doc y después hará el refinado por límite de tokens)

In [4]:
hier_chunks = list(HierarchicalChunker().chunk(doc))

In [5]:
hier_chunks

[DocChunk(text='Even if you set Temperature as Zero, it is still possible to get different answers because you are using GPUs.', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[TextItem(self_ref='#/texts/1', parent=RefItem(cref='#/body'), children=[], label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=84.44147491455078, t=722.78662109375, r=511.21490478515625, b=694.7868041992188, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 110))], orig='Even if you set Temperature as Zero, it is still possible to get different answers because you are using GPUs.', text='Even if you set Temperature as Zero, it is still possible to get different answers because you are using GPUs.')], headings=['Non-deterministic nature of llms'], captions=None, origin=DocumentOrigin(mimetype='application/pdf', binary_hash=7167666031040985369, filename='JerarquiaDocs.pdf', uri=None))),
 DocChunk(text='In this case, 

In [6]:
from copy import deepcopy
from typing import Iterable, Iterator

from docling_core.transforms.chunker import (
    BaseChunk,
    BaseChunker,
    DocMeta,
    HierarchicalChunker,
)
from docling_core.types.doc import DoclingDocument as DLDocument
from pydantic import ConfigDict, PositiveInt
from transformers import AutoTokenizer

In [7]:
class MaxTokenLimitingChunker(BaseChunker):
    model_config = ConfigDict(arbitrary_types_allowed=True)

    inner_chunker: BaseChunker = HierarchicalChunker() # setamos el chunker jerárquico como primer chunking
    tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
    max_tokens: PositiveInt = 512
    delim: str = "\n"

    def _serialize_meta_to_include(self, meta: DocMeta) -> str:
        meta_parts = []
        headings_part = self.delim.join(meta.headings or []) # titulo de la seccion
        if headings_part:
            meta_parts.append(headings_part)
        captions_part = self.delim.join(meta.captions or []) # contenido (sera limitado por los max tokens)
        if captions_part:
            meta_parts.append(captions_part)
        return self.delim.join(meta_parts)

    def _split_above_max_tokens(self, chunk_iter: Iterable[BaseChunk]): # segundo tokenizador por limite de tokens
        for chunk in chunk_iter:
            meta = DocMeta.model_validate(chunk.meta)
            meta_text = self._serialize_meta_to_include(meta=meta) # coge los metadatos (titulos y captions)
            meta_list = [meta_text] if meta_text else []
            full_ser = self.delim.join(meta_list + ([chunk.text] if chunk.text else []))

            meta_tokens = self.tokenizer(
                meta_text, return_offsets_mapping=True, add_special_tokens=False
            )["offset_mapping"]
            delim_tokens = (
                self.tokenizer(
                    self.delim, return_offsets_mapping=True, add_special_tokens=False
                )["offset_mapping"]
                if meta_text
                else []
            )
            num_tokens_avail_for_text = self.max_tokens - (
                len(meta_tokens) + len(delim_tokens)
            )

            text_tokens = self.tokenizer(
                chunk.text, return_offsets_mapping=True, add_special_tokens=False
            )["offset_mapping"]
            num_text_tokens = len(text_tokens)

            if (
                num_text_tokens <= num_tokens_avail_for_text
            ):  # chunk already within token limit
                c = deepcopy(chunk)
                c.text = full_ser
                yield c
            else:  # chunk requires further splitting to meet token limit
                fitting_texts = [
                    chunk.text[
                        text_tokens[base][0] : text_tokens[
                            min(base + num_tokens_avail_for_text, num_text_tokens) - 1
                        ][1]
                    ]
                    for base in range(0, num_text_tokens, num_tokens_avail_for_text)
                ]
                for text in fitting_texts:
                    c = deepcopy(chunk)
                    c.text = self.delim.join(meta_list + [text])
                    yield c

    def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:
        chunk_iter = self.inner_chunker.chunk(dl_doc=dl_doc, **kwargs) # procesa el documento y aplica el chunking jerarquico
        yield from self._split_above_max_tokens(chunk_iter=chunk_iter) # hace un segundo chunking con limite de tokens

In [8]:
# inicializar MaxTokenLimitingChunker con los ajustes deseados
complete_chunker = MaxTokenLimitingChunker(
    inner_chunker=HierarchicalChunker(), # hierarchical chunker
    max_tokens=150  # token limit
)

final_chunks = list(complete_chunker.chunk(doc)) # primero se hace el chunking jerarquico y despues el del limitador de tokens

# final chunks ya contiene los chunks jerarquicos + con limite de tokens
for chunk in final_chunks:
    meta = DocMeta.model_validate(chunk.meta) # cogemos los metadatos
    # calcular el tamaño
    token_length = len(complete_chunker.tokenizer(chunk.text, return_offsets_mapping=True, add_special_tokens=False)["offset_mapping"])
    
    # Display de los datos
    print(f"Token Length: {token_length}")
    print("RAW Metadata:", meta)
    print("PARSED Metadata Fields:")
    for field, value in meta.__dict__.items():
        if field == "doc_items":
            print(f"  {field}:")
            for idx, item in enumerate(value):
                print(f"    Item {idx+1}:")
                for item_field, item_value in item.__dict__.items():
                    if item_field == "prov":
                        print(f"      {item_field}:")
                        for prov_idx, prov_item in enumerate(item_value):
                            print(f"        Provenance {prov_idx+1}:")
                            for prov_field, prov_value in prov_item.__dict__.items():
                                if prov_field == "bbox":
                                    print(f"          {prov_field}:")
                                    for bbox_field, bbox_value in prov_value.__dict__.items():
                                        print(f"            {bbox_field}: {bbox_value}")
                                else:
                                    print(f"          {prov_field}: {prov_value}")
                    else:
                        print(f"      {item_field}: {item_value}")
        else:
            print(f"  {field}: {value}")
    print("Chunk Text:", chunk.text)
    print("-" * 50)  # Separador para la lectura de entre los chunks

Token Length: 32
RAW Metadata: schema_name='docling_core.transforms.chunker.DocMeta' version='1.0.0' doc_items=[TextItem(self_ref='#/texts/1', parent=RefItem(cref='#/body'), children=[], label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=84.44147491455078, t=722.78662109375, r=511.21490478515625, b=694.7868041992188, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 110))], orig='Even if you set Temperature as Zero, it is still possible to get different answers because you are using GPUs.', text='Even if you set Temperature as Zero, it is still possible to get different answers because you are using GPUs.')] headings=['Non-deterministic nature of llms'] captions=None origin=DocumentOrigin(mimetype='application/pdf', binary_hash=7167666031040985369, filename='JerarquiaDocs.pdf', uri=None)
PARSED Metadata Fields:
  schema_name: docling_core.transforms.chunker.DocMeta
  version: 1.0.0
  doc_items:
    Item 1:
      self_ref: #/texts/1
  