AVISO: Este notebook fue ejecutado en Google Colab Pro en A100GPU

In [2]:
from docling.document_converter import DocumentConverter
converter = DocumentConverter()

In [None]:
from docling_core.transforms.chunker import HierarchicalChunker

sample_pdf = "./resources/manuals/Archer AX11000.pdf"
# most resource-intensive part
converted_result = converter.convert(sample_pdf) # interpreta el contenido y es parseado en un formato estructurado

doc = converted_result.document

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

otslp_all_standard_094_clean.check:   0%|          | 0.00/213M [00:00<?, ?B/s]

otslp_all_fast.check:   0%|          | 0.00/146M [00:00<?, ?B/s]

model.pt:   0%|          | 0.00/202M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

.gitignore:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.49k [00:00<?, ?B/s]

(…)artifacts/tableformer/fat/tm_config.json:   0%|          | 0.00/7.09k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

(…)del_artifacts/tableformer/tm_config.json:   0%|          | 0.00/7.09k [00:00<?, ?B/s]



Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete

In [4]:
doc



La variable doc es la que se pasará como input a nuestro chunker avanzado. Sin embargo, podemos probar a hacer primeramente un chunking jerárquico de ese doc (hier_chunks no es usado en el resto del documento, el chunker avanzado hará de forma automático este mismo chunking para doc y después hará el refinado por límite de tokens)

In [5]:
hier_chunks = list(HierarchicalChunker().chunk(doc))

In [6]:
hier_chunks

[DocChunk(text='User Guide', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[TextItem(self_ref='#/texts/0', parent=RefItem(cref='#/body'), children=[], label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=189.30670166015625, t=485.01727294921875, r=415.20269775390625, b=397.41326904296875, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 10))], orig='User Guide', text='User Guide')], headings=None, captions=None, origin=DocumentOrigin(mimetype='application/pdf', binary_hash=13203266288581993525, filename='Archer AX11000.pdf', uri=None))),
 DocChunk(text='AX11000 MU-MIMO Tri-Band Gaming Router Archer AX11000', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[TextItem(self_ref='#/texts/1', parent=RefItem(cref='#/body'), children=[], label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=162.2770233154297, t=420.93

In [7]:
from copy import deepcopy
from typing import Iterable, Iterator

from docling_core.transforms.chunker import (
    BaseChunk,
    BaseChunker,
    DocMeta,
    HierarchicalChunker,
)
from docling_core.types.doc import DoclingDocument as DLDocument
from pydantic import ConfigDict, PositiveInt
from transformers import AutoTokenizer

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [8]:
class MaxTokenLimitingChunker(BaseChunker):
    model_config = ConfigDict(arbitrary_types_allowed=True)

    inner_chunker: BaseChunker = HierarchicalChunker() # setamos el chunker jerárquico como primer chunking
    tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
    max_tokens: PositiveInt = 512
    delim: str = "\n"

    def _serialize_meta_to_include(self, meta: DocMeta) -> str:
        meta_parts = []
        headings_part = self.delim.join(meta.headings or []) # titulo de la seccion
        if headings_part:
            meta_parts.append(headings_part)
        captions_part = self.delim.join(meta.captions or []) # contenido (sera limitado por los max tokens)
        if captions_part:
            meta_parts.append(captions_part)
        return self.delim.join(meta_parts)

    def _split_above_max_tokens(self, chunk_iter: Iterable[BaseChunk]): # segundo tokenizador por limite de tokens
        for chunk in chunk_iter:
            meta = DocMeta.model_validate(chunk.meta)
            meta_text = self._serialize_meta_to_include(meta=meta) # coge los metadatos (titulos y captions)
            meta_list = [meta_text] if meta_text else []
            full_ser = self.delim.join(meta_list + ([chunk.text] if chunk.text else []))

            meta_tokens = self.tokenizer(
                meta_text, return_offsets_mapping=True, add_special_tokens=False
            )["offset_mapping"]
            delim_tokens = (
                self.tokenizer(
                    self.delim, return_offsets_mapping=True, add_special_tokens=False
                )["offset_mapping"]
                if meta_text
                else []
            )
            num_tokens_avail_for_text = self.max_tokens - (
                len(meta_tokens) + len(delim_tokens)
            )

            text_tokens = self.tokenizer(
                chunk.text, return_offsets_mapping=True, add_special_tokens=False
            )["offset_mapping"]
            num_text_tokens = len(text_tokens)

            if (
                num_text_tokens <= num_tokens_avail_for_text
            ):  # chunk already within token limit
                c = deepcopy(chunk)
                c.text = full_ser
                yield c
            else:  # chunk requires further splitting to meet token limit
                fitting_texts = [
                    chunk.text[
                        text_tokens[base][0] : text_tokens[
                            min(base + num_tokens_avail_for_text, num_text_tokens) - 1
                        ][1]
                    ]
                    for base in range(0, num_text_tokens, num_tokens_avail_for_text)
                ]
                for text in fitting_texts:
                    c = deepcopy(chunk)
                    c.text = self.delim.join(meta_list + [text])
                    yield c

    def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:
        chunk_iter = self.inner_chunker.chunk(dl_doc=dl_doc, **kwargs) # procesa el documento y aplica el chunking jerarquico
        yield from self._split_above_max_tokens(chunk_iter=chunk_iter) # hace un segundo chunking con limite de tokens

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [9]:
# inicializar MaxTokenLimitingChunker con los ajustes deseados
complete_chunker = MaxTokenLimitingChunker(
    inner_chunker=HierarchicalChunker(), # hierarchical chunker
    max_tokens=150  # token limit
)

final_chunks = list(complete_chunker.chunk(doc)) # primero se hace el chunking jerarquico y despues el del limitador de tokens

# final chunks ya contiene los chunks jerarquicos + con limite de tokens
for chunk in final_chunks:
    meta = DocMeta.model_validate(chunk.meta) # cogemos los metadatos
    # calcular el tamaño
    token_length = len(complete_chunker.tokenizer(chunk.text, return_offsets_mapping=True, add_special_tokens=False)["offset_mapping"])

    # Display de los datos
    print(f"Token Length: {token_length}")
    print("RAW Metadata:", meta)
    print("PARSED Metadata Fields:")
    for field, value in meta.__dict__.items():
        if field == "doc_items":
            print(f"  {field}:")
            for idx, item in enumerate(value):
                print(f"    Item {idx+1}:")
                for item_field, item_value in item.__dict__.items():
                    if item_field == "prov":
                        print(f"      {item_field}:")
                        for prov_idx, prov_item in enumerate(item_value):
                            print(f"        Provenance {prov_idx+1}:")
                            for prov_field, prov_value in prov_item.__dict__.items():
                                if prov_field == "bbox":
                                    print(f"          {prov_field}:")
                                    for bbox_field, bbox_value in prov_value.__dict__.items():
                                        print(f"            {bbox_field}: {bbox_value}")
                                else:
                                    print(f"          {prov_field}: {prov_value}")
                    else:
                        print(f"      {item_field}: {item_value}")
        else:
            print(f"  {field}: {value}")
    print("Chunk Text:", chunk.text)
    print("-" * 50)  # Separador para la lectura de entre los chunks

Token indices sequence length is longer than the specified maximum sequence length for this model (2655 > 512). Running this sequence through the model will result in indexing errors


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
      text: 4 ) Click SAVE .
      enumerated: False
      marker: -
  headings: ['Note:']
  captions: None
  origin: mimetype='application/pdf' binary_hash=13203266288581993525 filename='Archer AX11000.pdf' uri=None
Chunk Text: Note:
1 ) Visit http://tplinkwifi.net , and log in with your TP-Link ID or the password you set for the router.
3 ) Modify the LAN IP address as the follow picture shows. Here we take 192.168.2.1 as an example.
2 ) Go to Advanced > Network > LAN .
4 ) Click SAVE .
--------------------------------------------------
Token Length: 150
RAW Metadata: schema_name='docling_core.transforms.chunker.DocMeta' version='1.0.0' doc_items=[ListItem(self_ref='#/texts/1338', parent=RefItem(cref='#/groups/279'), children=[], label=<DocItemLabel.LIST_ITEM: 'list_item'>, prov=[ProvenanceItem(page_no=118, bbox=BoundingBox(l=84.46641540527344, t=483.1624755859375, r=264.2479553222656, b=461.261474609375, coord_origin=<