In [1]:
import torch
from docling_core.types.doc import DoclingDocument
from docling_core.types.doc.document import DocTagsDocument
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image
import requests
from PIL import Image
from io import BytesIO
import IPython.display as display

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained ("ds4sd/SmolDocling-256M-preview")
model = AutoModelForVision2Seq.from_pretrained("ds4sd/SmolDocling-256M-preview", torch_dtype=torch.bfloat16,).to(DEVICE)
messages = [
    {
        "role": "user",
        "content" : [
            {"type": "image"},
            {"type": "text", "text": "convert this page to docling."}
        ]
    },
]

In [4]:
def prompt(image_path):
    image = Image.open(image_path).convert("RGB")
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE)
    generated_ids = model.generate(**inputs, max_new_tokens=8192)
    prompt_length = inputs.input_ids.shape[1]
    trimmed_generated_ids = generated_ids[:, prompt_length:]
    doctags = processor.batch_decode(
        trimmed_generated_ids,
        skip_special_tokens=False,
    )[0].lstrip()
    doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
    print(doctags)
    doc = DoclingDocument(name="Document")
    doc.load_from_doctags(doctags_doc)
    print("\n\n")
    print(f"**MD Output:**\n\n{doc.export_to_markdown()}")

In [30]:
prompt("page-13.png")

<doctag><section_header_level_1><loc_97><loc_14><loc_412><loc_33>Closed loop transfer function of drive in z domain (cont'd)</section_header_level_1>
<picture><loc_22><loc_63><loc_498><loc_170><other></picture>
<section_header_level_1><loc_187><loc_180><loc_291><loc_194>Position feedback loop</section_header_level_1>
<section_header_level_1><loc_15><loc_207><loc_230><loc_222>Closed loop transfer function of the machine:</section_header_level_1>
<formula><loc_15><loc_240><loc_485><loc_284>G _ { c l } ( z ) = \frac { D ( z ) G _ { c } ( z ) } { 1 + D ( z ) G _ { c } ( z ) } = \frac { K _ { p } z + b } { 1 + K _ { p } z + a } G _ { c } ( z ) = \frac { K _ { c l } z ^ { - 1 } + a _ { 2 } z ^ { 2 } + a _ { 3 } z + a _ { 0 } } { z ^ { 4 } + \beta _ { 3 } z ^ { 3 } + \beta _ { 2 } z ^ { 2 } + \beta _ { 1 } z + \beta _ { 0 } } = \frac { z ^ { - 1 } ( 1 + a _ { 2 } z ^ { - 1 } + a _ { 3 } z ^ { - 2 } + a _ { 0 } z ^ { - 3 } ) } { z ^ { 1 } + \beta _ { 3 } z ^ { - 1 } + \beta _ { 2 } z ^ { - 2 }

In [5]:
prompt("page-7.png")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


<doctag><section_header_level_1><loc_25><loc_14><loc_427><loc_40>Article 3.2 - ZOH Equivalent of Continuous Systems</section_header_level_1>
<picture><loc_15><loc_88><loc_480><loc_460><other></picture>
<picture><loc_4><loc_469><loc_35><loc_498><logo></picture>
<page_footer><loc_25><loc_481><loc_317><loc_492>© University of British Columbia - Manufacturing Automation Laboratory Prof. Yusuf Altintas</page_footer>
<page_footer><loc_462><loc_471><loc_470><loc_489>8</page_footer>
</doctag><end_of_utterance>



**MD Output:**

## Article 3.2 - ZOH Equivalent of Continuous Systems

<!-- image -->

<!-- image -->
