In [12]:
import base64
import re
import textwrap
from io import BytesIO
from pathlib import Path

import numpy as np
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    RapidOcrOptions,
    smolvlm_picture_description,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from dotenv import load_dotenv
from IPython.display import HTML, display
from ollama import chat
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

load_dotenv()

True

In [28]:
pipeline_options = PdfPipelineOptions(
    generate_page_images=True,
    images_scale=1.00,
    do_ocr=True,
    do_picture_description=True,
    ocr_options=RapidOcrOptions(),
    picture_description_options=smolvlm_picture_description,
)

converter = DocumentConverter(
    format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)

In [56]:
#document_path = Path("test_data/01. House Rules - Current Version.pdf")
document_path = Path("test_data/House Rules.pdf")
document_path

PosixPath('test_data/House Rules.pdf')

In [57]:
%%time

result = converter.convert(document_path)

CPU times: user 53.1 s, sys: 3.27 s, total: 56.4 s
Wall time: 23.5 s


In [58]:
document = result.document

In [None]:
print(document.export_to_markdown())

In [61]:
#print(document.pictures[0].annotations)
print(document.pictures[0])

self_ref='#/pictures/0' parent=RefItem(cref='#/body') children=[RefItem(cref='#/texts/2')] content_layer=<ContentLayer.BODY: 'body'> label=<DocItemLabel.PICTURE: 'picture'> prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=110.65599822998047, t=489.8112487792969, r=494.0281982421875, b=200.79742431640625, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 0))] captions=[] references=[] footnotes=[] image=None annotations=[DescriptionAnnotation(kind='description', text='In this image we can see a building with windows and balconies. We can also see a tree and a bicycle. We can also see the sky with clouds.', provenance='HuggingFaceTB/SmolVLM-256M-Instruct')]


In [60]:
print("Number of pictures:", len(document.pictures))
for i, pic in enumerate(document.pictures):
    print(f"Picture {i}: loaded image: {pic.image is not None}")

Number of pictures: 1
Picture 0: loaded image: False


In [62]:
annotations = []
for picture in document.pictures:
    for annotation in picture.annotations:
            annotations.append(annotation.text)
assert len(annotations) == len(document.pictures)

In [67]:
def replace_occurences(text, target, replacements):
    for replacement in replacements:
        if target in text:
            text = text.replace(target, replacement, 1)
        else:
            raise ValueError(
                f"No more occurences of '{target}' found in the text for replacement ({replacement})."
            )
    return text

In [63]:
IMAGE_PLACEHOLDER = "<!-- image_placeholder -->"
PAGE_BREAK_PLACEHOLDER = "<!-- page_break -->"
text = document.export_to_markdown(
    page_break_placeholder=PAGE_BREAK_PLACEHOLDER, image_placeholder=IMAGE_PLACEHOLDER
)

In [None]:
print(text)

In [None]:
print(replace_occurences(text, IMAGE_PLACEHOLDER, annotations))

In [78]:
def process_document(
    document_path: Path, converter: DocumentConverter, n_pages: int = -1
) -> str:
    result = converter.convert(document_path)
    document = result.document

    annotations = []
    for picture in document.pictures:
        print(picture)
        for annotation in picture.annotations:
            annotations.append(annotation.text)

    if(len(annotations) == len(document.pictures)):
        console.log("mismatch in number of annotations and number or pictures")
    text = document.export_to_markdown(
        page_break_placeholder=PAGE_BREAK_PLACEHOLDER,
        image_placeholder=IMAGE_PLACEHOLDER,
    )
    text = replace_occurences(text, IMAGE_PLACEHOLDER, annotations)
    if n_pages == -1:
        return text
    return PAGE_BREAK_PLACEHOLDER.join(text.split(PAGE_BREAK_PLACEHOLDER)[:n_pages])

In [79]:
%%time

document_path = Path("test_data/House Rules.pdf")
document_text = process_document(document_path, converter, n_pages=12)

self_ref='#/pictures/0' parent=RefItem(cref='#/body') children=[RefItem(cref='#/texts/2')] content_layer=<ContentLayer.BODY: 'body'> label=<DocItemLabel.PICTURE: 'picture'> prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=110.65599822998047, t=489.8112487792969, r=494.0281982421875, b=200.79742431640625, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 0))] captions=[] references=[] footnotes=[] image=None annotations=[DescriptionAnnotation(kind='description', text='In this image we can see a building with windows and balconies. We can also see a tree and a bicycle. We can also see the sky with clouds.', provenance='HuggingFaceTB/SmolVLM-256M-Instruct')]


In [80]:
print(len(document_text.split(" ")))

1615


In [None]:
print(document_text)

In [82]:
SPLIT_PATTERN = "\n"
chunks = document_text.split(SPLIT_PATTERN)

In [85]:
chunked_text = ""
for i, chunk in enumerate(chunks):
    if chunk.startswith("#"):
        chunk = f"#{chunk}"
    chunked_text += f"<|start_chunk_{i}>\n{chunk}<|end_chunk_{i}|>"

In [None]:
print(chunked_text)

In [92]:
MODEL = "hf.co/google/gemma-3-12b-it-qat-q4_0-gguf:latest"
TEMPERATURE = 0.0
MIN_P = 0.0
REPEAT_PENALTY = 1.0
TOP_K = 64
TOP_P = 0.95

In [96]:
def call_model(prompt: str) -> str:
    response = chat(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        keep_alive="1h",
        options={
            "num_ctx": 16384,
            "temperature": TEMPERATURE,
            "min_p": MIN_P,
            "repeat_penalty": REPEAT_PENALTY,
            "top_k": TOP_K,
            "top_p": TOP_P,
        },
    )
    return response.message.content

In [94]:
CHUNKING_PROMPT = """
You are an assistant specialized in splitting text into semantically consistent sections.

<instructions>
    <instruction>The text has been divided into chunks, each marked with <|start_chunk_X|> and <|end_chunk_X|> tags, where X is the chunk number</instruction>
    <instruction>Identify points where splits should occur, such that consecutive chunks of similar themese stay together</instruction>
    <instruction>Each chunk must be between 200 and 1000 words</instruction>
    <instruction>If chunks 1 and 2 belong together but chunk 3 starts a new topic, suggest a split after chunk 2</instruction>
    <instruction>The chunks must be listed in ascending order</instruction>
    <instruction>Provide your response in the form: 'split_after: 3, 5'</instruction>
</instructions>"

This is the document text:
<document>
{document_text}
</document>

Respond only with the IDs of the chunks where you believe a split should occur.
YOU MUST RESPOND WITH AT LEAST ONE SPLIT
""".strip()

In [None]:
prompt = CHUNKING_PROMPT.format(document_text=chunked_text)
print(prompt)

In [97]:
%%time
response = call_model(prompt)

CPU times: user 1.45 ms, sys: 2.2 ms, total: 3.66 ms
Wall time: 3.23 ms


ConnectionError: Failed to connect to Ollama. Please check that Ollama is downloaded, running and accessible. https://ollama.com/download