In [158]:
import base64
import re
import textwrap
from io import BytesIO
from pathlib import Path

import numpy as np
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    RapidOcrOptions,
    smolvlm_picture_description,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from dotenv import load_dotenv
from IPython.display import HTML, display
from ollama import chat
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

load_dotenv()

True

In [159]:
pipeline_options = PdfPipelineOptions(
    generate_page_images=True,
    images_scale=1.00,
    do_ocr=True,
    do_picture_description=True,
    ocr_options=RapidOcrOptions(),
    picture_description_options=smolvlm_picture_description,
)

converter = DocumentConverter(
    format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)

In [160]:
#document_path = Path("test_data/01. House Rules - Current Version.pdf")
document_path = Path("test_data/House Rules.pdf")
document_path

PosixPath('test_data/House Rules.pdf')

In [161]:
%%time

result = converter.convert(document_path)



CPU times: user 1min 12s, sys: 22.6 s, total: 1min 35s
Wall time: 36.5 s


In [162]:
document = result.document

In [None]:
print(document.export_to_markdown())

In [61]:
#print(document.pictures[0].annotations)
print(document.pictures[0])

self_ref='#/pictures/0' parent=RefItem(cref='#/body') children=[RefItem(cref='#/texts/2')] content_layer=<ContentLayer.BODY: 'body'> label=<DocItemLabel.PICTURE: 'picture'> prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=110.65599822998047, t=489.8112487792969, r=494.0281982421875, b=200.79742431640625, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 0))] captions=[] references=[] footnotes=[] image=None annotations=[DescriptionAnnotation(kind='description', text='In this image we can see a building with windows and balconies. We can also see a tree and a bicycle. We can also see the sky with clouds.', provenance='HuggingFaceTB/SmolVLM-256M-Instruct')]


In [60]:
print("Number of pictures:", len(document.pictures))
for i, pic in enumerate(document.pictures):
    print(f"Picture {i}: loaded image: {pic.image is not None}")

Number of pictures: 1
Picture 0: loaded image: False


In [163]:
annotations = []
for picture in document.pictures:
    for annotation in picture.annotations:
            annotations.append(annotation.text)
assert len(annotations) == len(document.pictures)

In [164]:
def replace_occurences(text, target, replacements):
    for replacement in replacements:
        if target in text:
            text = text.replace(target, replacement, 1)
        else:
            raise ValueError(
                f"No more occurences of '{target}' found in the text for replacement ({replacement})."
            )
    return text

In [165]:
IMAGE_PLACEHOLDER = "<!-- image_placeholder -->"
PAGE_BREAK_PLACEHOLDER = "<!-- page_break -->"
text = document.export_to_markdown(
    page_break_placeholder=PAGE_BREAK_PLACEHOLDER, image_placeholder=IMAGE_PLACEHOLDER
)

In [None]:
print(text)

In [None]:
print(replace_occurences(text, IMAGE_PLACEHOLDER, annotations))

In [98]:
def process_document(
    document_path: Path, converter: DocumentConverter, n_pages: int = -1
) -> str:
    result = converter.convert(document_path)
    document = result.document

    annotations = []
    for picture in document.pictures:
        print(picture)
        for annotation in picture.annotations:
            annotations.append(annotation.text)

    if(len(annotations) == len(document.pictures)):
        print("mismatch in number of annotations and number or pictures")
    text = document.export_to_markdown(
        page_break_placeholder=PAGE_BREAK_PLACEHOLDER,
        image_placeholder=IMAGE_PLACEHOLDER,
    )
    text = replace_occurences(text, IMAGE_PLACEHOLDER, annotations)
    if n_pages == -1:
        return text
    return PAGE_BREAK_PLACEHOLDER.join(text.split(PAGE_BREAK_PLACEHOLDER)[:n_pages])

In [79]:
%%time

document_path = Path("test_data/House Rules.pdf")
document_text = process_document(document_path, converter, n_pages=12)

self_ref='#/pictures/0' parent=RefItem(cref='#/body') children=[RefItem(cref='#/texts/2')] content_layer=<ContentLayer.BODY: 'body'> label=<DocItemLabel.PICTURE: 'picture'> prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=110.65599822998047, t=489.8112487792969, r=494.0281982421875, b=200.79742431640625, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 0))] captions=[] references=[] footnotes=[] image=None annotations=[DescriptionAnnotation(kind='description', text='In this image we can see a building with windows and balconies. We can also see a tree and a bicycle. We can also see the sky with clouds.', provenance='HuggingFaceTB/SmolVLM-256M-Instruct')]


In [80]:
print(len(document_text.split(" ")))

1615


In [157]:
print(document_text)

## House Rules Of The

## University Court A OAO

In this image we can see a building with windows and balconies. We can also see a tree and a bicycle. We can also see the sky with clouds.

In this image we can see a building with windows and balconies. We can also see a tree and a bicycle. We can also see the sky with clouds.

<!-- page_break -->

## HOUSERULESOFUNIVERSITYCOURT

## A. TERMINOLOGY

- 1)Agents.Any real estate broker,corporation,firm or individual empoweredtoactonbehalfofanyapartmentowner.
- 2)Board.TheBoard ofDirectorsof theAssociation ofOwners.
- 3)Bylaws.TheBylawsof the Association ofOwners,asamended fromtimetotime.
- 4)Declaration.The Declaration of Horizontal Property Regime of thiscondominiumproject,asitmaybeamendedfrom time to time.
- 5)Guest. A person who resides other than at the project,and visits thepremisesforaperiodof time atinvitation of aresident.
- 6)ManagingAgent.A firmwhich maybeappointedand/or employeehired by theBoard tomanage theproject andwhose duti

In [82]:
SPLIT_PATTERN = "\n"
chunks = document_text.split(SPLIT_PATTERN)

In [85]:
chunked_text = ""
for i, chunk in enumerate(chunks):
    if chunk.startswith("#"):
        chunk = f"#{chunk}"
    chunked_text += f"<|start_chunk_{i}>\n{chunk}<|end_chunk_{i}|>"

In [None]:
print(chunked_text)

In [100]:
#MODEL = "hf.co/google/gemma-3-12b-it-qat-q4_0-gguf:latest"
MODEL = "llama3:8b"
TEMPERATURE = 0.0
MIN_P = 0.0
REPEAT_PENALTY = 1.0
TOP_K = 64
TOP_P = 0.95

In [96]:
def call_model(prompt: str) -> str:
    response = chat(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        keep_alive="1h",
        options={
            "num_ctx": 16384,
            "temperature": TEMPERATURE,
            "min_p": MIN_P,
            "repeat_penalty": REPEAT_PENALTY,
            "top_k": TOP_K,
            "top_p": TOP_P,
        },
    )
    return response.message.content

In [94]:
CHUNKING_PROMPT = """
You are an assistant specialized in splitting text into semantically consistent sections.

<instructions>
    <instruction>The text has been divided into chunks, each marked with <|start_chunk_X|> and <|end_chunk_X|> tags, where X is the chunk number</instruction>
    <instruction>Identify points where splits should occur, such that consecutive chunks of similar themese stay together</instruction>
    <instruction>Each chunk must be between 200 and 1000 words</instruction>
    <instruction>If chunks 1 and 2 belong together but chunk 3 starts a new topic, suggest a split after chunk 2</instruction>
    <instruction>The chunks must be listed in ascending order</instruction>
    <instruction>Provide your response in the form: 'split_after: 3, 5'</instruction>
</instructions>"

This is the document text:
<document>
{document_text}
</document>

Respond only with the IDs of the chunks where you believe a split should occur.
YOU MUST RESPOND WITH AT LEAST ONE SPLIT
""".strip()

In [None]:
prompt = CHUNKING_PROMPT.format(document_text=chunked_text)
print(prompt)

In [101]:
%%time
response = call_model(prompt)

CPU times: user 4.59 ms, sys: 9.32 ms, total: 13.9 ms
Wall time: 39.3 s


In [106]:
print(response)

Based on the text, I suggest the following splits:

split_after: 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 66, 70, 74, 78, 82, 86, 90, 94, 98, 102, 106, 110, 114, 118, 122, 126


In [None]:
def split_text_by_llm_suggestions(chunked_text, llm_response):
    split_after = []
    if "split_after:" in llm_response:
        split_points = llm_response.split("split_after:")[1].strip()
        split_after = [int(x.strip()) for x in split_points.split(",")]

        print("split after:", split_after)

        #return whole text as one chunk if no splits were suggested
        if not split_after:
            print("returned whole chunk")
            return [chunked_text]

        chunk_pattern = r"<\|start_chunk_(\d+)\|?>(.*?)<\|end_chunk_\1\|>"
        chunks = re.findall(chunk_pattern, chunked_text, re.DOTALL)
        print("chunks: ", chunks)
        sections = []
        current_section = []

        for chunk_id, chunk_text in chunks:
            current_section.append(chunk_text)
            if int(chunk_id) in split_after:
                sections.append("".join(current_section).strip())
                current_section = []

        # add the last section if it's not empty
        if current_section:
            sections.append("".join(current_section).strip())

        return sections


In [None]:
llm_chunks = split_text_by_llm_suggestions(chunked_text, response)

In [128]:
len(llm_chunks)

33

In [142]:
print(llm_chunks[19])

- c)Installationsforelectrical power,telephone,television, radio,airconditioning,awningsor any otherpurpose whichshallbevisible fromorprojectfrom,orprotrude outside thephysical confinesofan apartment t orbe attached in anyway to the exterior of thebuilding are prohibitedwithout theprior approval of theBoard or ManagingAgent.
- d)Nameplate,signs,signalsorlettersvisibleoutsidean apartmentshall notbe inscribed,placedorexposed on or atanywindow,door,orparking stall unless approved by the Board or the Managing Agent."For Sale","For Rent" and"Open House" signs will be regulated by the Board or theManagingAgent.


In [208]:
from __future__ import annotations
import re, time, logging, concurrent.futures, pathlib, urllib.request
from functools import lru_cache
from typing import List

import spacy, kenlm
from huggingface_hub import hf_hub_download
from symspellpy import SymSpell
from wordfreq import zipf_frequency
from tqdm import tqdm

In [None]:
logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s",
                    level=logging.INFO)

logging.info("Loading KenLM 5-gram …")
KENLM_PATH = hf_hub_download("BramVanroy/kenlm_wikipedia_en",
                             "wiki_en_token.arpa.bin")
kenlm_model = kenlm.Model(KENLM_PATH)
logging.info("KenLM loaded.")

@lru_cache(maxsize=50_000)
def kenlm_score(txt: str) -> float:
    """Per-token cached perplexity."""
    return kenlm_model.perplexity(txt)

nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "tagger"])

FREQ_URL = ("https://raw.githubusercontent.com/mammothb/symspellpy/master/"
            "symspellpy/frequency_dictionary_en_82_765.txt")
FREQ_FILE = pathlib.Path("frequency_dictionary_en_82_765.txt")
if not FREQ_FILE.exists():
    logging.info("⬇️  downloading 3-MB SymSpell frequency list …")
    urllib.request.urlretrieve(FREQ_URL, FREQ_FILE)

sym = SymSpell(max_dictionary_edit_distance=0, prefix_length=7)
sym.load_dictionary(str(FREQ_FILE), 0, 1)

for term in ["bylaws", "universitycourt", "lanai"]:
    sym.create_dictionary_entry(term, 1)


def load_word_set(dict_path, min_zipf=2):
    words = set()
    with open(dict_path, "r", encoding="utf8") as f:
        for line in f:
            word = line.strip().split()[0]
            words.add(word.lower())
    return words

EN_WORDS = load_word_set(str(FREQ_FILE))

CAPS_SPLIT_RE = re.compile(r"(?<!^)(?=[A-Z][a-z])")  # ABCDef → ABCD ef

def _reconstruct_sentence_with_split(sentence_tokens: List[str], word_index: int, split_words: List[str]) -> str:
    """Reconstruct sentence with a specific word split for perplexity scoring"""
    reconstructed = sentence_tokens[:word_index] + split_words + sentence_tokens[word_index + 1:]
    return " ".join(reconstructed)

@lru_cache(maxsize=50_000)
def best_recursive_split_sentence_aware(word: str, sentence_context: str = "", word_position: int = -1) -> str:
    """
    Try all splits using sentence-level perplexity scoring instead of just Zipf frequency.
    Returns the split that results in the lowest sentence perplexity while maintaining constraints.
    """
    word_l = word.lower()
    if word_l in EN_WORDS:
        return word  # Already English

    # Parse sentence context if provided
    sentence_tokens = sentence_context.split() if sentence_context else [word]
    if word_position == -1:
        # Try to find word position in sentence
        try:
            word_position = sentence_tokens.index(word)
        except ValueError:
            # Word not found in sentence, fall back to isolated scoring
            sentence_tokens = [word]
            word_position = 0

    def _helper(s, non_english_used):
        if not s:
            return (float('inf'), [])  # No more to split, return high perplexity

        best_here = None

        # Try all splits (from 1 to len-1)
        for i in range(1, len(s)):  # i is split point
            left, right = s[:i], s[i:]
            left_l = left.lower()
            right_l = right.lower()

            left_in = left_l in EN_WORDS
            right_in = right_l in EN_WORDS

            # Count non-English segments
            non_eng = non_english_used
            if not left_in:
                non_eng += 1
            if not right_in:
                non_eng += 1

            # If more than one non-English segment, skip
            if non_eng > 1:
                continue

            # Must have at least one English subword in this split
            if not (left_in or right_in):
                continue

            # For each side, if it's not English, try splitting further
            left_best = (0, [left]) if left_in else _helper(left, non_eng)
            right_best = (0, [right]) if right_in else _helper(right, non_eng)

            if left_best[1] and right_best[1]:
                # Reconstruct sentence with this split
                split_words = left_best[1] + right_best[1]
                test_sentence = _reconstruct_sentence_with_split(sentence_tokens, word_position, split_words)

                # Score with sentence-level perplexity
                sentence_perplexity = kenlm_score(test_sentence)

                # Also consider word-level validity (hybrid approach)
                word_level_score = sum(zipf_frequency(w.lower(), "en") for w in split_words)

                # Combined score: prioritize sentence perplexity but consider word validity
                # Lower perplexity is better, higher Zipf is better
                combined_score = sentence_perplexity - (word_level_score * 0.1)  # Small word-level bonus

                if best_here is None or combined_score < best_here[0]:
                    best_here = (combined_score, split_words)

        # If no split found, return self (OOV, only if not already used non-English)
        if best_here is None and non_english_used == 0:
            # Score the original word in sentence context
            original_sentence = _reconstruct_sentence_with_split(sentence_tokens, word_position, [s])
            original_perplexity = kenlm_score(original_sentence)
            return (original_perplexity, [s])
        return best_here if best_here else (float('inf'), [])

    # Try all possible splits, return lowest perplexity
    result = _helper(word, 0)
    if result and result[1] and len(result[1]) > 1:
        # Compare with original word in sentence context
        original_sentence = _reconstruct_sentence_with_split(sentence_tokens, word_position, [word])
        original_perplexity = kenlm_score(original_sentence)

        # Only accept split if it significantly improves sentence perplexity
        if result[0] < original_perplexity * 0.85:  # 15% improvement threshold
            return " ".join(result[1])

    return word  # Could not split or no significant improvement

# Fallback to original function for backward compatibility
@lru_cache(maxsize=50_000)
def best_recursive_split(word: str) -> str:
    """
    Original Zipf-based splitting for backward compatibility.
    Use best_recursive_split_sentence_aware for better results.
    """
    return best_recursive_split_sentence_aware(word)

def split_caps(token: str) -> str:
    """HOUSEOFCOURT → HOUSE OF COURT (greedy ALL-CAPS splitter)."""
    pieces, buf = [], ""
    for c in token:
        if buf and buf[-1].isupper() and c.isupper():
            buf += c
        elif buf:
            pieces.append(buf); buf = c
        else:
            buf = c
    pieces.append(buf)
    return " ".join(CAPS_SPLIT_RE.sub(" ", p) for p in pieces)

CAMEL_RE = re.compile(r"(?<=[a-z])(?=[A-Z])")
def split_camel(token: str) -> str:
    """theBoardbut → the Boardbut  (first pass – SymSpell can refine later)."""
    return CAMEL_RE.sub(" ", token)

def symspell_segments(word: str, k: int = 3) -> List[str]:
    comp = sym.word_segmentation(word.lower())          # ← NO extra kwargs
    seg  = comp.segmented_string if hasattr(comp, "segmented_string") else comp
    return [seg]

def beam_split(word: str,
               beam_size: int = 4,
               max_depth: int = 4,
               min_zipf: float = 2) -> str:
    L = len(word)
    beams = [(0, "", 0.0)]  # (idx, sofar, ppl)
    for _ in range(max_depth):
        nxt = []
        for idx, sofar, _ in beams:
            for j in range(idx + 3, min(idx + 16, L)):
                part = word[idx:j]
                if zipf_frequency(part, "en") < min_zipf:
                    continue
                candidate = (sofar + " " + part).strip()
                ppl = kenlm_score(" ".join(t.text for t in nlp(candidate)))
                nxt.append((j, candidate, ppl))
        if not nxt:
            break
        beams = sorted(nxt, key=lambda x: x[2])[:beam_size]
        for idx, cand, _ in beams:
            if idx == L:
                return cand
    return word

def candidate_splits(token: str, sentence_context: str = "", word_position: int = -1, use_beam: bool = True) -> List[str]:
    """Return several candidate splits (best first) with sentence context awareness."""
    # Use sentence-aware splitting when context is available
    if sentence_context and word_position >= 0:
        best_rec = best_recursive_split_sentence_aware(token, sentence_context, word_position)
    else:
        best_rec = best_recursive_split(token)

    if best_rec != token and " " in best_rec:
        return [best_rec]

    if token.isupper():
        return [split_caps(token)]

    if CAMEL_RE.search(token):
        camel = split_camel(token)
        # if camel already yields at least one space, let SymSpell refine
        if " " in camel:
            return [camel]

    segs = symspell_segments(token, k=3)
    good = [s for s in segs
            if " " in s and all(zipf_frequency(w, "en") > 2
                                for w in s.split())]

    if use_beam and not good and len(token) > 25:
        beam = beam_split(token)
        if beam != token:
            good.append(beam)

    if token not in good:
        good.append(token)

    return good

def unsmash_sentence(sent: str,
                     ratio_thresh: float = 0.65,
                     abs_thresh: float = 40) -> str:
    """Per-sentence unsmasher with sentence-aware splitting"""
    doc = nlp(sent)
    sentence_tokens = [tk.text for tk in doc]
    out = []

    for i, tk in enumerate(doc):
        w = tk.text
        if w.isalpha() and len(w) > 8:
            # Use sentence context for better splitting decisions
            cands = candidate_splits(w, sent, i)

            # accept immediately if all parts are common words
            if " " in cands[0] and all(zipf_frequency(p, "en") > 2 for p in cands[0].split()):
                out.append(cands[0])
                continue

            # Score candidates in sentence context
            scored = []
            for c in cands:
                # Reconstruct sentence with this candidate
                test_tokens = sentence_tokens[:i] + c.split() + sentence_tokens[i+1:]
                test_sentence = " ".join(test_tokens)
                ppl = kenlm_score(test_sentence)
                scored.append((ppl, c))

            best_ppl, best = min(scored, key=lambda x: x[0])

            # Compare with original sentence
            orig_ppl = kenlm_score(sent)

            if best_ppl < orig_ppl * ratio_thresh or orig_ppl - best_ppl > abs_thresh:
                out.append(best)
            else:
                out.append(w)
        else:
            out.append(w)
    return " ".join(out)

def clean_ocr_text(text: str, max_workers: int = 8) -> str:
    sentences = re.split(r"(?<=[.?!])\s+", text)
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as ex:
        cleaned = list(ex.map(unsmash_sentence, sentences))
    return "\n".join(cleaned)

IMAGE_PLACEHOLDER = "<!-- image_placeholder -->"
PAGE_BREAK_PLACEHOLDER = "<!-- page_break -->"
PLACEHOLDERS = {IMAGE_PLACEHOLDER, PAGE_BREAK_PLACEHOLDER}
PLACEHOLDER_RE = re.compile(
    rf"({re.escape(IMAGE_PLACEHOLDER)}|{re.escape(PAGE_BREAK_PLACEHOLDER)})"
)

def clean_with_placeholders(doc_text: str) -> str:
    parts = PLACEHOLDER_RE.split(doc_text)
    cleaned_parts = []
    for part in parts:
        if part in PLACEHOLDERS:
            cleaned_parts.append(part)
        elif part.strip():
            cleaned_parts.append(clean_ocr_text(part))
        else:
            cleaned_parts.append(part)
    return "".join(cleaned_parts)

2025-07-15 11:45:32,037 | INFO | Loading KenLM 5-gram …
2025-07-15 11:45:36,836 | INFO | KenLM loaded.


In [228]:
cleaned = clean_ocr_text(text)
print(cleaned)

# # House Rules Of The 

 # # University Court A OAO 

 In this image we can see a building with windows and balconies .
We can also see a tree and a bicycle .
We can also see the sky with clouds .
< ! -- image_placeholder -- > 

 < ! -- page_break -- > 

 # # HOUSERULESOFUNIVERSITYCOURT 

 # # A.
TERMINOLOGY 

 - 1)Agents . Any real estate broker , corporation , firm or individual empowered to acton behalf of any apartment owner .
- 2)Board . TheBoard of Directorsof the Association ofOwners .
- 3)Bylaws . The Bylawsof the Association ofOwners , as amended from time to time .
- 4)Declaration . The Declaration of Horizontal Property Regime of this condominium project , as it maybe amended from time to time .
- 5)Guest .
A person who resides other than at the project , and visits the premises for a period of time at invitation of a resident .
- 6)ManagingAgent . A firm which maybe appointed and / or employee hired by theBoard tomanage the project andwhose duties are outlined inthe Bylaws