In [3]:
import base64
import re
import textwrap
from io import BytesIO
from pathlib import Path

import numpy as np
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    RapidOcrOptions,
    smolvlm_picture_description,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from dotenv import load_dotenv
from IPython.display import HTML, display
from ollama import chat
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

load_dotenv()

True

In [4]:
pipeline_options = PdfPipelineOptions(
    generate_page_images=True,
    images_scale=1.00,
    do_ocr=True,
    do_picture_description=True,
    ocr_options=RapidOcrOptions(),
    picture_description_options=smolvlm_picture_description,
)

converter = DocumentConverter(
    format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)

In [5]:
#document_path = Path("test_data/01. House Rules - Current Version.pdf")
document_path = Path("test_data/House Rules.pdf")
document_path

PosixPath('test_data/House Rules.pdf')

In [6]:
%%time

result = converter.convert(document_path)



CPU times: user 1min 6s, sys: 15.6 s, total: 1min 22s
Wall time: 39.1 s


In [7]:
document = result.document

In [None]:
print(document.export_to_markdown())

In [61]:
#print(document.pictures[0].annotations)
print(document.pictures[0])

self_ref='#/pictures/0' parent=RefItem(cref='#/body') children=[RefItem(cref='#/texts/2')] content_layer=<ContentLayer.BODY: 'body'> label=<DocItemLabel.PICTURE: 'picture'> prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=110.65599822998047, t=489.8112487792969, r=494.0281982421875, b=200.79742431640625, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 0))] captions=[] references=[] footnotes=[] image=None annotations=[DescriptionAnnotation(kind='description', text='In this image we can see a building with windows and balconies. We can also see a tree and a bicycle. We can also see the sky with clouds.', provenance='HuggingFaceTB/SmolVLM-256M-Instruct')]


In [60]:
print("Number of pictures:", len(document.pictures))
for i, pic in enumerate(document.pictures):
    print(f"Picture {i}: loaded image: {pic.image is not None}")

Number of pictures: 1
Picture 0: loaded image: False


In [163]:
annotations = []
for picture in document.pictures:
    for annotation in picture.annotations:
            annotations.append(annotation.text)
assert len(annotations) == len(document.pictures)

In [164]:
def replace_occurences(text, target, replacements):
    for replacement in replacements:
        if target in text:
            text = text.replace(target, replacement, 1)
        else:
            raise ValueError(
                f"No more occurences of '{target}' found in the text for replacement ({replacement})."
            )
    return text

In [9]:
IMAGE_PLACEHOLDER = "<!-- image_placeholder -->"
PAGE_BREAK_PLACEHOLDER = "<!-- page_break -->"
text = document.export_to_markdown(
    page_break_placeholder=PAGE_BREAK_PLACEHOLDER, image_placeholder=IMAGE_PLACEHOLDER
)

In [None]:
print(text)

In [None]:
print(replace_occurences(text, IMAGE_PLACEHOLDER, annotations))

In [98]:
def process_document(
    document_path: Path, converter: DocumentConverter, n_pages: int = -1
) -> str:
    result = converter.convert(document_path)
    document = result.document

    annotations = []
    for picture in document.pictures:
        print(picture)
        for annotation in picture.annotations:
            annotations.append(annotation.text)

    if(len(annotations) == len(document.pictures)):
        print("mismatch in number of annotations and number or pictures")
    text = document.export_to_markdown(
        page_break_placeholder=PAGE_BREAK_PLACEHOLDER,
        image_placeholder=IMAGE_PLACEHOLDER,
    )
    text = replace_occurences(text, IMAGE_PLACEHOLDER, annotations)
    if n_pages == -1:
        return text
    return PAGE_BREAK_PLACEHOLDER.join(text.split(PAGE_BREAK_PLACEHOLDER)[:n_pages])

In [79]:
%%time

document_path = Path("test_data/House Rules.pdf")
document_text = process_document(document_path, converter, n_pages=12)

self_ref='#/pictures/0' parent=RefItem(cref='#/body') children=[RefItem(cref='#/texts/2')] content_layer=<ContentLayer.BODY: 'body'> label=<DocItemLabel.PICTURE: 'picture'> prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=110.65599822998047, t=489.8112487792969, r=494.0281982421875, b=200.79742431640625, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 0))] captions=[] references=[] footnotes=[] image=None annotations=[DescriptionAnnotation(kind='description', text='In this image we can see a building with windows and balconies. We can also see a tree and a bicycle. We can also see the sky with clouds.', provenance='HuggingFaceTB/SmolVLM-256M-Instruct')]


In [80]:
print(len(document_text.split(" ")))

1615


In [None]:
print(document_text)

In [82]:
SPLIT_PATTERN = "\n"
chunks = document_text.split(SPLIT_PATTERN)

In [85]:
chunked_text = ""
for i, chunk in enumerate(chunks):
    if chunk.startswith("#"):
        chunk = f"#{chunk}"
    chunked_text += f"<|start_chunk_{i}>\n{chunk}<|end_chunk_{i}|>"

In [None]:
print(chunked_text)

In [100]:
#MODEL = "hf.co/google/gemma-3-12b-it-qat-q4_0-gguf:latest"
MODEL = "llama3:8b"
TEMPERATURE = 0.0
MIN_P = 0.0
REPEAT_PENALTY = 1.0
TOP_K = 64
TOP_P = 0.95

In [96]:
def call_model(prompt: str) -> str:
    response = chat(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        keep_alive="1h",
        options={
            "num_ctx": 16384,
            "temperature": TEMPERATURE,
            "min_p": MIN_P,
            "repeat_penalty": REPEAT_PENALTY,
            "top_k": TOP_K,
            "top_p": TOP_P,
        },
    )
    return response.message.content

In [94]:
CHUNKING_PROMPT = """
You are an assistant specialized in splitting text into semantically consistent sections.

<instructions>
    <instruction>The text has been divided into chunks, each marked with <|start_chunk_X|> and <|end_chunk_X|> tags, where X is the chunk number</instruction>
    <instruction>Identify points where splits should occur, such that consecutive chunks of similar themese stay together</instruction>
    <instruction>Each chunk must be between 200 and 1000 words</instruction>
    <instruction>If chunks 1 and 2 belong together but chunk 3 starts a new topic, suggest a split after chunk 2</instruction>
    <instruction>The chunks must be listed in ascending order</instruction>
    <instruction>Provide your response in the form: 'split_after: 3, 5'</instruction>
</instructions>"

This is the document text:
<document>
{document_text}
</document>

Respond only with the IDs of the chunks where you believe a split should occur.
YOU MUST RESPOND WITH AT LEAST ONE SPLIT
""".strip()

In [None]:
prompt = CHUNKING_PROMPT.format(document_text=chunked_text)
print(prompt)

In [101]:
%%time
response = call_model(prompt)

CPU times: user 4.59 ms, sys: 9.32 ms, total: 13.9 ms
Wall time: 39.3 s


In [106]:
print(response)

Based on the text, I suggest the following splits:

split_after: 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 66, 70, 74, 78, 82, 86, 90, 94, 98, 102, 106, 110, 114, 118, 122, 126


In [None]:
def split_text_by_llm_suggestions(chunked_text, llm_response):
    split_after = []
    if "split_after:" in llm_response:
        split_points = llm_response.split("split_after:")[1].strip()
        split_after = [int(x.strip()) for x in split_points.split(",")]

        print("split after:", split_after)

        #return whole text as one chunk if no splits were suggested
        if not split_after:
            print("returned whole chunk")
            return [chunked_text]

        chunk_pattern = r"<\|start_chunk_(\d+)\|?>(.*?)<\|end_chunk_\1\|>"
        chunks = re.findall(chunk_pattern, chunked_text, re.DOTALL)
        print("chunks: ", chunks)
        sections = []
        current_section = []

        for chunk_id, chunk_text in chunks:
            current_section.append(chunk_text)
            if int(chunk_id) in split_after:
                sections.append("".join(current_section).strip())
                current_section = []

        # add the last section if it's not empty
        if current_section:
            sections.append("".join(current_section).strip())

        return sections


In [None]:
llm_chunks = split_text_by_llm_suggestions(chunked_text, response)

In [128]:
len(llm_chunks)

33

In [142]:
print(llm_chunks[19])

- c)Installationsforelectrical power,telephone,television, radio,airconditioning,awningsor any otherpurpose whichshallbevisible fromorprojectfrom,orprotrude outside thephysical confinesofan apartment t orbe attached in anyway to the exterior of thebuilding are prohibitedwithout theprior approval of theBoard or ManagingAgent.
- d)Nameplate,signs,signalsorlettersvisibleoutsidean apartmentshall notbe inscribed,placedorexposed on or atanywindow,door,orparking stall unless approved by the Board or the Managing Agent."For Sale","For Rent" and"Open House" signs will be regulated by the Board or theManagingAgent.


In [208]:
from __future__ import annotations
import re, time, logging, concurrent.futures, pathlib, urllib.request
from functools import lru_cache
from typing import List

import spacy, kenlm
from huggingface_hub import hf_hub_download
from symspellpy import SymSpell
from wordfreq import zipf_frequency
from tqdm import tqdm

In [15]:
logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s",
                    level=logging.INFO)

logging.info("Loading KenLM 5-gram …")
KENLM_PATH = hf_hub_download("BramVanroy/kenlm_wikipedia_en",
                             "wiki_en_token.arpa.bin")
kenlm_model = kenlm.Model(KENLM_PATH)
logging.info("KenLM loaded.")

@lru_cache(maxsize=50_000)
def kenlm_score(txt: str) -> float:
    """Per-token cached perplexity."""
    return kenlm_model.perplexity(txt)

nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "tagger"])

FREQ_URL = ("https://raw.githubusercontent.com/mammothb/symspellpy/master/"
            "symspellpy/frequency_dictionary_en_82_765.txt")
FREQ_FILE = pathlib.Path("frequency_dictionary_en_82_765.txt")
if not FREQ_FILE.exists():
    logging.info("⬇️  downloading 3-MB SymSpell frequency list …")
    urllib.request.urlretrieve(FREQ_URL, FREQ_FILE)

sym = SymSpell(max_dictionary_edit_distance=0, prefix_length=7)
sym.load_dictionary(str(FREQ_FILE), 0, 1)

for term in ["bylaws", "universitycourt", "lanai"]:
    sym.create_dictionary_entry(term, 1)


def load_word_set(dict_path, min_zipf=2):
    words = set()
    with open(dict_path, "r", encoding="utf8") as f:
        for line in f:
            word = line.strip().split()[0]
            words.add(word.lower())
    return words

EN_WORDS = load_word_set(str(FREQ_FILE))

CAPS_SPLIT_RE = re.compile(r"(?<!^)(?=[A-Z][a-z])")  # ABCDef → ABCD ef

def _reconstruct_sentence_with_split(sentence_tokens: List[str], word_index: int, split_words: List[str]) -> str:
    """Reconstruct sentence with a specific word split for perplexity scoring"""
    reconstructed = sentence_tokens[:word_index] + split_words + sentence_tokens[word_index + 1:]
    return " ".join(reconstructed)

@lru_cache(maxsize=50_000)
def best_recursive_split_sentence_aware(word: str, sentence_context: str = "", word_position: int = -1) -> str:
    # Parse sentence context
    sentence_tokens = sentence_context.split() if sentence_context else [word]
    if word_position == -1:
        try:
            word_position = sentence_tokens.index(word)
        except ValueError:
            sentence_tokens = [word]
            word_position = 0

    # Score original word in context
    original_sentence = _reconstruct_sentence_with_split(sentence_tokens, word_position, [word])
    original_perplexity = kenlm_score(original_sentence)
    
    # Try all possible splits
    best_split = word
    best_perplexity = original_perplexity
    
    def _try_all_splits(s, start_pos=0):
        nonlocal best_split, best_perplexity
        
        for i in range(1, len(s)):
            left, right = s[:i], s[i:]
            
            # Only require that split parts are reasonable words (Zipf > 1.0)
            if (zipf_frequency(left.lower(), "en") > 1.0 and 
                zipf_frequency(right.lower(), "en") > 1.0):
                
                split_words = [left, right]
                test_sentence = _reconstruct_sentence_with_split(sentence_tokens, word_position, split_words)
                perplexity = kenlm_score(test_sentence)
                
                if perplexity < best_perplexity:
                    best_perplexity = perplexity
                    best_split = " ".join(split_words)
    
    _try_all_splits(word)
    
    # Only return split if it significantly improves perplexity
    if best_perplexity < original_perplexity * 0.90:  # 10% improvement threshold
        return best_split
    
    return word

# Fallback to original function for backward compatibility
@lru_cache(maxsize=50_000)
def best_recursive_split(word: str) -> str:
    """
    Original Zipf-based splitting for backward compatibility.
    Use best_recursive_split_sentence_aware for better results.
    """
    return best_recursive_split_sentence_aware(word)

def split_caps(token: str) -> str:
    """HOUSEOFCOURT → HOUSE OF COURT (greedy ALL-CAPS splitter)."""
    pieces, buf = [], ""
    for c in token:
        if buf and buf[-1].isupper() and c.isupper():
            buf += c
        elif buf:
            pieces.append(buf); buf = c
        else:
            buf = c
    pieces.append(buf)
    return " ".join(CAPS_SPLIT_RE.sub(" ", p) for p in pieces)

CAMEL_RE = re.compile(r"(?<=[a-z])(?=[A-Z])")
def split_camel(token: str) -> str:
    """theBoardbut → the Boardbut  (first pass – SymSpell can refine later)."""
    return CAMEL_RE.sub(" ", token)

def symspell_segments(word: str, k: int = 3) -> List[str]:
    comp = sym.word_segmentation(word.lower())          # ← NO extra kwargs
    seg  = comp.segmented_string if hasattr(comp, "segmented_string") else comp
    return [seg]

def beam_split(word: str,
               beam_size: int = 4,
               max_depth: int = 4,
               min_zipf: float = 2) -> str:
    L = len(word)
    beams = [(0, "", 0.0)]  # (idx, sofar, ppl)
    for _ in range(max_depth):
        nxt = []
        for idx, sofar, _ in beams:
            for j in range(idx + 3, min(idx + 16, L)):
                part = word[idx:j]
                if zipf_frequency(part, "en") < min_zipf:
                    continue
                candidate = (sofar + " " + part).strip()
                ppl = kenlm_score(" ".join(t.text for t in nlp(candidate)))
                nxt.append((j, candidate, ppl))
        if not nxt:
            break
        beams = sorted(nxt, key=lambda x: x[2])[:beam_size]
        for idx, cand, _ in beams:
            if idx == L:
                return cand
    return word

def candidate_splits(token: str, sentence_context: str = "", word_position: int = -1, use_beam: bool = True) -> List[str]:
    """Return several candidate splits (best first) with sentence context awareness."""
    # Use sentence-aware splitting when context is available
    if sentence_context and word_position >= 0:
        best_rec = best_recursive_split_sentence_aware(token, sentence_context, word_position)
    else:
        best_rec = best_recursive_split(token)

    if best_rec != token and " " in best_rec:
        return [best_rec]

    if token.isupper():
        return [split_caps(token)]

    if CAMEL_RE.search(token):
        camel = split_camel(token)
        # if camel already yields at least one space, let SymSpell refine
        if " " in camel:
            return [camel]

    segs = symspell_segments(token, k=3)
    good = [s for s in segs
            if " " in s and all(zipf_frequency(w, "en") > 2
                                for w in s.split())]

    if use_beam and not good and len(token) > 25:
        beam = beam_split(token)
        if beam != token:
            good.append(beam)

    if token not in good:
        good.append(token)

    return good

def unsmash_sentence(sent: str,
                     ratio_thresh: float = 0.65,
                     abs_thresh: float = 40) -> str:
    """Per-sentence unsmasher with sentence-aware splitting"""
    doc = nlp(sent)
    sentence_tokens = [tk.text for tk in doc]
    out = []

    for i, tk in enumerate(doc):
        w = tk.text
        if w.isalpha() and len(w) > 8:
            # Use sentence context for better splitting decisions
            cands = candidate_splits(w, sent, i)

            # accept immediately if all parts are common words
            if " " in cands[0] and all(zipf_frequency(p, "en") > 2 for p in cands[0].split()):
                out.append(cands[0])
                continue

            # Score candidates in sentence context
            scored = []
            for c in cands:
                # Reconstruct sentence with this candidate
                test_tokens = sentence_tokens[:i] + c.split() + sentence_tokens[i+1:]
                test_sentence = " ".join(test_tokens)
                ppl = kenlm_score(test_sentence)
                scored.append((ppl, c))

            best_ppl, best = min(scored, key=lambda x: x[0])

            # Compare with original sentence
            orig_ppl = kenlm_score(sent)

            if best_ppl < orig_ppl * ratio_thresh or orig_ppl - best_ppl > abs_thresh:
                out.append(best)
            else:
                out.append(w)
        else:
            out.append(w)
    return " ".join(out)

def clean_ocr_text(text: str, max_workers: int = 8) -> str:
    sentences = re.split(r"(?<=[.?!])\s+", text)
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as ex:
        cleaned = list(ex.map(unsmash_sentence, sentences))
    return "\n".join(cleaned)

IMAGE_PLACEHOLDER = "<!-- image_placeholder -->"
PAGE_BREAK_PLACEHOLDER = "<!-- page_break -->"
PLACEHOLDERS = {IMAGE_PLACEHOLDER, PAGE_BREAK_PLACEHOLDER}
PLACEHOLDER_RE = re.compile(
    rf"({re.escape(IMAGE_PLACEHOLDER)}|{re.escape(PAGE_BREAK_PLACEHOLDER)})"
)

def clean_with_placeholders(doc_text: str) -> str:
    parts = PLACEHOLDER_RE.split(doc_text)
    cleaned_parts = []
    for part in parts:
        if part in PLACEHOLDERS:
            cleaned_parts.append(part)
        elif part.strip():
            cleaned_parts.append(clean_ocr_text(part))
        else:
            cleaned_parts.append(part)
    return "".join(cleaned_parts)

2025-07-16 21:42:22,678 | INFO | Loading KenLM 5-gram …
2025-07-16 21:42:26,579 | INFO | KenLM loaded.
2025-07-16 21:42:26,823 | INFO | ⬇️  downloading 3-MB SymSpell frequency list …


URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1010)>

In [None]:
cleaned = clean_ocr_text(text)
print(cleaned)

In [16]:
import concurrent.futures
import logging
import os
import pathlib
import re
import urllib.request
from functools import lru_cache
from typing import List, Optional

import kenlm
import spacy
from huggingface_hub import hf_hub_download
from symspellpy import SymSpell
from wordfreq import zipf_frequency

logging.basicConfig(
    format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO
)


class TextCleaningService:

    def __init__(self, models_dir: str = "app/models"):
        """Initialize the text cleaning service with model loading."""
        self.models_dir = pathlib.Path(models_dir)
        self.models_dir.mkdir(exist_ok=True)

        # Model paths
        self.kenlm_path = None
        self.freq_file = self.models_dir / "frequency_dictionary_en_82_765.txt"

        # Models
        self.kenlm_model = None
        self.nlp = None
        self.sym = None
        self.en_words = None

        # Regex patterns
        self.camel_re = re.compile(r"(?<=[a-z])(?=[A-Z])")

        # Placeholders for document structure
        self.image_placeholder = "<!-- image_placeholder -->"
        self.page_break_placeholder = "<!-- page_break -->"
        self.placeholders = {self.image_placeholder, self.page_break_placeholder}
        self.placeholder_re = re.compile(
            rf"({re.escape(self.image_placeholder)}|{re.escape(self.page_break_placeholder)})"
        )

        # Initialize models
        self._load_models()

    def _load_models(self):
        self._load_kenlm_model()
        self._load_spacy_model()
        self._load_symspell_model()

    def _load_kenlm_model(self):
        """Load KenLM model, downloading if not present."""
        try:
            logging.info("Loading KenLM 5-gram model...")

            # Check if model exists in models directory (direct path)
            kenlm_local_path = self.models_dir / "wiki_en_token.arpa.bin"

            # Check if model exists in HuggingFace cache structure
            hf_cache_pattern = (
                self.models_dir
                / "models--BramVanroy--kenlm_wikipedia_en"
                / "snapshots"
                / "*"
                / "wiki_en_token.arpa.bin"
            )

            # Try to find existing model in HF cache structure
            import glob

            hf_cached_files = glob.glob(str(hf_cache_pattern))

            if kenlm_local_path.exists():
                logging.info(f"Using local KenLM model: {kenlm_local_path}")
                self.kenlm_path = str(kenlm_local_path)
            elif hf_cached_files:
                # Use the first found cached model
                self.kenlm_path = hf_cached_files[0]
                logging.info(f"Using cached HuggingFace KenLM model: {self.kenlm_path}")
            else:
                logging.info("Downloading KenLM model from HuggingFace...")
                self.kenlm_path = hf_hub_download(
                    "BramVanroy/kenlm_wikipedia_en",
                    "wiki_en_token.arpa.bin",
                    cache_dir=str(self.models_dir),
                )
                logging.info(f"KenLM model downloaded to: {self.kenlm_path}")

            self.kenlm_model = kenlm.Model(self.kenlm_path)
            logging.info("KenLM model loaded successfully.")

        except Exception as e:
            logging.error(f"Failed to load KenLM model: {e}")
            raise

    def _load_spacy_model(self):
        """Load spaCy model for tokenization."""
        try:
            logging.info("Loading spaCy model...")
            self.nlp = spacy.load(
                "en_core_web_sm", disable=["ner", "parser", "tagger", "lemmatizer"]
            )
            logging.info("spaCy model loaded successfully.")
        except Exception as e:
            logging.error(f"Failed to load spaCy model: {e}")
            raise

    def _load_symspell_model(self):
        """Load SymSpell model and frequency dictionary."""
        try:
            logging.info("Loading SymSpell model...")

            # Download frequency dictionary if not present
            if not self.freq_file.exists():
                freq_url = (
                    "https://raw.githubusercontent.com/mammothb/symspellpy/master/"
                    "symspellpy/frequency_dictionary_en_82_765.txt"
                )
                logging.info("Downloading SymSpell frequency dictionary...")
                urllib.request.urlretrieve(freq_url, self.freq_file)
                logging.info("Frequency dictionary downloaded.")

            # Initialize SymSpell
            self.sym = SymSpell(max_dictionary_edit_distance=0, prefix_length=7)
            self.sym.load_dictionary(str(self.freq_file), 0, 1)

            # Add custom terms
            for term in ["bylaws", "universitycourt", "lanai"]:
                self.sym.create_dictionary_entry(term, 1)

            # Load word set
            self.en_words = self._load_word_set(str(self.freq_file))

            logging.info("SymSpell model loaded successfully.")

        except Exception as e:
            logging.error(f"Failed to load SymSpell model: {e}")
            raise

    def _load_word_set(self, dict_path: str, min_zipf: float = 2) -> set:
        """Load word set from frequency dictionary."""
        words = set()
        with open(dict_path, "r", encoding="utf8") as f:
            for line in f:
                word = line.strip().split()[0]
                words.add(word.lower())
        return words

    @lru_cache(maxsize=10000)
    def _kenlm_score(self, txt: str) -> float:
        """Per-token cached perplexity scoring."""
        return self.kenlm_model.perplexity(txt)

    def _reconstruct_sentence_with_split(
        self, sentence_tokens: List[str], word_index: int, split_words: List[str]
    ) -> str:
        """Reconstruct sentence with a specific word split for perplexity scoring."""
        reconstructed = (
            sentence_tokens[:word_index]
            + split_words
            + sentence_tokens[word_index + 1 :]
        )
        return " ".join(reconstructed)

    @lru_cache(maxsize=50_000)
    def _recursive_split_with_context(
        self, word: str, sentence_context: str = "", word_position: int = -1
    ) -> str:
        """
        Recursive splitting with sentence context awareness and perplexity checking.
        This is the main splitting strategy.
        """
        # Parse sentence context
        sentence_tokens = sentence_context.split() if sentence_context else [word]
        if word_position == -1:
            try:
                word_position = sentence_tokens.index(word)
            except ValueError:
                sentence_tokens = [word]
                word_position = 0

        # Score original word in context
        original_sentence = self._reconstruct_sentence_with_split(
            sentence_tokens, word_position, [word]
        )
        original_perplexity = self._kenlm_score(original_sentence)

        # Try all possible splits
        best_split = word
        best_perplexity = original_perplexity

        def _try_all_splits(s, start_pos=0):
            nonlocal best_split, best_perplexity

            for i in range(1, len(s)):
                left, right = s[:i], s[i:]

                # Only require that split parts are reasonable words (Zipf > 1.0)
                if (
                    zipf_frequency(left.lower(), "en") > 1.0
                    and zipf_frequency(right.lower(), "en") > 1.0
                ):

                    split_words = [left, right]
                    test_sentence = self._reconstruct_sentence_with_split(
                        sentence_tokens, word_position, split_words
                    )
                    perplexity = self._kenlm_score(test_sentence)

                    if perplexity < best_perplexity:
                        best_perplexity = perplexity
                        best_split = " ".join(split_words)

        _try_all_splits(word)

        # Only return split if it significantly improves perplexity
        if best_perplexity < original_perplexity * 0.90:  # 10% improvement threshold
            return best_split

        return word

    def _split_camel_case(self, token: str) -> str:
        """Split camel case: theBoardbut → the Boardbut"""
        return self.camel_re.sub(" ", token)

    def _symspell_segments(self, word: str) -> List[str]:
        """Get SymSpell segmentation suggestions."""
        comp = self.sym.word_segmentation(word.lower())
        seg = comp.segmented_string if hasattr(comp, "segmented_string") else comp
        return [seg]

    def _get_candidate_splits(
        self, token: str, sentence_context: str = "", word_position: int = -1
    ) -> List[str]:
        """
        Return candidate splits using recursive splitting, camel case, and SymSpell.
        Beam search and all caps splitting have been removed.
        """
        # Use sentence-aware recursive splitting when context is available
        if sentence_context and word_position >= 0:
            best_recursive = self._recursive_split_with_context(
                token, sentence_context, word_position
            )
        else:
            best_recursive = self._recursive_split_with_context(token)

        # if a split was made
        if best_recursive != token and " " in best_recursive:
            return [best_recursive]

        # Try camel case splitting
        if self.camel_re.search(token):
            camel_split = self._split_camel_case(token)
            # if camel already yields at least one space, return it
            if " " in camel_split:
                return [camel_split]

        # Try SymSpell segmentation as fallback
        segs = self._symspell_segments(token)
        good = [
            s
            for s in segs
            if " " in s and all(zipf_frequency(w, "en") > 2 for w in s.split())
        ]

        if good:
            return good

        # Return original token if no good splits found
        return [token]

    def clean_sentence(
        self, sentence: str, ratio_thresh: float = 0.65, abs_thresh: float = 40
    ) -> str:
        """
        Clean a single sentence using recursive splitting and camel case splitting only.
        """
        doc = self.nlp(sentence)
        sentence_tokens = [tk.text for tk in doc]
        out = []

        for i, tk in enumerate(doc):
            w = tk.text

            if w.isalpha() and len(w) > 8:
                candidates = self._get_candidate_splits(w, sentence, i)

                # Accept immediately if all parts are common words
                if " " in candidates[0] and all(
                    zipf_frequency(p, "en") > 2 for p in candidates[0].split()
                ):
                    out.append(candidates[0])
                    continue

                # Score candidates in sentence context
                scored = []
                for c in candidates:
                    # Reconstruct sentence with this candidate
                    test_tokens = (
                        sentence_tokens[:i] + c.split() + sentence_tokens[i + 1 :]
                    )
                    test_sentence = " ".join(test_tokens)
                    ppl = self._kenlm_score(test_sentence)
                    scored.append((ppl, c))

                best_ppl, best = min(scored, key=lambda x: x[0])

                # Compare with original sentence
                orig_ppl = self._kenlm_score(sentence)

                if (
                    best_ppl < orig_ppl * ratio_thresh
                    or orig_ppl - best_ppl > abs_thresh
                ):
                    out.append(best)
                else:
                    out.append(w)
            else:
                out.append(w)

        return " ".join(out)

    def clean_text(self, text: str, max_workers: int = 8) -> str:
        sentences = re.split(r"(?<=[.?!])\s+", text)
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            cleaned = list(executor.map(self.clean_sentence, sentences))
        return "\n".join(cleaned)

    def clean_document_with_placeholders(self, doc_text: str) -> str:
        """
        Clean document text while preserving image and page break placeholders.
        """
        parts = self.placeholder_re.split(doc_text)
        cleaned_parts = []

        for part in parts:
            if part in self.placeholders:
                cleaned_parts.append(part)
            elif part.strip():
                cleaned_parts.append(self.clean_text(part))
            else:
                cleaned_parts.append(part)

        return "".join(cleaned_parts)


# Global service instance
_text_cleaning_service = None


def get_text_cleaning_service() -> TextCleaningService:
    """Get or create the global text cleaning service instance."""
    global _text_cleaning_service
    if _text_cleaning_service is None:
        _text_cleaning_service = TextCleaningService()
    return _text_cleaning_service


In [17]:
cleaningService = get_text_cleaning_service()

2025-07-16 21:47:05,489 | INFO | Loading KenLM 5-gram model...
2025-07-16 21:47:05,491 | INFO | Using cached HuggingFace KenLM model: app/models/models--BramVanroy--kenlm_wikipedia_en/snapshots/744f3005d30510c09ae7302baf62d3cdc0a2bdec/wiki_en_token.arpa.bin
2025-07-16 21:47:09,706 | INFO | KenLM model loaded successfully.
2025-07-16 21:47:09,710 | INFO | Loading spaCy model...
2025-07-16 21:47:09,997 | INFO | spaCy model loaded successfully.
2025-07-16 21:47:09,997 | INFO | Loading SymSpell model...
2025-07-16 21:47:10,596 | INFO | SymSpell model loaded successfully.


In [20]:
cleanedText = cleaningService.clean_document_with_placeholders(text)

In [21]:
print(cleanedText)

# # House Rules Of The 

 # # University Court A OAO 

 In this image we can see a building with windows and balconies .
We can also see a tree and a bicycle .
We can also see the sky with clouds .
<!-- image_placeholder -->

<!-- page_break -->

 # # house rules of university court 

 # # A.
TERMINOLOGY 

 - 1)Agents . Any real estate broker , corporation , firm or individual empowered to acton behalf of any apartment owner .
- 2)Board . TheBoard of Directorsof the Association ofOwners .
- 3)Bylaws . The Bylawsof the Association ofOwners , as amended from time to time .
- 4)Declaration . The Declaration of Horizontal Property Regime of this condominium project , as it maybe amended from time to time .
- 5)Guest .
A person who resides other than at the project , and visits the premises for a period of time at invitation of a resident .
- 6)ManagingAgent . A firm which maybe appointed and / or employee hired by theBoard tomanage the project andwhose duties are outlined inthe Bylaws .
- 

In [22]:
print(text)

## House Rules Of The

## University Court A OAO

In this image we can see a building with windows and balconies. We can also see a tree and a bicycle. We can also see the sky with clouds.

<!-- image_placeholder -->

<!-- page_break -->

## HOUSERULESOFUNIVERSITYCOURT

## A. TERMINOLOGY

- 1)Agents.Any real estate broker,corporation,firm or individual empoweredtoactonbehalfofanyapartmentowner.
- 2)Board.TheBoard ofDirectorsof theAssociation ofOwners.
- 3)Bylaws.TheBylawsof the Association ofOwners,asamended fromtimetotime.
- 4)Declaration.The Declaration of Horizontal Property Regime of thiscondominiumproject,asitmaybeamendedfrom time to time.
- 5)Guest. A person who resides other than at the project,and visits thepremisesforaperiodof time atinvitation of aresident.
- 6)ManagingAgent.A firmwhich maybeappointedand/or employeehired by theBoard tomanage theproject andwhose dutiesareoutlined intheBylaws.
- 7)Owner.The person or person holding the Fee Simple title to an apartment and the c

In [None]:
rbkrdliifdgcltfukbejittlhnkvlrvd