## Docling

In [1]:
import json
import logging
import time
from pathlib import Path

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
_log = logging.getLogger(__name__)


NameError: name 'logging' is not defined

In [None]:
logging.basicConfig(level=logging.INFO)

file_path = "../data/mmo_87050793_1630397705_64_10689.pdf"

input_doc_path = Path(file_path)

###########################################################################

# The following sections contain a combination of PipelineOptions
# and PDF Backends for various configurations.
# Uncomment one section at the time to see the differences in the output.

# PyPdfium without EasyOCR
# --------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = False
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = False

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(
#             pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
#         )
#     }
# )

# PyPdfium with EasyOCR
# -----------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(
#             pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
#         )
#     }
# )

# Docling Parse without EasyOCR
# -------------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = False
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
#     }
# )

# Docling Parse with EasyOCR
# ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options.lang = ["es"]
# pipeline_options.accelerator_options = AcceleratorOptions(
#     num_threads=4, device=AcceleratorDevice.AUTO
# )

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
#     }
# )

# Docling Parse with EasyOCR (CPU only)
# ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.ocr_options.use_gpu = False  # <-- set this.
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
#     }
# )

# Docling Parse with Tesseract
# ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractOcrOptions()

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
#     }
# )

# Docling Parse with Tesseract CLI
# ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractCliOcrOptions()

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
#     }
# )

# Docling Parse with ocrmac(Mac only)
# ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = OcrMacOptions()

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
#     }
# )

###########################################################################



NameError: name 'TesseractOcrOptions' is not defined

In [None]:
start_time = time.time()
conv_result = doc_converter.convert(input_doc_path)
end_time = time.time() - start_time

_log.info(f"Document converted in {end_time:.2f} seconds.")

## Export results
output_dir = Path("scratch")
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_result.input.file.stem

# # Export Deep Search document JSON format:
# with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
#     fp.write(json.dumps(conv_result.document.export_to_dict()))

# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_text())

# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_markdown())

# # Export Document Tags format:
# with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
#     fp.write(conv_result.document.export_to_document_tokens())

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash dd4327e08668463153bee7a9220b4c04
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
INFO:docling.models.factories:Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'


## Also Docling (Langchain)

In [1]:
from langchain_docling import DoclingLoader
from docling.chunking import HybridChunker
from langchain_docling.loader import ExportType

# EXPORT_TYPE = ExportType.MARKDOWN
EXPORT_TYPE = ExportType.DOC_CHUNKS
EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
file_path = "Washer_Manuals/WAV28KH3GB.pdf"


loader = DoclingLoader(
    file_path=file_path,
    export_type=EXPORT_TYPE,
    # chunker=HybridChunker(tokenizer=EMBED_MODEL_ID),
)

docs = loader.load()

  from .autonotebook import tqdm as notebook_tqdm
Token indices sequence length is longer than the specified maximum sequence length for this model (2935 > 512). Running this sequence through the model will result in indexing errors


Determining splits

In [2]:
if EXPORT_TYPE == ExportType.DOC_CHUNKS:
    splits = docs
elif EXPORT_TYPE == ExportType.MARKDOWN:
    from langchain_text_splitters import MarkdownHeaderTextSplitter

    splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=[
            ("#", "Header_1"),
            ("##", "Header_2"),
            ("###", "Header_3"),
        ],
    )
    splits = [split for doc in docs for split in splitter.split_text(doc.page_content)]
else:
    raise ValueError(f"Unexpected export type: {EXPORT_TYPE}")

In [3]:
for d in splits:
    print(f"- {d=}")
print("...")

- d=Document(metadata={'source': 'Washer_Manuals/WAV28KH3GB.pdf', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/11', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 31.465, 't': 243.685, 'r': 49.915, 'b': 235.471, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 4]}]}, {'self_ref': '#/texts/12', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 68.315, 't': 243.685, 'r': 191.039, 'b': 224.471, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 41]}]}], 'headings': ['WAV28KH3GB'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 9722454755316797560, 'filename': 'WAV28KH3GB.pdf'}}}, page_content='WAV28KH3GB\n[en]\nUser manual and installation instructions')
- d=Document(metadata={'source': 'Washer_Manuals/WAV28KH3GB.pdf', 'dl_meta': {'schema_name': 'doclin

In [14]:
for d in splits:
    print(f"- {d=}")
print("...")

- d=Document(metadata={'Header_2': 'WASHING MACHINE DRUM TYPE'}, page_content='The model code is on the next page.')
- d=Document(metadata={'Header_2': 'CONTENTS'}, page_content='- 1. Safety Instructions\n- 2. FeaturesandSpecifications\n- 3. Disassembly and Reassembly\n- 4. Troubleshooting\n- 5. Wiring Diagram\n- 6. Reference')
- d=Document(metadata={'Header_2': 'MODEL CODE'}, page_content='| WW1*T******             | WW7*T******             | WW8*T******             | WW8*T******             | WW9*T******             | WW9*T******                         |\n|-------------------------|-------------------------|-------------------------|-------------------------|-------------------------|-------------------------------------|\n| WW10T4040CH             | WW70T4020CE             |                         | WW80T754ABH             | WW90T4020CE             | WW90T734DBT                         |\n| WW10T4040CN             | WW70T4020CH             | WW80T4020CE WW80T4020CX | WW80T754ABT  

## PyMuPDF 

In [28]:
import pymupdf4llm
import pymupdf4llm
from langchain.text_splitter import MarkdownTextSplitter


md_text = pymupdf4llm.to_markdown(file_path)

splitter = MarkdownTextSplitter(chunk_size=600, chunk_overlap=200)

documents = splitter.create_documents([md_text])

print(len(documents))
print(documents)


125


## PDFMiner

In [7]:
from langchain.text_splitter import MarkdownTextSplitter

from PyPDF2 import PdfReader
# file_path = "../data/mmo_87050793_1630397705_64_10689.pdf"
file_path = "Washer_Manuals/WAV28KH3GB.pdf"
reader = PdfReader(file_path)
number_of_pages = len(reader.pages)
pages = reader.pages
text = ""
for page in pages:
    text+= page.extract_text() + "\n\n"
    
splitter = MarkdownTextSplitter(chunk_size=3000, chunk_overlap=200)

documents = splitter.create_documents([text])

In [15]:
print(documents[31])

page_content='Troubleshooting  en
57Fault Cause and troubleshooting
"E:30 / -80" Unapproved extension installed on the water drain
hose.
▶Remove any unapproved extensions from the water
hose. Connecting the appliance
"E:36 / -25 / -26" Drain pump is blocked.
▶→ "Cleaning the drain pump",  Page  50
"E:38 / -25 / -26" Drain pump is blocked.
▶→ "Cleaning the drum",  Page  48
if the fault is displayed again,
▶→ "Cleaning the drain pump",  Page  50
Inlet opening of the rubber gasket is blocked.
▶→ "Cleaning the intake opening in the rubber gas-
ket", Page  53
"E:10 / -00 / -10 /
-20"The pump for the intelligent dosing system is blocked.
1.Switch off the appliance.
2.Clean the pump unit. 
→ "Cleaning the detergent drawer",  Page  48
3.If this indicator appears again, call our after-sales
service. 
→ "Customer Service",  Page  69
Note:  You can start the washing programme if you de-
activate the intelligent dosing system and dose manu-
ally. 
→ "Buttons",  Page  23
The water pressure is low.


## PyPDF2

In [25]:
import PyPDF2
import re
from PyPDF2 import PdfReader

def extract_error_codes(text):
    # Match 'E:XX' followed by optional whitespace and '/-XX' parts
    pattern = r'E:\d{2}(?:\s*/\s*-\d{2})*'
    matches = re.findall(pattern, text, re.DOTALL)
    # Normalize slashes by stripping extra whitespace
    return [re.sub(r'\s*/\s*', '/', match) for match in matches]


with open(file_path, 'rb') as file:
    reader = PdfReader(file)
    print(repr(reader))
    document = ""
    for page_number in range(len(reader.pages)):
        page = reader.pages[page_number]
        text = page.extract_text()
        document += text + "\n"
        # print(text)

print(extract_error_codes(document))


<PyPDF2._reader.PdfReader object at 0x0000027947E41B80>
['E:35/-10', 'E:30/-80', 'E:30/-80', 'E:36/-25/-26', 'E:38/-25/-26', 'E:10/-00/-10/-20']


## pdfminer

In [None]:
from pdfminer.high_level import extract_text

text = extract_text(file_path)
print(text)


## Fitz 

In [None]:
import fitz

doc = fitz.open(file_path)
for page_num in range(doc.page_count):
    page = doc.load_page(page_num)
    text = page.get_text()
    print(text)


## pdfquery

In [None]:
from pdfquery import PDFQuery

pdf = PDFQuery(file_path)
pdf.load()
text = pdf.pq('LTTextLineHorizontal:contains("Your Text Here")').text()
print(text)


## Slate

In [None]:
import slate3k as slate

with open(file_path, 'rb') as file:
    doc = slate.PDF(file)
    for page in doc:
        print(page)


In [None]:
pip uninstall pdfminer
pip uninstall pdfminer-six
pip install pdfminer-six

## colqwen2-v1.0

In [1]:
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available

from colpali_engine.models import ColQwen2, ColQwen2Processor

model = ColQwen2.from_pretrained(
    "vidore/colqwen2-v1.0",
    torch_dtype=torch.bfloat16,
    device_map="cuda:0",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2Processor.from_pretrained("vidore/colqwen2-v1.0")

# Your inputs
images = [
    Image.new("RGB", (128, 128), color="white"),
    Image.new("RGB", (64, 32), color="black"),
]
queries = [
    "Is attention really all you need?",
    "What is the amount of bananas farmed in Salvador?",
]

# Process the inputs
batch_images = processor.process_images(images).to(model.device)
batch_queries = processor.process_queries(queries).to(model.device)

# Forward pass
with torch.no_grad():
    image_embeddings = model(**batch_images)
    query_embeddings = model(**batch_queries)

scores = processor.score_multi_vector(query_embeddings, image_embeddings)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 2 files: 100%|██████████| 2/2 [02:47<00:00, 83.56s/it] 
Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

AssertionError: Torch not compiled with CUDA enabled