In [1]:
%pip install language-tool-python

Collecting language-tool-python
  Downloading language_tool_python-2.9.0-py3-none-any.whl.metadata (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting toml (from language-tool-python)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Downloading language_tool_python-2.9.0-py3-none-any.whl (49 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading toml-0.10.2-py2.py3-none-any.whl (16 kB)
Installing collected packages: toml, language-tool-python
Successfully installed language-tool-python-2.9.0 toml-0.10.2
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
%pip uninstall lxml --yes # needs to be reinstalled by docling

Found existing installation: lxml 4.8.0
Uninstalling lxml-4.8.0:
  Successfully uninstalled lxml-4.8.0
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
%pip install transformers

[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
%pip install docling

Collecting docling
  Downloading docling-2.26.0-py3-none-any.whl.metadata (8.8 kB)
Collecting certifi>=2024.7.4 (from docling)
  Downloading certifi-2025.1.31-py3-none-any.whl.metadata (2.5 kB)
Collecting docling-core<3.0.0,>=2.19.0 (from docling-core[chunking]<3.0.0,>=2.19.0->docling)
  Downloading docling_core-2.23.1-py3-none-any.whl.metadata (5.8 kB)
Collecting docling-ibm-models<4.0.0,>=3.4.0 (from docling)
  Downloading docling_ibm_models-3.4.1-py3-none-any.whl.metadata (7.4 kB)
Collecting docling-parse<4.0.0,>=3.3.0 (from docling)
  Downloading docling_parse-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.5 kB)
Collecting easyocr<2.0,>=1.7 (from docling)
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting huggingface_hub<1,>=0.23 (from docling)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
C

In [1]:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
    TesseractCliOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.document import ConversionResult
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling_core.types.doc import ImageRefMode

import language_tool_python
from pathlib import Path
import json
from typing import List
from tqdm.auto import tqdm

2025-03-17 14:26:53.275011: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-17 14:26:53.275063: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-17 14:26:53.276012: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-17 14:26:53.281658: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class Postprocessing:
    def __init__(self, target_language: str = "de-DE"):
        self.tool = language_tool_python.LanguageTool(target_language)

        pipeline_options = PdfPipelineOptions()
        pipeline_options.accelerator_options = AcceleratorOptions(
            num_threads=8, device=AcceleratorDevice.CUDA
        )
    
        # pipeline_options.ocr_options = TesseractCliOcrOptions(lang=["de"])

        pipeline_options.do_ocr = True
        pipeline_options.do_table_structure = True
        pipeline_options.table_structure_options.do_cell_matching = True

        pipeline_options.document_timeout = 20.0

        self.doc_converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
            }
        )

    def filter_viable_pdf_stems(self, metadata: Path):
        viable_pdf_stems = []
        for file in metadata.glob("*.json"):
            metadata = json.load(file.open("r"))
            if not metadata["language_metadata"]["language"] == "de":
                continue
            if not metadata["ocr_text_statistics"]["num_words"] > 100:
                continue
            viable_pdf_stems.append(file.stem)
        return viable_pdf_stems

    def docling_processing(self, pdf_folder: Path, viable_pdf_stems: List[str], extraction_folder_path: Path = Path("docling")):
        extraction_folder_path.mkdir(parents=True, exist_ok=True)

        extraction_stems = [x.stem for x in extraction_folder_path.glob("*.md")] # previously processed files

        unprocessed_stems = list(filter(lambda stem: stem not in extraction_stems, viable_pdf_stems))
        pdf_files = [pdf_folder / f"{stem}.pdf" for stem in unprocessed_stems]
        
        res_conv = self.doc_converter.convert_all(pdf_files, raises_on_error=False)

        success_count = 0
        failed_count = 0

        pbar = tqdm(res_conv, desc="Processing PDFs", total=len(pdf_files))
        for res in pbar:
            if res.status == ConversionStatus.SUCCESS:
                res.document.save_as_markdown(extraction_folder_path / f"{res.input.file.stem}.md", image_mode=ImageRefMode.PLACEHOLDER)
                res.document.save_as_json(extraction_folder_path / f"{res.input.file.stem}.json", image_mode=ImageRefMode.PLACEHOLDER)
                success_count += 1
            else:
                failed_count += 1

            pbar.set_postfix_str(f"Success: {success_count} | Error: {failed_count}")

    def spelling_correction(self, metadata: Path):
        pass

    def postprocess(self, metadata: Path):
        pass

In [3]:
postprocessing = Postprocessing()

In [12]:
viable_pdf_stems = postprocessing.filter_viable_pdf_stems(Path("metadata"))

In [13]:
len(viable_pdf_stems)

18798

In [4]:
viable_pdf_stems = json.load(open("viable_pdf_stems.json", "r"))

In [5]:
postprocessing.docling_processing(Path("pdfs"), viable_pdf_stems)
#974

Processing PDFs:   0%|          | 0/18789 [00:00<?, ?it/s]

If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


KeyboardInterrupt: 