In [None]:
!apt update

In [3]:
!apt-get install tesseract-ocr --yes

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 100 not upgraded.


In [10]:
!apt-get install poppler-utils --yes

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libpoppler118 poppler-data
Suggested packages:
  ghostscript fonts-japanese-mincho | fonts-ipafont-mincho
  fonts-japanese-gothic | fonts-ipafont-gothic fonts-arphic-ukai
  fonts-arphic-uming fonts-nanum
The following NEW packages will be installed:
  libpoppler118 poppler-data poppler-utils
0 upgraded, 3 newly installed, 0 to remove and 233 not upgraded.
Need to get 3428 kB of archives.
After this operation, 17.7 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 poppler-data all 0.4.11-1 [2171 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libpoppler118 amd64 22.02.0-2ubuntu0.6 [1071 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.6 [186 kB]
Fetched 3428 kB in 0s (35.2 MB/s)       
Selecting previously unselecte

In [None]:
%pip uninstall lxml --yes # needs to be reinstalled by docling

In [None]:
%pip install transformers

In [None]:
%pip install docling

In [1]:
%pip install pymupdf
%pip install pytesseract
%pip install pdf2image

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
    EasyOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.document import ConversionResult
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling_core.types.doc import ImageRefMode


import pytesseract
import pymupdf
from pathlib import Path
import json
from tqdm.auto import tqdm
from pdf2image import convert_from_path

In [2]:
class PDFProcessor:
    def __init__(self):
        pipeline_options = PdfPipelineOptions()
        pipeline_options.accelerator_options = AcceleratorOptions(
            num_threads=8, device=AcceleratorDevice.CUDA
        )

        pipeline_options.document_timeout = 20.0

        ocr_options = EasyOcrOptions()

        pipeline_options.ocr_options = ocr_options

        self.doc_converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
            }
        )
        
        self.rejected_files = self._load_rejected_files()

    def _load_rejected_files(self, file_path: Path = Path("rejected_ocr_files.json")):
        if file_path.exists():
            rejected_files = json.load(file_path.open("r"))
            return [Path(x) for x in rejected_files]
        else:
            return []

    def _reject_file(self, rejected_file: Path, storage_path: Path = Path("rejected_ocr_files.json")):
        self.rejected_files.append(rejected_file)
        serialized = [str(x.name) for x in self.rejected_files]
        json.dump(serialized, storage_path.open("w"))

    def extract_text_docling(self, pdf_folder_path: Path, extraction_folder_path: Path = Path("docling")):
        extraction_folder_path.mkdir(parents=True, exist_ok=True)

        pdf_files = list(pdf_folder_path.glob("*.pdf"))

        extraction_stems = [x.stem for x in extraction_folder_path.glob("*.md")] # previously processed files
        rejected_stems = [x.stem for x in self.rejected_files] # previously rejected files

        pdf_files = list(filter(lambda x: x.stem not in extraction_stems and x.stem not in rejected_stems, pdf_files))

        res_conv = self.doc_converter.convert_all(pdf_files, raises_on_error=False)

        success_count = 0
        failed_count = 0

        pbar = tqdm(res_conv, desc="Processing PDFs", total=len(pdf_files))
        for res in pbar:
            if res.status == ConversionStatus.SUCCESS:
                res.document.save_as_markdown(extraction_folder_path / f"{res.input.file.stem}.md", image_mode=ImageRefMode.PLACEHOLDER, strict_text=True)
                res.document.save_as_json(extraction_folder_path / f"{res.input.file.stem}.json", image_mode=ImageRefMode.PLACEHOLDER)
                success_count += 1
            else:
                self._reject_file(res.input.file)
                failed_count += 1

            pbar.set_postfix_str(f"Success: {success_count} | Error: {failed_count}")

    def extract_text_tesseract(self, pdf_folder_path: Path, extraction_folder_path: Path = Path("tesseract")):
        extraction_folder_path.mkdir(parents=True, exist_ok=True)

        pdf_files = list(pdf_folder_path.glob("*.pdf"))

        extraction_stems = [x.stem for x in extraction_folder_path.glob("*.txt")] # previously processed files
        print("Number of previously processed files: ", len(extraction_stems))
        rejected_stems = [x.stem for x in self.rejected_files] # previously rejected files
        print("Number of previously rejected files: ", len(rejected_stems))

        pdf_files = list(filter(lambda x: x.stem not in extraction_stems and x.stem not in rejected_stems, pdf_files))

        success_cnt = 0
        error_cnt = 0

        pbar = tqdm(pdf_files, desc="Processing PDFs", total=len(pdf_files))

        for pdf_file in pbar:

            pbar.set_postfix_str(f"[Success: {success_cnt} | Error: {error_cnt}] » Processing: {pdf_file.stem}")

            text_all_pages = ""

            # pymupdf gets stuck on some files
            # doc = pymupdf.open(pdf_file)
            # 
            # for page in doc:
            #     pixmap = page.get_pixmap()
            #     text_page = pytesseract.image_to_string(Image.open(io.BytesIO(pixmap.tobytes())), timeout=10)
            #     text_all_pages += text_page

            try:
                images = convert_from_path(pdf_file, timeout=10)
 
                for image in images:
                    text_page = pytesseract.image_to_string(image, timeout=10)
                    text_all_pages += text_page
   
            except Exception as e:
                self._reject_file(pdf_file)
                error_cnt += 1
                print(f"Error: {e}")
                continue
            
            with open(extraction_folder_path / f"{pdf_file.stem}.txt", "w") as f:
                f.write(text_all_pages)

            success_cnt += 1

In [3]:
pdf_processor = PDFProcessor()
pdf_processor.extract_text_tesseract(Path("pdfs"))

Number of previously processed files:  1102
Number of previously rejected files:  17


Processing PDFs:   0%|          | 0/51829 [00:00<?, ?it/s]

Error: Tesseract process timeout
