In [1]:
from typing import Optional, List, Tuple, Dict, Union
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    TesseractCliOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption


def get_text_from_pdf(pdf_path: str, lang: str = "eng") -> str:
    """
    Use OCR to extract text from all pages of the PDF.
    Returns the extracted text as a single string.
    """
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = False
    pipeline_options.table_structure_options.do_cell_matching = True
    ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
    pipeline_options.ocr_options = ocr_options

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
            )
        }
    )
    result = converter.convert(pdf_path)
    text = result.document.export_to_markdown()

    return text

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
text = get_text_from_pdf("./phil_reports_pdf_first5/PR_Vol_nan_Volume_151-A.pdf")

2025-11-14 08:52:39,182 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-14 08:52:39,734 - INFO - Going to convert document batch...
2025-11-14 08:52:39,735 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 8ddd7950785f4643f0483fc13c09b07a
2025-11-14 08:52:39,743 - INFO - Loading plugin 'docling_defaults'
2025-11-14 08:52:39,748 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-14 08:52:39,756 - INFO - Loading plugin 'docling_defaults'
2025-11-14 08:52:39,769 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-11-14 08:52:39,850 - INFO - command: tesseract --list-langs
2025-11-14 08:52:41,920 - INFO - Accelerator device: 'cuda:0'
2025-11-14 08:52:57,581 - INFO - Processing document PR_Vol_nan_Volume_151-A.pdf
2025-11-14 08:52:58,155 - INFO - command: tesseract --psm 0 -l osd /tmp/tmp2knv7k73.png stdout
2025-11-14 08:52:58,532 - ERROR - OSD failed (doc PR_Vol_nan_Volume_151-A.pdf, pa

In [2]:
from paddleocr import PaddleOCR
import fitz  # PyMuPDF

# This loads the OCR model (first run will download weights)
ocr = PaddleOCR(
    use_angle_cls=True,  # better for rotated text
    lang="en"            # change to "en", "ch", etc. as needed
)

  ocr = PaddleOCR(
[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_doc_ori), the model files will be automatically downloaded and saved in `/root/.paddlex/official_models/PP-LCNet_x1_0_doc_ori`.[0m
Fetching 6 files: 100%|██████████| 6/6 [00:02<00:00,  2.10it/s]
[32mCreating model: ('UVDoc', None)[0m
[32mUsing official model (UVDoc), the model files will be automatically downloaded and saved in `/root/.paddlex/official_models/UVDoc`.[0m
Fetching 6 files: 100%|██████████| 6/6 [00:03<00:00,  1.63it/s]
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_textline_ori), the model files will be automatically downloaded and saved in `/root/.paddlex/official_models/PP-LCNet_x1_0_textline_ori`.[0m
Fetching 6 files: 100%|██████████| 6/6 [00:01<00:00,  3.91it/s]
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mUsing official model (PP-OCRv5_server_det), the model files will be auto

In [5]:
import os
from pathlib import Path
pdf_path = Path("./phil_reports_pdf_first5/PR_Vol_nan_Volume_151-A.pdf")

doc = fitz.open(str(pdf_path))

all_pages_text = []

for page_index in range(len(doc)):
    page = doc[page_index]
    
    # Render the PDF page to an image (increase dpi for better quality if needed)
    pix = page.get_pixmap(dpi=200)
    img_path = f"_tmp_page_{page_index+1}.png"
    pix.save(img_path)

    # Run OCR on the page image
    result = ocr.predict(img_path)
    
    page_lines = []
    if result and result[0] is not None:
        for line in result[0]:
            text = line[1][0]     # line[1] = (text, confidence)
            page_lines.append(text)

    page_text = "\n".join(page_lines)
    all_pages_text.append(page_text)

    # optional: clean up the temp image
    os.remove(img_path)

    print(f"✅ Finished OCR for page {page_index+1}/{len(doc)}")

doc.close()


✅ Finished OCR for page 1/978
✅ Finished OCR for page 2/978
✅ Finished OCR for page 3/978
✅ Finished OCR for page 4/978
✅ Finished OCR for page 5/978
✅ Finished OCR for page 6/978
✅ Finished OCR for page 7/978
✅ Finished OCR for page 8/978
✅ Finished OCR for page 9/978
✅ Finished OCR for page 10/978
✅ Finished OCR for page 11/978
✅ Finished OCR for page 12/978
✅ Finished OCR for page 13/978
✅ Finished OCR for page 14/978
✅ Finished OCR for page 15/978
✅ Finished OCR for page 16/978
✅ Finished OCR for page 17/978
✅ Finished OCR for page 18/978
✅ Finished OCR for page 19/978
✅ Finished OCR for page 20/978
✅ Finished OCR for page 21/978
✅ Finished OCR for page 22/978
✅ Finished OCR for page 23/978
✅ Finished OCR for page 24/978
✅ Finished OCR for page 25/978
✅ Finished OCR for page 26/978
✅ Finished OCR for page 27/978
✅ Finished OCR for page 28/978
✅ Finished OCR for page 29/978
✅ Finished OCR for page 30/978
✅ Finished OCR for page 31/978
✅ Finished OCR for page 32/978
✅ Finished OCR fo

KeyboardInterrupt: 

In [None]:
combined_text = ""

for i, page_text in enumerate(all_pages_text, start=1):
    combined_text += f"\n\n===== PAGE {i} =====\n\n"
    combined_text += page_text

# Preview a snippet
print(combined_text[:3000])

output_txt = pdf_path.with_suffix(".ocr.txt")
with open(output_txt, "w", encoding="utf-8") as f:
    f.write(combined_text)

output_txt
