# Preprocess pdfs using docling

this is a notebook that describes the process of using docling to convert pdfs to text
Since it utilises some AI Capabilities it does require some gpu power. It is recommended to create a Jupyter enviroment in watsonx and choose a GPU instance.

For additional details on docling please refer to the [docling github](https://github.com/docling-project/docling)


In [1]:
!pip install docling

Collecting docling
  Downloading docling-2.26.0-py3-none-any.whl.metadata (8.8 kB)
Collecting docling-core<3.0.0,>=2.19.0 (from docling-core[chunking]<3.0.0,>=2.19.0->docling)
  Downloading docling_core-2.23.0-py3-none-any.whl.metadata (5.8 kB)
Collecting docling-ibm-models<4.0.0,>=3.4.0 (from docling)
  Downloading docling_ibm_models-3.4.1-py3-none-any.whl.metadata (7.4 kB)
Collecting docling-parse<4.0.0,>=3.3.0 (from docling)
  Downloading docling_parse-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.5 kB)
Collecting easyocr<2.0,>=1.7 (from docling)
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting marko<3.0.0,>=2.1.2 (from docling)
  Downloading marko-2.1.2-py3-none-any.whl.metadata (4.5 kB)
Collecting openpyxl<4.0.0,>=3.1.5 (from docling)
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting

## 1. Download pdfs from cloud storage to local storage for processing

In [5]:
import os
import ibm_boto3
from botocore.client import Config

def download_files_to_local_storage(api_key, bucket_name, temp_dir="tmp_folder"):
    # IBM COS client setup
    cos_client = ibm_boto3.client(
        service_name='s3',
        ibm_api_key_id=api_key,
        ibm_auth_endpoint="https://iam.cloud.ibm.com/identity/token",
        config=Config(signature_version='oauth'),
        endpoint_url='https://s3.direct.us-south.cloud-object-storage.appdomain.cloud'
    )

    # Ensure the directory exists
    os.makedirs(temp_dir, exist_ok=True)
    print(f"Using temporary directory: {temp_dir}")
    
    # Dictionary to store file paths
    local_files = {}

    # List all objects in the bucket
    objects = cos_client.list_objects_v2(Bucket=bucket_name)

    # Process each PDF file
    for obj in objects.get('Contents', []):
        key = obj['Key']
        
        # Skip non-PDF files
        if not key.lower().endswith('.pdf'):
            continue
        
        # Extract filename from key (remove any path components)
        filename = os.path.basename(key)
        
        # Create a clean local path with the original filename
        local_path = os.path.join(temp_dir, filename)
        
        # Check if file already exists to avoid redundant downloads
        if os.path.exists(local_path):
            print(f"Skipping {key}, already exists at {local_path}")
            continue
            
        print(f"Downloading {key}...")
        
        # Download the PDF file
        cos_client.download_file(bucket_name, key, local_path)
        
        # Store the file path
        local_files[key] = local_path
        
        print(f"Downloaded {key} to {local_path}")
    
    return local_files, temp_dir

# Example usage
api_key = 'PLACEHOLDER'
bucket = 'PLACEHOLDER'

downloaded_files, temp_directory = download_files_to_local_storage(api_key, bucket)
print(f"Downloaded {len(downloaded_files)} files to {temp_directory}")


Using temporary directory: tmp_folder
Downloading admindmbook.pdf...
Downloaded admindmbook.pdf to tmp_folder/admindmbook.pdf
Downloading partitionbook.pdf...
Downloaded partitionbook.pdf to tmp_folder/partitionbook.pdf
Downloading perfbook.pdf...
Downloaded perfbook.pdf to tmp_folder/perfbook.pdf
Downloading secbook.pdf...
Downloaded secbook.pdf to tmp_folder/secbook.pdf
Downloading textsearchbook.pdf...
Downloaded textsearchbook.pdf to tmp_folder/textsearchbook.pdf
Downloaded 5 files to tmp_folder


## 2. Process the pdfs using docling

### 2.1. the actual docling functionality

feel free to change this according to your needs. This is an iterative process and you can change the options to improve the results.


In [7]:
import json
import time
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption


def convert_pdf_with_options(input_path, output_dir=None, language="en"):
    """
    Convert a PDF file to various formats using Docling with enhanced options.
    
    Args:
        input_path (str): Path to the PDF file
        output_dir (str, optional): Directory to save output files. Defaults to None (no saving).
        language (str, optional): Language for OCR. Defaults to 'eng'.
    
    Returns:
        dict: Dictionary containing conversion results in different formats
    """
    # Set up pipeline options
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    pipeline_options.ocr_options.lang = [language]
    pipeline_options.ocr_options.force_full_page_ocr = True
    pipeline_options.accelerator_options = AcceleratorOptions(
        num_threads=10, device=AcceleratorDevice.AUTO
    )

    # Initialize document converter with custom options
    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )
    
    # Convert the document
    start_time = time.time()
    input_path = Path(input_path)
    conv_result = doc_converter.convert(input_path)
    end_time = time.time() - start_time
    
    print(f"Document converted in {end_time:.2f} seconds: {input_path.name}")
    
    # Prepare results
    results = {
        "json": conv_result.document.export_to_dict(),
        "text": conv_result.document.export_to_text(),
        "markdown": conv_result.document.export_to_markdown(),
        "doctags": conv_result.document.export_to_document_tokens()
    }
    
    # Save results if output directory is provided
    if output_dir:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        doc_filename = input_path.stem  # Get file name without extension
        
        # Export Deep Search document JSON format
        with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
            json.dump(results["json"], fp, indent=4)
        
        # Export Text format
        with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
            fp.write(results["text"])
        
        # Export Markdown format
        with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
            fp.write(results["markdown"])
        
        # Export Document Tags format
        with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
            fp.write(results["doctags"])
            
        print(f"All output files saved for {input_path.name} in {output_dir}")
    
    return results

### 2.2. Go through all pdf files in the download folder

In [None]:
def process_and_upload_pdfs(folder_path, api_key, bucket_name, output_dir="output", language="en", folder_prefix="parsed_pdfs"):
    """
    Processes PDF files and uploads each file immediately after processing.
    
    Args:
        folder_path (str): Path to the folder containing PDF files
        api_key (str): IBM Cloud API key
        bucket_name (str): Name of the bucket
        output_dir (str): Directory to save output files
        language (str): Language for OCR
        folder_prefix (str): Folder path inside the bucket
    """
    # Set up COS client
    cos_client = ibm_boto3.client(
        service_name='s3',
        ibm_api_key_id=api_key,
        ibm_auth_endpoint="https://iam.cloud.ibm.com/identity/token",
        config=Config(signature_version='oauth'),
        endpoint_url='https://s3.direct.us-south.cloud-object-storage.appdomain.cloud'
    )
    
    folder_path = Path(folder_path)
    output_dir = Path(output_dir)
    
    if not folder_path.exists():
        print(f"Folder {folder_path} does not exist.")
        return {}

    # Ensure folder_prefix ends with "/"
    if not folder_prefix.endswith("/"):
        folder_prefix += "/"

    pdf_files = list(folder_path.glob("*.pdf"))
    print(f"Found {len(pdf_files)} PDF files in {folder_path}")
    
    results = {}
    for pdf_file in pdf_files:
        print(f"Processing file: {pdf_file}")
        
        # Process the PDF
        result = convert_pdf_with_options(pdf_file, output_dir=output_dir, language=language)
        results[pdf_file.name] = result
        
        # Upload the generated files for this PDF
        doc_filename = pdf_file.stem
        generated_files = [
            (f"{doc_filename}.json", "application/json"),
            (f"{doc_filename}.txt", "text/plain"),
            (f"{doc_filename}.md", "text/markdown"),
            (f"{doc_filename}.doctags", "text/plain")
        ]
        
        # Upload each generated file
        for filename, content_type in generated_files:
            local_path = output_dir / filename
            if local_path.exists():
                object_key = folder_prefix + filename
                print(f"Uploading {filename} to {bucket_name}/{object_key}...")
                
                try:
                    cos_client.upload_file(
                        str(local_path),
                        bucket_name,
                        object_key,
                        ExtraArgs={'ContentType': content_type}
                    )
                    print(f"Successfully uploaded {filename}")
                except Exception as e:
                    print(f"Error uploading {filename}: {str(e)}")
        
        print(f"Completed processing and uploading files for {pdf_file.name}")
    
    print(f"Finished processing and uploading {len(pdf_files)} files.")
    return results

# Example usage
if __name__ == "__main__":
    tmp_folder = "tmp_folder"
    output_dir = "output"
    api_key = 'PLACEHOLDER'
    bucket_name = 'PLACEHOLDER'
    
    all_results = process_and_upload_pdfs(
        tmp_folder,
        api_key,
        bucket_name,
        output_dir=output_dir,
        language="en",
        folder_prefix="parsed_pdfs"
    )
    
    print(f"Processed {len(all_results)} PDF files. Results saved and uploaded.")