In [1]:
import os
import fitz  # PyMuPDF is for splitting pdf docs
import difflib
import pandas as pd
from google.api_core.client_options import ClientOptions
from google.cloud import documentai

project_id = "nyc-ddc-cdc4"
location = "us"
processor_id = "12fdd9cc0a336340" # Create processor before running sample
file_path = "2A-033_ConEd_cdd.2019-02G.wtd.pdf"
mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
field_mask = "entities" #Optional. The fields to return in the Document object.
processor_version_id = "pretrained-foundation-model-v1.0-2023-08-22" # Optional. Processor version to use

opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=opts)
processor_name = client.processor_path(project_id, location, processor_id)

columns = ['type_debris',
           'waste_quantity',
           'pickup_name',
           'pickup_address',
           'pickup_city',
           'pickup_state',
           'pickup_zip',
           'pickup_lat',
           'pickup_lng',
           'generator_name',
           'generator_address',
           'generator_city',
           'generator_state',
           'generator_zip',
           'transporter_name',
           'receiving_name',
           'receiving_address',
           'receiving_city',
           'receiving_state',
           'receiving_zip',
           'receiving_lat',
           'receiving_lng',
          ]

In [2]:
def process_type(text, cutoff=0.6):
    candidates = ['Limited-Use Fill',
                  'Restricted-Use Fill',
                  'Contaminated Fill',
                  'Fill Material - Unknown',
                  'General Fill',
                  'Residue',
                  'Construction Waste',
                  'Demolition Waste'
                 ]
    matches = difflib.get_close_matches(text, candidates, n=1, cutoff=cutoff)
    if matches:
        return matches[0]  # Return the closest match found
    return 'Mix'  # Return 'Mix' if no close match is found
    

def process_data(text: str) -> str:
    return text.replace("\n", " ")

def process_page(page):
    image = page.get_pixmap()
    image_bytes = image.tobytes("png")  # Convert the page to PNG bytes

    # Create a document
    raw_document = documentai.RawDocument(content=image_bytes, mime_type='image/png')

    # Create a request
    request = documentai.ProcessRequest(name=processor_name, raw_document=raw_document)

    # Process the document
    result = client.process_document(request=request)
    page_data = {col: '' for col in columns} # Initial a blank row
    for entity in result.document.entities:
        if entity.type_ in page_data: # Check if it is required
            if entity.type_ == 'type_debris':
                page_data[entity.type_] = process_type(entity.mention_text)
            else:
                page_data[entity.type_] = process_data(entity.mention_text)
    return page_data

def process_pdf(file_path, csv_file_path):
    doc = fitz.open(file_path)  # Open the PDF file
    data = []
    for page_num, page in enumerate(doc, start=1):
        print(f"Processing page {page_num}")
        page_data = process_page(page)
        data.append(page_data)
    
    # Create DataFrame and save it to CSV
    df = pd.DataFrame(data)
    df.to_csv(csv_file_path, mode='a', header=False, index=False)

In [3]:
csv_file_path = 'interim_csv.csv'
file_path = "1A-301_Park_Trucking_cdd.2020-02B.wtd.pdf"

process_pdf(file_path, csv_file_path)

Processing page 1
