In [1]:
import os
import json
import re
import pandas as pd
import numpy as np

from google.cloud import storage
from google.cloud import documentai_v1beta2 as documentai

# os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=r"C:\Users\ksaluj1\Documents\work\dev\gv\iacoe_sa.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="C:/Users/hsingh151/My_Work/Uber_IACOE_OCR_ML_Pipeline_v1.1.0/iacoe_sa.json"
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000

def parse_ner_json_to_string(parsed_ner):
    ner_string = ""
    flag = True
    for key, value in parsed_ner.items():
        if flag:
            ner_string = ner_string + value + "|" + key
            flag = False
        else:
            ner_string = ner_string + ";" + value + "|" + key
    return ner_string

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    file_name = source_file_name.split('/')[-1]
    destination_blob_name = destination_blob_name + "/" + file_name
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)
    gs_bucket_url = "gs://iacoe-cloud-bucket"
    gcs_uri = gs_bucket_url + '/' + destination_blob_name
    return gcs_uri

def extract_data_doc_ai(input_uri, project_id='concrete-tuner-241417'):
         # input_uri='gs://iacoe-cloud-bucket/k'):
    """Process a single document with the Document AI API, including
    text extraction and entity extraction."""

    client = documentai.DocumentUnderstandingServiceClient()
    gcs_source = documentai.types.GcsSource(uri=input_uri)

    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source, mime_type='application/pdf')

    form_extraction_params = documentai.types.FormExtractionParams(
        enabled=True)
    table_extraction_params = documentai.types.TableExtractionParams(
        enabled=True)
    entity_extraction_params = documentai.types.EntityExtractionParams(
        enabled=True)

    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        form_extraction_params=form_extraction_params,
        table_extraction_params=table_extraction_params,
        entity_extraction_params=entity_extraction_params)

    document = client.process_document(request=request)
    data = {}
    data["raw_text"] = (document.text).encode('utf-8')

    def _get_text(el):
        """Convert text offset indexes into text snippets.
        """
        response = ''
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response
    ent_dict = {}
    for entity in document.entities:
        ent_dict[_get_text(entity)] = (entity.mention_text)

    data["entities"] = ent_dict
    form_ent = {}
    for page in document.pages:
        for form_field in page.form_fields:
            key = (_get_text(form_field.field_name), form_field.field_name.confidence)
            value = (_get_text(form_field.field_value), form_field.field_value.confidence)
            form_ent[key] = value
    data["form_data"] = form_ent
    
    out_tables = []
    for page in document.pages:
        for table_num, table in enumerate(page.tables):
            out_table = []
            for row_num, row in enumerate(table.header_rows):
                out_header_row = [_get_text(cell.layout) for cell in row.cells]
                out_table.append(out_header_row)
            for row_num, row in enumerate(table.body_rows):
                out_body_row = [_get_text(cell.layout) for cell in row.cells]
                out_table.append(out_body_row)
            out_tables.append(out_table)

    data["tables"] = out_tables
    
    labels = {}
    for label in document.labels:
        labels[label.name] = label.confidence
    data["labels"] = labels

    return data

In [2]:
pwd

'C:\\Users\\hsingh151\\My_Work\\BOL\\02_NER_Bol_processing\\01_Format_1\\04_jupyter_notebook'

In [3]:
import time

In [4]:
data_files = {}

In [5]:
GCS_BUCKET = "iacoe-cloud-bucket"
DESTINATION_BLOB_NAME = "iacoe_ocr_ml"
file_loc = 'C:/Users/hsingh151/My_Work/BOL/02_NER_Bol_processing/01_Format_1/02_samples/pdf_samples/'

arr_txt =  [x for x in os.listdir(file_loc) if x.endswith(".pdf") or x.endswith(".PDF")]
t1 = time.time()

for i in arr_txt:
    file_path = file_loc + i
    t3 = time.time()
    gcs_uri = upload_blob(GCS_BUCKET, file_path, DESTINATION_BLOB_NAME)
    data_files[i] = extract_data_doc_ai(gcs_uri)
    print("File {} : time {}".format(i, time.time() - t3))
    
t2 = time.time()

File 00_Format1-pepsi.pdf : time 8.29396915435791
File 01_48.pdf : time 9.507373571395874
File 02_2915691946-PROOF_OF_DELIVERY.PDF : time 7.982613801956177
File 03_2952221748-PROOF_OF_DELIVERY.PDF : time 6.374773025512695
File 04_3660595532bol.pdf : time 6.917149305343628
File 05_7969705634.pdf : time 7.169848442077637
File 06_9861340886.pdf : time 7.138998985290527
File 07_BILL OF LADING_1629281603.pdf : time 8.027692794799805
File 08_mar4.pdf : time 8.209413766860962
File 09_PEPSI_4_PROOF_OF_DELIVERY.PDF : time 7.908060789108276
File 10_PEPSI_16-PROOF_OF_DELIVERY.PDF : time 7.868931770324707
File 11_PEPSI_21-PROOF_OF_DELIVERY.PDF : time 6.737341403961182
File 12_PEPSI_35_PROOF_OF_DELIVERY (1).PDF : time 8.81954026222229
File 13_PEPSI_38_PROOF_OF_DELIVERY.PDF : time 7.4525909423828125
File 14_PEPSI_40_PROOF_OF_DELIVERY (1).PDF : time 8.418945789337158
File 15_PEPSI_42_PROOF_OF_DELIVERY.PDF : time 6.895565986633301
File 16_PEPSI_43_PROOF_OF_DELIVERY.PDF : time 7.316099405288696
File 17

In [6]:
print("total time to process {} files is {}".format(len(arr_txt), t2 - t1))

total time to process 21 files is 159.75716423988342


In [7]:
[*data_files]

['00_Format1-pepsi.pdf',
 '01_48.pdf',
 '02_2915691946-PROOF_OF_DELIVERY.PDF',
 '03_2952221748-PROOF_OF_DELIVERY.PDF',
 '04_3660595532bol.pdf',
 '05_7969705634.pdf',
 '06_9861340886.pdf',
 '07_BILL OF LADING_1629281603.pdf',
 '08_mar4.pdf',
 '09_PEPSI_4_PROOF_OF_DELIVERY.PDF',
 '10_PEPSI_16-PROOF_OF_DELIVERY.PDF',
 '11_PEPSI_21-PROOF_OF_DELIVERY.PDF',
 '12_PEPSI_35_PROOF_OF_DELIVERY (1).PDF',
 '13_PEPSI_38_PROOF_OF_DELIVERY.PDF',
 '14_PEPSI_40_PROOF_OF_DELIVERY (1).PDF',
 '15_PEPSI_42_PROOF_OF_DELIVERY.PDF',
 '16_PEPSI_43_PROOF_OF_DELIVERY.PDF',
 '17_PEPSI_PROOF_OF_DELIVERY.PDF',
 '18_2543502120-proof_of_delivery.pdf',
 '19_2690224223-proof_of_delivery.pdf',
 '20_8532313429-proof_of_delivery_0.pdf']

In [8]:
data_files[[*data_files][19]]['raw_text']

b'LADING\nPepsi Beverages Company\nBILL OF\nBOL#: 280767631849\nOrder ID:\nCustomer PO: RAMIC\nAustin, TX Warehouse\n9010 Wall Street\nFrom: 2807 St. Louis, MO Plant\nOne Union \n70 Center Drive\nSt. Louis\nMO \n63120\nShip Date: 09/28/2021 19:12\nTX 78754\nShip\nPallet\nWeight\nExtended\nWeight\n44064\n44684\nPallets\nID\n1,248\nCS\nDescription\n31321 200Z PL 1/245 AQUA WTR\n26.0\npallet-plastic-full pallet (#57582 ):\n44.912\nTotal:\n1,274.0\nGrand Total:\n26.0\nBOL Comment 20oz Aqua support for F1 TR# 15142\nSeal #: 02806092\nCarrier: UNASSIGNED\nLoaded By: Smith, Mathew J.\nDriver Name: UNSSIGNED\n9/30\nPrint Date: 09/28/2021 19:13\n'

In [9]:
data_files[[*data_files][3]]['entities']

{'BOL#: 20210678942': '20210678942',
 'From: 202 Fresno, CA Plant\n1150 East North Ave\nFresno\nCA 93725': '202 Fresno, CA Plant\n1150 East North Ave\nFresno\nCA 93725',
 'Shin Date: 08/06/2021 22:48': '08/06/2021 22:48',
 'Order ID: CC': 'CC',
 'Customer PO: CC': 'CC',
 'To: 222 Greeley, CO Warehouse\n2323 117th Avenue': '222 Greeley, CO Warehouse\n2323 117th Avenue',
 'Greeley\nCO 80634': 'CO 80634',
 'Arrival Date: 08/09/2021 23:59': '08/09/2021 23:59',
 'Total:\n42,553': '42,553',
 'Grand Total:\n22.0': '22.0',
 'BOL Comment; PLCB Trailer# 559': 'PLCB Trailer# 559',
 'Trailer #: CARRIER': 'CARRIER',
 'Seal #: 239942': '239942',
 'Carrier Common Carrier': 'Common Carrier',
 'Loaded By: White, Brandon C.': 'White, Brandon C.',
 'Driver Name: CARRIER-DRIVER': 'CARRIER-DRIVER',
 'Received By:\nDriver Name: CARRIER-DRIVER\nBryan Arkan': 'Driver Name: CARRIER-DRIVER\nBryan Arkan',
 'Print Date: 08/06/2021 19:50': '08/06/2021 19:50'}

In [10]:
def parse_ner_json_to_string(parsed_ner):
    ner_string = ""
    flag = True
    print(parsed_ner)
    for key, value in parsed_ner.items():
        if flag:
            ner_string = ner_string + value + "|" + key
            flag = False
        else:
            ner_string = ner_string + ";" + value + "|" + key
    return ner_string

def pepsi_format1_doc_ai_ner_parser(raw_text, entities_json):
    parsed_data = {}
    
    #Look up keys
    BOL_KEY = "BOL#:"
    BOL_KEY2 = "BOL #:"
    DROPOFF_LOCATION_KEY = "To:"
    PICK_UP_ADDRESS_KEY = "From:"
    CUSTOMER_PO_KEY = "Customer PO:"
    
    # Entities to return
    PO_NUMBER = "PO Number"
    DROPOFF_ADDRESS = "Dropoff Address"
    PICKUP_ADDRESS = "Pickup Address"
    CUSTOMER_PO_NUMBER = "Customer Po"
    
    PO_FLAG = True
    PICKUP_FLAG = True
    DROPOFF_FLAG = True
    CUSTOMER_PO_NUMBER_FLAG = True
    
    for key, value in entities_json.items():
        
        if key.startswith(BOL_KEY) or key.startswith(BOL_KEY2):
            try:
                if PO_FLAG:
                    parsed_data[PO_NUMBER] = value.strip()
                    PO_FLAG = False
            except:
                pass
                
        if key.startswith(CUSTOMER_PO_KEY):
            try:
                if CUSTOMER_PO_NUMBER_FLAG:
                    parsed_data[CUSTOMER_PO_NUMBER] = value.strip()                    
                    if len(parsed_data[CUSTOMER_PO_NUMBER]) >= 25:
                        parsed_data[CUSTOMER_PO_NUMBER] = "NA"
                        CUSTOMER_PO_NUMBER_FLAG = False
            except:
                pass
                
    def get_zip_from_raw_text(text):
        pattern = re.compile(r'(\w+\s[A-Z]{2}\s\d{5})')
        match = pattern.findall(text.replace('\n', ' '))
        #print(match)
        if match:
            return match
        else:
            return ""
    
    for key, value in entities_json.items():
        
        if key.startswith(PICK_UP_ADDRESS_KEY):
            try:
                zip_match = re.search(r'(\s\d{5,})', value)
                zip_ = zip_match.group(1).strip()
               # print(zip_)

            except:
                pass
            
            
    def get_bol_number_from_raw_text(text):
        search_key = "BOL"
        start_idx = text.find(search_key)
        end_idx = text.find('From')
        bol_str = text[start_idx + len(search_key):end_idx].strip()
        try:
            bol = bol_str.strip()
        except:
            bol = ''
            
        return bol
    
    
    if PO_FLAG:
        try:
            bol_num = get_bol_number_from_raw_text(raw_text)
            bol_num = bol_num.replace('#: ', '')
            parsed_data[PO_NUMBER] = bol_num
        except:
            pass
    
    def get_pickup_address_exception(text):
        search_key = "From:"
        start_idx = text.find(search_key)
        end_idx = text.find('Order ID:')
        pickup_str = text[start_idx + len(search_key):end_idx].strip()
        pickup = pickup_str.strip()
        pickup = pickup.replace('\n', ' ')
        pickup = pickup.replace(':', '')
        pickup = pickup + " " +zip_
        #print(pickup)
        return pickup
        
    
    def get_pickup_address_from_raw_text(text):
        search_key = "From"
        start_idx = text.find(search_key)
        end_idx = text.find(' Date: ')
        pickup_str = text[start_idx + len(search_key):end_idx].strip()
        pickup = pickup_str.strip()
        pickup = pickup.replace('\n', ' ')
        pickup = pickup.replace(':', '')
        pickup_address = pickup[:-5].strip()
        if len(pickup_address) >= 90:
            try:
                pickup_address = get_pickup_address_exception(raw_text)
                return pickup_address
            except:
                return "NA"
            
        if len(pickup_address) <= 90:
            try:
                return pickup_address
            except:
                return "NA"
            
        
    if PICKUP_FLAG:
        pickup_add = get_pickup_address_from_raw_text(raw_text)
        parsed_data[PICKUP_ADDRESS] = pickup_add
    
    if parsed_data[PICKUP_ADDRESS] == "":
        try:
            raw_text = raw_text.replace('\n', ' ')
            from_key = "From: "
            from_idx = raw_text.find(from_key)
            end_key = "Ship Date:"
            end_idx = raw_text.find(end_key)
            from_str = raw_text[from_idx + len(from_key):end_idx].strip()
            from_address = from_str.strip()
            parsed_data[PICKUP_ADDRESS] = from_address
        except:
            pass
          
        
    def get_dropoff_address_from_raw_text(text):
        search_key = "To:"
        start_idx = text.find(search_key)
        end_idx = text.find('Arrival')
        dropoff_str = text[start_idx + len(search_key):end_idx].strip()
        dropoff = dropoff_str.strip()
        dropoff = dropoff.replace('\n', ' ')
        dropoff_address = dropoff.strip()
        #print(len(dropoff_address))
        if len(dropoff_address) >= 90:
            try:
                dropoff_address = dropoff_address[:-33].strip()
                #print(dropoff_address)
                return dropoff_address
            except:
                pass
        else:
            try:
                return dropoff_address
            except:
                return "NA"
      
        
    if DROPOFF_FLAG:
        dropadd = get_dropoff_address_from_raw_text(raw_text)
        parsed_data[DROPOFF_ADDRESS] = dropadd
    
            
    ner_string = parse_ner_json_to_string(parsed_data)
    return [200, ner_string]
    #return parsed_data
    

In [11]:
data = data_files[[*data_files][2]]
a = pepsi_format1_doc_ai_ner_parser2(data['raw_text'].decode(), data['entities'])

In [12]:
extract_data = []

In [13]:
for file,data in data_files.items():
    a = pepsi_format1_doc_ai_ner_parser2(data['raw_text'].decode(), data['entities'])
    extract_data.append(a)

In [14]:
df = pd.DataFrame(extract_data)

In [15]:
df

Unnamed: 0,PO Number,Customer Po,Pickup Address,Dropoff Address
0,20210678942,CC,"202 Fresno, CA Plant 1150 East North Ave Fresn...","222 Greeley, CO Warehouse 2323 117th Avenue Gr..."
1,OT4946069264,5824766113,"494 Mesquite, TX Plant 4532 Highway 67 E. Mesq...",999 S8261 SAMS CLUB #18-8261 455 31ST. STREET ...
2,512Q0340674,7608385498,512 New River Plant/Wytheville 200 Pepsi Way W...,999 QTGEK 30901 W 185th ST EDGERTON KS 66021
3,20210678942,CC,"202 Fresno, CA Plant 1150 East North Ave Fresn...","222 Greeley, CO Warehouse 2323 117th Avenue Gr..."
4,280767625857,FITZ PATRICK,"2807 St. Louis, MO Plant One Union 70 Center D...","971 Burnsville, MN Plant 11601 12th Avenue So...."
5,29967626906,,"299 Wichita, KS Plant 101 West 48th Street Wic...",494 4532 Highway 67 E. Mesquite TX 75150
6,190Q0341719,7610101233,"190 Phoenix, AZ Plant 4242 East Raymond Street...",999 QTGTS 410 South 104th Ave. Tolleson AZ 85353
7,280767626741,CADENCE,"2807 St. Louis, MO Plant One Union 70 Center D...","2815 Tupelo, MS Warehouse 620 East President S..."
8,50613657558,,"506 San Antonio, TX Plant 6100 NE Loop 410 San...","2615 Abilene, TX Warehouse 1850 Clack Street A..."
9,231210458749,,"2312 Stone Mountain, GA Plant 1644 Rock Mounta...","275 Cordele, GA Warehouse 501 Burnette Blvd Co..."


In [16]:
df["Pickup Address"][22]

KeyError: 22

In [None]:
df["Dropoff Address"][22]

In [None]:
def get_zip_from_raw_text(text):
        pattern = re.compile(r'(\w+\s[A-Z]{2}\s\d{5})')
        match = pattern.findall(text.replace('\n', ' '))
        #print(match)
        if match:
            return match
        else:
            return ""

In [None]:
data_files[[*data_files][24]]['raw_text']

In [None]:
data_files[[*data_files][24]]['entities']