In [15]:
import os
import json
import re
import pandas as pd
import numpy as np

from google.cloud import storage
from google.cloud import documentai_v1beta2 as documentai

# os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=r"C:\Users\ksaluj1\Documents\work\dev\gv\iacoe_sa.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]='/Users/jghosh2/Documents/my-notebook/BOL_CODE_REF/iacoe_sa.json'
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000

def parse_ner_json_to_string(parsed_ner):
    ner_string = ""
    flag = True
    for key, value in parsed_ner.items():
        if flag:
            ner_string = ner_string + value + "|" + key
            flag = False
        else:
            ner_string = ner_string + ";" + value + "|" + key
    return ner_string

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    file_name = source_file_name.split('/')[-1]
    destination_blob_name = destination_blob_name + "/" + file_name
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)
    gs_bucket_url = "gs://iacoe-cloud-bucket"
    gcs_uri = gs_bucket_url + '/' + destination_blob_name
    return gcs_uri

def extract_data_doc_ai(input_uri, project_id='concrete-tuner-241417'):
         # input_uri='gs://iacoe-cloud-bucket/k'):
    """Process a single document with the Document AI API, including
    text extraction and entity extraction."""

    client = documentai.DocumentUnderstandingServiceClient()
    gcs_source = documentai.types.GcsSource(uri=input_uri)

    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source, mime_type='application/pdf')

    form_extraction_params = documentai.types.FormExtractionParams(
        enabled=True)
    table_extraction_params = documentai.types.TableExtractionParams(
        enabled=True)
    entity_extraction_params = documentai.types.EntityExtractionParams(
        enabled=True)

    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        form_extraction_params=form_extraction_params,
        table_extraction_params=table_extraction_params,
        entity_extraction_params=entity_extraction_params)

    document = client.process_document(request=request)
    data = {}
    data["raw_text"] = (document.text).encode('utf-8')

    def _get_text(el):
        """Convert text offset indexes into text snippets.
        """
        response = ''
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response
    ent_dict = {}
    for entity in document.entities:
        ent_dict[_get_text(entity)] = (entity.mention_text)

    data["entities"] = ent_dict
    form_ent = {}
    for page in document.pages:
        for form_field in page.form_fields:
            key = (_get_text(form_field.field_name), form_field.field_name.confidence)
            value = (_get_text(form_field.field_value), form_field.field_value.confidence)
            form_ent[key] = value
    data["form_data"] = form_ent
    
    out_tables = []
    for page in document.pages:
        for table_num, table in enumerate(page.tables):
            out_table = []
            for row_num, row in enumerate(table.header_rows):
                out_header_row = [_get_text(cell.layout) for cell in row.cells]
                out_table.append(out_header_row)
            for row_num, row in enumerate(table.body_rows):
                out_body_row = [_get_text(cell.layout) for cell in row.cells]
                out_table.append(out_body_row)
            out_tables.append(out_table)

    data["tables"] = out_tables
    
    labels = {}
    for label in document.labels:
        labels[label.name] = label.confidence
    data["labels"] = labels

    return data
def doc_ai_data_extractor(file_base64, doctype):

    file = base64.b64decode(file_base64)

    # Upload file to GCS Bucket and get GCS Uri for Doc AI
    destination_blob_name = DESTINATION_BLOB_NAME + '/' + doctype
    try:
        gcs_uri = upload_blob(GCS_BUCKET, file, destination_blob_name)
        print(gcs_uri)
    except:
        tb = traceback.format_exc()
        return [DOC_UPLOAD_EXCEPTION + "\n" + tb]
    try:
        data = extract_data_doc_ai(gcs_uri)
        print(data)
    except:
        tb = traceback.format_exc()
        return [DOC_AI_EXCEPTION + "\n" + tb]
    return [200, data]

In [16]:
pwd

'/Users/jghosh2/Documents/my-notebook/BOL_CODE_REF/02_NER_Bol_processing/01_Format_1/04_jupyter_notebook'

In [17]:
import time

In [18]:
data_files = {}

In [22]:
GCS_BUCKET = "iacoe-cloud-bucket"
DESTINATION_BLOB_NAME = "iacoe_ocr_ml"
#file_loc = 'C:/Users/hsingh151/My_Work/BOL/02_NER_Bol_processing/01_Format_1/02_samples/pdf_samples/'
file_loc='/Users/jghosh2/Documents/my-notebook/BOL_CODE_REF/02_NER_Bol_processing/01_Format_1/Format1/pdf_samples/'

arr_txt =  [x for x in os.listdir(file_loc) if x.endswith(".pdf") or x.endswith(".PDF")]
print(arr_txt)
t1 = time.time()

for i in arr_txt:
    file_path = file_loc + i
    t3 = time.time()
    gcs_uri = upload_blob(GCS_BUCKET, file_path, DESTINATION_BLOB_NAME)
    print(gcs_uri)
    data_files[i] = extract_data_doc_ai(gcs_uri)
    print(data_files[i]['raw_text'])
    print("File {} : time {}".format(i, time.time() - t3))
    
t2 = time.time()

['48.pdf']
gs://iacoe-cloud-bucket/iacoe_ocr_ml/48.pdf
{'raw_text': b"TRUCK#182142\nLOAD#0528065\nPepsi Beverages Company\nBILL OF LADING\nBOL#: OT4946069264\nFrom: 494 Mesquite, TX Plant\n4532 Highway 67 E.\nMesquite\nTX 75150-\nShip Date: 08/16/2021 14:51\nOrder ID:\nCustomer PO: 5824766113\nTo: 999 S8261 SAMS CLUB #18-8261\n455 31ST. STREET\nKENNER\nLA 70065\nArrival Date: 08/17/2021 04:00\nItem\nCS\n1\n1.0\nDescription\n88973 12OZ CN 36/1CB BRSK SWL\n19716 12OZ CN 36/1CB PEPSI\n106551 16.90Z PL PK 24/1 DT LPT GRN BRY\n95207 16.9OZ PL PK 24/1 DT LPT GT CIT\n95206 - 16.90Z PL PK 24/1 LIT GRN CTRS\n125294 16.90Z PL PK 24/1 LIT LMND\n144297 16.9OZ PL PK 24/1 LIT PCH\n95903 16.9OZ PLPK32/1 AQUA WTR\n82697 6.50Z CN 1/12 STRBK DS\n57933 9.5OZ NR 15/1 FRAP MCH\n120\n60\nCS\nShip\nPallet Extended\nPallets Qty UOMA Weight Weight\nComment\n2.0 150\n2,400\n4,800\n3.0 225 CS 2,355 7,065\n2.0 120\n1,665 3,330\n60 CS 1,790 1,790\n1.0\n60 CS 1,817 1,817\n2.0 120 CS 1,710 3,420\n2.0\nCS\n1,890\n3,7

In [37]:
print("total time to process {} files is {}".format(len(arr_txt), t2 - t1))

total time to process 21 files is 164.14141941070557


In [28]:
data_files['48.pdf']['raw_text']

b"TRUCK#182142\nLOAD#0528065\nPepsi Beverages Company\nBILL OF LADING\nBOL#: OT4946069264\nFrom: 494 Mesquite, TX Plant\n4532 Highway 67 E.\nMesquite\nTX 75150-\nShip Date: 08/16/2021 14:51\nOrder ID:\nCustomer PO: 5824766113\nTo: 999 S8261 SAMS CLUB #18-8261\n455 31ST. STREET\nKENNER\nLA 70065\nArrival Date: 08/17/2021 04:00\nItem\nCS\n1\n1.0\nDescription\n88973 12OZ CN 36/1CB BRSK SWL\n19716 12OZ CN 36/1CB PEPSI\n106551 16.90Z PL PK 24/1 DT LPT GRN BRY\n95207 16.9OZ PL PK 24/1 DT LPT GT CIT\n95206 - 16.90Z PL PK 24/1 LIT GRN CTRS\n125294 16.90Z PL PK 24/1 LIT LMND\n144297 16.9OZ PL PK 24/1 LIT PCH\n95903 16.9OZ PLPK32/1 AQUA WTR\n82697 6.50Z CN 1/12 STRBK DS\n57933 9.5OZ NR 15/1 FRAP MCH\n120\n60\nCS\nShip\nPallet Extended\nPallets Qty UOMA Weight Weight\nComment\n2.0 150\n2,400\n4,800\n3.0 225 CS 2,355 7,065\n2.0 120\n1,665 3,330\n60 CS 1,790 1,790\n1.0\n60 CS 1,817 1,817\n2.0 120 CS 1,710 3,420\n2.0\nCS\n1,890\n3,780\n6.0 360 CS 2,278 13,669\n0.6 180 CS 1,710 1,026\n1.0 126\n2,129 

In [27]:
data_files['48.pdf']['entities']

{'BOL#: OT4946069264': 'OT4946069264',
 'Ship Date: 08/16/2021 14:51': '08/16/2021 14:51',
 'Customer PO: 5824766113': '5824766113',
 'To: 999 S8261 SAMS CLUB #18-8261\n455 31ST. STREET': '999 S8261 SAMS CLUB #18-8261\n455 31ST. STREET',
 'KENNER\nLA 70065': 'LA 70065',
 'Arrival Date: 08/17/2021 04:00': '08/17/2021 04:00',
 'Total: 44,275': '44,275',
 'Grand Total:\n21.0 1,542.0': '21.0 1,542.0',
 'Received By\n44,275\nDate 8-17-21\nManer TR #': '44,275\nDate 8-17-21\nManer TR #',
 'Trailer #: Prointment.\nUNKNOWN Pale\n3482816\nDate': 'Prointment.\nUNKNOWN Pale\n3482816\nDate',
 'Checked By:\nBad Bad\nPallet Comment!\nthrich Good': 'Bad Bad\nPallet Comment!\nthrich Good',
 'Driver Signature:\nES': 'ES',
 'PO 582476643': '582476643',
 "Total Rec'd 21413² 1521": '21413² 1521',
 'BOL Comment: TR# 6250': 'TR# 6250',
 'Carrier: Common Carrier': 'Common Carrier',
 'Loaded By: Torres, Jose J.': 'Torres, Jose J.',
 'Driver Name: UNASSIGNED': 'UNASSIGNED',
 'Print Date: 08/16/2021 14:52': '08

In [29]:
data_files['48.pdf']['form_data']

{('Loaded By: ', 0.9995680451393127): ('Torres, Jose J.\n',
  0.9995680451393127),
 ('Driver Signature:\n', 0.9994579553604126): ('ES\n', 0.9994579553604126),
 ('Driver Name: ', 0.9994081854820251): ('UNASSIGNED\n', 0.9994081854820251),
 ('Ship Date: ', 0.9891992211341858): ('08/16/2021 14:51\n',
  0.9891992211341858),
 ('BOL#: ', 0.9565322399139404): ('OT4946069264\n', 0.9565322399139404),
 ('Arrival Date: ', 0.9493228793144226): ('08/17/2021 04:00\n',
  0.9493228793144226),
 ('Customer PO: ', 0.9470323324203491): ('5824766113\n', 0.9470323324203491),
 ('Print Date: ', 0.9389381408691406): ('08/16/2021 14:52\n',
  0.9389381408691406),
 ('BOL Comment: ', 0.9062824845314026): ('TR# 6250\n', 0.9062824845314026),
 ('Checked By:\n',
  0.8233382105827332): ('Bad Bad\nPallet Comment!\nthrich Good\n', 0.8233382105827332),
 ('PO ', 0.7955166101455688): ('582476643\n', 0.7955166101455688),
 ('Trailer #: ',
  0.7832170724868774): ('Prointment.\nUNKNOWN Pale\n3482816\nDate\n', 0.7832170724868774)

In [30]:
data_files['48.pdf']['tables']

[[['', '', 'Ship\n', '', '', 'Pallet ', 'Extended\n'],
  ['Item\n',
   'Description\n',
   'Pallets ',
   'Qty ',
   'UOMA ',
   'Weight ',
   'Weight\n'],
  ['88973 ',
   '12OZ CN 36/1CB BRSK SWL\n',
   '2.0 ',
   '150\n',
   'CS\n',
   '2,400\n',
   '4,800\n'],
  ['1\n19716 ',
   '12OZ CN 36/1CB PEPSI\n',
   '3.0 ',
   '225 ',
   'CS ',
   '2,355 ',
   '7,065\n'],
  ['106551 ',
   '16.90Z PL PK 24/1 DT LPT GRN BRY\n',
   '2.0 ',
   '120\n',
   '',
   '1,665 ',
   '3,330\n'],
  ['95207 ',
   '16.9OZ PL PK 24/1 DT LPT GT CIT\n',
   '1.0\n',
   '60 ',
   'CS ',
   '1,790 ',
   '1,790\n'],
  ['95206 - ',
   '16.90Z PL PK 24/1 LIT GRN CTRS\n',
   '1.0\n',
   '60 ',
   'CS ',
   '1,817 ',
   '1,817\n'],
  ['125294 ',
   '16.90Z PL PK 24/1 LIT LMND\n',
   '2.0 ',
   '120 ',
   'CS ',
   '1,710 ',
   '3,420\n'],
  ['144297 ',
   '16.9OZ PL PK 24/1 LIT PCH\n',
   '2.0\n',
   '120\n',
   'CS\n',
   '1,890\n',
   '3,780\n'],
  ['95903 ',
   '16.9OZ PLPK32/1 AQUA WTR\n',
   '6.0 ',
   '360 ',
  

In [24]:
data_files[[*data_files][0]]['raw_text']

b"TRUCK#182142\nLOAD#0528065\nPepsi Beverages Company\nBILL OF LADING\nBOL#: OT4946069264\nFrom: 494 Mesquite, TX Plant\n4532 Highway 67 E.\nMesquite\nTX 75150-\nShip Date: 08/16/2021 14:51\nOrder ID:\nCustomer PO: 5824766113\nTo: 999 S8261 SAMS CLUB #18-8261\n455 31ST. STREET\nKENNER\nLA 70065\nArrival Date: 08/17/2021 04:00\nItem\nCS\n1\n1.0\nDescription\n88973 12OZ CN 36/1CB BRSK SWL\n19716 12OZ CN 36/1CB PEPSI\n106551 16.90Z PL PK 24/1 DT LPT GRN BRY\n95207 16.9OZ PL PK 24/1 DT LPT GT CIT\n95206 - 16.90Z PL PK 24/1 LIT GRN CTRS\n125294 16.90Z PL PK 24/1 LIT LMND\n144297 16.9OZ PL PK 24/1 LIT PCH\n95903 16.9OZ PLPK32/1 AQUA WTR\n82697 6.50Z CN 1/12 STRBK DS\n57933 9.5OZ NR 15/1 FRAP MCH\n120\n60\nCS\nShip\nPallet Extended\nPallets Qty UOMA Weight Weight\nComment\n2.0 150\n2,400\n4,800\n3.0 225 CS 2,355 7,065\n2.0 120\n1,665 3,330\n60 CS 1,790 1,790\n1.0\n60 CS 1,817 1,817\n2.0 120 CS 1,710 3,420\n2.0\nCS\n1,890\n3,780\n6.0 360 CS 2,278 13,669\n0.6 180 CS 1,710 1,026\n1.0 126\n2,129 

In [25]:
data_files[[*data_files][0]]['entities']

{'BOL#: OT4946069264': 'OT4946069264',
 'Ship Date: 08/16/2021 14:51': '08/16/2021 14:51',
 'Customer PO: 5824766113': '5824766113',
 'To: 999 S8261 SAMS CLUB #18-8261\n455 31ST. STREET': '999 S8261 SAMS CLUB #18-8261\n455 31ST. STREET',
 'KENNER\nLA 70065': 'LA 70065',
 'Arrival Date: 08/17/2021 04:00': '08/17/2021 04:00',
 'Total: 44,275': '44,275',
 'Grand Total:\n21.0 1,542.0': '21.0 1,542.0',
 'Received By\n44,275\nDate 8-17-21\nManer TR #': '44,275\nDate 8-17-21\nManer TR #',
 'Trailer #: Prointment.\nUNKNOWN Pale\n3482816\nDate': 'Prointment.\nUNKNOWN Pale\n3482816\nDate',
 'Checked By:\nBad Bad\nPallet Comment!\nthrich Good': 'Bad Bad\nPallet Comment!\nthrich Good',
 'Driver Signature:\nES': 'ES',
 'PO 582476643': '582476643',
 "Total Rec'd 21413² 1521": '21413² 1521',
 'BOL Comment: TR# 6250': 'TR# 6250',
 'Carrier: Common Carrier': 'Common Carrier',
 'Loaded By: Torres, Jose J.': 'Torres, Jose J.',
 'Driver Name: UNASSIGNED': 'UNASSIGNED',
 'Print Date: 08/16/2021 14:52': '08

In [166]:
def parse_ner_json_to_string(parsed_ner):
    ner_string = ""
    flag = True
    print(parsed_ner)
    for key, value in parsed_ner.items():
        if flag:
            ner_string = ner_string + value + "|" + key
            flag = False
        else:
            ner_string = ner_string + ";" + value + "|" + key
    return ner_string

def pepsi_format1_doc_ai_ner_parser(raw_text, entities_json):
    parsed_data = {}
    
    #Look up keys
    BOL_KEY = "BOL#:"
    BOL_KEY2 = "BOL #:"
    DROPOFF_LOCATION_KEY = "To:"
    PICK_UP_ADDRESS_KEY = "From:"
    
    # Entities to return
    PO_NUMBER = "PO Number"
    DROPOFF_ADDRESS = "Dropoff Address"
    PICKUP_ADDRESS = "Pickup Address"
    
    PO_FLAG = True
    PICKUP_FLAG = True
    DROPOFF_FLAG = True
    
    for key, value in entities_json.items():
        
        if key.startswith(BOL_KEY) or key.startswith(BOL_KEY2):
            if PO_FLAG:
                parsed_data[PO_NUMBER] = value
                PO_FLAG = False
                
    def get_zip_from_raw_text(text):
        pattern = re.compile(r'(\w+\s[A-Z]{2}\s\d{5})')
        match = pattern.findall(text.replace('\n', ' '))
        #print(match)
        if match:
            return match
        else:
            return ""
        
            
    def get_bol_number_from_raw_text(text):
        search_key = "BOL"
        start_idx = text.find(search_key)
        end_idx = text.find('From')
        bol_str = text[start_idx + len(search_key):end_idx].strip()
        try:
            bol = bol_str.strip()
        except:
            bol = ''
            
        return bol
    
    def get_pickup_address_from_raw_text(text):
        text = text.replace('\n', ' ')
        print(text)
        search_key = "From"
        start_idx = text.find(search_key)
        end_idx = text.find(' Date: ')
        pickup_str = text[start_idx + len(search_key):end_idx].strip()
        print("pickup str", pickup_str)
        zip_ = get_zip_from_raw_text(raw_text)
        pickup = pickup_str.strip()
        pickup = pickup.replace('\n', ' ')
        pickup = pickup.replace(':', '')
        zip_check_pickup = get_zip_from_raw_text(pickup)
        print(zip_check_pickup)
        if len(pickup) < 85 and zip_check_pickup:
            print(pickup)
            return pickup
        else:
            print(pickup)
            return pickup[0:49] + ' ' +zip_[0]
        
    def get_dropoff_address_from_raw_text(text):
        zip_ = get_zip_from_raw_text(raw_text)
        search_key = "To:"
        start_idx = text.find(search_key)
        end_idx = text.find('Arrival')
        dropoff_str = text[start_idx + len(search_key):end_idx].strip()
        dropoff = dropoff_str.strip()
        dropoff = dropoff.replace('\n', ' ')
        zip_check_dropoff = get_zip_from_raw_text(dropoff)
        if dropoff and zip_check_dropoff:
            return dropoff
        else:
            return dropoff + ' ' + zip_[1]
        
        try:
            dropoff = dropoff_str.strip()
            dropoff = dropoff.replace('\n', ' ')
        except:
            dropoff = ''
            
        return dropoff 
           
    
    if PO_FLAG:
        bol_num = get_bol_number_from_raw_text(raw_text)
        bol_num = bol_num.replace('#: ', '')
        parsed_data[PO_NUMBER] = bol_num
        
    if PICKUP_FLAG:
        pickup_add = get_pickup_address_from_raw_text(raw_text)
        parsed_data[PICKUP_ADDRESS] = pickup_add
        
    if DROPOFF_FLAG:
        dropadd = get_dropoff_address_from_raw_text(raw_text)
        parsed_data[DROPOFF_ADDRESS] = dropadd
    
            
    #ner_string = parse_ner_json_to_string(parsed_data)
    #return [200, ner_string]
    return parsed_data
    

In [167]:
data = data_files[[*data_files][5]]

In [168]:
a = pepsi_format1_doc_ai_ner_parser(data['raw_text'].decode(), data['entities'])

Pepsi Beverages Company BILL OF LADING BOL#: 29967626906 Order ID: Customer PO: To: 494 4532 Highway 67 E. Mesquite TX 75150 Arrival Date: 08/18/2021 23:59 From: 299 Wichita, KS Plant 101 West 48th Street Wichita KS 67217 Ship Date: 08/17/2021 19:33 Mesquite, TX Plant 6 Comment Item Description 197330 16OZ CN 1/12 BUBLY BNC TRP BRY 159298 1L PL 1/12 PRM LIFEWTR 160143 7.5OZ CN 15/2 DRPEP 165343 7.5OZ CN 15/2 DT PEPSI ORIG Ship Pallets Qty 0.2 21 9.0 648 1.0 150 1.0 150 11.2 969 12 Pallet UOM Weight CS 1,457 CS 2,016 CS 2,460 CS 2,595 Extended Weight 291 18,144 2,460 2,595 23,490 828 pallet-wood-chep ( #90197): Qty.. Comment Item Description 151578 1.25L PL 1/12 DRPEP 166044 1.25L PL 1/12 DT PEPSI ORIG Ship Pallets 4.0 9.0 180 405 UOM CS CS Total: 24,318 Pallet Extended Weight Weight 1,647 6,588 1,566 14,094 20,682 429 13.0 585 13 pallet-plastic-full pallet (#57582 ): Total: 21,111 Grand Total: 25.0 1,579.0 45,429 BOL Comment: nelson#801 Carrier: Common Carrier Trailer #: UNKNOWN Seal #

In [165]:
data_files[[*data_files][5]]['raw_text']

b'Pepsi Beverages Company\nBILL OF LADING\nBOL#: 29967626906\nOrder ID:\nCustomer PO:\nTo: 494\n4532 Highway 67 E.\nMesquite\nTX 75150\nArrival Date: 08/18/2021 23:59\nFrom: 299\nWichita, KS Plant\n101 West 48th Street\nWichita\nKS 67217\nShip Date: 08/17/2021 19:33\nMesquite, TX Plant\n6\nComment\nItem\nDescription\n197330 16OZ CN 1/12 BUBLY BNC TRP BRY\n159298 1L PL 1/12 PRM LIFEWTR\n160143 7.5OZ CN 15/2 DRPEP\n165343 7.5OZ CN 15/2 DT PEPSI ORIG\nShip\nPallets Qty\n0.2\n21\n9.0\n648\n1.0 150\n1.0\n150\n11.2 969\n12\nPallet\nUOM Weight\nCS 1,457\nCS 2,016\nCS 2,460\nCS 2,595\nExtended\nWeight\n291\n18,144\n2,460\n2,595\n23,490\n828\npallet-wood-chep ( #90197):\nQty..\nComment\nItem\nDescription\n151578 1.25L PL 1/12 DRPEP\n166044 1.25L PL 1/12 DT PEPSI ORIG\nShip\nPallets\n4.0\n9.0\n180\n405\nUOM\nCS\nCS\nTotal: 24,318\nPallet Extended\nWeight Weight\n1,647 6,588\n1,566 14,094\n20,682\n429\n13.0\n585\n13\npallet-plastic-full pallet (#57582 ):\nTotal:\n21,111\nGrand Total:\n25.0\n1,579

In [25]:
data_files[[*data_files][14]]['raw_text']

b'Pepsi Beverages Company\nBILL OF LADING\nBOL#: 264862237714\nOrder ID:\nCustomer PO:\nTo: 2312 Stone Mountain, GA Plant\n1644 Rock Mountain Blvd\nStone Mountain\nArrival Date: 10/30/2020 23:59\nFrom: 2648 Riviera Beach, FL Plant\n7305 Garden Rd\nRiviera Beach FL 33404\nShip Date: 10/28/2020 14:40\nGA 30083\nComment\nItem\n3130\nDescription\n2L PL 1/8S MDEW\nShip\nPallet\nExtended\nPallets Qty\nUOM Weight Weight\n21.0\n1,050 CS 2,050 43,050\n21.0 1050\n43,050\n21\n1,050\npallet-wood-full pallet (#14961):\nTotal:\n44,100\nGrand Total:\n21.0\n1,071.0\n44,100\nBOL Comment: Trailer# 178616\nCarrier: Common Carrier\nTrailer #: UNKNOWN\nSeal #: 00152306\nLoaded By: VPICK\nChecked By: S\nDriver Name: UNASSIGNED\nSSIGNE\nDriver Signature:\nReceived By:\nG\nRA\n10-29-20\nThe property described above has been accepted on the date hereof in apparent good order (except as noted)for carriage subject \nto individually determined rates or contracts agreed to between the carrier and shipper.\n'

In [26]:
extract_data = []

In [27]:
for file,data in data_files.items():
    a = pepsi_format1_doc_ai_ner_parser(data['raw_text'].decode(), data['entities'])
    extract_data.append(a)

pickup str : 202 Fresno, CA Plant
1150 East North Ave
Fresno
CA 93725
Shin
['Fresno CA 93725']
 202 Fresno, CA Plant 1150 East North Ave Fresno CA 93725 Shin
pickup str : 494 Mesquite, TX Plant
4532 Highway 67 E.
Mesquite
TX 75150-
Ship
['Mesquite TX 75150']
 494 Mesquite, TX Plant 4532 Highway 67 E. Mesquite TX 75150- Ship
pickup str : 512
New River Plant/Wytheville
200 Pepsi Way
Wytheville
VA 24382
Ship
['Wytheville VA 24382']
 512 New River Plant/Wytheville 200 Pepsi Way Wytheville VA 24382 Ship
pickup str : 202 Fresno, CA Plant
1150 East North Ave
Fresno
CA 93725
Shin
['Fresno CA 93725']
 202 Fresno, CA Plant 1150 East North Ave Fresno CA 93725 Shin
pickup str : 2807 St. Louis, MO Plant
One Union 70 Center Drive
St. Louis
MO 63120
Ship
['Louis MO 63120']
 2807 St. Louis, MO Plant One Union 70 Center Drive St. Louis MO 63120 Ship
pickup str 


pickup str : 190 Phoenix, AZ Plant
4242 East Raymond Street
Phoenix
AZ 85040
Ship
['Phoenix AZ 85040']
 190 Phoenix, AZ Plant 4242 East Raymo

IndexError: string index out of range

In [28]:
df = pd.DataFrame(extract_data)

In [29]:
df

Unnamed: 0,PO Number,Pickup Address,Dropoff Address
0,20210678942,"202 Fresno, CA Plant 1150 East North Ave Fres...","222 Greeley, CO Warehouse 2323 117th Avenue Gr..."
1,OT4946069264,"494 Mesquite, TX Plant 4532 Highway 67 E. Mes...",999 S8261 SAMS CLUB #18-8261 455 31ST. STREET ...
2,512Q0340674,512 New River Plant/Wytheville 200 Pepsi Way ...,999 QTGEK 30901 W 185th ST EDGERTON KS 66021
3,20210678942,"202 Fresno, CA Plant 1150 East North Ave Fres...","222 Greeley, CO Warehouse 2323 117th Avenue Gr..."
4,280767625857,"2807 St. Louis, MO Plant One Union 70 Center ...","971 Burnsville, MN Plant 11601 12th Avenue So...."
5,29967626906,Mesquite TX 75150,494 4532 Highway 67 E. Mesquite TX 75150
6,190Q0341719,"190 Phoenix, AZ Plant 4242 East Raymond Stree...",999 QTGTS 410 South 104th Ave. Tolleson AZ 85353
7,280767626741,"2807 St. Louis, MO Plant One Union 70 Center ...","2815 Tupelo, MS Warehouse 620 East President S..."
8,50613657558,"506 San Antonio, TX Plant 6100 NE Loop 410 Sa...","2615 Abilene, TX Warehouse 1850 Clack Street A..."
9,231210458749,"2312 Stone Mountain, GA Plant 1644 Rock Mount...","275 Cordele, GA Warehouse 501 Burnette Blvd Co..."


In [158]:
data_files[[*data_files][5]]['raw_text']

b'Pepsi Beverages Company\nBILL OF LADING\nBOL#: 29967626906\nOrder ID:\nCustomer PO:\nTo: 494\n4532 Highway 67 E.\nMesquite\nTX 75150\nArrival Date: 08/18/2021 23:59\nFrom: 299\nWichita, KS Plant\n101 West 48th Street\nWichita\nKS 67217\nShip Date: 08/17/2021 19:33\nMesquite, TX Plant\n6\nComment\nItem\nDescription\n197330 16OZ CN 1/12 BUBLY BNC TRP BRY\n159298 1L PL 1/12 PRM LIFEWTR\n160143 7.5OZ CN 15/2 DRPEP\n165343 7.5OZ CN 15/2 DT PEPSI ORIG\nShip\nPallets Qty\n0.2\n21\n9.0\n648\n1.0 150\n1.0\n150\n11.2 969\n12\nPallet\nUOM Weight\nCS 1,457\nCS 2,016\nCS 2,460\nCS 2,595\nExtended\nWeight\n291\n18,144\n2,460\n2,595\n23,490\n828\npallet-wood-chep ( #90197):\nQty..\nComment\nItem\nDescription\n151578 1.25L PL 1/12 DRPEP\n166044 1.25L PL 1/12 DT PEPSI ORIG\nShip\nPallets\n4.0\n9.0\n180\n405\nUOM\nCS\nCS\nTotal: 24,318\nPallet Extended\nWeight Weight\n1,647 6,588\n1,566 14,094\n20,682\n429\n13.0\n585\n13\npallet-plastic-full pallet (#57582 ):\nTotal:\n21,111\nGrand Total:\n25.0\n1,579

In [106]:
data_files[[*data_files][19]]['entities']

{'Pepsi Beverages Company': 'Company',
 'BOL#: 280767631849': '280767631849',
 'Customer PO: RAMIC': 'RAMIC',
 'From: 2807 St. Louis, MO Plant\nOne Union \n70 Center Drive': '2807 St. Louis, MO Plant\nOne Union \n70 Center Drive',
 'Ship Date: 09/28/2021 19:12': '09/28/2021 19:12',
 'Pallets\n26.0': '26.0',
 'Total:\n44.912': '44.912',
 'Grand Total:\n1,274.0\n26.0': '1,274.0\n26.0',
 '26.0\nGrand Total:\nBOL Comment 20oz Aqua support for F1 TR# 15142': 'Grand Total:\nBOL Comment 20oz Aqua support for F1 TR# 15142',
 'BOL Comment 20oz Aqua support for F1 TR# 15142': '20oz Aqua support for F1 TR# 15142',
 'Carrier: UNASSIGNED': 'UNASSIGNED',
 'Loaded By: Smith, Mathew J.': 'Smith, Mathew J.',
 'Driver Name: UNSSIGNED\n9/30': 'UNSSIGNED\n9/30'}

In [172]:
def parse_ner_json_to_string(parsed_ner):
    ner_string = ""
    flag = True
    print(parsed_ner)
    for key, value in parsed_ner.items():
        if flag:
            ner_string = ner_string + value + "|" + key
            flag = False
        else:
            ner_string = ner_string + ";" + value + "|" + key
    return ner_string

def pepsi_format1_doc_ai_ner_parser1(raw_text, entities_json):
    parsed_data = {}
    
    #Look up keys
    BOL_KEY = "BOL#:"
    BOL_KEY2 = "BOL #:"
    DROPOFF_LOCATION_KEY = "To:"
    PICK_UP_ADDRESS_KEY = "From:"
    CUSTOMER_PO_KEY = "Customer PO:"
    
    # Entities to return
    PO_NUMBER = "PO Number"
    DROPOFF_ADDRESS = "Dropoff Address"
    PICKUP_ADDRESS = "Pickup Address"
    CUSTOMER_PO_NUMBER = "Customer Po"
    
    PO_FLAG = True
    PICKUP_FLAG = True
    DROPOFF_FLAG = True
    CUSTOMER_PO_NUMBER_FLAG = True
    
    for key, value in entities_json.items():
        
        if key.startswith(BOL_KEY) or key.startswith(BOL_KEY2):
            if PO_FLAG:
                parsed_data[PO_NUMBER] = value
                PO_FLAG = False
                
        if key.startswith(CUSTOMER_PO_KEY):
            if CUSTOMER_PO_NUMBER_FLAG:
                parsed_data[CUSTOMER_PO_NUMBER] = value
                CUSTOMER_PO_NUMBER_FLAG = False
                
    def get_zip_from_raw_text(text):
        pattern = re.compile(r'(\w+\s[A-Z]{2}\s\d{5})')
        match = pattern.findall(text.replace('\n', ' '))
        #print(match)
        if match:
            return match
        else:
            return ""
        
            
    def get_bol_number_from_raw_text(text):
        search_key = "BOL"
        start_idx = text.find(search_key)
        end_idx = text.find('From')
        bol_str = text[start_idx + len(search_key):end_idx].strip()
        try:
            bol = bol_str.strip()
        except:
            bol = ''
            
        return bol
    
    
    if PO_FLAG:
        bol_num = get_bol_number_from_raw_text(raw_text)
        bol_num = bol_num.replace('#: ', '')
        parsed_data[PO_NUMBER] = bol_num
    
    def get_pickup_address_from_raw_text(text):
        search_key = "From"
        start_idx = text.find(search_key)
        end_idx = text.find(' Date: ')
        pickup_str = text[start_idx + len(search_key):end_idx].strip()
        #print("pickup str", pickup_str)
        #zip_ = get_zip_from_raw_text(raw_text)
        #zip_pickup = get_zip_from_raw_text(pickup)
        pickup = pickup_str.strip()
        pickup = pickup.replace('\n', ' ')
        pickup = pickup.replace(':', '')
        pickup_address = pickup[:-5].strip()
        # print(len(pickup_address))
        try:
            return pickup_address
        except:
            return "NA"
            
        #print(pickup_address)
        
    if PICKUP_FLAG:
        pickup_add = get_pickup_address_from_raw_text(raw_text)
        parsed_data[PICKUP_ADDRESS] = pickup_add
          
        
    def get_dropoff_address_from_raw_text(text):
        # zip_ = get_zip_from_raw_text(raw_text)
        search_key = "To:"
        start_idx = text.find(search_key)
        end_idx = text.find('Arrival')
        dropoff_str = text[start_idx + len(search_key):end_idx].strip()
        #print(dropoff_str)
        dropoff = dropoff_str.strip()
        dropoff = dropoff.replace('\n', ' ')
        #print(dropoff)
        dropoff_address = dropoff.strip()
        print(len(dropoff_address))
        try:
            return dropoff_address
        except:
            return "NA"
           
        
    if DROPOFF_FLAG:
        dropadd = get_dropoff_address_from_raw_text(raw_text)
        parsed_data[DROPOFF_ADDRESS] = dropadd
    
            
    ner_string = parse_ner_json_to_string(parsed_data)
    return [200, ner_string]
    #return parsed_data
    

In [173]:
data = data_files[[*data_files][18]]

In [174]:
a = pepsi_format1_doc_ai_ner_parser1(data['raw_text'].decode(), data['entities'])

92
{'PO Number': '32410421508', 'Customer Po': 'PLCB 1900', 'Pickup Address': '324 Detroit, MI Plant 1555 Mack Ave Detroit MI Order ID Customer PO PLCB 1900 To 2837 Elyria, OH Warehouse 925 Lorain Blvd. Elyria OH 44035 48207', 'Dropoff Address': '2837 Elyria, OH Warehouse 925 Lorain Blvd. Elyria OH 44035 48207 Ship Date: 09/28/2021 19:00'}


In [175]:
extract_data = []

In [176]:
for file,data in data_files.items():
    a = pepsi_format1_doc_ai_ner_parser1(data['raw_text'].decode(), data['entities'])
    extract_data.append(a)

60
{'PO Number': '20210678942', 'Customer Po': 'CC', 'Pickup Address': '202 Fresno, CA Plant 1150 East North Ave Fresno CA 93725', 'Dropoff Address': '222 Greeley, CO Warehouse 2323 117th Avenue Greeley CO 80634'}
61
{'PO Number': 'OT4946069264', 'Customer Po': '5824766113', 'Pickup Address': '494 Mesquite, TX Plant 4532 Highway 67 E. Mesquite TX 75150-', 'Dropoff Address': '999 S8261 SAMS CLUB #18-8261 455 31ST. STREET KENNER LA 70065'}
44
{'PO Number': '512Q0340674', 'Customer Po': '7608385498', 'Pickup Address': '512 New River Plant/Wytheville 200 Pepsi Way Wytheville VA 24382', 'Dropoff Address': '999 QTGEK 30901 W 185th ST EDGERTON KS 66021'}
60
{'PO Number': '20210678942', 'Customer Po': 'CC', 'Pickup Address': '202 Fresno, CA Plant 1150 East North Ave Fresno CA 93725', 'Dropoff Address': '222 Greeley, CO Warehouse 2323 117th Avenue Greeley CO 80634'}
66
{'PO Number': '280767625857', 'Customer Po': 'FITZ PATRICK', 'Pickup Address': '2807 St. Louis, MO Plant One Union 70 Center Dr

In [147]:
df = pd.DataFrame(extract_data)

In [152]:
df["Pickup Address"][18]

'324 Detroit, MI Plant 1555 Mack Ave Detroit MI Order ID Customer PO PLCB 1900 To 2837 Elyria, OH Warehouse 925 Lorain Blvd. Elyria OH 44035 48207'

In [151]:
df["Dropoff Address"]

0     222 Greeley, CO Warehouse 2323 117th Avenue Gr...
1     999 S8261 SAMS CLUB #18-8261 455 31ST. STREET ...
2          999 QTGEK 30901 W 185th ST EDGERTON KS 66021
3     222 Greeley, CO Warehouse 2323 117th Avenue Gr...
4     971 Burnsville, MN Plant 11601 12th Avenue So....
5              494 4532 Highway 67 E. Mesquite TX 75150
6      999 QTGTS 410 South 104th Ave. Tolleson AZ 85353
7     2815 Tupelo, MS Warehouse 620 East President S...
8     2615 Abilene, TX Warehouse 1850 Clack Street A...
9     275 Cordele, GA Warehouse 501 Burnette Blvd Co...
10    192 Harrison, AR Warehouse 229 Industrial Park...
11    497 Houston, TX Plant 9300 Laporte Freeway Hou...
12    2642 Maumelle, AR Warehouse 104 Champs BLVD Ma...
13    214 Riverside, CA Plant 6659 Sycamore Canyon B...
14    2312 Stone Mountain, GA Plant 1644 Rock Mounta...
15    506 San Antonio, TX Plant 6100 NE Loop 410 San...
16    506 San Antonio, TX Plant 6100 NE Loop 410 San...
17    2615 Abilene, TX Warehouse 1850 Clack Stre

In [141]:
if key.startswith(PICK_UP_ADDRESS_KEY):
            if PICKUP_FLAG:
                parsed_data[PICKUP_ADDRESS] = value
                PICKUP_FLAG = False
                
        if key.startswith(DROPOFF_LOCATION_KEY):
            if DROPOFF_FLAG:
                parsed_data[DROPOFF_ADDRESS] = value
                DROPOFF_FLAG = False

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 6)

In [None]:
if PICKUP_FLAG:
        pickup_add = get_pickup_address_from_raw_text(raw_text)
        parsed_data[PICKUP_ADDRESS] = pickup_add
        
    if DROPOFF_FLAG:
        dropadd = get_dropoff_address_from_raw_text(raw_text)
        parsed_data[DROPOFF_ADDRESS] = dropadd

In [None]:
def get_pickup_address_from_raw_text(text):
        search_key = "From"
        start_idx = text.find(search_key)
        end_idx = text.find('Ship ')
        pickup_str = text[start_idx + len(search_key):end_idx].strip()
        try:
            pickup = pickup_str.strip()
            #pickup = pickup.replace('\n', ' ')
            pickup = pickup[:-5]
            print(len(pickup))
        except:
            pickup = ''
            
        if len(pickup) < 85:
            return pickup
        else:
            return pickup[0:18]
        
    def get_dropoff_address_from_raw_text(text):
        search_key = "To:"
        start_idx = text.find(search_key)
        end_idx = text.find('Arrival')
        dropoff_str = text[start_idx + len(search_key):end_idx].strip()
        try:
            dropoff = dropoff_str.strip()
            dropoff = dropoff.replace('\n', ' ')
            #dropoff = dropoff[:-5]
            print(len(dropoff))
        except:
            dropoff = ''
            
        return dropoff 

In [None]:
def get_dropoff_address_from_raw_text(text):
        zip_ = get_zip_from_raw_text(raw_text)
        search_key = "To:"
        start_idx = text.find(search_key)
        end_idx = text.find('Arrival')
        dropoff_str = text[start_idx + len(search_key):end_idx].strip()
        print(dropoff_str)
        dropoff = dropoff_str.strip()
        dropoff = dropoff.replace('\n', ' ')
        zip_check_dropoff = get_zip_from_raw_text(dropoff)
        if dropoff and zip_check_dropoff:
            return dropoff
        else:
            return dropoff + ' ' + zip_[1]
        
        try:
            dropoff = dropoff_str.strip()
            dropoff = dropoff.replace('\n', ' ')
        except:
            dropoff = ''
            
        return dropoff 