In [1]:
import os
import json
import re
import pandas as pd
import numpy as np

from google.cloud import storage
from google.cloud import documentai_v1beta2 as documentai

# os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=r"C:\Users\ksaluj1\Documents\work\dev\gv\iacoe_sa.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="C:/Users/hsingh151/My_Work/Uber_IACOE_OCR_ML_Pipeline_v1.1.0/iacoe_sa.json"
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000

def parse_ner_json_to_string(parsed_ner):
    ner_string = ""
    flag = True
    for key, value in parsed_ner.items():
        if flag:
            ner_string = ner_string + value + "|" + key
            flag = False
        else:
            ner_string = ner_string + ";" + value + "|" + key
    return ner_string

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    file_name = source_file_name.split('/')[-1]
    destination_blob_name = destination_blob_name + "/" + file_name
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)
    gs_bucket_url = "gs://iacoe-cloud-bucket"
    gcs_uri = gs_bucket_url + '/' + destination_blob_name
    return gcs_uri

def extract_data_doc_ai(input_uri, project_id='concrete-tuner-241417'):
         # input_uri='gs://iacoe-cloud-bucket/k'):
    """Process a single document with the Document AI API, including
    text extraction and entity extraction."""

    client = documentai.DocumentUnderstandingServiceClient()
    gcs_source = documentai.types.GcsSource(uri=input_uri)

    input_config = documentai.types.InputConfig(
        gcs_source=gcs_source, mime_type='application/pdf')

    form_extraction_params = documentai.types.FormExtractionParams(
        enabled=True)
    table_extraction_params = documentai.types.TableExtractionParams(
        enabled=True)
    entity_extraction_params = documentai.types.EntityExtractionParams(
        enabled=True)

    parent = 'projects/{}/locations/us'.format(project_id)
    request = documentai.types.ProcessDocumentRequest(
        parent=parent,
        input_config=input_config,
        form_extraction_params=form_extraction_params,
        table_extraction_params=table_extraction_params,
        entity_extraction_params=entity_extraction_params)

    document = client.process_document(request=request)
    data = {}
    data["raw_text"] = (document.text).encode('utf-8')

    def _get_text(el):
        """Convert text offset indexes into text snippets.
        """
        response = ''
        for segment in el.text_anchor.text_segments:
            start_index = segment.start_index
            end_index = segment.end_index
            response += document.text[start_index:end_index]
        return response
    ent_dict = {}
    for entity in document.entities:
        ent_dict[_get_text(entity)] = (entity.mention_text)

    data["entities"] = ent_dict
    form_ent = {}
    for page in document.pages:
        for form_field in page.form_fields:
            key = (_get_text(form_field.field_name), form_field.field_name.confidence)
            value = (_get_text(form_field.field_value), form_field.field_value.confidence)
            form_ent[key] = value
    data["form_data"] = form_ent
    
    out_tables = []
    for page in document.pages:
        for table_num, table in enumerate(page.tables):
            out_table = []
            for row_num, row in enumerate(table.header_rows):
                out_header_row = [_get_text(cell.layout) for cell in row.cells]
                out_table.append(out_header_row)
            for row_num, row in enumerate(table.body_rows):
                out_body_row = [_get_text(cell.layout) for cell in row.cells]
                out_table.append(out_body_row)
            out_tables.append(out_table)

    data["tables"] = out_tables
    
    labels = {}
    for label in document.labels:
        labels[label.name] = label.confidence
    data["labels"] = labels

    return data

In [2]:
pwd

'C:\\Users\\hsingh151\\My_Work\\BOL\\02_NER_Bol_processing\\02_Format_2\\04_jupyter_notebook'

In [3]:
import time
import os

In [4]:
data_files = {}

In [5]:
GCS_BUCKET = "iacoe-cloud-bucket"
DESTINATION_BLOB_NAME = "iacoe_ocr_ml"
file_loc = 'C:/Users/hsingh151/My_Work/BOL/02_NER_Bol_processing/02_Format_2/02_samples/pdf_samples/'

arr_txt =  [x for x in os.listdir(file_loc) if x.endswith(".pdf") or x.endswith(".PDF")]
t1 = time.time()

for i in arr_txt:
    file_path = file_loc + i
    t3 = time.time()
    gcs_uri = upload_blob(GCS_BUCKET, file_path, DESTINATION_BLOB_NAME)
    data_files[i] = extract_data_doc_ai(gcs_uri)
    print("File {} : time {}".format(i, time.time() - t3))
    
t2 = time.time()

File 00_Format2 & Packing List.pdf : time 10.550351858139038
File 01_.32.pdf : time 9.005488395690918
File 02.pdf : time 8.425299167633057
File 03_.ll.pdf : time 9.526931762695312
File 04_PEPSI_5_PROOF_OF_DELIVERY.PDF : time 9.005971670150757
File 05_PEPSI_29_PROOF_OF_DELIVERY.PDF : time 16.507134199142456
File 06.pdf : time 8.904397010803223
File 07_done.pdf : time 8.61620044708252
File 08_pepsi.pdf : time 8.23430871963501
File 09.pdf : time 11.66556978225708
File 10_2_PEPSI-PROOF_OF_DELIVERY.PDF.pdf : time 8.099743127822876
File 11_PEPSI_33_-PROOF_OF_DELIVERY.PDF : time 10.888596057891846
File 12.pdf : time 8.243054628372192
File 13_5956134233.pdf : time 7.969057321548462
File 14.pdf : time 12.593974828720093
File 15_PEPSI_44_PROOF_OF_DELIVERY.PDF : time 8.543724536895752
File 16.PDF : time 13.74306607246399
File 17_6408420256-proof_of_delivery.pdf : time 13.148607015609741
File 18_8750899445b.pdf : time 8.090185165405273
File 19_9663840468-PROOF_OF_DELIVERY.pdf : time 8.842718839645

In [6]:
data_files[[*data_files][40]]['raw_text']

b'72\nPage 1\nDate: 8/11/2021 0:05:33\nBILL OF LADING\nSHIP FROM\nDocument Number:\n34844450248\nName: 3484-QTG TACOMA SC\nAddress: 2309 MILWAUKEE WAY\nAppt: 8/10/21 8:00 PM\nCity/State/Zip: TACOMA, WA 98421-2709\nCheckin: 8/10/21 7:13 PM\nLoaded: 8/10/21 11:49 PM\nSID/BOL#: 57088785\n(402) 34844450248\nFOB:\nDispatch: 8/11/21 12:05 AM\nSHIP TO\nCarrier Name: PEPSI LOGISTICS CO INC\nMOS: T\nName: FRED MEYER\nTrailer Number: LIVE775PLCB\nAddress: 349 VALLEY AVE NW\nSeal number(s): 646632\nCity/State/Zip: PUYALLUP, WA 98371-3314\nSCAC: PLCB\nCAR MOVE: 57088785\nID: C10001238\nFOB:\nPro Number:\nLOAD SEQ:\nTHIRD PARTY FREIGHT CHARGES BILL TO:\nFreight Charge (freight charges are prepaid unless marked\nName: PEPSICO C/O CASS INFO. SERVICES\notherwise)\nAddress: PO BOX 17608\nPrepaid\nCollect\n3rd Party\nCity/State/Zip: ST LOUIS, MO 63178-7608\nMaster Bill of Lading: with attached underlying Bills of\n(check box)\nLading\nSPECIAL INSTRUCTIONS: OTHERS-Carriers are to use One Network for appt

In [7]:
data_files[[*data_files][1]]['entities']

{'Date: 8/9/2021 19:12:26': '8/9/2021 19:12:26',
 'Document Number:\n32165665361': '32165665361',
 'Name: 3216-QTG-ATLANTA SC\nAddress: 747 DOUGLAS HILL BLVD': '3216-QTG-ATLANTA SC\nAddress: 747 DOUGLAS HILL BLVD',
 'Appt: 8/9/21 8:00 PM': '8/9/21 8:00 PM',
 'Checkin: 8/9/21 5:00 PM': '8/9/21 5:00 PM',
 'City/State/Zip: LITHIA SPRINGS, GA 30122-3606': 'LITHIA SPRINGS, GA 30122-3606',
 'Loaded: 8/9/21 6:34 PM': '8/9/21 6:34 PM',
 'SID/BOL#: 57129480': '57129480',
 'Dispatch: 8/9/21 7:12 PM': '8/9/21 7:12 PM',
 'Carrier Name: PEPSI LOGISTICS CO INC': 'PEPSI LOGISTICS CO INC',
 'Name: PBG-FT SMITH': 'PBG-FT SMITH',
 'Address: 3700 S ZERO': '3700 S ZERO',
 'City/State/Zip: FORT SMITH, AR 72908-6915': '72908-6915',
 'SCAC: PLCB': 'PLCB',
 'CAR MOVE: 57129480': '57129480',
 'ID: C10004506': 'C10004506',
 'Name: PEPSICO C/O CASS INFO. SERVICES': 'PEPSICO C/O CASS INFO. SERVICES',
 'Address: PO BOX 17608': 'PO BOX 17608',
 'City/State/Zip: ST LOUIS, MO 63178-7608': 'ST LOUIS, MO 63178-7608',
 

In [8]:
def parse_ner_json_to_string(parsed_ner):
    ner_string = ""
    flag = True
    print(parsed_ner)
    for key, value in parsed_ner.items():
        if flag:
            ner_string = ner_string + value + "|" + key
            flag = False
        else:
            ner_string = ner_string + ";" + value + "|" + key
    return ner_string

In [166]:
def pepsi_format2_doc_ai_ner_parser(raw_text, entities_json):
    parsed_data = {}
    
    # Entities to return
    PO_NUMBER = "PO Number"
    DROPOFF_ADDRESS = "Dropoff Address"
    PICKUP_ADDRESS = "Pickup Address"
    
    PO_FLAG = True
    PICKUP_FLAG = True
    DROPOFF_FLAG = True
    
    def get_bol_number_from_raw_text(text):
        start_search_key = 'CUSTOMER'
        end_search_key = 'ADDITIONAL SHIPPER INFO'
        start_idx = text.find(start_search_key)
        end_idx = text.find(end_search_key)
        search_str = text[start_idx + len(start_search_key):end_idx].strip()
        bol_list = re.findall(r'\b\d{6,}\b', search_str)
        if len(bol_list) >= 1:
            bol_num = bol_list[-1].strip()
            return bol_num
        elif len(bol_list) == 0:
            start_search_key = 'CUSTOMER'
            end_search_key = 'GRAND TOTAL'
            start_idx = text.find(start_search_key)
            end_idx = text.find(end_search_key)
            search_str = text[start_idx + len(start_search_key):end_idx].strip()
            po_num_list =[]
            data = search_str.split('\n')
            for po in data:
                if len(po)>= 5 and po[:1].isdigit():
                    if (' ' in po.strip()):
                        continue
                    po_num_list.append(po.strip())
            if len(po_num_list) == 1:
                return(po_num_list[0])
            else:
                return('@'.join(po_num_list))
            
            
    if PO_FLAG:
        bol_number = get_bol_number_from_raw_text(raw_text)
        parsed_data[PO_NUMBER] = bol_number
        
    # Checking Po Number in some extra cases
    if parsed_data[PO_NUMBER] == '':
        start_search_key = 'CUSTOMER'
        end_search_key = 'GRAND TOTAL'
        start_idx = raw_text.find(start_search_key)
        end_idx = raw_text.find(end_search_key)
        search_str = raw_text[start_idx + len(start_search_key):end_idx].strip()
            
        po_key = 'S-'
        po_idx = search_str.find(po_key)
        po_str = search_str[po_idx + len(po_key): po_idx + len(po_key) + 21].strip()
        po_list = po_str.split('\n')
        for po_num in po_list:
            if (len(po_num)>= 5 and not po_num[:1].isdigit()):
                if (' ' in po_num.strip()):
                    continue
                po_number = po_num.strip() 
                parsed_data[PO_NUMBER] = po_number
                
    if parsed_data[PO_NUMBER] == '':
        po_key = 'CUSTOMER ORDER NUMBER'
        po_idx = raw_text.find(po_key)
        po_str = raw_text[po_idx + len(po_key): po_idx + len(po_key) + 10].strip()
        po_list = po_str.split('\n')
        for po_num in po_list:
            if (len(po_num)>= 5 and not po_num[:1].isdigit()):
                if (' ' in po_num.strip() or '#' in po_num.strip()):
                    continue
                po_number = po_num.strip() 
                parsed_data[PO_NUMBER] = po_number
                
                     
    def get_pickup_address_from_raw_text(text):
        start_search_key = 'SHIP FROM'
        end_search_key = 'SHIP TO'        
        start_idx = text.find(start_search_key)
        end_idx = text.find(end_search_key)
        search_str = text[start_idx + len(start_search_key):end_idx].strip()
        
        name_key = 'Name:'
        name_idx = search_str.find(name_key)
        name_str = search_str[name_idx + len(name_key): name_idx + len(name_key) + 50]
        name_list = name_str.split('\n')
        name = name_list[0].strip()
        
        address_key = 'Address:'
        address_idx = search_str.find(address_key)
        address_str = search_str[address_idx + len(address_key): address_idx + len(address_key) + 50]
        address_list = address_str.split('\n')
        address = address_list[0].strip()
        
        city_state_zip_key = 'City/State/Zip:'
        city_state_zip_idx = search_str.find(city_state_zip_key)
        city_state_zip_str = search_str[city_state_zip_idx + len(city_state_zip_key): city_state_zip_idx + len(city_state_zip_key) + 50]
        city_state_zip_list = city_state_zip_str.split('\n')
        city_state_zip = city_state_zip_list[0].strip()
        
        pick_up_address = name + address + city_state_zip
        if pick_up_address:
            return pick_up_address
        else:
            return ''
        
    if PICKUP_FLAG:
        pickup_add = get_pickup_address_from_raw_text(raw_text)
        parsed_data[PICKUP_ADDRESS] = pickup_add
        PICKUP_FLAG = False
 
    def get_dropoff_address_from_raw_text(text):
        start_search_key = 'SHIP TO'
        end_search_key = 'THIRD PARTY FREIGHT'        
        start_idx = text.find(start_search_key)
        end_idx = text.find(end_search_key)
        search_str = text[start_idx + len(start_search_key):end_idx].strip()
        
        
        def get_name(search_str):
            start_search_key = 'Carrier Name:'
            end_search_key = 'THIRD PARTY FREIGHT'
            start_idx = search_str.find(start_search_key)
            end_idx = search_str.find(end_search_key)
            name_string = search_str[start_idx + len(start_search_key):end_idx].strip()
            name_key = 'Name:'
            name_idx = name_string.find(name_key)
            name_str = name_string[name_idx + len(name_key): name_idx + len(name_key) + 50]
            name_list = name_str.split('\n')
            name_ = name_list[0].strip()
            return name_
        
        name = get_name(search_str)
    
        address_key = 'Address:'
        address_idx = search_str.find(address_key)
        address_str = search_str[address_idx + len(address_key): address_idx + len(address_key) + 50]
        address_list = address_str.split('\n')
        address = address_list[0].strip()
        
        city_state_zip_key = 'City/State/Zip:'
        city_state_zip_idx = search_str.find(city_state_zip_key)
        city_state_zip_str = search_str[city_state_zip_idx + len(city_state_zip_key): city_state_zip_idx + len(city_state_zip_key) + 50]
        city_state_zip_list = city_state_zip_str.split('\n')
        city_state_zip = city_state_zip_list[0].strip()
        
        drop_off_address = name + address + city_state_zip
        if drop_off_address:
            return drop_off_address
        else:
            return ''
        
    if DROPOFF_FLAG:
        dropadd = get_dropoff_address_from_raw_text(raw_text)
        parsed_data[DROPOFF_ADDRESS] = dropadd
        DROPOFF_FLAG = False
    
            
    ner_string = parse_ner_json_to_string(parsed_data)
    return [200, ner_string]


In [167]:
data = data_files[[*data_files][24]]

In [168]:
a = pepsi_format2_doc_ai_ner_parser(data['raw_text'].decode(), data['entities'])

{'PO Number': 'SNN7ADVO', 'Pickup Address': '3399-QUAKER-CARLISLE SC1200 DISTRIBUTION DRCARLISLE, PA 17013-7456', 'Dropoff Address': 'AMAZON.COM SORT GREENWOOD IND91151 S GRAHAM RDGREENWOOD, IN 46143-7830'}


In [169]:
a

[200,
 'SNN7ADVO|PO Number;3399-QUAKER-CARLISLE SC1200 DISTRIBUTION DRCARLISLE, PA 17013-7456|Pickup Address;AMAZON.COM SORT GREENWOOD IND91151 S GRAHAM RDGREENWOOD, IN 46143-7830|Dropoff Address']

In [170]:
extract_data = []

In [171]:
for file,data in data_files.items():
    a = pepsi_format2_doc_ai_ner_parser(data['raw_text'].decode(), data['entities'])
    extract_data.append(a)

{'PO Number': 'BXGSTION', 'Pickup Address': '3264-QTG-TRACY SC1565 N MACARTHUR DRIVETRACY, CA 95376-2839', 'Dropoff Address': 'I LOGISTICS CO INC24300 NANDINA AVEMORENO VALLEY, CA 92551-9534'}
{'PO Number': '43571@00050415', 'Pickup Address': '3216-QTG-ATLANTA SC747 DOUGLAS HILL BLVDLITHIA SPRINGS, GA 30122-3606', 'Dropoff Address': 'PBG-FT SMITH3700 S ZERO STFORT SMITH, AR 72908-6915'}
{'PO Number': '00211738', 'Pickup Address': '3928-PCNA OXNARD DC2100 EASTMAN AVEOXNARD, CA 93030-7591', 'Dropoff Address': 'PBG-SEATTLE2300 26TH AVESSEATTLE, WA 98144-5339'}
{'PO Number': '002670723432', 'Pickup Address': '3389-QTG-INDIANAPOLIS SC9101 ORLY RDINDIANAPOLIS, IN 46241-9605', 'Dropoff Address': 'COSTCO MW MORRIS DPT #2673800 N DIVISION STMORRIS, IL 60450-9476'}
{'PO Number': '1568928700', 'Pickup Address': '3442-PCNA CARLISLE1301 DISTRIBUTION DRCARLISLE, PA 17013-7457', 'Dropoff Address': "SAM'S CLUB DC #6499140 FLEET DRVILLA RICA, GA 30180-1090"}
{'PO Number': '323912607946', 'Pickup Addres

In [172]:
df = pd.DataFrame(extract_data)

In [173]:
df

Unnamed: 0,0,1
0,200,BXGSTION|PO Number;3264-QTG-TRACY SC1565 N MAC...
1,200,43571@00050415|PO Number;3216-QTG-ATLANTA SC74...
2,200,00211738|PO Number;3928-PCNA OXNARD DC2100 EAS...
3,200,002670723432|PO Number;3389-QTG-INDIANAPOLIS S...
4,200,1568928700|PO Number;3442-PCNA CARLISLE1301 DI...
5,200,323912607946|PO Number;3442-PCNA CARLISLE1301 ...
6,200,2UZ4D7HS@75KY2CVN|PO Number;3264-QTG-TRACY SCA...
7,200,758Z36QF@39515|PO Number;ment Number:t Number:...
8,200,122397|PO Number;ment Number:1635 WESTGATE PKW...
9,200,2UZ4D7HS|PO Number;3264-QTG-TRACY SCAddress:TR...


In [179]:
def pepsi_format2_doc_ai_ner_parser2(raw_text, entities_json):
    parsed_data = {}
    
    # Entities to return
    PO_NUMBER = "PO Number"
    DROPOFF_ADDRESS = "Dropoff Address"
    PICKUP_ADDRESS = "Pickup Address"
    
    PO_FLAG = True
    PICKUP_FLAG = True
    DROPOFF_FLAG = True
    
    def get_bol_number_from_raw_text(text):
        start_search_key = 'CUSTOMER'
        end_search_key = 'ADDITIONAL SHIPPER INFO'
        start_idx = text.find(start_search_key)
        end_idx = text.find(end_search_key)
        search_str = text[start_idx + len(start_search_key):end_idx].strip()
        bol_list = re.findall(r'\b\d{6,}\b', search_str)
        if len(bol_list) >= 1:
            bol_num = bol_list[-1].strip()
            return bol_num
        elif len(bol_list) == 0:
            start_search_key = 'CUSTOMER'
            end_search_key = 'GRAND TOTAL'
            start_idx = text.find(start_search_key)
            end_idx = text.find(end_search_key)
            search_str = text[start_idx + len(start_search_key):end_idx].strip()
            po_num_list =[]
            data = search_str.split('\n')
            for po in data:
                if len(po)>= 5 and po[:1].isdigit():
                    if (' ' in po.strip()):
                        continue
                    po_num_list.append(po.strip())
            if len(po_num_list) == 1:
                return(po_num_list[0])
            else:
                return('@'.join(po_num_list))
            
            
    if PO_FLAG:
        bol_number = get_bol_number_from_raw_text(raw_text)
        parsed_data[PO_NUMBER] = bol_number
        
    if parsed_data[PO_NUMBER] == '':
        start_search_key = 'CUSTOMER'
        end_search_key = 'GRAND TOTAL'
        start_idx = raw_text.find(start_search_key)
        end_idx = raw_text.find(end_search_key)
        search_str = raw_text[start_idx + len(start_search_key):end_idx].strip()
            
        po_key = 'S-'
        po_idx = search_str.find(po_key)
        po_str = search_str[po_idx + len(po_key): po_idx + len(po_key) + 21].strip()
        po_list = po_str.split('\n')
        for po_num in po_list:
            if (len(po_num)>= 5 and not po_num[:1].isdigit()):
                if (' ' in po_num.strip()):
                    continue
                po_number = po_num.strip() 
                parsed_data[PO_NUMBER] = po_number
                
    if parsed_data[PO_NUMBER] == '':
        po_key = 'CUSTOMER ORDER NUMBER'
        po_idx = raw_text.find(po_key)
        po_str = raw_text[po_idx + len(po_key): po_idx + len(po_key) + 10].strip()
        po_list = po_str.split('\n')
        for po_num in po_list:
            if (len(po_num)>= 5 and not po_num[:1].isdigit()):
                if (' ' in po_num.strip() or '#' in po_num.strip()):
                    continue
                po_number = po_num.strip() 
                parsed_data[PO_NUMBER] = po_number
                
                     
    def get_pickup_address_from_raw_text(text):
        start_search_key = 'SHIP FROM'
        end_search_key = 'SHIP TO'        
        start_idx = text.find(start_search_key)
        end_idx = text.find(end_search_key)
        search_str = text[start_idx + len(start_search_key):end_idx].strip()
        
        name_key = 'Name:'
        name_idx = search_str.find(name_key)
        name_str = search_str[name_idx + len(name_key): name_idx + len(name_key) + 50]
        name_list = name_str.split('\n')
        name = name_list[0].strip()
        
        address_key = 'Address:'
        address_idx = search_str.find(address_key)
        address_str = search_str[address_idx + len(address_key): address_idx + len(address_key) + 50]
        address_list = address_str.split('\n')
        address = address_list[0].strip()
        
        city_state_zip_key = 'City/State/Zip:'
        city_state_zip_idx = search_str.find(city_state_zip_key)
        city_state_zip_str = search_str[city_state_zip_idx + len(city_state_zip_key): city_state_zip_idx + len(city_state_zip_key) + 50]
        city_state_zip_list = city_state_zip_str.split('\n')
        city_state_zip = city_state_zip_list[0].strip()
        
        pick_up_address = name + address + city_state_zip
        if pick_up_address:
            return pick_up_address
        else:
            return ''
        
    if PICKUP_FLAG:
        pickup_add = get_pickup_address_from_raw_text(raw_text)
        parsed_data[PICKUP_ADDRESS] = pickup_add
        PICKUP_FLAG = False
 
    def get_dropoff_address_from_raw_text(text):
        start_search_key = 'SHIP TO'
        end_search_key = 'THIRD PARTY FREIGHT'        
        start_idx = text.find(start_search_key)
        end_idx = text.find(end_search_key)
        search_str = text[start_idx + len(start_search_key):end_idx].strip()
        
        
        def get_name(search_str):
            start_search_key = 'Carrier Name:'
            end_search_key = 'THIRD PARTY FREIGHT'
            start_idx = search_str.find(start_search_key)
            end_idx = search_str.find(end_search_key)
            name_string = search_str[start_idx + len(start_search_key):end_idx].strip()
            name_key = 'Name:'
            name_idx = name_string.find(name_key)
            name_str = name_string[name_idx + len(name_key): name_idx + len(name_key) + 50]
            name_list = name_str.split('\n')
            name_ = name_list[0].strip()
            return name_
        
        name = get_name(search_str)
    
        address_key = 'Address:'
        address_idx = search_str.find(address_key)
        address_str = search_str[address_idx + len(address_key): address_idx + len(address_key) + 50]
        address_list = address_str.split('\n')
        address = address_list[0].strip()
        
        city_state_zip_key = 'City/State/Zip:'
        city_state_zip_idx = search_str.find(city_state_zip_key)
        city_state_zip_str = search_str[city_state_zip_idx + len(city_state_zip_key): city_state_zip_idx + len(city_state_zip_key) + 50]
        city_state_zip_list = city_state_zip_str.split('\n')
        city_state_zip = city_state_zip_list[0].strip()
        
        drop_off_address = name + address + city_state_zip
        if drop_off_address:
            return drop_off_address
        else:
            return ''
        
    if DROPOFF_FLAG:
        dropadd = get_dropoff_address_from_raw_text(raw_text)
        parsed_data[DROPOFF_ADDRESS] = dropadd
        DROPOFF_FLAG = False
    
            
    #ner_string = parse_ner_json_to_string(parsed_data)
    #return [200, ner_string]
    #print(parsed_data)
    return parsed_data

In [180]:
extract_data = []

In [181]:
for file,data in data_files.items():
    a = pepsi_format2_doc_ai_ner_parser2(data['raw_text'].decode(), data['entities'])
    extract_data.append(a)

In [182]:
df = pd.DataFrame(extract_data)

In [183]:
df

Unnamed: 0,PO Number,Pickup Address,Dropoff Address
0,BXGSTION,"3264-QTG-TRACY SC1565 N MACARTHUR DRIVETRACY, ...",I LOGISTICS CO INC24300 NANDINA AVEMORENO VALL...
1,43571@00050415,3216-QTG-ATLANTA SC747 DOUGLAS HILL BLVDLITHIA...,"PBG-FT SMITH3700 S ZERO STFORT SMITH, AR 72908..."
2,00211738,"3928-PCNA OXNARD DC2100 EASTMAN AVEOXNARD, CA ...","PBG-SEATTLE2300 26TH AVESSEATTLE, WA 98144-5339"
3,002670723432,3389-QTG-INDIANAPOLIS SC9101 ORLY RDINDIANAPOL...,COSTCO MW MORRIS DPT #2673800 N DIVISION STMOR...
4,1568928700,3442-PCNA CARLISLE1301 DISTRIBUTION DRCARLISLE...,"SAM'S CLUB DC #6499140 FLEET DRVILLA RICA, GA ..."
5,323912607946,3442-PCNA CARLISLE1301 DISTRIBUTION DRCARLISLE...,"PBG-ERIE5820 EVANS ROADERIE, PA 16509"
6,2UZ4D7HS@75KY2CVN,3264-QTG-TRACY SCAddress: 1565 N1565 NTRACY,AMAZON.COM NON-SORT PHOENIX GYR38181 WPEPSI LO...
7,758Z36QF@39515,ment Number:t Number:r:,I LOGISTICS COMPANY INC.3450 E HOLMES RDMEMPHI...
8,122397,"ment Number:1635 WESTGATE PKWY SWATLANTA, GA 3...","WALMART DC #605745346 PARKWAY BLVDROBERT, LA, ..."
9,2UZ4D7HS,3264-QTG-TRACY SCAddress:TRACY,I LOGISTICS COMPANY INC MOS: T8181 WOM NON-SOR...
