# Entity extraction
Run spaCy NER out of the box on OCR text from card scans. Compare the extracted entities to RegEx and LayoutParser.

Mappings - expected entity type for each column
- AGENCY :: `ORG`
- AMOUNT :: `MONEY`
- LOCATION :: `LOC`
- NAME :: `PERSON`

In [1]:
import os
import glob
import re
import pandas as pd
import spacy
from spacy import displacy
import en_core_web_sm
from pandarallel import pandarallel

In [2]:
print(spacy.__version__)

3.1.1


In [3]:
nlp = en_core_web_sm.load()

In [4]:
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


Demonstrating that the format of the input text impacts the NER output

In [5]:
test_1 = """Aaron, Samuel H. and Pearl C. LHG~L0124 Wichita, Kansas $9,000. CC-64 Kansas City Agency  The Reserve Building & Loan Association Wichita, Kansas  (VA) FIRST MORTGAGE LOAN GUARANTEED """
print(test_1)

Aaron, Samuel H. and Pearl C. LHG~L0124 Wichita, Kansas $9,000. CC-64 Kansas City Agency  The Reserve Building & Loan Association Wichita, Kansas  (VA) FIRST MORTGAGE LOAN GUARANTEED 


In [6]:
test_2 = """Aaron, Samuel H. and Pearl C. LHG~L0124
Wichita, Kansas $9,000.
CC-64
Kansas City Agency

The Reserve Building & Loan Association
Wichita, Kansas

(VA)
FIRST MORTGAGE LOAN GUARANTEED"""
print(test_2)

Aaron, Samuel H. and Pearl C. LHG~L0124
Wichita, Kansas $9,000.
CC-64
Kansas City Agency

The Reserve Building & Loan Association
Wichita, Kansas

(VA)
FIRST MORTGAGE LOAN GUARANTEED


In [7]:
print(test_1)
doc_1 = nlp(test_1)
displacy.render(doc_1, style='ent')

Aaron, Samuel H. and Pearl C. LHG~L0124 Wichita, Kansas $9,000. CC-64 Kansas City Agency  The Reserve Building & Loan Association Wichita, Kansas  (VA) FIRST MORTGAGE LOAN GUARANTEED 


In [8]:
print(test_2)
doc_2 = nlp(test_2)
displacy.render(doc_2, style='ent')

Aaron, Samuel H. and Pearl C. LHG~L0124
Wichita, Kansas $9,000.
CC-64
Kansas City Agency

The Reserve Building & Loan Association
Wichita, Kansas

(VA)
FIRST MORTGAGE LOAN GUARANTEED


Get the text from the file directory into a dataframe

In [9]:
%%time

# txt_files = sorted(glob.glob('/nfs/turbo/isr-jtalexnonsen/tesseract_txt/'+'/*.txt')) # TURBO
txt_files = sorted(glob.glob('/Users/slafia/Documents/layout-parser/tesseract_txt/'+'/*.txt')) # LOCAL
# txt_files = txt_files[:10] # subset for testing

name_list = []
content_list = []

for file in txt_files:
    name = os.path.basename(file)
    name_list.append(name)
    with open(file) as f:
#         content = f.read().replace('\n',' ') # replace new line character with a space
        content = f.read()
        content_list.append(content)
           
df = pd.DataFrame()
df['file_name'] = name_list
df['file_content'] = content_list
df

CPU times: user 1.04 s, sys: 2.73 s, total: 3.77 s
Wall time: 12.4 s


Unnamed: 0,file_name,file_content
0,783095-01-0001.txt,"Aaron, Samuel H. and Pearl C. LHG~L0124\nWichi..."
1,783095-01-0002.txt,\n
2,783095-01-0003.txt,"Abbenga, Arnold N, & Geraldine LH-31228\nDetro..."
3,783095-01-0004.txt,"Abbey, Leonard Ray and Barbara Joan, LE-6746\n..."
4,783095-01-0005.txt,"Abbey, Newton Warren\nAtlanta, Ga.\n\nRoy D. W..."
...,...,...
25739,783095-23-0261.txt,"Zvolanek, Harry & Ruby M. LHG-9598\nWichita, K..."
25740,783095-23-0262.txt,"Zwijac, Walter C, & Sara A, LH-3293\nHammond, ..."
25741,783095-23-0263.txt,"Zwinklis, Victor C. LHG-7715\nBaltimore, Md. $..."
25742,783095-23-0264.txt,"Zygler, Francis H, & Margaret T. CC-162\nPerth..."


In [10]:
def extract_org(text):
    doc = nlp(text)
    results = [(ent.text) for ent in doc.ents if ent.label_ == 'ORG']
    return results

def extract_money(text):
    doc = nlp(text)
    results = [(ent.text) for ent in doc.ents if ent.label_ == 'MONEY']
    return results

def extract_loc(text):
    doc = nlp(text)
    results = [(ent.text) for ent in doc.ents if ent.label_ == 'GPE']
#     results = [(ent.text) for ent in doc.ents if (ent.label_ == 'LOC' or ent.label_ == 'GPE')]
    return results

def extract_person(text):
    doc = nlp(text)
    results = [(ent.text) for ent in doc.ents if ent.label_ == 'PERSON']
    return results

def extract_id(text):
    doc = nlp(text)
    results = [(ent.text) for ent in doc.ents if ent.label_ == 'PRODUCT']
    return results

In [11]:
df['AGENCY'] = df['file_content'].parallel_apply(extract_org)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6436), Label(value='0 / 6436'))), …

In [12]:
df['AMOUNT'] = df['file_content'].parallel_apply(extract_money)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6436), Label(value='0 / 6436'))), …

In [13]:
df['ID'] = df['file_content'].parallel_apply(extract_id)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6436), Label(value='0 / 6436'))), …

In [14]:
df['LOCATION'] = df['file_content'].parallel_apply(extract_loc)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6436), Label(value='0 / 6436'))), …

In [15]:
df['NAME'] = df['file_content'].parallel_apply(extract_person)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6436), Label(value='0 / 6436'))), …

In [16]:
# df.to_csv('/nfs/turbo/isr-jtalexnonsen/extracts/tesseract_txt_NER.csv', index=False) # TURBO
df.to_csv('/Users/slafia/Documents/layout-parser/tesseract_txt_NER.csv', index=False) # LOCAL
df

Unnamed: 0,file_name,file_content,AGENCY,AMOUNT,ID,LOCATION,NAME
0,783095-01-0001.txt,"Aaron, Samuel H. and Pearl C. LHG~L0124\nWichi...",[The Reserve Building & Loan Association],"[9,000]",[CC-64],"[Wichita, Kansas, Kansas City Agency, Wichita,...","[Aaron, Samuel H., Pearl C. LHG, L0124]"
1,783095-01-0002.txt,\n,[],[],[],[],[]
2,783095-01-0003.txt,"Abbenga, Arnold N, & Geraldine LH-31228\nDetro...","[Abbenga, Arnold N, & Geraldine LH-31228, Detr...","[4,100]",[],"[Detroit, Mich., Detroit]","[James T. Barnes & Company, Mich]"
3,783095-01-0004.txt,"Abbey, Leonard Ray and Barbara Joan, LE-6746\n...",[W. R. Johnston & Co.],"[8,000]",[],"[Tulsa, Oklahoma, Oklahoma City, Oklahoma]","[Abbey, Leonard Ray, Barbara Joan]"
4,783095-01-0005.txt,"Abbey, Newton Warren\nAtlanta, Ga.\n\nRoy D. W...","[Newton Warren, Roy D. Werren Co., Inc.]","[CC-78\n$, 6,500]",[],"[Atlanta, Ga., Atlanta, Ga., Va]",[]
...,...,...,...,...,...,...,...
25739,783095-23-0261.txt,"Zvolanek, Harry & Ruby M. LHG-9598\nWichita, K...","[Zvolanek,, Harry & Ruby M. LHG-9598, Kans, Th...","[3,650]",[],"[Wichita, Coffeyville, Kans.]",[]
25740,783095-23-0262.txt,"Zwijac, Walter C, & Sara A, LH-3293\nHammond, ...",[Irvin Jacobs & Company],"[5,000]",[],"[Chicago, Ill., VA]","[Zwijac, Walter C]"
25741,783095-23-0263.txt,"Zwinklis, Victor C. LHG-7715\nBaltimore, Md. $...","[Zwinklis, Victor C. LHG-7715, Richmond Agency...",[7],[],"[Baltimore, Md., Baltimore, Md., VA]",[]
25742,783095-23-0264.txt,"Zygler, Francis H, & Margaret T. CC-162\nPerth...","[Francis H, &, New York Agency, Union County T...","[5,250]",[],[],"[Margaret T., Perth Amboy, Elizabeth, N. J.\n\n]"


Workflow for applying NER to Layout Parser output (already structured)

In [17]:
# df = pd.read_csv('/nfs/turbo/isr-jtalexnonsen/extracts/flat_images_all.csv')
# df.info()
# df = df.sample(n=20) # subset for testing

Look up the description of a label in spaCy

In [18]:
# spacy.explain('ORG')

In [19]:
# def extract_entities(text):
#     """
#     If a cell contains one or more entities, extract and store each as a tuple in a new column
#     """
#     doc = nlp(text)
#     for ent in doc.ents:
#         return ent.label_,ent.text

In [20]:
# df['AGENCY_ORG'] = df['AGENCY'].astype(str).parallel_apply(extract_entities)

In [21]:
# df['AMOUNT_MONEY'] = df['AMOUNT'].astype(str).parallel_apply(extract_entities)

In [22]:
# df['LOCATION_GPE'] = df['LOCATION'].astype(str).parallel_apply(extract_entities)

In [23]:
# df['NAME_PERSON'] = df['NAME'].astype(str).parallel_apply(extract_entities)

Compare the entity type of each record to the entity type of its column

In [24]:
# df['AGENCY_ORG'].astype(str).str.contains("ORG") \
# .value_counts().plot(kind = 'barh', title='AGENCY contains ORG label');

In [25]:
# df['AMOUNT_MONEY'].astype(str).str.contains("MONEY") \
# .value_counts().plot(kind = 'barh', title='AMOUNT contains MONEY label');

In [26]:
# df['LOCATION_GPE'].astype(str).str.contains("GPE") \
# .value_counts().plot(kind = 'barh', title='LOCATION contains GPE label');

In [27]:
# df['NAME_PERSON'].astype(str).str.contains("PERSON") \
# .value_counts().plot(kind = 'barh', title='NAME contains PERSON label');

In [28]:
# df.to_csv('/nfs/turbo/isr-jtalexnonsen/extracts/flat_images_all_NER.csv', index=False)