# Entity extraction
Run spaCy NER out of the box on OCR text from card scans. Compare the extracted entities to RegEx and LayoutParser.

Mappings - expected entity type for each column
- AGENCY :: `ORG`
- AMOUNT :: `MONEY`
- LOCATION :: `LOC`
- NAME :: `PERSON`

In [1]:
import os
import glob
import re
import pandas as pd
import spacy
from spacy import displacy
import en_core_web_sm
from pandarallel import pandarallel

In [2]:
nlp = en_core_web_sm.load()



In [3]:
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 36 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Get the text from the file directory into a dataframe

In [4]:
%%time

txt_files = sorted(glob.glob('/nfs/turbo/isr-jtalexnonsen/tesseract_txt/'+'/*.txt'))
# txt_files = txt_files[:10] # subset for testing

name_list = []
content_list = []

for file in txt_files:
    name = os.path.basename(file)
    name_list.append(name)
    with open(file) as f:
        content = f.read().replace('\n',' ') # replace new line character with a space
        content_list.append(content)
           
df = pd.DataFrame()
df['file_name'] = name_list
df['file_content'] = content_list
df

CPU times: user 490 ms, sys: 703 ms, total: 1.19 s
Wall time: 1min 13s


Unnamed: 0,file_name,file_content
0,783095-01-0001.txt,"Aaron, Samuel H. and Pearl C. LHG~L0124 Wichit..."
1,783095-01-0002.txt,
2,783095-01-0003.txt,"Abbenga, Arnold N, & Geraldine LH-31228 Detroi..."
3,783095-01-0004.txt,"Abbey, Leonard Ray and Barbara Joan, LE-6746 T..."
4,783095-01-0005.txt,"Abbey, Newton Warren Atlanta, Ga. Roy D. Werr..."
...,...,...
25739,783095-23-0261.txt,"Zvolanek, Harry & Ruby M. LHG-9598 Wichita, Ka..."
25740,783095-23-0262.txt,"Zwijac, Walter C, & Sara A, LH-3293 Hammond, I..."
25741,783095-23-0263.txt,"Zwinklis, Victor C. LHG-7715 Baltimore, Md. $7..."
25742,783095-23-0264.txt,"Zygler, Francis H, & Margaret T. CC-162 Perth ..."


In [5]:
def extract_org(text):
    doc = nlp(text)
    results = [(ent.text) for ent in doc.ents if ent.label_ == 'ORG']
    return results

def extract_money(text):
    doc = nlp(text)
    results = [(ent.text) for ent in doc.ents if ent.label_ == 'MONEY']
    return results

def extract_loc(text):
    doc = nlp(text)
    results = [(ent.text) for ent in doc.ents if ent.label_ == 'LOC']
    return results

def extract_person(text):
    doc = nlp(text)
    results = [(ent.text) for ent in doc.ents if ent.label_ == 'PERSON']
    return results

In [6]:
df['AGENCY'] = df['file_content'].parallel_apply(extract_org)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=716), Label(value='0 / 716'))), HB…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed

In [7]:
df['AMOUNT'] = df['file_content'].parallel_apply(extract_money)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=716), Label(value='0 / 716'))), HB…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed

In [8]:
df['LOCATION'] = df['file_content'].parallel_apply(extract_loc)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=716), Label(value='0 / 716'))), HB…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed

In [9]:
df['NAME'] = df['file_content'].parallel_apply(extract_person)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=716), Label(value='0 / 716'))), HB…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed

In [10]:
df.to_csv('/nfs/turbo/isr-jtalexnonsen/extracts/tesseract_txt_NER.csv', index=False)
df

Unnamed: 0,file_name,file_content,AGENCY,AMOUNT,LOCATION,NAME
0,783095-01-0001.txt,"Aaron, Samuel H. and Pearl C. LHG~L0124 Wichit...","[Kansas City Agency, The Reserve Building & Lo...","[9,000]",[],"[Aaron, Samuel H., Pearl C. LHG~, L0124 Wichita]"
1,783095-01-0002.txt,,[],[],[],[]
2,783095-01-0003.txt,"Abbenga, Arnold N, & Geraldine LH-31228 Detroi...","[Abbenga, Arnold N, & Geraldine LH-31228 Detro...","[4,100]",[],[James T. Barnes & Company]
3,783095-01-0004.txt,"Abbey, Leonard Ray and Barbara Joan, LE-6746 T...",[],"[8,000]",[],"[Abbey, Leonard Ray, Barbara Joan, W. R. Johns..."
4,783095-01-0005.txt,"Abbey, Newton Warren Atlanta, Ga. Roy D. Werr...","[Newton Warren, Roy D. Werren Co.]","[CC-78 $6,500]",[],[Abbey]
...,...,...,...,...,...,...
25739,783095-23-0261.txt,"Zvolanek, Harry & Ruby M. LHG-9598 Wichita, Ka...","[Harry & Ruby M., Serial No, The Coffeyville L...","[3,650]",[],[]
25740,783095-23-0262.txt,"Zwijac, Walter C, & Sara A, LH-3293 Hammond, I...","[Walter C, LH-3293 Hammond, Ind, Irvin Jacobs ...","[5,000]",[],[]
25741,783095-23-0263.txt,"Zwinklis, Victor C. LHG-7715 Baltimore, Md. $7...","[Victor C. LHG-7715 Baltimore, Richmond Agency...",[7],[],[]
25742,783095-23-0264.txt,"Zygler, Francis H, & Margaret T. CC-162 Perth ...","[New York Agency, Union County Trust Company]","[5,250]",[],"[Zygler, Francis H, Margaret T., Perth Amboy, ..."


Workflow for applying NER to Layout Parser output (already structured)

In [11]:
# df = pd.read_csv('/nfs/turbo/isr-jtalexnonsen/extracts/flat_images_all.csv')
# df.info()
# df = df.sample(n=20) # subset for testing

Look up the description of a label in spaCy

In [12]:
# spacy.explain('ORG')

In [13]:
# def extract_entities(text):
#     """
#     If a cell contains one or more entities, extract and store each as a tuple in a new column
#     """
#     doc = nlp(text)
#     for ent in doc.ents:
#         return ent.label_,ent.text

In [14]:
# df['AGENCY_ORG'] = df['AGENCY'].astype(str).parallel_apply(extract_entities)

In [15]:
# df['AMOUNT_MONEY'] = df['AMOUNT'].astype(str).parallel_apply(extract_entities)

In [16]:
# df['LOCATION_GPE'] = df['LOCATION'].astype(str).parallel_apply(extract_entities)

In [17]:
# df['NAME_PERSON'] = df['NAME'].astype(str).parallel_apply(extract_entities)

NameError: name 'extract_entities' is not defined

Compare the entity type of each record to the entity type of its column

In [None]:
# df['AGENCY_ORG'].astype(str).str.contains("ORG") \
# .value_counts().plot(kind = 'barh', title='AGENCY contains ORG label');

In [None]:
# df['AMOUNT_MONEY'].astype(str).str.contains("MONEY") \
# .value_counts().plot(kind = 'barh', title='AMOUNT contains MONEY label');

In [None]:
# df['LOCATION_GPE'].astype(str).str.contains("GPE") \
# .value_counts().plot(kind = 'barh', title='LOCATION contains GPE label');

In [None]:
# df['NAME_PERSON'].astype(str).str.contains("PERSON") \
# .value_counts().plot(kind = 'barh', title='NAME contains PERSON label');

In [None]:
# df.to_csv('/nfs/turbo/isr-jtalexnonsen/extracts/flat_images_all_NER.csv', index=False)