# NER Models Builder

This notebook compares the effectiveness of different NER libraries. By comparing each library's results to the names authority list, it should be possible to determine which library works best with the corpus.

#### Sources:

Christina, "[Named Entity Recognition in Python with Stanford-NER and Spacy](https://lvngd.com/blog/named-entity-recognition-in-python-with-stanford-ner-and-spacy/)," <i>LVNGD</i>, Accessed 10/04/2020.

In [1]:
# Import necessary libraries.
import re, warnings, glob, csv, sys, os, nltk, spacy
import pandas as pd
import numpy as np
import seaborn as sns
import xml.etree.ElementTree as ET
from itertools import chain
from nltk import word_tokenize, pos_tag, ne_chunk, Tree
from nltk.tag.stanford import StanfordNERTagger
from fuzzywuzzy import fuzz, process


# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/SemanticData/"


# Import StanfordNER model & jar.
PATH_TO_MODEL = '/Users/quinn.wi/stanfordNLP/stanford-ner-4.0.0/classifiers/english.all.3class.distsim.crf.ser.gz'

PATH_TO_JAR = '/Users/quinn.wi/stanfordNLP/stanford-ner-4.0.0/stanford-ner-4.0.0.jar'

# Instantiate StanfordNERTagger.
tagger = StanfordNERTagger(model_filename = PATH_TO_MODEL,
                           path_to_jar = PATH_TO_JAR,
                           encoding = 'utf-8')

# Instantiate Custom StanfordNER.
custom_model = '/Users/quinn.wi/stanfordNLP/stanford-ner-4.0.0/jqa-ner-model.ser.gz'

custom_ner_tagger = StanfordNERTagger(custom_model,
                                      path_to_jar = PATH_TO_JAR,
                                      encoding='utf8')

# Ignore warnings related to deprecated functions.
warnings.simplefilter("ignore", DeprecationWarning)

# Import spaCy language model.
nlp = spacy.load('en_core_web_sm')

# Import custom spaCy model.
jqa_spacy = spacy.load(abs_dir + 'Output/NER/ner-spacyCustom-training-model')

## Parse XML for Encoded Entities

#### Define functions and declare variables.

In [2]:
%%time

# Declare regex to simplify file paths below
regex = re.compile(r'.*/(.*).xml')

# Get plain text of every element (designated by first argument).
def get_textContent(ancestor, xpath_as_string, namespace):
    text_list = []
    for elem in ancestor.findall(xpath_as_string, namespace):
        text = ''.join(ET.tostring(elem, encoding='unicode', method='text'))

#         Add text (cleaned of additional whitespace) to text_list.
        text_list.append(re.sub(r'\s+', ' ', text))

#     Return concetanate text list.
    return ' '.join(text_list)

CPU times: user 88 µs, sys: 2 µs, total: 90 µs
Wall time: 90.8 µs


#### Choose dataset ('all' or 'testing') and run functions.

In [3]:
%%time

# Choose either all .xml files or training set by select dataset = 'all' or 'training'.
# Selection will parse the XML for different elements.

# dataset = 'all'
dataset = 'testing'


# Conditionally choose directory and create dataframe.
if dataset == 'all':
    # Gather all .xml files using glob.
    list_of_files = glob.glob(abs_dir + "Data/JQA/*/*.xml")
    
    # Create dataframe to store results.
    entities = pd.DataFrame(columns = ['file', 'entry', 'text',
                                       'element', 'refKey', 'entity'])

elif dataset == "testing":
    # Or, use training document(s) alone.
#     list_of_files = glob.glob(abs_dir + "Data/TestEncoding/TestingData/*.xml")
    list_of_files = glob.glob(abs_dir + "Data/TestEncoding/TestingData/*.xml")
    
    # Create dataframe to store results.
    entities = pd.DataFrame(columns = ['file', 'entry', 'text',
                                       'element', 'entity'])

else:
    print ('Dataset not found.')

    
# Loop through each file within a directory.
for file in list_of_files:
    tree = ET.parse(file)
    root = tree.getroot()
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}
    reFile = str(regex.match(file).group(1))
    
    for eachDoc in root.findall('.//ns:div/[@type="entry"]', ns):
        entry = eachDoc.get('{http://www.w3.org/XML/1998/namespace}id')
        text = get_textContent(eachDoc, './ns:div/[@type="docbody"]/ns:p', ns)
        
        if dataset == 'all':
            for elem in eachDoc.findall('.//ns:p//ns:persRef/[@ref]', ns):
                name = elem.text
                try:
                    entity = re.sub(r'\s+', ' ', name)
                except TypeError:
                    entity = name

                entities = entities.append({'file':reFile,
                                'entry':entry,
                                'text':text,
                                'element':re.sub(r'.*}(.*)', '\\1', elem.tag),
                                'refKey':elem.get('ref'),
                                'entity':entity},
                               ignore_index = True)

        elif dataset == 'testing':
            for xpath in ['.//ns:p//ns:persName', './/ns:p//ns:placeName']:
                for elem in eachDoc.findall(xpath, ns):
                    name = elem.text
                    try:
                        entity = re.sub(r'\s+', ' ', name)
                    except TypeError:
                        entity = name

                    entities = entities.append({'file':reFile,
                                    'entry':entry,
                                    'text':text,
                                    'element':re.sub(r'.*}(.*)', '\\1', elem.tag),
                                    'entity':entity},
                                   ignore_index = True)

            
        else:
            print ('Selected dataset not found.')
        
                
entities.head()

CPU times: user 1.22 s, sys: 10.4 ms, total: 1.23 s
Wall time: 1.24 s


Unnamed: 0,file,entry,text,element,entity
0,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,B 2 VII:15. Heard Lynd at the Capitol. At Bake...,persName,Lynd
1,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,B 2 VII:15. Heard Lynd at the Capitol. At Bake...,persName,Baker’s
2,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,B 2 VII:15. Heard Lynd at the Capitol. At Bake...,persName,Matthew
3,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,B 2 VII:15. Heard Lynd at the Capitol. At Bake...,persName,Garnett
4,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,B 2 VII:15. Heard Lynd at the Capitol. At Bake...,persName,G. Hay


## Declare Functions to Find Entities

In [4]:
%%time

# NLTK
def get_nltk_entities(text, label_list):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []

    for subtree in chunked:
        if type(subtree) == Tree and subtree.label() in label_list:
            current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
        if current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append((named_entity, subtree.label()))
                current_chunk = []
        else:
            continue

    return continuous_chunk

# spaCy
def get_spacy_entities(text, label_list):
    sp_entities_l = []
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in label_list:
            sp_entities_l.append((str(ent), ent.label_))
        else:
            pass
    return sp_entities_l

# CREATE CUSTOM SPACY FUNCTION
def get_custom_spacy_entities(text, label_list):
    sp_entities_l = []
    doc = jqa_spacy(text)
    for ent in doc.ents:
        if ent.label_ in label_list:
            sp_entities_l.append((str(ent), ent.label_))
        else:
            pass
    return sp_entities_l


# Stanford (return full-PERSON name as single string)
# From alvas, StackOverflow
def chunk_stanfordNER(ner_output, label_list):
    chunked, pos = [], ""
    for i, word_pos in enumerate(ner_output):
        word, pos = word_pos
        if pos in label_list and pos == prev_tag:
            chunked[-1]+=word_pos
        else:
            chunked.append(word_pos)
        prev_tag = pos

    clean_chunked = [tuple([" ".join(wordpos[::2]),
                            wordpos[-1]]) if len(wordpos)!=2 else wordpos for wordpos in chunked]
    chunks_subset = [tup for tup in clean_chunked if tup[1] in label_list]
    
    return chunks_subset

# Stanford
def get_stanford_entities(text, label_list):
    tokenized_text = word_tokenize(text)
    tagged_text = tagger.tag(tokenized_text)
    chunked_entities = chunk_stanfordNER(tagged_text, label_list)

    return chunked_entities

# Custom Stanford
def get_custom_stanford_NER(text, label_list):
    tokenized_text = word_tokenize(text)
    tagged_text = custom_ner_tagger.tag(tokenized_text)
    chunked_entities = chunk_stanfordNER(tagged_text, label_list)
    
    return chunked_entities

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.77 µs


## Find Entities

In [5]:
%%time

# Add or substract labels to list for NER to find.
label_list = ['PERSON']

entities_sub = entities[['entry', 'text']].drop_duplicates()

# Apply function to find NLTK-based entities.
entities_sub['nltk'] = entities_sub.apply(lambda row: get_nltk_entities(row['text'],
                                                                        label_list),
                                          axis = 1)
print ("NLTK entities found.")

# Apply function to find spaCy-based entities.
entities_sub['spacy'] = entities_sub.apply(lambda row: get_spacy_entities(row['text'],
                                                                          label_list),
                                           axis = 1)
print ("Spacy entities found.")


# Apply function to find custom spaCy-based entities.
entities_sub['custom_spacy'] = entities_sub \
    .apply(lambda row: get_custom_spacy_entities(row['text'], label_list),
           axis = 1)

print ("Custom spaCy entities found.")

# Apply function to find Stanford-based entities.
entities_sub['stanford'] = entities_sub.apply(lambda row: get_stanford_entities(row['text'],
                                                                                label_list),
                                           axis = 1)
print ("Stanford entities found.")


# Apply function to find Stanford-based entities.
entities_sub['custom_stanford'] = entities_sub \
    .apply(lambda row: get_custom_stanford_NER(row['text'], label_list),
           axis = 1)

print ("Custom stanford entities found.")


# Merge found entities with metadata & tidy up data.
if dataset == "all":
    entities = pd.merge(entities[['file', 'entry', 'element', 'refKey', 'entity']],
                    entities_sub[['entry', 'nltk', 'spacy', 'custom_spacy',
                                  'stanford', 'custom_stanford']],
                    on = 'entry', how = 'left')
    
    entities = pd.melt(entities, id_vars = ['file', 'entry', 'element', 'refKey', 'entity'],
                       value_vars = ['nltk', 'spacy', 'custom_spacy',
                                     'stanford', 'custom_stanford'],
                       var_name = 'model', value_name = 'found_entities')
elif dataset == "testing":
    entities = pd.merge(entities[['file', 'entry', 'element', 'entity']],
                    entities_sub[['entry', 'nltk', 'spacy', 'custom_spacy',
                                  'stanford', 'custom_stanford']],
                    on = 'entry', how = 'left')
    
    entities = pd.melt(entities, id_vars = ['file', 'entry', 'element', 'entity'],
                       value_vars = ['nltk', 'spacy', 'custom_spacy',
                                     'stanford', 'custom_stanford'],
                       var_name = 'model', value_name = 'found_entities')
else:
    "Dataset not found."


entities.head()

NLTK entities found.
Spacy entities found.
Custom spaCy entities found.
Stanford entities found.
Custom stanford entities found.
CPU times: user 2.35 s, sys: 2.92 s, total: 5.27 s
Wall time: 1min 42s


Unnamed: 0,file,entry,element,entity,model,found_entities
0,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,persName,Lynd,nltk,"[(Heard Lynd, PERSON), (Garnett, PERSON)]"
1,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,persName,Baker’s,nltk,"[(Heard Lynd, PERSON), (Garnett, PERSON)]"
2,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,persName,Matthew,nltk,"[(Heard Lynd, PERSON), (Garnett, PERSON)]"
3,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,persName,Garnett,nltk,"[(Heard Lynd, PERSON), (Garnett, PERSON)]"
4,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,persName,G. Hay,nltk,"[(Heard Lynd, PERSON), (Garnett, PERSON)]"


## Fuzzy Match Entities

Fuzzy Matching has four modes of calculating likeness.

1. ratio
2. partial_ratio
3. token_sort_ratio
4. token_set_ratio

In [6]:
%%time

# Unnest found_entities.
entities = entities.explode('found_entities')

# Un-tuple 'found_entities'.
entities[['found_entity', 'entity_label']] = pd.DataFrame(entities['found_entities'] \
                                                          .values.tolist(),
                                                     index = entities.index)

# Determine quality of fuzzy match.
entities['confidence'] = entities \
    .apply(lambda row: fuzz.token_sort_ratio(row['entity'], row['found_entity']),
           axis = 1)

# Define function to return the dataframe row with the highest value in a group.
def grp_func(group, column):
    return group.loc[group[column] == group[column].max()]


# Select ner_match with highest confidence score for model, entity in each entry.
entities = entities.groupby(['entry', 'entity', 'model'], as_index = False) \
    .apply(grp_func, 'confidence').reset_index(drop = True)

entities.head()

CPU times: user 2.64 s, sys: 30 ms, total: 2.67 s
Wall time: 2.69 s


Unnamed: 0,file,entry,element,entity,model,found_entities,found_entity,entity_label,confidence
0,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,persName,Baker’s,custom_spacy,,,,20
1,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,persName,Baker’s,custom_stanford,,,,20
2,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,persName,Baker’s,nltk,"(Heard Lynd, PERSON)",Heard Lynd,PERSON,35
3,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,persName,Baker’s,spacy,"(Baker, PERSON)",Baker,PERSON,83
4,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,persName,Baker’s,stanford,"(Baker, PERSON)",Baker,PERSON,83


## Write File for Model Comparisons

The final dataset compares every found entity (within an entry and model) and selects the best match for the encoded entity.

In some cases, the best match is actually not a match (e.g. NLTK considers "La Forêt" the closest match to "A. Gallatin," presumably because it doesn't find a "Gallatin"). Further filtering and adjustments will happen in JQA_NER-ModelsEvaluator because it's easier to remove data than re-run programs to collect more.

In [7]:
%%time

entities.to_csv(abs_dir + 'Output/NER/ner-model-comparisons_' + str(dataset) + '.csv',
                sep = ',', index = False)

CPU times: user 20.4 ms, sys: 2.26 ms, total: 22.7 ms
Wall time: 22.6 ms


## Notes

* Additional found entities would appear as found entities NOT in list of (element) content
* Rather than rely on encoded data, NER could refer to names authority
    * Each found entity would be compared to entire list of names authority and any matches would be assumed to be an entity
    
## Examine Matched & Unmatched Entities

In [8]:
# %%time

# Identify matched elements & entities.
matches = entities.query('confidence >= 55')

# # unmatched elements == elements without a found_entity > 70
# Subset elements that are not matched.
unmatched_elems = entities[~entities.isin(matches)].dropna()


# # unmatched found_entities == found_entities without an element > 70

unmatched_elems

Unnamed: 0,file,entry,element,entity,model,found_entities,found_entity,entity_label,confidence
2,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,persName,Baker’s,nltk,"(Heard Lynd, PERSON)",Heard Lynd,PERSON,35.0
7,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,persName,G. Hay,nltk,"(Heard Lynd, PERSON)",Heard Lynd,PERSON,40.0
22,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,persName,Matthew,nltk,"(Garnett, PERSON)",Garnett,PERSON,43.0
23,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,persName,Matthew,spacy,"(Garnett, PERSON)",Garnett,PERSON,43.0
27,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-02,persName,Tracy,nltk,"(Heard Lynd, PERSON)",Heard Lynd,PERSON,27.0
...,...,...,...,...,...,...,...,...,...
3755,TestData_JQADiaries-v33-1821-12-p001,jqadiaries-v33-1821-12-31,placeName,Vera-Cruz,nltk,"(Morel, PERSON)",Morel,PERSON,29.0
3756,TestData_JQADiaries-v33-1821-12-p001,jqadiaries-v33-1821-12-31,placeName,Vera-Cruz,nltk,"(Eaton, PERSON)",Eaton,PERSON,29.0
3757,TestData_JQADiaries-v33-1821-12-p001,jqadiaries-v33-1821-12-31,placeName,Vera-Cruz,spacy,"(Russell, PERSON)",Russell,PERSON,38.0
3758,TestData_JQADiaries-v33-1821-12-p001,jqadiaries-v33-1821-12-31,placeName,Vera-Cruz,stanford,"(Morel, PERSON)",Morel,PERSON,29.0


In [9]:

# entities.query('entry == "jqadiaries-v23-1821-05-02"')

entities.query('45 <= confidence <= 49')

Unnamed: 0,file,entry,element,entity,model,found_entities,found_entity,entity_label,confidence
137,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-08,persName,General Brown’s,nltk,"(Baron Tuyll, PERSON)",Baron Tuyll,PERSON,46
152,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-09,persName,Little,nltk,"(Heard Little Eccles, PERSON)",Heard Little Eccles,PERSON,48
227,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-13,persName,Wingate,nltk,"(Met R. King, PERSON)",Met R. King,PERSON,47
228,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-13,persName,Wingate,spacy,"(Met R. King, PERSON)",Met R. King,PERSON,47
229,TestData_JQADiaries-v23-1825-01-p403,jqadiaries-v23-1825-01-13,persName,Wingate,stanford,"(R. King, PERSON)",R. King,PERSON,46
...,...,...,...,...,...,...,...,...,...
3655,TestData_JQADiaries-v33-1821-12-p001,jqadiaries-v33-1821-12-31,placeName,Connecticut,nltk,"(Consul, PERSON)",Consul,PERSON,47
3726,TestData_JQADiaries-v33-1821-12-p001,jqadiaries-v33-1821-12-31,persName,R. Ingersoll,stanford,"(R. M. Johnson, PERSON)",R. M. Johnson,PERSON,45
3740,TestData_JQADiaries-v33-1821-12-p001,jqadiaries-v33-1821-12-31,persName,Spanish Minister Anduaga,nltk,"(Anduaga, PERSON)",Anduaga,PERSON,45
3741,TestData_JQADiaries-v33-1821-12-p001,jqadiaries-v33-1821-12-31,persName,Spanish Minister Anduaga,spacy,"(Anduaga, PERSON)",Anduaga,PERSON,45
