# Training Custom spaCy NER Models

#### Steps

1. Import parsed plain XML as dataframe.
2. "Senticize" and tokenize texts.
3. Count distance from beginning of sentence (in tokens).


#### Sources

Christina, "[Named Entity Recognition in Python with Stanford-NER and Spacy](https://lvngd.com/blog/named-entity-recognition-in-python-with-stanford-ner-and-spacy/)," <i>LVNG</i>, Accessed 10/26/2020.

Nishanth, N. "[Training Custom NER](https://towardsdatascience.com/train-ner-with-custom-training-data-using-spacy-525ce748fab7)," <i>towards data science</i>, Accessed 10/26/2020.

In [1]:
# Import necessary libraries.
import re, glob, random, csv, sys, os, warnings
import pandas as pd
import xml.etree.ElementTree as ET
import spacy

# Import spaCy language model.
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser', 'tagger'])
nlp.add_pipe(nlp.create_pipe('sentencizer'))

# Ignore warnings related to deprecated functions.
warnings.simplefilter("ignore")

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/SemanticData/"

## 1. Parse XML

In [2]:
%%time

# Declare regex to simplify file paths below
regex = re.compile(r'.*/(.*).xml')

# Get plain text of every element (designated by first argument).
def get_textContent(ancestor, xpath_as_string, namespace):
    text_list = []
    for elem in ancestor.findall(xpath_as_string, namespace):
        text = ''.join(ET.tostring(elem, encoding='unicode', method='text'))

#         Add text (cleaned of additional whitespace) to text_list.
        text_list.append(re.sub(r'\s+', ' ', text))

#     Return concetanate text list.
    return ' '.join(text_list)


# Choose either all .xml files or training set by select dataset = 'all' or 'training'.
# Selection will parse the XML for different elements.

# dataset = 'all'
dataset = 'training'

# Conditionally choose directory and create dataframe.
if dataset == 'all':
    # Gather all .xml files using glob.
    list_of_files = glob.glob(abs_dir + "Data/JQA/*/*.xml")
    
    # Create dataframe to store results.
    data = pd.DataFrame(columns = ['file', 'entry', 'text',
                                       'element', 'refKey', 'entity'])

elif dataset == "training":
    # Or, use training document(s) alone.
    list_of_files = glob.glob(abs_dir + "Data/TestEncoding/TrainingData/*.xml")
    
    # Create dataframe to store results.
    data = pd.DataFrame(columns = ['file', 'entry', 'text',
                                       'element', 'entity'])

else:
    print ('Dataset not found.')

    
# Loop through each file within a directory.
for file in list_of_files:
    tree = ET.parse(file)
    root = tree.getroot()
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}
    reFile = str(regex.match(file).group(1))
    
    for eachDoc in root.findall('.//ns:div/[@type="entry"]', ns):
        entry = eachDoc.get('{http://www.w3.org/XML/1998/namespace}id')
        text = get_textContent(eachDoc, './ns:div/[@type="docbody"]/ns:p', ns)
        
        if dataset == 'all':
            for elem in eachDoc.findall('.//ns:p/ns:persRef/[@ref]', ns):
                name = elem.text
                try:
                    entity = re.sub(r'\s+', ' ', name)
                except TypeError:
                    entity = name

                data = data.append({'file':reFile,
                                'entry':entry,
                                'text':text,
                                'element':re.sub(r'.*}(.*)', '\\1', elem.tag),
                                'refKey':elem.get('ref'),
                                'entity':entity},
                               ignore_index = True)

        elif dataset == 'training':
            for xpath in ['.//ns:p//ns:persName', './/ns:p//ns:placeName']:
                for elem in eachDoc.findall(xpath, ns):
                    name = elem.text
                    try:
                        entity = re.sub(r'\s+', ' ', name)
                    except TypeError:
                        entity = name

                    data = data.append({'file':reFile,
                                    'entry':entry,
                                    'text':text,
                                    'element':re.sub(r'.*}(.*)', '\\1', elem.tag),
                                    'entity':entity},
                                   ignore_index = True)

            
        else:
            print ('Selected dataset not found.')


# Create a dictionary to change element tags to NER labels.
element_ner_dictionary = {'persName':'PERSON', 'placeName':'LOC'}

# Change elements to NER labels.
labels_for_sentences = (data['element'].map(element_ner_dictionary))

# Attach labels as a column.
data['label'] = labels_for_sentences
                
data.head()

CPU times: user 249 ms, sys: 2.72 ms, total: 252 ms
Wall time: 255 ms


Unnamed: 0,file,entry,text,element,entity,label
0,TrainCopy_JQADiaries-v23-1821-05-p359,jqadiaries-v23-1821-05-01,1 V:15. Tuesday. W. A. Schoolfield at the Offi...,persName,W. A. Schoolfield,PERSON
1,TrainCopy_JQADiaries-v23-1821-05-p359,jqadiaries-v23-1821-05-02,2. VI: Mrs. Adams unwell. Despatches to A. Gal...,persName,Adams,PERSON
2,TrainCopy_JQADiaries-v23-1821-05-p359,jqadiaries-v23-1821-05-02,2. VI: Mrs. Adams unwell. Despatches to A. Gal...,persName,A. Gallatin,PERSON
3,TrainCopy_JQADiaries-v23-1821-05-p359,jqadiaries-v23-1821-05-02,2. VI: Mrs. Adams unwell. Despatches to A. Gal...,persName,La Forêt,PERSON
4,TrainCopy_JQADiaries-v23-1821-05-p359,jqadiaries-v23-1821-05-02,2. VI: Mrs. Adams unwell. Despatches to A. Gal...,persName,Canning,PERSON


## 2. "Senticize" and Tokenize Texts

In [3]:
%%time

# Parse plain text for sentences.
data['sents'] = data['text'].apply(lambda x: list(nlp(x).sents))

# Unnest list of sentences.
sentences = data.explode('sents')

# Subset dataframe by columns. (Unless way to automate, leave out 'element', 'entity')
sentences = sentences[['file', 'entry', 'element', 'entity', 'label', 'sents']]

# Join nlp.sents into strings.
sentences['sents'] = sentences['sents'].apply(lambda x: ' '.join(i.text for i in x))

# Replace m-dash with space for tokenizing words.
sentences['sents'] = sentences['sents'].str.replace(r'—', ' ', regex = True)

# Decalare function to tokenize sentences.
# Then, return entities and their start and end position within a string.
def tokenize_sents(sentence_column, entity_column, label_column):
    token_l = []
    sent = sentence_column
    
#     Replace whitespace with underscore in entity strings so they're considered continuous.
    if entity_column in sentence_column:
        re_entity = re.sub('\s', '_', entity_column)
        sent = re.sub(entity, re_entity, sent)
        
#     Find string positions of each word.
    for w in sent.split(' '):
        token_start = sent.find(w)
        token_end = token_start + len(w)
        
#         If condition to filter out words that are not entities.
        if re.sub('_', ' ', w) == entity_column:
            token_l.append((token_start, token_end, label_column))
        else:
            pass
        
    return token_l
        

# Tokenize sentences with string position (beginning and end).
sentences['tokens'] = sentences \
    .apply(lambda row: tokenize_sents(row['sents'], row['entity'], row['label']),
           axis = 1)

# Unnest 'tokens' columns so only tuples remain.
sentences = sentences.explode('tokens')

# Drop null values (rows without an entity).
sentences = sentences.dropna()

# For each sentences, gather 'tokens' tuples into single list.
sentences = sentences.groupby(['file', 'entry', 'sents'])['tokens'].apply(list) \
    .reset_index(name = 'entities')


sentences.head()
# sentences.query('sents == "Mrs. Adams to Alexandria ."')
sentences.query('entry == "jqadiaries-v23-1821-05-28"')

CPU times: user 104 ms, sys: 5.23 ms, total: 109 ms
Wall time: 113 ms


Unnamed: 0,file,entry,sents,entities
42,TrainCopy_JQADiaries-v23-1821-05-p359,jqadiaries-v23-1821-05-28,Bailey here .,"[(0, 6, PERSON)]"
43,TrainCopy_JQADiaries-v23-1821-05-p359,jqadiaries-v23-1821-05-28,Mrs. Adams to Alexandria .,"[(5, 10, PERSON), (14, 24, LOC)]"
44,TrainCopy_JQADiaries-v23-1821-05-p359,jqadiaries-v23-1821-05-28,P.U.S. to Loudoun .,"[(10, 17, LOC)]"
45,TrainCopy_JQADiaries-v23-1821-05-p359,jqadiaries-v23-1821-05-28,"S. Thompson , Parish , Calhoun at Office .","[(14, 20, PERSON), (23, 30, PERSON)]"


## Reshape Data for spaCy NER


#### Data Sample for Custom NER (spaCy)

```json
TRAIN_DATA = [
    ('Who is Nishanth?', {
        'entities': [(7, 15, 'PERSON')]
    }),
     ('Who is Kamal Khumar?', {
        'entities': [(7, 19, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]
```

In [4]:
%%time

# Gather tuples for each sentence into single list.
training_data = sentences[['sents', 'entities']]

# Convert dataframe to json format.
TRAIN_DATA = []

for index, row in training_data.iterrows():
    sentence_data = (row['sents'], {'entities': row['entities']})
    TRAIN_DATA.append(sentence_data)
    
TRAIN_DATA

CPU times: user 6.08 ms, sys: 30 µs, total: 6.11 ms
Wall time: 6.12 ms


[('Notes to Canning & c.', {'entities': [(9, 16, 'PERSON')]}),
 ('VI : Mrs. Adams unwell .', {'entities': [(10, 15, 'PERSON')]}),
 ('Mrs. A. to Alexandria .',
  {'entities': [(5, 7, 'PERSON'), (11, 21, 'LOC')]}),
 ('Philip went away .', {'entities': [(0, 6, 'PERSON')]}),
 ('E. Wyer at the Office with his Report   P. Lanman , and E. Patterson   Letter to Carysfort .',
  {'entities': [(81, 90, 'LOC')]}),
 ('Poletica , Wyer , Rodgers at the Office .',
  {'entities': [(0, 8, 'PERSON'), (11, 15, 'PERSON'), (18, 25, 'PERSON')]}),
 ('Heard Little , at the Bath Room .', {'entities': [(6, 12, 'PERSON')]}),
 ('Hyde here   Seamen from Martinique .', {'entities': [(0, 4, 'PERSON')]}),
 ('7 VI : Jacob Adams , Wyer , Connell at the Office .',
  {'entities': [(21, 25, 'PERSON'), (28, 35, 'PERSON')]}),
 ('Dr Tucker .', {'entities': [(3, 9, 'PERSON')]}),
 ('9 VI : S. Kean here   Connell at the Office .',
  {'entities': [(22, 29, 'PERSON')]}),
 ('Draft of Note to Hyde .', {'entities': [(17, 21, 'PERSON'

## Save spaCy Model

In [5]:
%%time

nlp = spacy.blank("en")

optimizer = nlp.begin_training()

for i in range(20):
    random.shuffle(TRAIN_DATA)
    for text, annotations in TRAIN_DATA:
        nlp.update([text], [annotations], sgd=optimizer)
        
nlp.to_disk(abs_dir + 'Output/NER/ner-spacyCustom-' + str(dataset) + '-model')

CPU times: user 300 ms, sys: 5.6 ms, total: 306 ms
Wall time: 308 ms
