# Operations to Sync Up XML & NER

These functions will make encoding suggestions and, if accepted, will completely re-write XML documents. Only run after version control or backup.

* make_ner_suggestions(): ensure well-formed

With suggested encoding as is ("exploded" entities), writing will after to be recursive...
1. Previous encoding becomes new encoding with first change.
2. New encoding (1 change) becomes new encoding with second change.
3. New encoding (n changes) becomes new encoding with n+1 change.
    
The most recent change becomes the working text.

In [1]:
import warnings, re, glob, datetime, csv, sys, os, base64, io
import spacy
import pandas as pd
import numpy as np
from lxml import etree

# Import spaCy language model.
nlp = spacy.load('en_core_web_sm')

# Ignore simple warnings.
warnings.simplefilter('ignore', DeprecationWarning)

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/SemanticData/"

## Declare Functions

#### Functions for Suggesting New Encoding

In [2]:
%%time

"""
XML Parsing Function: Get Namespaces
"""
def get_namespace(root):
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}
    return ns


"""
XML Parsing Function: Retrieve XPaths
"""
def get_abridged_xpath(elem):
    while elem.getparent().get('{http://www.w3.org/XML/1998/namespace}id') is None:
        elem = elem.getparent()
        
        if elem.getparent().get('{http://www.w3.org/XML/1998/namespace}id') is not None:    
            ancestor = elem.getparent().tag
            xml_id = elem.getparent().get('{http://www.w3.org/XML/1998/namespace}id')
            
            abridged_xpath = f'.//ns:body//{ancestor}[@xml:id="{xml_id}"]'
            return abridged_xpath
        

"""
XML Parsing Function: Convert to String
"""
def get_text(elem):
    text_list = []
    text = ''.join(etree.tostring(elem, encoding='unicode', method='text', with_tail=False))
    text_list.append(re.sub(r'\s+', ' ', text))
    return ' '.join(text_list)

        
"""
XML Parsing Function: Get Encoded Content
"""    
def get_encoding(elem):
    encoding = etree.tostring(elem, pretty_print = True).decode('utf-8')
    encoding = re.sub('\s+', ' ', encoding) # remove additional whitespace
    return encoding


"""
XML Parsing Function: Intersperse Entity with Likely TEI Information for Capacious Regex
"""
def intersperse(lst, item):
    result = [item] * (len(lst) * 2 - 0)
    result[0::2] = lst
    return result


"""
XML Parsing Function: Write New Encoding
"""
def make_ner_suggestions(previous_encoding, entities, label_dict):
    previous_encoding = re.sub('\s+', ' ', previous_encoding, re.MULTILINE)
    entity = entities[0]
    label = label_dict[entities[1]]
    
    try:
    #     Create regex that anticipates additional encoding anywhere in tag content.
    #     Break up entity by character to intersperse possible TEI interruptions.
        expanded_entity = [c for c in entity]
        expanded_regex = '[' + "|".join(['<.*>', '</.*>', '\s*']) + ']*'

    #     Intersperse possible encoding within entity.
        expanded_regex =  r''.join(intersperse(expanded_entity, expanded_regex))
        match = re.search(expanded_regex, previous_encoding, re.VERBOSE|re.DOTALL)

    #     If expanded regex is in previous encoding, find & replace it with new encoding.
        if match:
            new_encoding = re.sub(f'{match.group(0)}',
                                  f'<{label}>{match.group(0)}</{label}>',
                                  previous_encoding)

            return new_encoding # Check if encoding is well formed?


        else:
            pass
    
    except:
        return 'Error Occurred with Regex.'
        
        

"""
NER Function
"""
# spaCy
def get_spacy_entities(text, label_dict):
    sp_entities_l = []
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in label_dict.keys():
            sp_entities_l.append((str(ent), ent.label_))
        else:
            pass
    return sp_entities_l


"""
XML & NER: Retrieve Contents
"""
def get_contents(ancestor, xpath_as_string, namespace):
    
    textContent = get_text(ancestor) # Get plain text.
    encodedContent = get_encoding(ancestor) # Get encoded content.
    sp_entities_l = get_spacy_entities(textContent, label_dict) # Get named entities from plain text.
    
    return (sp_entities_l, encodedContent)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 5.96 µs


#### Functions for Re-Writing XML with Accepted Changes

In [3]:
%%time

"""
XML Parsing Function: Suggest New Encoding with Hand Edits

Similar to make_ner_suggestions(), this function folds in revision using regular expressions.
The outcome is the previous encoding with additional encoded information determined by user input.

Expected Columns:
    previous_encoding
    entities
    accept_change
    make_hand_edits
    add_unique_identifier
"""
def revise_with_hand_edits(label_dict, row):
    previous_encoding = re.sub('\s+', ' ', row['previous_encoding'], re.MULTILINE)
    
#     Using re.sub because pandas is converting tuple to string.
    entity = re.sub("\(\'(.*)\',\'(.*)'\)", '\g<1>', row['entities'])
    label = label_dict[re.sub("\(\'(.*)\',\'(.*)'\)", '\g<2>', row['entities'])]

#     Create regex that anticipates additional encoding anywhere in tag content.
#     Break up entity by character to intersperse possible TEI interruptions.
    expanded_entity = [c for c in entity]
    expanded_regex = '[' + "|".join(['<.*>', '</.*>', '\s*']) + ']*'

#     Intersperse possible encoding within entity.
#     row['previous_encoding'] requires [0] to grab contents.
    expanded_regex =  r''.join(intersperse(expanded_entity, expanded_regex))
    match = re.search(expanded_regex, previous_encoding, re.VERBOSE|re.DOTALL)
    
#     If expanded regex is in previous encoding, find & replace it with new encoding.
    if match != None:

#             If there is a unique id to add & hand edits...
        if row['add_unique_identifier'] != '' and row['make_hand_edits'] != '':
            identifier_regex = re.search('(<.+)>.+</.+>', row['make_hand_edits'], re.VERBOSE|re.DOTALL)
            new_edit = identifier_regex.group(1) + 'xml:id="{}"'.format(row['add_unique_identifier'])

            new_match = re.sub(f'{identifier_regex.group(1)}',
                               f'{new_edit}',
                               row['make_hand_edits'])
            
            revised_encoding = re.sub(f'{match.group(0)}',
                          new_match + ' ',
                          row['previous_encoding'])

    #             Clean up any additional whitespace.
            revised_encoding = re.sub('\s+', ' ', revised_encoding, re.MULTILINE)

            return revised_encoding # Check if encoding is well formed?



#             If there are ONLY unique ids to add an NO hand edits...
        elif row['add_unique_identifier'] != '' and row['make_hand_edits'] == '':
            identifier_regex = re.search('(<.+)>.+</.+>', match.group(0), re.VERBOSE|re.DOTALL)
            new_edit = identifier_regex.group(1) + 'xml:id="{}"'.format(row['add_unique_identifier'])

            new_match = re.sub(f'{identifier_regex.group(1)}',
                               f'{new_edit}',
                               identifier_regex.group(0))
            
            revised_encoding = re.sub(f'{match.group(0)}',
                                      new_match + ' ',
                                      row['previous_encoding'])

    #             Clean up any additional whitespace.
            revised_encoding = re.sub('\s+', ' ', revised_encoding, re.MULTILINE)

            return revised_encoding # Check if encoding is well formed?
    
        else:
            pass

    else:
        pass


"""
XML & NER: Update/Inherit Accepted Changes
Expects a dataframe (from a .csv) with these columns:
    file
    abridged_xpath
    previous_encoding
    entities
    new_encoding
    accept_change
    make_hand_edits
    add_unique_identifier
"""
def inherit_changes(label_dict, dataframe):
    for index, row in dataframe.iterrows():
        
#         If HAND changes are accepted...
        if row['accept_change'] == 'y' and (row['make_hand_edits'] != '' or row['add_unique_identifier'] != ''):
        
            revised_by_hand = revise_with_hand_edits(label_dict, row)
            dataframe.loc[index, 'new_encoding'] = revised_by_hand
            
            try:
                if dataframe.loc[index + 1, 'abridged_xpath'] == row['abridged_xpath']:
                    dataframe.loc[index + 1, 'previous_encoding'] = row['new_encoding']
            except KeyError as e:
                dataframe.loc[index, 'new_encoding'] = revised_by_hand
        
#         If NER suggestions are accepted as-is...
        elif row['accept_change'] == 'y' and row['make_hand_edits'] == '' and row['add_unique_identifier'] == '':
        
            try:
                if dataframe.loc[index + 1, 'abridged_xpath'] == row['abridged_xpath']:
                    dataframe.loc[index + 1, 'previous_encoding'] = row['new_encoding']
            except KeyError as e:
                dataframe.loc[index, 'new_encoding'] = row['new_encoding']
                
#         If changes are rejected...
        else:
            try:
                if dataframe.loc[index + 1, 'abridged_xpath'] == row['abridged_xpath']:
                    dataframe.loc[index + 1, 'previous_encoding'] = dataframe.loc[index, 'previous_encoding']
            except KeyError as e:
                dataframe.loc[index, 'new_encoding'] = dataframe.loc[index, 'previous_encoding']

        
    dataframe = dataframe.groupby('abridged_xpath').tail(1)
    
    return dataframe


"""
XML & NER: Write New XML File with Accepted Revisions
Expects:
    XML File with Original Encoding
    CSV File with Accepted Changes
    Label Dictionary
"""
def revise_xml(xml_in, csv_df, label_dict):
    with open(input_filename, 'r') as xml_in: #, open(output_filename, 'wb') as xml_out:
    #     First, update data to reflect accepted changes.
        new_data = inherit_changes(label_dict, csv_df)

        tree = etree.parse(xml_in)
        root = tree.getroot()
        ns = get_namespace(root)

        tree_as_string = etree.tostring(tree, pretty_print = True).decode('utf-8')
        tree_as_string = re.sub('\s+', ' ', tree_as_string) # remove additional whitespace

    #     Declare accepted encoding to be written.
    #     For each entry in file...
        for child in root.findall('.//ns:p', ns):

    #         Store original encoding.
            original_encoding_as_string = get_encoding(child)
    #         Removing namespace information embedded in <p> tags.
            original_encoding_as_string = re.sub('(<p)(.*1.0")(>)',
                                                 '\\1\\3',
                                                 original_encoding_as_string)

    #         Get xpath of child and write full xpath with namespaces using dictionary
            abridged_xpath = get_abridged_xpath(child)
            for key, value in ns.items():
                full_xpath = re.sub('(.*)(xml:)(.*)', '\\1{http://www.w3.org/XML/1998/namespace}\\3', abridged_xpath)

            accepted_encoding_as_string = new_data.loc[new_data['abridged_xpath'] == abridged_xpath, 'new_encoding'][1]
            accepted_encoding_as_string = re.sub('(<p)(.*1.0")(>)',
                                                 '\\1\\3',
                                                 accepted_encoding_as_string)


            tree_as_string = re.sub(original_encoding_as_string,
                                    accepted_encoding_as_string,
                                    tree_as_string)

    #     Check well-formedness (will fail if not well-formed)
        doc = etree.fromstring(tree_as_string)

    #     Write changed XML.
        et = etree.ElementTree(doc)
        return (et)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.15 µs


## 1. Suggest NER


#### Intakes XML File & Produces CSV with Suggested Changes

In [4]:
%%time

# filename = abs_dir + 'Data/TestEncoding/EditingData/JQADiaries-v33-1821-12-p001 copy.xml'
filename = abs_dir + 'Data/TestEncoding/EditingData/test_xml-before.xml'

# Add or substract labels to list for NER to find.
# Complete list of NER labels: https://spacy.io/api/annotation
label_dict = {
    'PERSON':'persName',
    'LOC':'placeName', # Non-GPE locations, mountain ranges, bodies of water.
#     'GPE':'placeName', # Countries, cities, states.
#     'FAC':'placeName', # Buildings, airports, highways, bridges, etc.
#     'ORG':'orgName', # Companies, agencies, institutions, etc.
#     'NORP':'name', # Nationalities or religious or political groups.
#     'EVENT':'name', # Named hurricanes, battles, wars, sports events, etc.
#     'WORK_OF_ART':'name', # Titles of books, songs, etc.
#     'LAW':'name', # Named documents made into laws.
}


df = pd.DataFrame(columns = ['file', 'abridged_xpath', 'previous_encoding', 'entities'])


with open(filename, 'r') as xml_file:
    tree = etree.parse(xml_file)
    root = tree.getroot()
    ns = get_namespace(root)
    
    for child in root.findall('.//ns:p', ns):
        
        abridged_xpath = get_abridged_xpath(child)
        entities, previous_encoding = get_contents(child, './/ns:p', ns)        
        
        df = df.append({
            'file':re.sub('.*/(.*.xml)', '\\1', filename),
            'abridged_xpath':abridged_xpath,
            'previous_encoding': previous_encoding,
            'entities':entities
        },
            ignore_index = True)
        
df = df \
    .explode('entities') \
    .dropna()

df['new_encoding'] = df \
    .apply(lambda row: make_ner_suggestions(row['previous_encoding'], row['entities'], label_dict), axis = 1)


# Add additional columns for user input.
df['accept_change'] = 'y'  # Temporarily accepting all changes for testing.
df['make_hand_edits'] = ''
df['add_unique_identifier'] = ''

df.to_csv(abs_dir + 'Data/TestEncoding/EditingData/make_ner_suggestions.csv', sep = ',', index = False)
df.head()

CPU times: user 58.8 ms, sys: 5.47 ms, total: 64.3 ms
Wall time: 64.6 ms


Unnamed: 0,file,abridged_xpath,previous_encoding,entities,new_encoding,accept_change,make_hand_edits,add_unique_identifier
0,test_xml-before.xml,.//ns:body//{http://www.tei-c.org/ns/1.0}div[@...,"<p xmlns=""http://www.tei-c.org/ns/1.0"" xmlns:m...","(W. A. Schoolfield, PERSON)","<p xmlns=""http://www.tei-c.org/ns/1.0"" xmlns:m...",y,,
0,test_xml-before.xml,.//ns:body//{http://www.tei-c.org/ns/1.0}div[@...,"<p xmlns=""http://www.tei-c.org/ns/1.0"" xmlns:m...","(Abel, PERSON)","<p xmlns=""http://www.tei-c.org/ns/1.0"" xmlns:m...",y,,


## 2. Make NER Edits

#### Intakes CSV & Re-Writes XML Chunks According to User Input Added to CSV

#### Manually Accept/Reject Changes First

In [5]:
%%time

csv_df = pd.read_csv(abs_dir + 'Data/TestEncoding/EditingData/make_ner_suggestions.csv', sep = ',').fillna('')

csv_df.loc[1, 'accept_change'] = 'n'

new_data = inherit_changes(label_dict, csv_df)

print (new_data.shape)
new_data

(1, 8)
CPU times: user 7.93 ms, sys: 1.45 ms, total: 9.37 ms
Wall time: 8.27 ms


Unnamed: 0,file,abridged_xpath,previous_encoding,entities,new_encoding,accept_change,make_hand_edits,add_unique_identifier
1,test_xml-before.xml,.//ns:body//{http://www.tei-c.org/ns/1.0}div[@...,"<p xmlns=""http://www.tei-c.org/ns/1.0"" xmlns:m...","('Abel', 'PERSON')","<p xmlns=""http://www.tei-c.org/ns/1.0"" xmlns:m...",n,,


## Conditions of Accepting Changes & Encoding Inheritance

In inherit_changes(), include decision tree if there are hand edits to be made.

In [6]:
%%time

test = pd.DataFrame({'file':[0,0,0,0,0,0,0,0,0,0],
                     'abridged_xpath':[1, 1, 1, 2, 2, 2, 3, 3, 4, 4],
                     'previous_encoding':['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'Going to Boston for the holiday'],
                     'new_encoding':['k','l','m','n', 'o','p','q c', 'r', 's', 't'],
                     'accept_change':['y','n','y','n','y','y','y','y','y','y'],
                     'entities':["('a','LOC')", "('a','LOC')", "('a','LOC')", "('b','LOC')", "('b','LOC')", "('b','LOC')", "('c','LOC')", "('c','LOC')", "('d','LOC')", "('Boston','LOC')"],
                     'make_hand_edits':['','','','','','','','','','<placeName>Boston</placeName>'],
                     'add_unique_identifier':['','','','','','','','','','boston-1']})


inherit_changes(label_dict, test)

# Shouldn't accepted changes (no edits) wrap found entity in tags?
# No, because inherit_changes does not re-write xml UNLESS there are handmade edits.
# Entity to encoding changes happens in make_ner_suggestions(), which is separate from inherit_changes().
test

CPU times: user 14.2 ms, sys: 880 µs, total: 15.1 ms
Wall time: 14.5 ms


Unnamed: 0,file,abridged_xpath,previous_encoding,new_encoding,accept_change,entities,make_hand_edits,add_unique_identifier
0,0,1,a,k,y,"('a','LOC')",,
1,0,1,k,l,n,"('a','LOC')",,
2,0,1,k,m,y,"('a','LOC')",,
3,0,2,b,n,n,"('b','LOC')",,
4,0,2,b,o,y,"('b','LOC')",,
5,0,2,o,p,y,"('b','LOC')",,
6,0,3,c,q c,y,"('c','LOC')",,
7,0,3,q c,r,y,"('c','LOC')",,
8,0,4,d,s,y,"('d','LOC')",,
9,0,4,s,"Going to <placeNamexml:id=""boston-1"">Boston</p...",y,"('Boston','LOC')",<placeName>Boston</placeName>,boston-1


## 3. Write Changes to XML

Validating XML to schema after changes possible with accessible schema: http://emredjan.github.io/blog/2017/04/08/validating-xml/

In [7]:
%%time

# App Required Inputs (Drag and Drop Files).
input_filename = abs_dir + 'Data/TestEncoding/EditingData/test_xml-before.xml'
csv_df = pd.read_csv(abs_dir + 'Data/TestEncoding/EditingData/make_ner_suggestions.csv', sep = ',').fillna('')

csv_df.loc[1, 'accept_change'] = 'n' # Making temporary changes to avoid opening/saving file each time.

# App Created Output.
output_filename = abs_dir + 'Data/TestEncoding/EditingData/test_xml-after.xml'

with open(output_filename, 'wb') as xml_out:
    revised_tree = revise_xml(input_filename, csv_df, label_dict)
    revised_tree.write(xml_out, encoding = 'utf-8', pretty_print = True, xml_declaration = True)

CPU times: user 13.8 ms, sys: 3.33 ms, total: 17.1 ms
Wall time: 14.9 ms


In [8]:
# print (revised_tree.tostring(root, pretty_print=True))

print (etree.tostring(revised_tree, encoding='utf8', method='xml', pretty_print = True))

b'<?xml-model href="http://www.masshist.org/publications/pub/schema/codem-0.2-djqa.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>\n<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:mhs="http://www.masshist.org/ns/1.0" xmlns:tei="http://www.tei-c.org/ns/1.0" xml:id="v23-1821-05"> <teiHeader> <fileDesc> <titleStmt> <title>Test Encoding: Copy-Pasted Contents from Three Editions</title> </titleStmt> </fileDesc> </teiHeader> <text> <body> <div type="entry" xml:id="jqadiaries-v23-1821-05-01"> <head>1 May 1821</head> <bibl> <author>JQA</author> <date type="creation" when="1821-05-01"/> <editor role="transcription">Neal Millikan</editor> </bibl> <div type="docbody"> <opener> <dateline>Washington May 22 1832</dateline> <salute>My Dear Sir</salute> </opener> <dateline><hi rend="italic">May 1821.</hi></dateline> <p><date>1 V:15.</date> Tuesday. <persName>W. A. Schoolfield </persName>at the Office. About Slaves. Weekly tea-party resumed. I send you a list of the stoc

In [16]:

xml_in = revised_tree



new_data = inherit_changes(label_dict, csv_df)

root = xml_in.getroot()
ns = get_namespace(root)

tree_as_string = etree.tostring(tree, pretty_print = True).decode('utf-8')
tree_as_string = re.sub('\s+', ' ', tree_as_string) # remove additional whitespace

#     Declare accepted encoding to be written.
#     For each entry in file...
for child in root.findall('.//ns:p', ns):

#         Store original encoding.
    original_encoding_as_string = get_encoding(child)
#         Removing namespace information embedded in <p> tags.
    original_encoding_as_string = re.sub('(<p)(.*1.0")(>)',
                                         '\\1\\3',
                                         original_encoding_as_string)

#         Get xpath of child and write full xpath with namespaces using dictionary
    abridged_xpath = get_abridged_xpath(child)
#     for key, value in ns.items():
#         full_xpath = re.sub('(.*)(xml:)(.*)', '\\1{http://www.w3.org/XML/1998/namespace}\\3', abridged_xpath)

    accepted_encoding_as_string = new_data.loc[new_data['abridged_xpath'] == abridged_xpath, 'new_encoding'][1]
    accepted_encoding_as_string = re.sub('(<p)(.*1.0")(>)',
                                         '\\1\\3',
                                         accepted_encoding_as_string)


    tree_as_string = re.sub(original_encoding_as_string,
                            accepted_encoding_as_string,
                            tree_as_string)

#     Check well-formedness (will fail if not well-formed)
doc = etree.fromstring(tree_as_string)

#     Write changed XML.
et = etree.ElementTree(doc)
et = etree.tostring(et, encoding='utf8', method='xml', pretty_print = True)

et

b'<?xml-model href="http://www.masshist.org/publications/pub/schema/codem-0.2-djqa.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>\n<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:mhs="http://www.masshist.org/ns/1.0" xmlns:tei="http://www.tei-c.org/ns/1.0" xml:id="v23-1821-05"> <teiHeader> <fileDesc> <titleStmt> <title>Test Encoding: Copy-Pasted Contents from Three Editions</title> </titleStmt> </fileDesc> </teiHeader> <text> <body> <div type="entry" xml:id="jqadiaries-v23-1821-05-01"> <head>1 May 1821</head> <bibl> <author>JQA</author> <date type="creation" when="1821-05-01"/> <editor role="transcription">Neal Millikan</editor> </bibl> <div type="docbody"> <opener> <dateline>Washington May 22 1832</dateline> <salute>My Dear Sir</salute> </opener> <dateline><hi rend="italic">May 1821.</hi></dateline> <p><date>1 V:15.</date> Tuesday. W. A. Schoolfield at the Office. About Slaves. Weekly tea-party resumed. I send you a list of the stockholders of the Bank 