# Title

In [1]:
import warnings, re, glob, datetime, csv, sys, os, base64, io, spacy
import pandas as pd
import numpy as np
from lxml import etree
from urllib.parse import quote as urlquote
from flask import Flask, send_from_directory, send_file
import dash, dash_table
import dash_core_components as dcc
from dash.dependencies import Input, Output, State
import dash_html_components as html
from jupyter_dash import JupyterDash

# Import spaCy language model.
nlp = spacy.load('en_core_web_sm')

# Ignore simple warnings.
warnings.simplefilter('ignore', DeprecationWarning)

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/SemanticData/"

## Functions

In [38]:
%%time

"""
XML Parsing Function: Get Namespaces
"""
def get_namespace(root):
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}
    return ns


"""
XML Parsing Function: Retrieve XPaths
"""
def get_abridged_xpath(elem):
    while elem.getparent().get('{http://www.w3.org/XML/1998/namespace}id') is None:
        elem = elem.getparent()
        
        if elem.getparent().get('{http://www.w3.org/XML/1998/namespace}id') is not None:    
            ancestor = elem.getparent().tag
            xml_id = elem.getparent().get('{http://www.w3.org/XML/1998/namespace}id')
            
            abridged_xpath = f'.//ns:body//{ancestor}[@xml:id="{xml_id}"]'
            return abridged_xpath
        

"""
XML Parsing Function: Convert to String
"""
def get_text(elem):
    text_list = []
    text = ''.join(etree.tostring(elem, encoding='unicode', method='text', with_tail=False))
    text_list.append(re.sub(r'\s+', ' ', text))
    return ' '.join(text_list)

        
"""
XML Parsing Function: Get Encoded Content
"""    
def get_encoding(elem):
    encoding = etree.tostring(elem, pretty_print = True).decode('utf-8')
    encoding = re.sub('\s+', ' ', encoding) # remove additional whitespace
    return encoding


"""
XML Parsing Function: Intersperse Entity with Likely TEI Information for Capacious Regex
"""
def intersperse(lst, item):
    result = [item] * (len(lst) * 2 - 0)
    result[0::2] = lst
    return result


"""
XML Parsing Function: Write New Encoding
"""
def make_ner_suggestions(previous_encoding, entities, label_dict):
    previous_encoding = re.sub('\s+', ' ', previous_encoding, re.MULTILINE)
    entity = entities[0]
    label = label_dict[entities[1]]
    
    try:
    #     Create regex that anticipates additional encoding anywhere in tag content.
    #     Break up entity by character to intersperse possible TEI interruptions.
        expanded_entity = [c for c in entity]
        expanded_regex = '[' + "|".join(['<.*>', '</.*>', '\s*']) + ']*'

    #     Intersperse possible encoding within entity.
        expanded_regex =  r''.join(intersperse(expanded_entity, expanded_regex))
        match = re.search(expanded_regex, previous_encoding, re.VERBOSE|re.DOTALL)

    #     If expanded regex is in previous encoding, find & replace it with new encoding.
        if match:
            new_encoding = re.sub(f'{match.group(0)}',
                                  f'<{label}>{match.group(0)}</{label}>',
                                  previous_encoding)

            return new_encoding # Check if encoding is well formed?


        else:
            pass
    
    except:
        return 'Error Occurred with Regex.'
        
        

"""
NER Function
"""
# spaCy
def get_spacy_entities(text, label_dict):
    sp_entities_l = []
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in label_dict.keys():
            sp_entities_l.append((str(ent), ent.label_))
        else:
            pass
    return sp_entities_l


"""
XML & NER: Retrieve Contents
"""
def get_contents(ancestor, xpath_as_string, namespace):
    
    textContent = get_text(ancestor) # Get plain text.
    encodedContent = get_encoding(ancestor) # Get encoded content.
    sp_entities_l = get_spacy_entities(textContent, label_dict) # Get named entities from plain text.
    
    return (sp_entities_l, encodedContent)


"""
XML Parsing Function: Suggest New Encoding with Hand Edits

Similar to make_ner_suggestions(), this function folds in revision using regular expressions.
The outcome is the previous encoding with additional encoded information determined by user input.

Expected Columns:
    previous_encoding
    entities
    accept_change
    make_hand_edits
    add_unique_identifier
"""
def revise_with_hand_edits(label_dict, row):
    previous_encoding = re.sub('\s+', ' ', row['previous_encoding'], re.MULTILINE)
    
#     Using re.sub because pandas is converting tuple to string.
    entity = re.sub("\(\'(.*)\',\'(.*)'\)", '\g<1>', row['entities'])
    label = label_dict[re.sub("\(\'(.*)\',\'(.*)'\)", '\g<2>', row['entities'])]

#     Create regex that anticipates additional encoding anywhere in tag content.
#     Break up entity by character to intersperse possible TEI interruptions.
    expanded_entity = [c for c in entity]
    expanded_regex = '[' + "|".join(['<.*>', '</.*>', '\s*']) + ']*'

#     Intersperse possible encoding within entity.
#     row['previous_encoding'] requires [0] to grab contents.
    expanded_regex =  r''.join(intersperse(expanded_entity, expanded_regex))
    match = re.search(expanded_regex, previous_encoding, re.VERBOSE|re.DOTALL)
    
#     If expanded regex is in previous encoding, find & replace it with new encoding.
    if match != None:

#             If there is a unique id to add & hand edits...
        if row['add_unique_identifier'] != '' and row['make_hand_edits'] != '':
            identifier_regex = re.search('(<.+)>.+</.+>', row['make_hand_edits'], re.VERBOSE|re.DOTALL)
            new_edit = identifier_regex.group(1) + 'xml:id="{}"'.format(row['add_unique_identifier'])

            new_match = re.sub(f'{identifier_regex.group(1)}',
                               f'{new_edit}',
                               row['make_hand_edits'])
            
            revised_encoding = re.sub(f'{match.group(0)}',
                          new_match + ' ',
                          row['previous_encoding'])

    #             Clean up any additional whitespace.
            revised_encoding = re.sub('\s+', ' ', revised_encoding, re.MULTILINE)

            return revised_encoding # Check if encoding is well formed?



#             If there are ONLY unique ids to add an NO hand edits...
        elif row['add_unique_identifier'] != '' and row['make_hand_edits'] == '':
            identifier_regex = re.search('(<.+)>.+</.+>', match.group(0), re.VERBOSE|re.DOTALL)
            new_edit = identifier_regex.group(1) + 'xml:id="{}"'.format(row['add_unique_identifier'])

            new_match = re.sub(f'{identifier_regex.group(1)}',
                               f'{new_edit}',
                               identifier_regex.group(0))
            
            revised_encoding = re.sub(f'{match.group(0)}',
                                      new_match + ' ',
                                      row['previous_encoding'])

    #             Clean up any additional whitespace.
            revised_encoding = re.sub('\s+', ' ', revised_encoding, re.MULTILINE)

            return revised_encoding # Check if encoding is well formed?
    
        else:
            pass

    else:
        pass


"""
XML & NER: Update/Inherit Accepted Changes
Expects a dataframe (from a .csv) with these columns:
    file
    abridged_xpath
    previous_encoding
    entities
    new_encoding
    accept_change
    make_hand_edits
    add_unique_identifier
"""
def inherit_changes(label_dict, dataframe):
    for index, row in dataframe.iterrows():
        
#         If HAND changes are accepted...
        if row['accept_change'] == 'y' and (row['make_hand_edits'] != '' or row['add_unique_identifier'] != ''):
        
            revised_by_hand = revise_with_hand_edits(label_dict, row)
            dataframe.loc[index, 'new_encoding'] = revised_by_hand
            
            try:
                if dataframe.loc[index + 1, 'abridged_xpath'] == row['abridged_xpath']:
                    dataframe.loc[index + 1, 'previous_encoding'] = row['new_encoding']
            except KeyError as e:
                dataframe.loc[index, 'new_encoding'] = revised_by_hand
        
#         If NER suggestions are accepted as-is...
        elif row['accept_change'] == 'y' and row['make_hand_edits'] == '' and row['add_unique_identifier'] == '':
        
            try:
                if dataframe.loc[index + 1, 'abridged_xpath'] == row['abridged_xpath']:
                    dataframe.loc[index + 1, 'previous_encoding'] = row['new_encoding']
            except KeyError as e:
                dataframe.loc[index, 'new_encoding'] = row['new_encoding']
                
#         If changes are rejected...
        else:
            try:
                if dataframe.loc[index + 1, 'abridged_xpath'] == row['abridged_xpath']:
                    dataframe.loc[index + 1, 'previous_encoding'] = dataframe.loc[index, 'previous_encoding']
            except KeyError as e:
                dataframe.loc[index, 'new_encoding'] = dataframe.loc[index, 'previous_encoding']

        
    dataframe = dataframe.groupby('abridged_xpath').tail(1)
    
    return dataframe



"""
XML & NER: Write New XML File with Accepted Revisions
Expects:
    XML File with Original Encoding
    CSV File with Accepted Changes
    Label Dictionary
"""
def revise_xml(xml_in, csv_df, label_dict):
    with open(xml_in, 'r') as xml_in: #, open(output_filename, 'wb') as xml_out:
    #     First, update data to reflect accepted changes.
        new_data = inherit_changes(label_dict, csv_df)

        tree = etree.parse(xml_in)
        root = tree.getroot()
        ns = get_namespace(root)

        tree_as_string = etree.tostring(tree, pretty_print = True).decode('utf-8')
        tree_as_string = re.sub('\s+', ' ', tree_as_string) # remove additional whitespace

    #     Declare accepted encoding to be written.
    #     For each entry in file...
        for child in root.findall('.//ns:p', ns):

    #         Store original encoding.
            original_encoding_as_string = get_encoding(child)
    #         Removing namespace information embedded in <p> tags.
            original_encoding_as_string = re.sub('(<p)(.*1.0")(>)',
                                                 '\\1\\3',
                                                 original_encoding_as_string)

    #         Get xpath of child and write full xpath with namespaces using dictionary
            abridged_xpath = get_abridged_xpath(child)
            for key, value in ns.items():
                full_xpath = re.sub('(.*)(xml:)(.*)', '\\1{http://www.w3.org/XML/1998/namespace}\\3', abridged_xpath)

            accepted_encoding_as_string = new_data.loc[new_data['abridged_xpath'] == abridged_xpath, 'new_encoding'][1]
            accepted_encoding_as_string = re.sub('(<p)(.*1.0")(>)',
                                                 '\\1\\3',
                                                 accepted_encoding_as_string)


            tree_as_string = re.sub(original_encoding_as_string,
                                    accepted_encoding_as_string,
                                    tree_as_string)

    #     Check well-formedness (will fail if not well-formed)
        doc = etree.fromstring(tree_as_string)

    #     Write changed XML.
        et = etree.ElementTree(doc)
        return (et)
    

"""
App Function: Write Filename when Uploaded
"""
def parse_contents(contents, filename):
    return html.Div([
        html.H4(f'{filename} succesfully uploaded')
    ])


"""
App Function: Make XML Revisions and Provide Download Link
"""
def make_xml_revisions(xml_contents, csv_contents):
    try:
#         XML Contents
        xml_content_type, xml_content_string = xml_contents.split(',')
        xml_decoded = base64.b64decode(xml_content_string).decode('utf-8')
        xml_file = xml_decoded.encode('utf-8')
    
    except Exception as e:
        return html.Div([f'There was an error processing {xml_filename}: {e}.'])
    
    try:    
#         CSV Contents
        csv_content_type, csv_content_string = csv_contents.split(',')
        csv_decoded = base64.b64decode(csv_content_string).decode('utf-8')
        csv_df = pd.read_csv(io.StringIO(csv_decoded))
            
    except Exception as e:
        return html.Div([f'There was an error processing {csv_filename}: {e}.'])
    
    
    revised_data = revise_xml(xml_file, csv_file, label_dict)
    
    return revised_data
# #     Return HTML with outputs.
#     return html.Div([
#         html.A(
#         html.Button('Download New XML File'),
#         id = 'download-link',
#         download = 'New Data...',
#         href = "",
#         target = "_blank"
#     ),
        
# #         Break & Horizontal line
#         html.Br(),
#         html.Hr(),
#     ])

CPU times: user 14 µs, sys: 0 ns, total: 14 µs
Wall time: 16 µs


## App

In [17]:
%%time

label_dict = {
    'PERSON':'persName',
    'LOC':'placeName', # Non-GPE locations, mountain ranges, bodies of water.
    'GPE':'placeName', # Countries, cities, states.
    'FAC':'placeName', # Buildings, airports, highways, bridges, etc.
    'ORG':'orgName', # Companies, agencies, institutions, etc.
    'NORP':'name', # Nationalities or religious or political groups.
    'EVENT':'name', # Named hurricanes, battles, wars, sports events, etc.
    'WORK_OF_ART':'name', # Titles of books, songs, etc.
    'LAW':'name', # Named documents made into laws.
}

server = Flask(__name__)
app = JupyterDash(__name__, server = server)


# Layout.
app.layout = html.Div([
#     Title
    html.H1('XML & NER Revision'),

    
#     Upload Data Area.
    html.H2('Upload Original XML File'),
    dcc.Upload(
        id = 'upload-xml',
        children = html.Div([
            'Drag and Drop or ', html.A('Select File')
        ]),
        style={
            'width': '95%',
            'height': '60px',
            'lineHeight': '60px',
            'borderWidth': '1px',
            'borderStyle': 'dashed',
            'borderRadius': '5px',
            'textAlign': 'center',
            'margin': '10px'
        },
        multiple=True # Allow multiple files to be uploaded
    ),
    html.Div(id = 'xml-data-upload'),
    
    
    #     Upload Data Area.
    html.H2('Upload CSV File with Accepted Changes'),
    dcc.Upload(
        id = 'upload-csv',
        children = html.Div([
            'Drag and Drop or ', html.A('Select File')
        ]),
        style={
            'width': '95%',
            'height': '60px',
            'lineHeight': '60px',
            'borderWidth': '1px',
            'borderStyle': 'dashed',
            'borderRadius': '5px',
            'textAlign': 'center',
            'margin': '10px'
        },
        multiple=True # Allow multiple files to be uploaded
    ),
    html.Div(id = 'csv-data-upload'),
    
    html.Div(id = 'csv-table'),
    
    html.A(
        html.Button('Download New XML File'),
        id = 'download-link',
        download = 'New Data...',
        href = "revisions",
        target = "_blank"
    )
])


# App Callbacks
# Write XML Filename when Uploaded
@app.callback(
    Output('xml-data-upload', 'children'),
    Input('upload-xml', 'contents'),
    State('upload-xml', 'filename')
)
def update_xml_file(xml_contents, xml_filename):
    if xml_contents is not None:
        xml_children = [
            parse_contents(xc, xf) for xc, xf in zip(xml_contents, xml_filename)
        ]
        return xml_children

# Write CSV Filename when Uploaded
@app.callback(
    Output('csv-data-upload', 'children'),
    Input('upload-csv', 'contents'),
    State('upload-csv', 'filename')
)
def update_csv_file(csv_contents, csv_filename):
    if csv_contents is not None:
        csv_children = [
            parse_contents(cc, cf) for cc, cf in zip(csv_contents, csv_filename)
        ]
        return csv_children

# # Create DATATABLE TO VERIFY CONTENTS
# @app.callback(
#     Output('csv-table', 'children'),
#     Input('upload-csv', 'contents')
# )
# def update_table(csv_contents):
#     if csv_contents is not None:
#         return dash_table.DataTable(
#             data = pd.read_csv(csv_contents, sep = ',').to_dict('records'),
#             columns = [{'name':i, 'id':i} for i in df.columns],
#             page_size=5,
#             export_format = 'csv',            
#         )
    
    
# # Create NEW XML File with Revisions.
# @app.callback(
#     Output('download-link', 'href'),
#     [Input('upload-xml', 'xml-contents'), Input('upload-csv', 'csv-contents')]
# )

# def provide_download_link(xml_contents, csv_contents):
#     if xml_contents is not None and csv_contents is not None:
#         revised_children = [
#             make_xml_revisions(xc, cc) for xc, cc in zip(xml_contents, csv_contents)
#         ]
        
#         download_string = "data:text/csv;charset=utf-8," + urllib.quote(revised_children)
        
#         return download_string

@server.route("/downloadable/<path>")
def download_file (path = None):
    return send_file("downloadable/" + path, as_attachment=True)
    
if __name__ == "__main__":
    app.run_server(mode = 'inline', debug = True) # mode = 'inline' for JupyterDash

CPU times: user 20.7 ms, sys: 3.98 ms, total: 24.7 ms
Wall time: 367 ms


# Test With Dynamically Generated BUTTON & TEXT