# Title

Current Issue

1. ~~def revise_with_hand_edits(label_dict, row): [called with inherit_changes()] breaks because row isn't defined~~
2. ~~Does revised_xml include all metadata as well?~~
3. ~~Generate download button after revising (with download capability, of course)~~
4. Do I need to supply metadata (below, it's being erased): 

\<?xml version="1.0" encoding="UTF-8"?\>
\<?xml-model href="http://www.masshist.org/publications/pub/schema/codem-0.2-djqa.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?\>

To Do
1. Ensure suggested encoding does not duplicate encoding already there (I believe up conversion handles this)
2. Work on KWIC View (this might be something that happens in the first application)
3. Expand NER search beyond \<p>

In [None]:
import warnings, re, glob, datetime, csv, sys, os, base64, io, spacy
import pandas as pd
import numpy as np
from io import BytesIO
from lxml import etree
import dash, dash_table
import dash_core_components as dcc
from dash.dependencies import Input, Output, State
import dash_html_components as html
from jupyter_dash import JupyterDash
from urllib.parse import quote
from flask import Flask, send_from_directory, send_file

# Import spaCy language model.
nlp = spacy.load('en_core_web_sm')

# Ignore simple warnings.
warnings.simplefilter('ignore', DeprecationWarning)

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/SemanticData/"

## Functions

In [None]:
%%time

"""
XML Parsing Function: Get Namespaces
"""
def get_namespace(root):
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}
    return ns
        

"""
XML Parsing Function: Retrieve XPaths
"""
def get_abridged_xpath(child):
    while child.getparent().get('{http://www.w3.org/XML/1998/namespace}id') is None:
        elem = child.getparent()
        
        if elem.getparent().get('{http://www.w3.org/XML/1998/namespace}id') is not None:    
            ancestor = elem.getparent().tag
            xml_id = elem.getparent().get('{http://www.w3.org/XML/1998/namespace}id')
            
            abridged_xpath = f'.//ns:body//{ancestor}[@xml:id="{xml_id}"]/{child.tag}'
            return abridged_xpath


        
"""
XML Parsing Function: Get Encoded Content
"""    
def get_encoding(elem):
    encoding = etree.tostring(elem, pretty_print = True).decode('utf-8')
    encoding = re.sub('\s+', ' ', encoding) # remove additional whitespace
    return encoding



"""
XML & Regex: Up Conversion

Function replaces all spaces between beginning and end tags with underscores.
Then, function wraps each token (determined by whitespace) with word tags (<w>...</w>)
"""
def up_convert_encoding(column):
#     Regularize spacing & store data as new variable ('converted_encoding').
    converted_encoding = re.sub('\s+', ' ', column, re.MULTILINE)
    
#     Create regex that replaces spaces with underscores if spaces occur within tags.
#     This regex treats tags as a single token later.
    tag_regex = re.compile('<(.*?)>')

#     Accumulate underscores through iteration
    for match in re.findall(tag_regex, column):
        replace_space = re.sub('\s', '_', match)
        converted_encoding = re.sub(match, replace_space, converted_encoding)
    
#     Up-Converstion
#     Tokenize encoding and text, appending <w> tags, and re-join.
    converted_encoding = converted_encoding.split(' ')
    for idx, item in enumerate(converted_encoding):
        item = '<w>' + item + '</w>'
        converted_encoding[idx] = item
    converted_encoding = ' '.join(converted_encoding)
    
    return converted_encoding




"""
XML Parsing Function: Suggest New Encoding with Hand Edits

Similar to make_ner_suggestions(), this function folds in revision using regular expressions.
The outcome is the previous encoding with additional encoded information determined by user input.

Expected Columns:
    previous_encoding
    entities
    accept_changes
    make_hand_edits
"""
def revise_with_hand_edits(label_dict, make_hand_edits, 
                           label, entity, previous_encoding, new_encoding):
    
    label = label_dict[label]
    
#     Up convert PREVIOUS ENCODING: assumes encoder will supply new encoding and attribute with value.
    converted_encoding = up_convert_encoding(previous_encoding)
    converted_entity = ' '.join(['<w>' + e + '</w>' for e in entity.split(' ')])

#     If there is a unique id to add & hand edits...
    if make_hand_edits != '':
        
        entity_regex = re.sub('<w>(.*)</w>', '(\\1)(.*?</w>)', converted_entity)
        entity_match = re.search(entity_regex, converted_encoding)

        identifier_regex = re.search('<(.+)>(.+)</.+>', make_hand_edits, re.VERBOSE|re.DOTALL)


        revised_encoding = re.sub(f'{entity_match.group(0)}',
                                          f'<{label}>{entity_match.group(1)}</{label}>{entity_match.group(2)}',
                                          converted_encoding)


        revised_encoding = re.sub(f'<{label}>', f'<{identifier_regex.group(1)}>', revised_encoding)

#         Clean up any additional whitespace and remove word tags.
        revised_encoding = re.sub('\s+', ' ', revised_encoding, re.MULTILINE)
        revised_encoding = re.sub('<[/]?w>', '', revised_encoding)
        
        revised_encoding = re.sub('_', ' ', revised_encoding) # Remove any remaining underscores in tags.
        revised_encoding = re.sub('“', '"', revised_encoding) # Change quotation marks to correct unicode.
        revised_encoding = re.sub('”', '"', revised_encoding)
        
        return revised_encoding

    else:
        pass



"""
XML & NER: Update/Inherit Accepted Changes
Expects a dataframe (from a .csv) with these columns:
    file
    abridged_xpath
    previous_encoding
    entities
    new_encoding
    accept_changes
    make_hand_edits
"""
def inherit_changes(label_dict, dataframe):
    
    dataframe = dataframe.fillna('')
    for index, row in dataframe.iterrows():
        
#         If HAND changes are accepted...
#         if row['accept_changes'] == 'y' and row['make_hand_edits'] != '':
        if row['make_hand_edits'] != '':
        
            revised_by_hand = revise_with_hand_edits(label_dict, 
                                                     row['make_hand_edits'],
                                                     row['label'], row['entity'], 
                                                     row['previous_encoding'], row['new_encoding'])

            dataframe.loc[index, 'new_encoding'] = revised_by_hand
            
            try:
                if dataframe.loc[index + 1, 'abridged_xpath'] == row['abridged_xpath'] \
                and dataframe.loc[index + 1, 'descendant_order'] == row['descendant_order']:
                    dataframe.loc[index + 1, 'previous_encoding'] = row['new_encoding']
                    
                else:
                    dataframe.loc[index, 'new_encoding'] = revised_by_hand
                    
                    
            except KeyError as e:
                dataframe.loc[index, 'new_encoding'] = revised_by_hand
        
#         If NER suggestions are accepted as-is...
        elif row['accept_changes'] == 'y' and row['make_hand_edits'] == '':
        
            try:
                if dataframe.loc[index + 1, 'abridged_xpath'] == row['abridged_xpath'] \
                and dataframe.loc[index + 1, 'descendant_order'] == row['descendant_order']:
                    dataframe.loc[index + 1, 'previous_encoding'] = row['new_encoding']
                
                else:
                    dataframe.loc[index, 'new_encoding'] = row['new_encoding']
                    
            except KeyError as e:
                dataframe.loc[index, 'new_encoding'] = row['new_encoding']
                
#         If changes are rejected...
        else:
            try:
                if dataframe.loc[index + 1, 'abridged_xpath'] == row['abridged_xpath'] \
                and dataframe.loc[index + 1, 'descendant_order'] == row['descendant_order']:
                    dataframe.loc[index + 1, 'previous_encoding'] = dataframe.loc[index, 'previous_encoding']
                    
            except KeyError as e:
                dataframe.loc[index, 'new_encoding'] = dataframe.loc[index, 'previous_encoding']

#     Subset dataframe with finalized revisions.
    dataframe = dataframe.groupby(['abridged_xpath', 'descendant_order']).tail(1)
    
    return dataframe


"""
XML & NER: Write New XML File with Accepted Revisions
Expects:
    XML File with Original Encoding
    CSV File with Accepted Changes
    Label Dictionary
"""
def revise_xml(xml_file, csv_df, label_dict):
#     First, update data to reflect accepted changes.
    new_data = inherit_changes(label_dict, csv_df)
    
    root = etree.fromstring(xml_file)
    ns = get_namespace(root)
    
    tree_as_string = etree.tostring(root, pretty_print = True).decode('utf-8')
    tree_as_string = re.sub('\s+', ' ', tree_as_string) # remove additional whitespace
    
#     Write accepted code into XML tree.
    for index, row in new_data.iterrows():
        original_encoding_as_string = row['previous_encoding']
        
        # Remove namespaces within tags to ensure regex matches accurately.
        original_encoding_as_string = re.sub('^<(.*?)( xmlns.*?)>(.*)$',
                                             '<\\1>\\3',
                                             original_encoding_as_string)
        
        accepted_encoding_as_string = row['new_encoding']
        accepted_encoding_as_string = re.sub('<(.*?)( xmlns.*?)>(.*)$',
                                             '<\\1>\\3',
                                             accepted_encoding_as_string) # Remove namespaces within tags.
        
        tree_as_string = re.sub(original_encoding_as_string,
                                accepted_encoding_as_string,
                                tree_as_string)

        
#     Check well-formedness (will fail if not well-formed)
    doc = etree.fromstring(tree_as_string)
    et = etree.ElementTree(doc)
    
#     Convert to string.
    et = etree.tostring(et, encoding='unicode', method='xml', pretty_print = True)
    return et
    

"""
App Function: Write Filename when Uploaded
"""
def parse_contents(contents, filename):
    return html.Div([
        html.H4(f'{filename} succesfully uploaded')
    ])


"""
App Function: Make XML Revisions and Provide Download Link
"""
def commit_xml_revisions(xml_contents, csv_contents, xml_filename, csv_filename):
    try:
#         XML Contents
        xml_content_type, xml_content_string = xml_contents.split(',')
        xml_decoded = base64.b64decode(xml_content_string).decode('utf-8')
        xml_file = xml_decoded.encode('utf-8')
    
    except Exception as e:
        return html.Div([f'There was an error processing {xml_filename}: {e}.'])
    
    try:    
#         CSV Contents
        csv_content_type, csv_content_string = csv_contents.split(',')
        csv_decoded = base64.b64decode(csv_content_string).decode('utf-8')
        csv_df = pd.read_csv(io.StringIO(csv_decoded))
            
    except Exception as e:
        return html.Div([f'There was an error processing {csv_filename}: {e}.'])
    
    try:
        revised_data = revise_xml(xml_file, csv_df, label_dict)
        return revised_data
    
    except Exception as e:
        return html.P(f'Error with Revision Process: {e}')

## APP

In [None]:
%%time

label_dict = {
    'PERSON':'persName',
    'LOC':'placeName', # Non-GPE locations, mountain ranges, bodies of water.
    'GPE':'placeName', # Countries, cities, states.
    'FAC':'placeName', # Buildings, airports, highways, bridges, etc.
    'ORG':'orgName', # Companies, agencies, institutions, etc.
    'NORP':'name', # Nationalities or religious or political groups.
    'EVENT':'name', # Named hurricanes, battles, wars, sports events, etc.
    'WORK_OF_ART':'name', # Titles of books, songs, etc.
    'LAW':'name', # Named documents made into laws.
}

server = Flask(__name__)
app = JupyterDash(__name__, server = server)


# Layout.
app.layout = html.Div([
#     Title
    html.H1('XML & NER Revision'),

    
#     Upload Data Area.
    html.H2('Upload Original XML File'),
    dcc.Upload(
        id = 'upload-xml',
        children = html.Div([
            'Drag and Drop or ', html.A('Select XML File')
        ]),
        style={
            'width': '95%',
            'height': '60px',
            'lineHeight': '60px',
            'borderWidth': '1px',
            'borderStyle': 'dashed',
            'borderRadius': '5px',
            'textAlign': 'center',
            'margin': '10px'
        },
        multiple = True # Allow multiple files to be uploaded
    ),
    html.Div(id = 'xml-data-upload'),
    
    
    #     Upload Data Area.
    html.H2('Upload CSV File with Accepted Changes'),
    dcc.Upload(
        id = 'upload-csv',
        children = html.Div([
            'Drag and Drop or ', html.A('Select CSV File')
        ]),
        style={
            'width': '95%',
            'height': '60px',
            'lineHeight': '60px',
            'borderWidth': '1px',
            'borderStyle': 'dashed',
            'borderRadius': '5px',
            'textAlign': 'center',
            'margin': '10px'
        },
        multiple = True # Allow multiple files to be uploaded
    ),
    html.Div(id = 'csv-data-upload'),
    
    html.Div(id = 'text-area', children = []),
    html.Div(id = 'button-area', n_clicks = 0, children = []),
    html.Div(id = 'download-area', children = [])
])


# App Callbacks
# Write XML Filename when Uploaded
@app.callback(
    Output('xml-data-upload', 'children'),
    Input('upload-xml', 'contents'),
    State('upload-xml', 'filename')
)
def update_xml_file(xml_contents, xml_filename):
    if xml_contents is not None:
        xml_children = [
            parse_contents(xc, xf) for xc, xf in zip(xml_contents, xml_filename)
        ]
        return xml_children

# Write CSV Filename when Uploaded
@app.callback(
    Output('csv-data-upload', 'children'),
    Input('upload-csv', 'contents'),
    State('upload-csv', 'filename')
)
def update_csv_file(csv_contents, csv_filename):
    if csv_contents is not None:
        csv_children = [
            parse_contents(cc, cf) for cc, cf in zip(csv_contents, csv_filename)
        ]
        return csv_children


    
"""
Show Revisions
"""
# Print Revisions.   
@app.callback(
    Output('text-area', 'children'),
    [Input('upload-xml', 'contents'), Input('upload-csv', 'contents')],
    [State('upload-xml', 'filename'), State('upload-csv', 'filename')]
)
def print_revisions(xml_contents, csv_contents, xml_filename, csv_filename):
    if xml_contents and csv_contents:
        
#         Incoporate Revisions
        revisions = [
            commit_xml_revisions(xc, cc, xf, cf) for xc, cc, xf, cf in 
            zip(xml_contents, csv_contents, xml_filename, csv_filename)
        ]
        
        return revisions
        
    
    else:
        return 'Waiting for Required Files'


"""
Generate Download Option When Possible
"""
# Generates a download button for the resource
def build_download_button(uri):
    button = html.Form(
        id = 'download-button',
        action = uri,
        method="get",
        children=[
            html.Button(
                className="button",
                type="submit",
                children=["Download Revised File"]
            )]
    )
    return button


# Downloads with Button Click
@app.callback(
    Output("button-area", "children"),
    [Input("text-area", "children"), Input('upload-xml', 'filename')]
)
def show_download_button(text, filename):
    if text != 'Waiting for Required Files':
        path = f"{filename[0]}"
        return [build_download_button(path)]


@app.callback(
    Output('download-area', 'children'),
    [Input('button-area', 'n_clicks'), Input('text-area', 'children'), Input('upload-xml', 'filename')]
)
def download_with_button_click(n_clicks, text, filename):
    if n_clicks:
        revised_content = ' '.join(text)
        path = f"{filename[0]}"
        with open(path, "w") as file:
            file.write(revised_content)
        
        return f'{filename[0]} downloaded!'
            

if __name__ == "__main__":
    app.run_server(mode = 'inline', debug = True) # mode = 'inline' for JupyterDash

## Test Area

In [None]:
%%time

test = pd.DataFrame({'file':[0,0,0,0,0,0,0,0,0,0],
                     'abridged_xpath':[1, 1, 1, 2, 2, 2, 3, 3, 4, 4],
                     'previous_encoding':['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'Going to Boston for the holiday'],
                     'new_encoding':['k','l','m','n', 'o','p','q <LOC>c</LOC>', 'r', 's', 't'],
                     'accept_changes':['y','n','y','n','y','y','y','y','y','y'],
                     'entities':["('a','LOC')", "('a','LOC')", "('a','LOC')", "('b','LOC')", "('b','LOC')", "('b','LOC')", "('c','LOC')", "('c','LOC')", "('d','LOC')", "('Boston','LOC')"],
                     'entity':['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'Boston'],
                     'label':['LOC','LOC','LOC','LOC','LOC','LOC','LOC','LOC','LOC','LOC'],
                     'make_hand_edits':['','','','','','','','','','<placeName>Boston</placeName>'],
                     'add_unique_identifier':['','','','','','','c-2','','','boston-1']})

for index, row in test.iterrows():
    new = inherit_changes(label_dict, test)
    print (new)
    
test

In [None]:
%%time

cf = abs_dir + 'Data/TestEncoding/EditingData/Data.csv'
csv_df = pd.read_csv(cf)
new_data = inherit_changes(label_dict, csv_df)

# filename = abs_dir + "Data/TestEncoding/EditingData/test_xml-before.xml"
# xml_file = open(filename).read()
# root = etree.fromstring(xml_file.encode('utf-8'))
# ns = get_namespace(root)

# tree_as_string = etree.tostring(r oot, pretty_print = True).decode('utf-8')
# tree_as_string = re.sub('\s+', ' ', tree_as_string) # remove additional whitespace

In [None]:
new_data

In [None]:
csv_df

In [None]:
for index, row in csv_df.iterrows():
    try:
        if csv_df.loc[index + 1, 'abridged_xpath'] == row['abridged_xpath'] \
        and csv_df.loc[index + 1, 'descendant_order'] == row['descendant_order']:
            print ('Descendant orders equivalent')
        else:
            print ('Recognizes different elements')
    except KeyError as e:
        print ('Last row behavior')