# Test Area

In [1]:
import warnings, re, glob, datetime, csv, sys, os, base64, io, spacy, datetime
import pandas as pd
import numpy as np
# import xml.etree.ElementTree as ET
from lxml import etree

import dash, dash_table
import dash_core_components as dcc
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate
import dash_html_components as html
from jupyter_dash import JupyterDash

# Import spaCy language model.
nlp = spacy.load('en_core_web_sm')

# Ignore simple warnings.
warnings.simplefilter('ignore', DeprecationWarning)

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/GitHub/dsg-mhs/"

"""
XML Parsing Function: Get Namespaces
"""
def get_namespace(root):
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}
    return ns

In [2]:
%%time

xml_file = open(abs_dir + "TestEncoding/EditingData/ESR-EDA-1890-03-08 copy.xml").read()
xml_file = xml_file.encode('utf-8')
    
root = etree.fromstring(xml_file)
ns = get_namespace(root)

# Create a change element for revisionDesc.
# If revisionDesc already exists...
revision_desc = root.find('.//ns:teiHeader/ns:revisionDesc', ns)

new_change = etree.SubElement(revision_desc, 'change',
                              when = str(datetime.datetime.now().strftime("%Y-%m-%d")),
                              who = str(spacy.__version__))
new_change.text = f"Entities added by NER ({spacy.__version__}) application."
# Else, create revisionDesc with SubElement, then change.


# Create an application element.
app_info = root.find('.//ns:teiHeader//ns:appInfo', ns)

ner_app_info = etree.SubElement(app_info, 'application',
                                ident = 'ner-helper')

# Without saving a variable.
etree.SubElement(ner_app_info, 'label').text = 'Ner Helper App'

# ...possibly separated from code block above...
"""
For row in revisions_df (csv_df)

etree.SubElement(ner_app_info, 'ptr', target = f"ner-{xml-id-in-row}")
"""

print (ner_app_info)

<Element application at 0x7fe9c8335cc0>
CPU times: user 1.23 ms, sys: 807 µs, total: 2.04 ms
Wall time: 1.29 ms


In [3]:
%%time

tree_as_string = etree.tostring(root, pretty_print = True).decode('utf-8')
tree_as_string = re.sub('\s+', ' ', tree_as_string) # remove additional whitespace

tree_as_string

CPU times: user 352 µs, sys: 53 µs, total: 405 µs
Wall time: 315 µs


'<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xs="http://www.w3.org/2001/XMLSchema" xml:id="ESR-EDA-1890-03-08"> <teiHeader> <fileDesc> <titleStmt> <title/> <author/> </titleStmt> <editionStmt> <edition> <date/> </edition> </editionStmt> <publicationStmt> <p>unknown</p> </publicationStmt> <sourceDesc> <p>From a Word document</p> </sourceDesc> </fileDesc> <encodingDesc> <appInfo> <application xml:id="docx-to-tei-via-mhs-xslt" ident="TEI_fromDOCX_via_XSLT" version="0.2"> <label>DOCX to TEI</label> </application> <application ident="MHS-WETVAC" version="0.2b"><label>MHS-WETVAC</label></application> <!-- not actually from WETVAC, transcribed directly from Word to TEI, check with Bill how to mark --> <application ident="ner-helper"><label>Ner Helper App</label></application></appInfo> </encodingDesc> <revisionDesc> <change> <date/> <name/> </change> <change when="2021-03-05" who="2.3.2">Entities added by NER (2.3.2) application.</change></revisionDesc> </teiHeader> <text> <body> <div type

In [4]:
help(etree.SubElement)

Help on cython_function_or_method in module lxml.etree:

SubElement(_parent, _tag, attrib=None, nsmap=None, **_extra)
    SubElement(_parent, _tag, attrib=None, nsmap=None, **_extra)
    
    Subelement factory.  This function creates an element instance, and
    appends it to an existing element.



## Making revisions with and without IDs + Highlighting Text

In [5]:
label_dict = {'PERSON':'persName',
                  'LOC':'placeName', # Non-GPE locations, mountain ranges, bodies of water.
                  'GPE':'placeName', # Countries, cities, states.
                  'FAC':'placeName', # Buildings, airports, highways, bridges, etc.
                  'ORG':'orgName', # Companies, agencies, institutions, etc.
                  'NORP':'name', # Nationalities or religious or political groups.
                  'EVENT':'name', # Named hurricanes, battles, wars, sports events, etc.
                  'WORK_OF_ART':'name', # Titles of books, songs, etc.
                  'LAW':'name', # Named documents made into laws.
                  'DATE':'date' # Absolute or relative dates or periods.
                 }

def up_convert_encoding(column):
#     Regularize spacing & store data as new variable ('converted_encoding').
    converted_encoding = re.sub('\s+', ' ', column, re.MULTILINE)
    
#     Create regex that replaces spaces with underscores if spaces occur within tags.
#     This regex treats tags as a single token later.
    tag_regex = re.compile('<(.*?)>')

#     Accumulate underscores through iteration
    for match in re.findall(tag_regex, column):
        replace_space = re.sub('\s', '_', match)
        converted_encoding = re.sub(match, replace_space, converted_encoding)
    
#     Up-Converstion
#     Tokenize encoding and text, appending <w> tags, and re-join.
    converted_encoding = converted_encoding.split(' ')
    for idx, item in enumerate(converted_encoding):
        item = '<w>' + item + '</w>'
        converted_encoding[idx] = item
    converted_encoding = ' '.join(converted_encoding)
    
    return converted_encoding


In [6]:

"""
XML: Remove word tags and clean up
"""
def xml_cleanup(encoding):
#     Clean up any additional whitespace and remove word tags.
    encoding = re.sub('\s+', ' ', encoding, re.MULTILINE)
    encoding = re.sub('<[/]?w>', '', encoding)

    encoding = re.sub('_', ' ', encoding) # Remove any remaining underscores in tags.
    encoding = re.sub('“', '"', encoding) # Change quotation marks to correct unicode.
    encoding = re.sub('”', '"', encoding)
    
    return encoding



"""
XML Parsing Function: Suggest New Encoding with Hand Edits

Similar to make_ner_suggestions(), this function folds in revision using regular expressions.
The outcome is the previous encoding with additional encoded information determined by user input.

Expected Columns:
    previous_encoding
    entities
    uniq_id
"""
def revise_with_uniq_id(label_dict, uniq_id, 
                           label, entity, previous_encoding, new_encoding):
    
    label = label_dict[label]
    
#     Up convert PREVIOUS ENCODING: assumes encoder will supply new encoding and attribute with value.
    converted_encoding = up_convert_encoding(previous_encoding)
    converted_entity = ' '.join(['<w>' + e + '</w>' for e in entity.split(' ')])

#     If there is a unique id to add & hand edits...
    if uniq_id != '':

        entity_regex = re.sub('<w>(.*)</w>', '(\\1)(.*?</w>)', converted_entity)
        entity_match = re.search(entity_regex, converted_encoding)

        revised_encoding = re.sub(f'{entity_match.group(0)}',
                                  f'<{label} ref="{uniq_id}" type="nerHelper-added">{entity_match.group(1)}</{label}>{entity_match.group(2)}',
                                  converted_encoding)
        
        revised_encoding = xml_cleanup(revised_encoding)
        
        return revised_encoding

    else:
        pass

    
"""
XML Parsing Function: Suggest New Encoding with Hand Edits

Similar to make_ner_suggestions(), this function folds in revision using regular expressions.
The outcome is the previous encoding with additional encoded information determined by user input.

Expected Columns:
    previous_encoding
    entities
    uniq_id
"""
def revise_without_uniq_id(label_dict, uniq_id, 
                           label, entity, previous_encoding, new_encoding):
    
    label = label_dict[label]
    
#     Up convert PREVIOUS ENCODING: assumes encoder will supply new encoding and attribute with value.
    converted_encoding = up_convert_encoding(previous_encoding)
    converted_entity = ' '.join(['<w>' + e + '</w>' for e in entity.split(' ')])

#     If there is a unique id to add & hand edits...
    if uniq_id == '':

        entity_regex = re.sub('<w>(.*)</w>', '(\\1)(.*?</w>)', converted_entity)
        entity_match = re.search(entity_regex, converted_encoding)

        revised_encoding = re.sub(f'{entity_match.group(0)}',
                                  f'<{label} type="nerHelper-added">{entity_match.group(1)}</{label}>{entity_match.group(2)}',
                                  converted_encoding)
        
        revised_encoding = xml_cleanup(revised_encoding)
        
        return revised_encoding

    else:
        pass
    
    

In [8]:
previous_encoding = """<p xmlns="http://www.tei-c.org/ns/1.0" xmlns:mhs="http://www.masshist.org/ns/1.0" xmlns:tei="http://www.tei-c.org/ns/1.0"><date>1 V:15.</date> Tuesday. W. A. Schoolfield at the Office. About Slaves. Weekly tea-party resumed. I send you a list of the stockholders of the Bank of the U. States &#8211; foreign and domestic and am very sorry that I <unclear cert="low">suffered</unclear> the promise I made to you in Baltimore to escape from my mind until it was brought back by your letter &#8211; my apology is the pressure of many engagements &#8211; and I hope the delay has not produced inconvenience. I am often accused by <persRef ref="richards-robert">my husband</persRef> of being very deep in my plans and of not always telling at once what they are. <persRef ref="abel-mary">Mrs Abel</persRef> is winning the confidence of people so that we shall have no difficulty in introducing our wares.</p>
"""

new_encoding = """<p xmlns="http://www.tei-c.org/ns/1.0" xmlns:mhs="http://www.masshist.org/ns/1.0" xmlns:tei="http://www.tei-c.org/ns/1.0"><date>1 V:15.</date> Tuesday. <persName>W. A. Schoolfield</persName> at the Office. About Slaves. Weekly tea-party resumed. I send you a list of the stockholders of the Bank of the U. States &#8211; foreign and domestic and am very sorry that I <unclear cert="low">suffered</unclear> the promise I made to you in Baltimore to escape from my mind until it was brought back by your letter &#8211; my apology is the pressure of many engagements &#8211; and I hope the delay has not produced inconvenience. I am often accused by <persRef ref="richards-robert">my husband</persRef> of being very deep in my plans and of not always telling at once what they are. <persRef ref="abel-mary">Mrs Abel</persRef> is winning the confidence of people so that we shall have no difficulty in introducing our wares.</p>
"""

entity = 'W. A. Schoolfield'

label = 'PERSON'

xml_id = "schoolfied01"
# xml_id = ''

revised_encoding = revise_with_uniq_id(label_dict, xml_id, label, entity, previous_encoding, new_encoding)

# revised_encoding = revise_without_uniq_id(label_dict, xml_id, label, entity, previous_encoding, new_encoding)

revised_encoding

'<p xmlns="http://www.tei-c.org/ns/1.0" xmlns:mhs="http://www.masshist.org/ns/1.0" xmlns:tei="http://www.tei-c.org/ns/1.0"><date>1 V:15.</date> Tuesday. <persName ref="schoolfied01" type="nerHelper-added">W. A. Schoolfield</persName> at the Office. About Slaves. Weekly tea-party resumed. I send you a list of the stockholders of the Bank of the U. States &#8211; foreign and domestic and am very sorry that I <unclear cert="low">suffered</unclear> the promise I made to you in Baltimore to escape from my mind until it was brought back by your letter &#8211; my apology is the pressure of many engagements &#8211; and I hope the delay has not produced inconvenience. I am often accused by <persRef ref="richards-robert">my husband</persRef> of being very deep in my plans and of not always telling at once what they are. <persRef ref="abel-mary">Mrs Abel</persRef> is winning the confidence of people so that we shall have no difficulty in introducing our wares.</p>\n'

In [19]:

# html.P(['This is a text', html.Mark('with a keyword highlighted'), 'Does it work?']),

"""
Reading Pane: Highlight Found Entity
"""
def highlighter(previous_encoding, entity):
    highlighted_text = etree.fromstring(previous_encoding)
    highlighted_text = etree.tostring(highlighted_text, method = 'text', encoding = 'utf-8').decode('utf-8')
    
    entity_match = re.search(f'(.*)({entity})(.*)', highlighted_text)
    
    highlighted_text = [entity_match.group(1), entity_match.group(2), entity_match.group(3)]
    
    return highlighted_text

highlighted_text = highlighter(previous_encoding, entity)

highlighted_text

['1 V:15. Tuesday. ',
 'W. A. Schoolfield',
 ' at the Office. About Slaves. Weekly tea-party resumed. I send you a list of the stockholders of the Bank of the U. States – foreign and domestic and am very sorry that I suffered the promise I made to you in Baltimore to escape from my mind until it was brought back by your letter – my apology is the pressure of many engagements – and I hope the delay has not produced inconvenience. I am often accused by my husband of being very deep in my plans and of not always telling at once what they are. Mrs Abel is winning the confidence of people so that we shall have no difficulty in introducing our wares.']

In [10]:
highlighted_text = etree.fromstring(previous_encoding)
highlighted_text = etree.tostring(highlighted_text, method = 'text', encoding = 'utf-8')

type(highlighted_text)

bytes

## Capturing XML Schema Info

In [38]:
xml_file = open(abs_dir + "TestEncoding/EditingData/ESR-EDA-1890-03-08 copy.xml").read()
xml_file = xml_file.encode('utf-8').decode('utf-8')

xml_file = re.sub('\s+', ' ', xml_file)

schema_match = re.search('(<?.*)(<TEI.*)', xml_file)
schema_match.group(1)

'<?xml version="1.0" encoding="UTF-8"?> <?xml-model href="http://www.primarysourcecoop.org/publications/pub/schema/primarysourcecoop.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>'

In [33]:
"""
XML: Write Schema Information before Root
Input: 
    - Revised XML document (return variable from revise_xml())
    - XML File with Original Encoding
"""
def write_schema_information(xml_contents, final_revision):
    xml_contents = re.sub('\s+', ' ', xml_contents)
    schema_match = re.search('(<?.*)(<TEI.*)', xml_file)
    schema_match = schema_match.group(1)
    
    

'<?xml version="1.0" encoding="UTF-8"?><?xml-model href="http://www.primarysourcecoop.org/publications/pub/schema/primarysourcecoop.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?><TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xs="http://www.w3.org/2001/XMLSchema" xml:id="ESR-EDA-1890-03-08"><teiHeader><fileDesc><titleStmt><title/><author/></titleStmt><editionStmt><edition><date/></edition></editionStmt><publicationStmt><p>unknown</p></publicationStmt><sourceDesc><p>From a Word document</p></sourceDesc></fileDesc><encodingDesc><appInfo><application xml:id="docx-to-tei-via-mhs-xslt" ident="TEI_fromDOCX_via_XSLT" version="0.2"><label>DOCX to TEI</label></application><application ident="MHS-WETVAC" version="0.2b"><label>MHS-WETVAC</label></application><!-- not actually from WETVAC, transcribed directly from Word to TEI, check with Bill how to mark --></appInfo></encodingDesc><revisionDesc><change><date/><name/></change></revisionDesc></teiHeader><text><body

In [39]:
'a' + ' b'

'a b'