# Test Area

In [1]:
import warnings, re, glob, datetime, csv, sys, os, base64, io, spacy, datetime
import pandas as pd
import numpy as np
# import xml.etree.ElementTree as ET
from lxml import etree, isoschematron

import dash, dash_table
import dash_core_components as dcc
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate
import dash_html_components as html
from jupyter_dash import JupyterDash

# Import spaCy language model.
nlp = spacy.load('en_core_web_sm')

# Ignore simple warnings.
warnings.simplefilter('ignore', DeprecationWarning)

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/GitHub/dsg-mhs/"

## Making revisions with and without IDs + Highlighting Text

In [2]:
%%time

label_dict = {'PERSON':'persName',
                  'LOC':'placeName', # Non-GPE locations, mountain ranges, bodies of water.
                  'GPE':'placeName', # Countries, cities, states.
                  'FAC':'placeName', # Buildings, airports, highways, bridges, etc.
                  'ORG':'orgName', # Companies, agencies, institutions, etc.
                  'NORP':'name', # Nationalities or religious or political groups.
                  'EVENT':'name', # Named hurricanes, battles, wars, sports events, etc.
                  'WORK_OF_ART':'name', # Titles of books, songs, etc.
                  'LAW':'name', # Named documents made into laws.
                  'DATE':'date' # Absolute or relative dates or periods.
                 }

"""
XML Parsing Function: Get Namespaces
"""
def get_namespace(root):
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}
    return ns


"""
XML Parsing Function: Convert to String
"""
def get_text(elem):
    text_list = []
    text = ''.join(etree.tostring(elem, encoding='unicode', method='text', with_tail=False))
    text_list.append(re.sub(r'\s+', ' ', text))
    return ' '.join(text_list)

        
"""
XML Parsing Function: Get Encoded Content
"""    
def get_encoding(elem):
    encoding = etree.tostring(elem, pretty_print = True).decode('utf-8')
    encoding = re.sub('\s+', ' ', encoding) # remove additional whitespace
    return encoding


  

"""
NER Function
"""
# spaCy
def get_spacy_entities(text, subset_ner):
    sp_entities_l = []
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in subset_ner.keys():
            sp_entities_l.append((str(ent), ent.label_))
        else:
            pass
    return sp_entities_l




"""
XML & NER: Retrieve Contents
"""
def get_contents(ancestor, xpath_as_string, namespace, subset_ner):
    
    textContent = get_text(ancestor) # Get plain text.
    encodedContent = get_encoding(ancestor) # Get encoded content.
    sp_entities_l = get_spacy_entities(textContent, subset_ner) # Get named entities from plain text.

    return (sp_entities_l, encodedContent)



"""
XML & Regex: Up Conversion

Function replaces all spaces between beginning and end tags with underscores.
Then, function wraps each token (determined by whitespace) with word tags (<w>...</w>)
"""
def up_convert_encoding(column):
#     Regularize spacing & store data as new variable ('converted_encoding').
    converted_encoding = re.sub('\s+', ' ', column, re.MULTILINE)
    
#     Create regex that replaces spaces with underscores if spaces occur within tags.
#     This regex treats tags as a single token later.
    tag_regex = re.compile('<(.*?)>')

#     Accumulate underscores through iteration
    for match in re.findall(tag_regex, column):
        replace_space = re.sub('\s', '_', match)
        converted_encoding = re.sub(match, replace_space, converted_encoding)
    
#     Up-Converstion
#     Tokenize encoding and text, appending <w> tags, and re-join.
    converted_encoding = converted_encoding.split(' ')
    for idx, item in enumerate(converted_encoding):
        item = '<w>' + item + '</w>'
        converted_encoding[idx] = item
    converted_encoding = ' '.join(converted_encoding)
    
    return converted_encoding


"""
XML Parsing Function: Intersperse Entity with Likely TEI Information for Capacious Regex
"""
def intersperse(lst, item):
    result = [item] * (len(lst) * 2 - 0)
    result[0::2] = lst
    return result


"""
XML Function: Build KWIC of Found Entities in Up Converted Encoding
"""
def get_kwic_encoding(entity, encoding, banned_list, kwic_range):
#     Up convert arguments.
    converted_encoding = up_convert_encoding(encoding)
    converted_entity = up_convert_encoding(entity)

#     Intersperse & 'up convert' by hand entity.
    expanded_entity = [c for c in entity]
    expanded_regex = '[' + "|".join(['(<.*?>)']) + ']*'

    expanded_regex = r''.join(intersperse(expanded_entity, expanded_regex))
    expanded_entity = re.sub('\s', '</w> <w>', expanded_regex)
    
#     <w>(?:(?!<w>).)*
#     'Tempered greedy token solution', <w> cannot appear after a <w>, unless within expanded_entity
#     entity_regex = re.compile('(<w>(?:(?!<w>).)*' + expanded_entity + '.*?</w>)')
    entity_regex = re.compile('([^\s]*' + expanded_entity + '[^\s]*)')
    
    
    # Use regex match as final conv. entity.
    try:
        kwic_dict = {entity: []}
        for m in entity_regex.finditer(converted_encoding):
            
            if any(item in m.group() for item in banned_list):
                pass
            
            else:
#                 Gather context:
#                 Start of match (m.start()) minus kwic_range through end of match plus kwic_range.
                context = converted_encoding[ m.start() - kwic_range : m.end() + kwic_range]
                kwic_dict[entity].append(context)
        
        
#         For each item in entity list, create new regex and expand until reaches preceeding </w> and trailing <w>.
        for n, i in enumerate(kwic_dict[entity]):
            complete_kwic = re.search(f'([^\s]*{i}[^\s]*)', converted_encoding).group()
            kwic_dict[entity][n] = complete_kwic
        
#         Return values only
        return kwic_dict[entity]
            
    except AttributeError:
        return np.nan


"""
XML: & NER: Create Dataframe of Entities
"""
def make_dataframe(descendant, df, ns, subset_ner, filename, descendant_order):
    entities, previous_encoding = get_contents(descendant, './/ns:.', ns, subset_ner)

    df = df.append({
        'file':re.sub('.*/(.*.xml)', '\\1', filename),
        'descendant_order': descendant_order,
        'previous_encoding': previous_encoding,
        'entities':entities,
    },
        ignore_index = True)
    
    return df



"""
Parse Contents: XML Structure (ouput-data-upload)
"""
def parse_contents(contents, filename, date, ner_values):
    ner_values = ner_values.split(',')
    content_type, content_string = contents.split(',')
    decoded = base64.b64decode(content_string).decode('utf-8')
    
    # Label dictionary.
    label_dict = {'PERSON':'persRef',
                  'LOC':'placeName', # Non-GPE locations, mountain ranges, bodies of water.
                  'GPE':'placeName', # Countries, cities, states.
                  'FAC':'placeName', # Buildings, airports, highways, bridges, etc.
                  'ORG':'orgName', # Companies, agencies, institutions, etc.
                  'NORP':'name', # Nationalities or religious or political groups.
                  'EVENT':'name', # Named hurricanes, battles, wars, sports events, etc.
                  'WORK_OF_ART':'name', # Titles of books, songs, etc.
                  'LAW':'name', # Named documents made into laws.
                  'DATE':'date' # Absolute or relative dates or periods.
                 }
    
    #### Subset label_dict with input values from Checklist *****
    subset_ner = {k: label_dict[k] for k in ner_values}
    
#     Run XML Parser + NER here.
    try:
#         Assume that the user uploaded a CSV file
        if 'csv' in filename:
            df = pd.read_csv(
                io.StringIO(decoded)
            )
            
#         Assume that the user uploaded an XML file
        elif 'xml' in filename:
            xml_file = decoded.encode('utf-8')
            
            df = pd.DataFrame(columns = ['file', 'previous_encoding', 'entities'])
            
            root = etree.fromstring(xml_file)
            ns = get_namespace(root)
            
#             Search through elements for entities.
            desc_order = 0
            for child in root.findall('.//ns:body', ns): # Change this line to specify where to look for entities.
                
                for descendant in child:
                    desc_order = desc_order + 1
                    df = make_dataframe(descendant, df, ns, subset_ner, filename, desc_order)
                
#             Join data
            df = df \
                .explode('entities') \
                .dropna()

            df[['entity', 'label']] = pd.DataFrame(df['entities'].tolist(), index = df.index)
        
            # Add additional columns for user input.
            df['uniq_id'] = ''
            
#             Replace 'previous_encoding' with a KWIC version containing entity.
            df['previous_encoding'] = df.apply(lambda row: get_kwic_encoding(row['entity'],
                                                                             row['previous_encoding'], 
                                                                             banned_list,
                                                                             30),
                                               axis = 1)

            
    except Exception as e:
        return html.Div([
            f'There was an error processing this file: {e}.'
    ])
    
#     Explode lists within more than one item.
    df = df.explode('previous_encoding').dropna().drop_duplicates()
    
#     Return HTML with outputs.
    return filename, date, df




"""
XML: Remove word tags and clean up
"""
def xml_cleanup(encoding):
#     Clean up any additional whitespace and remove word tags.
    encoding = re.sub('\s+', ' ', encoding, re.MULTILINE)
    encoding = re.sub('(<[/]?w>)', '', encoding)

    encoding = re.sub('_', ' ', encoding) # Remove any remaining underscores in tags.
    encoding = re.sub('“', '"', encoding) # Change quotation marks to correct unicode.
    encoding = re.sub('”', '"', encoding)
    
    return encoding


"""
Reading Pane: Highlight Found Entity
"""
def highlighter(previous_encoding, entity):
#     Remove all tags.
    highlighted_text = re.sub('(<.*?>)', '', previous_encoding) 
    
    entity_match = re.search(f'(.*)({entity})(.*)', highlighted_text)
    
    highlighted_text = html.P([entity_match.group(1), html.Mark(entity_match.group(2)), entity_match.group(3)])
    
    return highlighted_text

"""
XML Parsing Function: Suggest New Encoding with Hand Edits

Similar to make_ner_suggestions(), this function folds in revision using regular expressions.
The outcome is the previous encoding with additional encoded information determined by user input.

Expected Columns:
    previous_encoding
    entities
    uniq_id
"""
def revise_with_selections(label_dict, label, uniq_id, entity, previous_encoding):
    
    label = label_dict[label]
    
#     <w>(?:(?!<w>).)*
#     'Tempered greedy token solution', <w> cannot appear after a <w>, unless it's within expanded_entity.
    expanded_entity = [c for c in entity]
    expanded_regex = '[' + "|".join(['(<.*?>)']) + ']*'
    expanded_regex = r''.join(intersperse(expanded_entity, expanded_regex))
    expanded_entity = re.sub('\s', '</w> <w>', expanded_regex)
    
#     [^\s]*(</w>)?
#     Match anything except for whitespace until first </w> appears.
#     expanded_entity = f'(<w>(?:(?!<w>).)*{expanded_entity}[^\s]*(</w>)?)'
    expanded_entity = f'([^\s]*{expanded_entity}[^\s]*)'
    
    matched_entity = re.search(expanded_entity, previous_encoding).group()
    
#     If there is a unique id to add & hand edits...
    if uniq_id != '':
        
        revised_encoding = re.sub(f'{matched_entity}',
                                  f'<{label} type="nerHelper-added">{matched_entity}</{label}>',
                                  previous_encoding)        
        revised_encoding = xml_cleanup(revised_encoding)

        return revised_encoding
    
    elif uniq_id == '':
        
        revised_encoding = re.sub(f'{matched_entity}',
                                  f'<{label} type="nerHelper-added">{matched_entity}</{label}>',
                                  previous_encoding)
        revised_encoding = xml_cleanup(revised_encoding)
        
        return revised_encoding

    else:
        pass


"""
XML & NER: Update/Inherit Accepted Changes
Expects a dataframe (from a .csv) with these columns:
    file
    abridged_xpath
    descendant_order
    previous_encoding
    entities
    new_encoding
    uniq_id
"""
def commit_revisions(label_dict, dataframe):
    
    dataframe = dataframe.fillna('').reset_index()
    
    for index, row in dataframe.iterrows():
        
        revised_by_hand = revise_with_selections(label_dict, row['label'], row['uniq_id'],
                                                 row['entity'], row['previous_encoding'])
        
        dataframe.loc[index, 'new_encoding'] = revised_by_hand
    
#     Clean up previous_encoding (remove word tags)
    dataframe['previous_encoding'] = dataframe.apply(lambda row: xml_cleanup(row['previous_encoding']), axis = 1)
    
    return dataframe



"""
XML & NER: Write New XML File with Accepted Revisions
Expects:
    XML File with Original Encoding
    CSV File with Accepted Changes
    Label Dictionary
"""
def revise_xml(xml_contents, csv_df):
#     Label dictionary.
    label_dict = {'PERSON':'persRef',
                  'LOC':'placeName', # Non-GPE locations, mountain ranges, bodies of water.
                  'GPE':'placeName', # Countries, cities, states.
                  'FAC':'placeName', # Buildings, airports, highways, bridges, etc.
                  'ORG':'orgName', # Companies, agencies, institutions, etc.
                  'NORP':'name', # Nationalities or religious or political groups.
                  'EVENT':'name', # Named hurricanes, battles, wars, sports events, etc.
                  'WORK_OF_ART':'name', # Titles of books, songs, etc.
                  'LAW':'name', # Named documents made into laws.
                  'DATE':'date' # Absolute or relative dates or periods.
                 }
    
    new_data = commit_revisions(label_dict, csv_df)
    
    xml_content_type, xml_content_string = xml_contents.split(',')
    xml_decoded = base64.b64decode(xml_content_string).decode('utf-8')
    xml_file = xml_decoded.encode('utf-8')
    
    root = etree.fromstring(xml_file)
    ns = get_namespace(root)    
    
#     Convert XML structure to string for regex processing.
    tree_as_string = etree.tostring(root, pretty_print = True).decode('utf-8')
    tree_as_string = re.sub('\s+', ' ', tree_as_string) # remove additional whitespace
    
#     Write accepted code into XML tree.
    for index, row in new_data.iterrows():
        tree_as_string = re.sub(f'(.*)({row.previous_encoding})(.*)',
                                f'\\1{row.new_encoding}\\3',
                                tree_as_string)
        
#     Check well-formedness (will fail if not well-formed)
    doc = etree.fromstring(tree_as_string)
    et = etree.ElementTree(doc)
    
#     Convert to string.
    et = etree.tostring(et, encoding='unicode', method='xml', pretty_print = True)
    return et


"""
XML: Write Schema Information before Root
Input: 
    - Revised XML document (return variable from revise_xml())
    - XML File with Original Encoding
"""
def write_schema_information(xml_contents, final_revisions):
    xml_content_type, xml_content_string = xml_contents.split(',')
    xml_decoded = base64.b64decode(xml_content_string).decode('utf-8')
    
    xml_file = xml_decoded.encode('utf-8').decode('utf-8')
    xml_file = re.sub('\s+', ' ', xml_file)
    
    schema_match = re.search('(<?.*)(<TEI.*)', xml_file)
    schema_match = schema_match.group(1)
    
    completed_document = schema_match + final_revisions

    return completed_document

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 9.06 µs


In [3]:
%%time

previous_encoding = """<div xmlns="http://www.tei-c.org/ns/1.0" xmlns:mhs="http://www.masshist.org/ns/1.0" xmlns:tei="http://www.tei-c.org/ns/1.0" type="entry" xml:id="jqadiaries-v23-1821-05-01"> <head>1 May 1821</head> <bibl> <author>JQA</author> <date type="creation" when="1821-05-01"/> <editor role="transcription">Neal Millikan</editor> </bibl> <div type="docbody"> <opener> <dateline>Washington May 22 1832</dateline> <salute>My Dear Sir</salute> </opener> <dateline><hi rend="italic">May 1821.</hi></dateline> <p><date>1 V:15.</date> Tuesday. W. A. Schoolfield at the Office. About Slaves. Weekly tea-party resumed. I send you a list of the stockholders of the Bank of the U. States &#8211; foreign and domestic and am very sorry that I <unclear cert="low">suffered</unclear> the promise I made to you in Baltimore to escape from my mind until it was brought back by your letter &#8211; my apology is the pressure of many engagements &#8211; and I hope the delay has not produced inconvenience. I am often accused by <persRef ref="richards-robert">my husband</persRef> of being very deep in my plans and of not always telling at once what they are. <persRef ref="abel-mary">Mrs Abel</persRef> is winning the confidence of people so that we shall have no difficulty in introducing our wares.</p> <p>Another paragraph here that discusses the daily errands of W. A. Schoolfield. Schoolfield's work was in Baltimore.</p> <closer> <salute>I am dear sir / very truly your / friend </salute> <signed>R. B. Taney</signed> </closer> </div> </div>
"""

# entity = 'R. B. Taney'
entity = 'W. A. Schoolfield'

label = 'PERSON'

xml_id = "schoolfied01"

#### Subset label_dict with input values from Checklist *****
ner_values = ['PERSON']
subset_ner = {k: label_dict[k] for k in ner_values}

# Banned List (list of elements that already encode entities)
banned_list = ['persRef', 'date']

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.91 µs


In [4]:

filename = 'fake.xml'
df = pd.DataFrame(columns = ['file', 'previous_encoding', 'entities'])

# xml_file = open(abs_dir + "TestEncoding/EditingData/ESR-EDA-1890-03-08 copy.xml").read()
xml_file = open(abs_dir + "TestEncoding/EditingData/test_xml-before.xml").read()
xml_file = xml_file.encode('utf-8')
    
root = etree.fromstring(xml_file)
ns = get_namespace(root)


#             Search through elements for entities.
desc_order = 0
for child in root.findall('.//ns:body', ns): # Change this line to specify where to look for entities.

    for descendant in child:
        desc_order = desc_order + 1
        df = make_dataframe(descendant, df, ns, subset_ner, filename, desc_order)
        
#             Join data
df = df \
    .explode('entities') \
    .dropna()

df[['entity', 'label']] = pd.DataFrame(df['entities'].tolist(), index = df.index)

# Add additional columns for user input.
df['uniq_id'] = ''

df['previous_encoding'] = df.apply(lambda row: get_kwic_encoding(row['entity'], 
                                                                 row['previous_encoding'], banned_list,
                                                                 30),
                                   axis = 1)

#     Explode lists within more than one item.
df = df.explode('previous_encoding').dropna().drop_duplicates()


# print (up_convert_encoding(df['previous_encoding'].head(1)[0]))
df

Unnamed: 0,file,previous_encoding,entities,descendant_order,entity,label,uniq_id
0,fake.xml,<w>V:15.</date></w> <w>Tuesday.</w> <w>W.</w> ...,"(W. A. Schoolfield, PERSON)",1.0,W. A. Schoolfield,PERSON,
0,fake.xml,<w>daily</w> <w>errands</w> <w>of</w> <w>W.</w...,"(W. A. Schoolfield, PERSON)",1.0,W. A. Schoolfield,PERSON,
0,fake.xml,<w>friend</w> <w></salute></w> <w><signed>R.</...,"(R. B. Taney, PERSON)",1.0,R. B. Taney,PERSON,


In [5]:
# xml_file = open(abs_dir + "TestEncoding/EditingData/ESR-EDA-1890-03-08 copy.xml").read()
xml_file = open(abs_dir + "TestEncoding/EditingData/test_xml-before.xml").read()
xml_file = xml_file.encode('utf-8')


root = etree.fromstring(xml_file)
ns = get_namespace(root)    

#     Convert XML structure to string for regex processing.
tree_as_string = etree.tostring(root, pretty_print = True).decode('utf-8')

csv_df = df
new_data = commit_revisions(label_dict, csv_df)

# new_data


tree_as_string = re.sub('\s+', ' ', tree_as_string) # remove additional whitespace

for index, row in new_data.iterrows():
    tree_as_string = re.sub(f'(.*)({row.previous_encoding})(.*)',
                            f'\\1{row.new_encoding}\\3',
                            tree_as_string)
    
#     Check well-formedness (will fail if not well-formed)
doc = etree.fromstring(tree_as_string)
et = etree.ElementTree(doc)

#     Convert to string.
et = re.sub('\s+', ' ',
            etree.tostring(et, encoding='unicode', method='xml', pretty_print = True))

et

'<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:mhs="http://www.masshist.org/ns/1.0" xmlns:tei="http://www.tei-c.org/ns/1.0" xml:id="v23-1821-05"> <teiHeader> <fileDesc> <titleStmt> <title>Test Encoding: Copy-Pasted Contents from Three Editions</title> </titleStmt> <publicationStmt><p/></publicationStmt> <sourceDesc><p/></sourceDesc> </fileDesc> </teiHeader> <text> <body> <div type="entry" xml:id="jqadiaries-v23-1821-05-01"> <head>1 May 1821</head> <bibl> <author>JQA</author> <date type="creation" when="1821-05-01"/> <editor role="transcription">Neal Millikan</editor> </bibl> <div type="docbody"> <opener> <dateline>Washington May 22 1832</dateline> <salute>My Dear Sir</salute> </opener> <dateline><hi rend="italic">May 1821.</hi></dateline> <p><date>1 V:15.</date> Tuesday. <persName type="nerHelper-added">W. A. Schoolfield</persName> at the Office. About Slaves. Weekly tea-party resumed. I send you a list of the stockholders of the Bank of the U. States – foreign and domestic and am ver

In [61]:
xml_file = open(abs_dir + "TestEncoding/EditingData/test_xml-before.xml").read()
xml_file = xml_file.encode('utf-8').decode('utf-8')
doc = etree.fromstring(bytes(xml_file, encoding = 'utf8'))

relaxng_doc = open(abs_dir + "TestEncoding/EditingData/primarysourcecoop.rng").read()
relaxng_doc = etree.XML(relaxng_doc.encode('utf-8'))
relaxng = etree.RelaxNG(relaxng_doc)

relaxng.validate(doc)

log = relaxng.error_log

log

<string>:17:0:ERROR:RELAXNGV:RELAXNG_ERR_ELEMWRONG: Did not expect element head there
<string>:18:0:ERROR:RELAXNGV:RELAXNG_ERR_EXTRACONTENT: Element div has extra content: head
<string>:16:0:ERROR:RELAXNGV:RELAXNG_ERR_ELEMWRONG: Did not expect element div there
<string>:16:0:ERROR:RELAXNGV:RELAXNG_ERR_CONTENTVALID: Element body failed to validate content