# Names Data Harmonization

This script finds xml:id's in Primary Source Coop documents and attempts to match them with the Names Authorization spreadsheet. The script produces a spreadsheet with unmatched xml:id's. Editors can find the corresponding entity within the Names Authorization and write the desired xml:id in a designated column within the generated report.

Once editors have confirmed the new xml:id, a second script will find and replace the old, unmatched xml:id's.

#### Assumptions
1. Currently in this prototype, xml:id's are only compared to Taney's name authority.
    * Some xml:id's might not exist in Taney's Names Authority because the xml:id was pulled from a larger spreadsheet (e.g., JQA). Future versions of this script will need to compared id's to the larger list or mutliple lists.
2. This script assumes that Names Authority has the correct unique-identifier.
    * It works best (at the moment) with the expectation that work will be done to the names authority first. Can also be iterative, but that might require retracing steps.

In [9]:
import re, warnings, csv, sys, os, glob
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
from lxml import etree

warnings.simplefilter('ignore')

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/GitHub/dsg-mhs/"

## Find Unmatch xml:id's

### Variables for Directories + Files

In [None]:
%%time

# Collect Taney files
xml_directory = glob.glob("/Users/quinn.wi/Documents/SemanticData/Data/Taney/*/*.xml")

names_auth = pd.read_csv("/Users/quinn.wi/Documents/SemanticData/Data/Taney/Names-RBTaney-20200723 - Sheet1.csv")

names_auth.head()

### Generate Report of Unmatched Entities

In [51]:
%%time

# Read in file and get root of XML tree.
def get_root(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    return root


# Get namespace of individual file from root element.
def get_namespace(root):
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}
    return ns


# Get list of unique-id's in names authority & lowercase them.
names_auth_ids = [x.lower() for x in names_auth['Hyphenated-unique-string- of-characters'] \
                  .values.tolist()]

persData = []

for file in xml_directory:
    reFile = re.sub('.*/(.*.xml)', '\\1', file)
    root = get_root(file)
    ns = get_namespace(root)
    
    for persRef in root.findall('.//ns:p/ns:persRef/[@ref]', ns):
        xml_id = persRef.get('ref').lower() # Lowercase xml:id's.
        
        if xml_id not in names_auth_ids:
            persData.append({'file':reFile, 'xml_id':xml_id})
        else:
            continue
            
unmatched_persRef_df = pd.DataFrame(persData)

# Add empty column for user-input.
unmatched_persRef_df['correct_id'] = ''

unmatched_persRef_df.head()

CPU times: user 10.3 ms, sys: 2.9 ms, total: 13.2 ms
Wall time: 12.2 ms


Unnamed: 0,file,xml_id,correct_id
0,RBT00169-collation.xml,campbell-x,
1,RBT00087-collation.xml,mclane-louis,
2,RBT00011-collation.xml,gill-x,
3,RBT00011-collation.xml,williamson-x,
4,RBT00011-collation.xml,benton-thomas,


## Re-write xml:id's

### Variables for Directories + Files

In [None]:
%%time

# Collect Taney files
xml_directory = glob.glob("/Users/quinn.wi/Documents/SemanticData/Data/Taney/*/*.xml")

user_corrections = pd.read_csv("...")

### Write new xml:id's into XML docs

In [None]:
%%time

# Read in file and get root of XML tree.
def get_root(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    return root


# Get namespace of individual file from root element.
def get_namespace(root):
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}
    return ns


# Get list of unique-id's in names authority & lowercase them.
old_ids = [x.lower() for x in user_corrections['xml_id'].values.tolist()]

persData = []

for file in xml_directory:
    reFile = re.sub('.*/(.*.xml)', '\\1', file)
    root = get_root(file)
    ns = get_namespace(root)
    
    for persRef in root.findall('.//ns:p/ns:persRef/[@ref]', ns):
        xml_id = persRef.get('ref').lower() # Lowercase xml:id's.
        
#         Checks
#         If xml_id == old_ids

#         Replace (overwrite) xml_id with corrected_id.
        