# XML Parser for Extracting JQA Subject Headings



In [1]:
# Import necessary libraries.
import re, glob, csv, sys, os
import pandas as pd
import xml.etree.ElementTree as ET

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"

# Gather all .xml files using glob.
list_of_files = glob.glob(abs_dir + "Data/JQA/*/*.xml")

## Define Functions

In [2]:
'''
Arguments of Functions:

    namespace:

    ancestor:
    
    xpath_as_string:
    
    attrib_val_str:
    
'''

# Read in file and get root of XML tree.
def get_root(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    return root


# Get namespace of individual file from root element.
def get_namespace(root):
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}
    return ns


# Get document id.
def get_document_id(ancestor, attrib_val_str):
    doc_id = ancestor.get(attrib_val_str)
    return doc_id


# Get date of document.
def get_date_from_attrValue(ancestor, xpath_as_string, attrib_val_str, namespace):
    date = ancestor.find(xpath_as_string, namespace).get(attrib_val_str)
    return date

# Get subject heading from document.
def get_subject_from_attrValue(ancestor, xpath_as_string, namespace):
    subject_list = []
    for elem in ancestor.findall(xpath_as_string, namespace):
        subject = ''.join(ET.tostring(elem, encoding='unicode', method='text'))
        subject_list.append(re.sub(r'\s+', ' ', subject))
#     Return a string object of 'list' to be written to output file. Can be split later.
    return ','.join(subject_list)

    
# Get plain text of every element (designated by first argument).
def get_textContent(ancestor, xpath_as_string, namespace):
    text_list = []
    for elem in ancestor.findall(xpath_as_string, namespace):
        text = ''.join(ET.tostring(elem, encoding='unicode', method='text'))

#         Add text (cleaned of additional whitespace) to text_list.
        text_list.append(re.sub(r'\s+', ' ', text))

#     Return concetanate text list.
    return ' '.join(text_list)

## Declare Variables

In [3]:
# Declare regex to simplify file paths below
regex = re.compile(r'.*/.*/(.*.xml)')

# Declare document level of file. Requires root starting point ('.').
doc_as_xpath = './/ns:div/[@type="entry"]'

# Declare date element of each document.
date_path = './ns:bibl/ns:date/[@when]'

# Declare subject elements in each document.
subject_path = './/ns:bibl/ns:note[@type="subject"]'

# Declare text level within each document.
text_path = './ns:div/[@type="docbody"]/ns:p'

## Parse Documents

In [4]:
%%time

# Open/Create file to write contents.
with open(abs_dir + 'Data/Output/ParsedXML/JQA_Subjects-dataframe.txt', 'w') as outFile:
    
#     Write headers for table.
    outFile.write('file' + '\t' + 'entry' + '\t' + 'date' + '\t' + \
                  'subject' + '\t' + 'text' + '\n')
    
#     Loop through each file within a directory.
    for file in list_of_files:
        reFile = str(regex.search(file).group(1))
        
#         Call functions to create necessary variables and grab content.
        root = get_root(file)
        ns = get_namespace(root)

        for eachDoc in root.findall(doc_as_xpath, ns):
#             Call functions.
            entry = get_document_id(eachDoc, '{http://www.w3.org/XML/1998/namespace}id')
            date = get_date_from_attrValue(eachDoc, date_path, 'when', ns)
            subject = get_subject_from_attrValue(eachDoc, subject_path, ns)
            text = get_textContent(eachDoc, text_path, ns)
            
#             Write results in tab-separated format.
            outFile.write(reFile + '\t' +  entry + \
                          '\t' + date + '\t' + subject + '\t' + text + '\n')

CPU times: user 1.85 s, sys: 63.3 ms, total: 1.92 s
Wall time: 2 s


## Import Dataframe

In [5]:
dataframe = pd.read_csv(abs_dir + 'Data/Output/ParsedXML/JQA_Subjects-dataframe.txt',
                        sep = '\t')

dataframe

Unnamed: 0,file,entry,date,subject,text
0,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-01,1817-10-01,"Adams Family Residences,Commerce",1. IV:30. Wednesday. Wrote a Letter to J. L. S...
1,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-02,1817-10-02,"Foreign Relations,Health and Illness,South Ame...",2. IV: Continued drafting instructions for Rus...
2,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-03,1817-10-03,"Adams Family Residences,Privateering,Recreatio...",3. IV: I had visits this morning from Mr Levet...
3,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-04,1817-10-04,"Commerce,Foreign Relations,Health and Illness",4. IV: I waked before three and had afterwards...
4,JQADiaries-v30-1817-10-p260.xml,jqadiaries-v30-1817-10-05,1817-10-05,Adams Family Residences,5. V: The Ladies went this morning to St. John...
...,...,...,...,...,...
9111,JQADiaries-v31-1820-07-p381.xml,jqadiaries-v31-1820-07-28,1820-07-28,Privateering,"28. IV:Mrs Cornell, with a Baptist Clergyman f..."
9112,JQADiaries-v31-1820-07-p381.xml,jqadiaries-v31-1820-07-29,1820-07-29,Immigration,29. IV: I took up this morning seriously the s...
9113,JQADiaries-v31-1820-07-p381.xml,jqadiaries-v31-1820-07-30,1820-07-30,Religion,30. IV:30. We all attended Church at Mr M’Corm...
9114,JQADiaries-v31-1820-07-p381.xml,jqadiaries-v31-1820-07-31,1820-07-31,Privateering,31. V: The morning was swallowed up by this Jo...
