# Exploring TEI Structure of JQA

Create a Network Graph of Parent-Child TEI relationships.

In [13]:
# Import necessary libraries.
import re, glob, csv, sys, os
import pandas as pd
import xml.etree.ElementTree as ET
from lxml import etree

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"

# Gather all .xml files using glob.
list_of_files = glob.glob(abs_dir + "Data/JQA/*/*.xml")

In [5]:
%%time

# Read in file and get root of XML tree.
def get_root(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    return root


# Get namespace of individual file from root element.
def get_namespace(root):
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}
    return ns

#     Call functions to create necessary variables and grab content.
#     root = get_root(file)
#     ns = get_namespace(root)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.29 µs


In [36]:
%%time

tei = []

# Loop through each file within a directory.
for file in list_of_files:

    tree = etree.parse(file)
    
    for elem in tree.iter():
        try:
            child = re.sub('{.*}(.*)', '\\1', str(elem.tag))
            parent = re.sub('{.*}(.*)', '\\1', str(elem.getparent().tag))
            
            data = {
                'source': parent,
                'target': child
            }
        
            tei.append(data)
            
        except AttributeError:
            pass

tei = pd.DataFrame(tei)

tei = tei.groupby(['source', 'target']).size().reset_index()
tei.columns = ['source', 'target', 'weight']

tei.head()

CPU times: user 2.07 s, sys: 35.4 ms, total: 2.1 s
Wall time: 2.13 s


Unnamed: 0,source,target,weight
0,TEI,teiHeader,323
1,TEI,text,323
2,bibl,<cyfunction Comment at 0x7fb668476ee0>,4
3,bibl,author,9116
4,bibl,date,9439


In [37]:
%%time

tei.to_csv(abs_dir + 'GitHub/dsg-mhs/d3/Network/TEI-Structure/jqa_tei-structure.csv',
           sep = ',', index = False)

CPU times: user 1.82 ms, sys: 2.18 ms, total: 4 ms
Wall time: 4.56 ms
