# Metadata

```
Name: Maureen O'Shea (mo2cr@virginia.edu)
Course: DS5001
Date: 06 May 2022

Title:   A Client for the Folger API

Description: Import and parse XML files. Establish OHCO and register a TOKEN table with extracted token strings, lemma and part of speech annotation.  

Original Code Author:  R.C. Alvarado
```

# Set Up

In [1]:
src_dir = "./XML"
data_home = './data'

In [2]:
from lxml import etree
import pandas as pd
import numpy as np
from glob import glob
import re

# Get Data

In [3]:
LIB = pd.read_csv(f'{data_home}/folger-LIB.csv').set_index('play_code')

In [4]:
LIB

Unnamed: 0_level_0,play_id,play_title
play_code,Unnamed: 1_level_1,Unnamed: 2_level_1
AWW,0,All's Well That Ends Well
Ant,1,Antony and Cleopatra
AYL,2,As You Like It
Err,3,The Comedy of Errors
Cor,4,Coriolanus
Cym,5,Cymbeline
Ham,6,Hamlet
1H4,7,"Henry IV, Part 1"
2H4,8,"Henry IV, Part 2"
H5,9,Henry V


# Get Page Contents

In [5]:
ns = dict(
    xmlns = "http://www.tei-c.org/ns/1.0",
    xmlns2 = "http://www.w3.org/XML/1998/namespace"
)
xpaths = {
    'title': "/xmlns:TEI/xmlns:teiHeader/xmlns:fileDesc/xmlns:titleStmt/xmlns:title/text()",
    'tokens': "//xmlns:sp/xmlns:l/xmlns:w"
}

In [6]:
token_data = []
parser = etree.XMLParser()

for page in glob(f"{src_dir}/*.xml"):

    tree = etree.parse(page, parser)
    title = tree.xpath(xpaths['title'], namespaces=ns)[0]
    
    for token in tree.xpath(xpaths['tokens'], namespaces=ns):
        
        
        # Get token element attributes

        token_str = token.xpath("./text()")[0]

        try:
            lemma = token.xpath("./@lemma")[0]
        except IndexError:
            lemma = None
        
        try:
            ana = token.xpath("./@ana")[0]
        except IndexError:
            ana = None
            
            
        # Get line element attributes

        line = token.getparent()
        
        try:
            line_str = line.xpath("./@n", namespaces=ns)[0]
            act_num, scene_num, line_num = line_str.split('.')
        except ValueError:
            act_name, line_num = line_str.split('.')
            scene_num = 0            
            
        # Get speech element attributes
        
        sp = token.getparent().getparent()
        
        try:
            speaker_str = sp.xpath("./@who", namespaces=ns)[0]
            title_code = speaker_str.split('_')[-1]
            speaker = '_'.join(speaker_str.split('_')[:-1])
        except IndexError:
            title_code = "__NONE__"
            speaker = speaker_str

        try:
            speech_id = sp.xpath("./@xmlns2:id", namespaces=ns)[0]
        except IndexError:
            speech_id = None
            
        token_data.append((title_code, act_num, scene_num, line_num, speech_id, speaker, token_str, lemma, ana))

In [7]:
TOKEN = pd.DataFrame(token_data, columns=['play_code', 'act_num', 'scene_num','line_num','speech_id', 'speaker','token_str', 'lemma', 'pos'])

In [8]:
OHCO = ['play_code', 'act_num', 'scene_num', 'speech_id', 'speaker','line_num']

In [9]:
TOKEN = TOKEN.set_index(OHCO)

In [10]:
TOKEN

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,token_str,lemma,pos
play_code,act_num,scene_num,speech_id,speaker,line_num,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Lr,1,1,sp-0034,#Lear,34,Attend,attend,#vvb
Lr,1,1,sp-0034,#Lear,34,the,the,#d
Lr,1,1,sp-0034,#Lear,34,lords,lord,#n2
Lr,1,1,sp-0034,#Lear,34,of,of,#acp-p
Lr,1,1,sp-0034,#Lear,34,France,France,#n1-nn
...,...,...,...,...,...,...,...,...
Shr,5,2,sp-2731,#Lucentio,206,she,she,#pns
Shr,5,2,sp-2731,#Lucentio,206,will,will,#vmb
Shr,5,2,sp-2731,#Lucentio,206,be,be,#vvi
Shr,5,2,sp-2731,#Lucentio,206,tamed,tame,#vvn


## Save to CSV

In [11]:
TOKEN.to_csv(f'{data_home}/folger-TOKEN.csv')