# Metadata

```
Name: Maureen O'Shea (mo2cr@virginia.edu)
Course: DS5001
Date: 06 May 2022

Title:   Convert Folger XML to CSV

Description: Source documents and metadata about 37 Shakespearean plays.  Register a LIB table with play title and play code and an API table to hold API functions.

Original Code Author:  R.C. Alvarado
```

# Set Up

In [1]:
series_id = 22
outdir = './dump'
base_url = "https://www.folgerdigitaltexts.org"
data_home = './data'

In [2]:
import requests
from lxml import etree
import pandas as pd
import numpy as np
from io import StringIO
from glob import glob
import re

# Folger XML's of Shakespearean Plays

In [3]:
plays = """
AWW: All's Well That Ends Well
Ant: Antony and Cleopatra
AYL: As You Like It
Err: The Comedy of Errors
Cor: Coriolanus
Cym: Cymbeline
Ham: Hamlet
1H4: Henry IV, Part 1
2H4: Henry IV, Part 2
H5: Henry V
1H6: Henry VI, Part 1
2H6: Henry VI, Part 2
3H6: Henry VI, Part 3
H8: Henry VIII
JC: Julius Caesar
Jn: King John
Lr: King Lear
LLL: Love's Labor's Lost
Mac: Macbeth
MM: Measure for Measure
MV: The Merchant of Venice
Wiv: The Merry Wives of Windsor
MND: A Midsummer Night's Dream
Ado: Much Ado About Nothing
Oth: Othello
Per: Pericles
R2: Richard II
R3: Richard III
Rom: Romeo and Juliet
Shr: The Taming of the Shrew
Tmp: The Tempest
Tim: Timon of Athens
Tit: Titus Andronicus
Tro: Troilus and Cressida
TN: Twelfth Night
TGV: Two Gentlemen of Verona
WT: The Winter's Tale
""".split('\n')[1:-1]

# remove TNK: Two Noble Kinsmen.  TNK is thought to be a collaborative play written in part (and not completely) by Shakespeare.  

# Register a `LIB` 

In [4]:
LIB = pd.DataFrame([play.split(': ') for play in plays], columns=['play_code','play_title'])
LIB.index.name = 'play_id'

In [5]:
LIB.head()

Unnamed: 0_level_0,play_code,play_title
play_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,AWW,All's Well That Ends Well
1,Ant,Antony and Cleopatra
2,AYL,As You Like It
3,Err,The Comedy of Errors
4,Cor,Coriolanus


# Register an `API` 

In [6]:
api_funcs = """
synopsis: (+ act/scene, optionally) returns a synopsis of the play and its scenes
ftln (+ Folger through line number): returns the spoken text at that FTLN
word (+ word id) : returns information about that word
segment (+ object id) : returns the text of that xml:id
text: returns only the spoken text in that play
charText: returns a list of characters arranged according to amount of lines spoken, with a link to each character's entire spoken text
charTextMinus: returns a list of characters arranged according to amount of lines spoken, with a link to the play's spoken text, minus this character
concordance: lists the words used (in spoken text) and their frequency
monologue (+ optional line count): provides a list of speeches longer than the given line count (defaults to 30 lines)
onStage (+ ftln): returns a list of characters on stage at that line
charChart: provides a graphical representation of who is on stage across a timeline of the play
parts: provides parts or cue scripts for each character
witScript: provides "witScripts" for each character. "Witness" or "Witmore" scripts attempt to show what a character sees. They offer the play text only when that character is on stage.
sounds: returns a list of all stage directions that contain sounds (i.e., "music," "flourish," "thunder")
scenes: returns a list of all the scenes in the play
""".split('\n')[1:-1]

In [7]:
API = pd.DataFrame([func.split(': ') for func in api_funcs], columns=['func_key','func_desc'])
API.index.name = 'func_id'

In [8]:
API

Unnamed: 0_level_0,func_key,func_desc
func_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,synopsis,"(+ act/scene, optionally) returns a synopsis o..."
1,ftln (+ Folger through line number),returns the spoken text at that FTLN
2,word (+ word id),returns information about that word
3,segment (+ object id),returns the text of that xml:id
4,text,returns only the spoken text in that play
5,charText,returns a list of characters arranged accordin...
6,charTextMinus,returns a list of characters arranged accordin...
7,concordance,lists the words used (in spoken text) and thei...
8,monologue (+ optional line count),provides a list of speeches longer than the gi...
9,onStage (+ ftln),returns a list of characters on stage at that ...


In [9]:
my_funcs = "synopsis text charText charTextMinus concordance scenes".split()

In [10]:
# list of function paths

urls = [f"{base_url}/{x[0]}/{x[1]}" for x in pd.MultiIndex.from_product([LIB.play_code, my_funcs]).tolist()]

In [11]:
# list of source_file_paths

source_file_path = [f"{base_url}/{x[0]}" for x in pd.MultiIndex.from_product([LIB.play_code]).tolist()]

# Save to CSV

In [12]:
LIB.to_csv(f'{data_home}/folger-LIB.csv')

In [13]:
API.to_csv(f'{data_home}/folger-API.csv')