In [4]:
tei_1 = """
<TEI version="3.3.0"
    xmlns="http://www.tei-c.org/ns/1.0">
    <teiHeader>
        <fileDesc>
            <titleStmt>
                <title>Compact valid TEI document</title>
            </titleStmt>
            <publicationStmt>
                <p>This file is published as part of the Digital Philology module.</p>
            </publicationStmt>
            <sourceDesc>
                <p>No source: this is an original work.</p>
            </sourceDesc>
        </fileDesc>
    </teiHeader>
    <text>
        <body>
            <p>Paragraph one is refreshingly short.</p>
            <p>Paragraph two is even shorter.</p>
        </body>
    </text>
</TEI>
"""

In [5]:
import xml.sax

In [6]:
class TEIHandler1( xml.sax.ContentHandler ):
    def startElement(self, tag, attributes):
        print("Starting " + tag + " element!")

# new parser object that we will use throughout the session
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0) # type: ignore

# assign a new instance of TEIHandler1 as the content handler
parser.setContentHandler(TEIHandler1())

# start the parsing
parser.parse("/home/mark/Documents/digital-philology/XML-TEI-Drama_Mining-Tutorial/tei_1.xml")

Starting TEI element!
Starting teiHeader element!
Starting fileDesc element!
Starting titleStmt element!
Starting title element!
Starting publicationStmt element!
Starting p element!
Starting sourceDesc element!
Starting p element!
Starting text element!
Starting body element!
Starting p element!
Starting p element!


In [8]:
class TEIHandler2( xml.sax.ContentHandler ):
    def __init__(self):
        self.body_seen = False
    def startElement(self, tag, attributes):
        if tag == "body":
            print("Starting body element!")
            self.body_seen = True
        if self.body_seen and tag == "p":
            print("Starting paragraph element in the body!")

# we re-use the parser object but change the content handler
parser.setContentHandler(TEIHandler2())
parser.parse("tei_1.xml")

Starting body element!
Starting paragraph element in the body!
Starting paragraph element in the body!


In [10]:
class TEIHandler2withEnd( TEIHandler2 ):
    def endElement(self, tag):
        if self.body_seen and tag == "p":
            print("Paragraph element is ending in the body!")

parser.setContentHandler(TEIHandler2withEnd())
parser.parse("tei_1.xml")

Starting body element!
Starting paragraph element in the body!
Paragraph element is ending in the body!
Starting paragraph element in the body!
Paragraph element is ending in the body!


In [11]:
class TEIHandler2withEndAndText( TEIHandler2withEnd ):
    def characters(self, content):
        if self.body_seen and content.strip() != "":
            print(">>> "+content)

parser.setContentHandler(TEIHandler2withEndAndText())
parser.parse("tei_1.xml")

Starting body element!
Starting paragraph element in the body!
>>> Paragraph one is refreshingly short.
Paragraph element is ending in the body!
Starting paragraph element in the body!
>>> Paragraph two is even shorter.
Paragraph element is ending in the body!


In [13]:
import codecs
import urllib.request
from bs4 import BeautifulSoup

In [14]:
BeautifulSoup("<ABC>Hello<def>world!")

<html><body><abc>Hello<def>world!</def></abc></body></html>

In [15]:
soup = BeautifulSoup(codecs.open("/home/mark/Documents/digital-philology/macbeth/macbeth_content/Mac.xml", "r", "utf-8"))



In [16]:
type(soup)

bs4.BeautifulSoup

In [17]:
type(soup.contents)

list

In [18]:
len(soup.contents)

2

In [19]:
soup.contents[0]

'xml-stylesheet type="text/xsl" href="fdt.xsl"?'

In [20]:
type(soup.contents[0])

bs4.element.ProcessingInstruction

In [21]:
soup.contents[1]

<html><body><tei xmlns="http://www.tei-c.org/ns/1.0">
<teiheader>
<filedesc>
<titlestmt>
<title>Macbeth</title>
<author>William Shakespeare</author>
<editor xml:id="BAM">Barbara A. Mowat</editor>
<editor xml:id="PW">Paul Werstine</editor>
<respstmt>
<resp>Edited for XML and encoded by</resp>
<persname xml:id="MSP">Michael Poston</persname>
<persname xml:id="RLN">Rebecca Niles</persname>
</respstmt>
</titlestmt>
<editionstmt>
<edition n="0.9.2.1">Text released without textual notes</edition>
</editionstmt>
<publicationstmt>
<publisher>Folger Shakespeare Library</publisher>
<idno>Mac</idno>
<address>
<addrline>201 East Capitol Street, SE</addrline>
<addrline>Washington, DC 20003</addrline>
<addrline>https://shakespeare.folger.edu</addrline>
<addrline>folgertexts@folger.edu</addrline>
</address>
<availability>
<licence target="http://creativecommons.org/licenses/by-nc/3.0/deed.en_US">Distributed
                        under a Creative Commons Attribution-NonCommercial 3.0 Unported Licens

In [22]:
type(soup.contents[1])

bs4.element.Tag

In [24]:
# this is the TEI element
type(soup.contents[2])

IndexError: list index out of range

In [25]:
soup.contents[3]

IndexError: list index out of range

In [26]:
type(soup.contents[3])

IndexError: list index out of range

In [27]:
len(soup.tei) # type: ignore

5

In [29]:
[type(x) for x in soup.tei] # type: ignore

[bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString]

In [31]:
[type(x) for x in soup.tei.contents] # type: ignore

[bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString]

In [32]:
len(soup.tei.body) # type: ignore

# error, contents is a list:
# len(soup.tei.contents.body)
# Error: special function called "text" - how to access text element?
#len(soup.tei.text.body)

TypeError: object of type 'NoneType' has no len()

In [34]:
len(soup.tei.find("text")) # type: ignore

45

In [35]:
[type(x) for x in soup.tei.body.contents]

AttributeError: 'NoneType' object has no attribute 'contents'

In [36]:
[x.name for x in soup.tei.body.contents]

AttributeError: 'NoneType' object has no attribute 'contents'

In [38]:
for c in soup.children:
    c.name # type: ignore

In [40]:
[(x["type"], int(x["n"])) for x in soup.tei.body.find_all("div", recursive=False)] # type: ignore

AttributeError: 'NoneType' object has no attribute 'find_all'

In [41]:
[(x["type"], int(x["n"])) for x in soup.tei.body.find_all("div")]

AttributeError: 'NoneType' object has no attribute 'find_all'

In [47]:
castlist = soup.tei.find("text").front.castlist

In [48]:
castlist.findAll("castitem")

AttributeError: 'NoneType' object has no attribute 'findAll'

In [49]:
for group in castlist.findAll("castgroup"):
    if group.find("head", recursive=False):
        print("H " + group.head.text.strip())
    for item in group.findAll("castitem", recursive=False):
        if not item.get("corresp"):
            print("> " + item.text.strip() + " (" + item["xml:id"] + ")")

AttributeError: 'NoneType' object has no attribute 'findAll'

In [50]:
for group in castlist.findAll("castgroup"):
    if group.find("head", recursive=False):
        print("- " + group.head.text.strip())
    for item in group.findAll("castitem", recursive=False):
        if not item.get("corresp"):
            role = item.role.text.strip() if item.role else None
            description = item.roledesc.text.strip() if item.roledesc else None
            text = "<unnamed>" if not role else role
            text = text + ", " + description if description else text
            print(" " + text + " (" + item["xml:id"] + ")")

AttributeError: 'NoneType' object has no attribute 'findAll'

In [52]:
import xml.etree.ElementTree as ElementTree
tei_doc = ElementTree.fromstring(codecs.open("/home/mark/Documents/digital-philology/macbeth/macbeth_content/Mac.xml", "r", "utf-8").read())
tei_header, tei_text = tei_doc
tei_header.tag

'{http://www.tei-c.org/ns/1.0}teiHeader'

In [53]:
tei_text.findall("./body")

[]

In [54]:
tei_text.findall("./{http://www.tei-c.org/ns/1.0}body")

[<Element '{http://www.tei-c.org/ns/1.0}body' at 0x7f121e201990>]

In [55]:
# setup namespace for writing "body" instead of "{http://www.tei-c.org/ns/1.0}body"
ns = {"": "http://www.tei-c.org/ns/1.0"}
len(tei_text.findall(".//castList//castGroup/castItem", ns))

0

In [69]:
act_2_scene_4 = tei_text.findall(
'.//div[@type="act"][@n="2"]//div[@type="scene"][@n="4"]', ns
)[:1]

In [70]:
len(act_2_scene_4.findall(".//sp", ns))

len(act_2_scene_4.findall(".//sp/speaker", ns))

len(act_2_scene_4.findall(".//sp//l", ns))

for speech in act_2_scene_4.findall(".//sp", ns):
    for line in speech.findall(".//l", ns):
        for word in line.findall(".//w", ns):
            if word.attrib["lemma"] == "do":
                print(word.text)

AttributeError: 'list' object has no attribute 'findall'

In [72]:
import codecs
import xml.etree.ElementTree as ElementTree
tei_header, tei_text = ElementTree.fromstring(
codecs.open("/home/mark/Documents/digital-philology/macbeth/macbeth_content/Mac.xml", "r", "utf-8").read()
)

In [73]:
def xp(node, path):
    return node.findall("." + path, {"": "http://www.tei-c.org/ns/1.0"})

xp(tei_text, '//div[@type="act"]')

[]

In [75]:
scene_speakers = []
for act in xp(tei_text, '//div[@type="act"]'):
    act_number = int(act.attrib["n"])
    for scene in xp(act, '//div[@type="scene"]'):
        scene_number = int(scene.attrib["n"])
        speakers = xp(scene, "//sp")
        speaker_ids = [speaker.attrib["who"] for speaker in speakers]
        scene_speakers.append([act_number, scene_number, speaker_ids])

In [76]:
print(scene_speakers)

[]


# NetowrkX

In [1]:
import networkx as nx
g = nx.Graph()
g.number_of_nodes()

ModuleNotFoundError: No module named 'networkx'