In [7]:
tei_1 = """
<TEI version="3.3.0"
    xmlns="http://www.tei-c.org/ns/1.0">
    <teiHeader>
        <fileDesc>
            <titleStmt>
                <title>Compact valid TEI document</title>
            </titleStmt>
            <publicationStmt>
                <p>This file is published as part of the Digital Philology module.</p>
            </publicationStmt>
            <sourceDesc>
                <p>No source: this is an original work.</p>
            </sourceDesc>
        </fileDesc>
    </teiHeader>
    <text>
        <body>
            <p>Paragraph one is refreshingly short.</p>
            <p>Paragraph two is even shorter.</p>
        </body>
    </text>
</TEI>
"""

In [8]:
tei_file_1 = open("tei_1.xml", "w")
tei_file_1.write(tei_1)

668

In [9]:
tei_file_1.close()

In [10]:
import xml.sax

In [15]:
class TEIHandler1(xml.sax.ContentHandler):
    def startElement(self, tag, attributes):
        print("Starting " + tag + " element!")

In [16]:
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)

In [17]:
parser.setContentHandler(TEIHandler1())

In [18]:
parser.parse('tei_1.xml')

Starting TEI element!
Starting teiHeader element!
Starting fileDesc element!
Starting titleStmt element!
Starting title element!
Starting publicationStmt element!
Starting p element!
Starting sourceDesc element!
Starting p element!
Starting text element!
Starting body element!
Starting p element!
Starting p element!


In [19]:
class TEIHandler2(xml.sax.ContentHandler):
    def __init__(self):
        self.body_seen = False
    def startElement(self, tag, attributes):
        if tag == "body":
            print("Starting body element!")
            self.body_seen = True
        if self.body_seen and tag == "p":
            print("Starting paragraph element in the body!")

In [21]:
parser.setContentHandler(TEIHandler2())
parser.parse("tei_1.xml")

Starting body element!
Starting paragraph element in the body!
Starting paragraph element in the body!


In [22]:
class TEIHandler2withEnd( TEIHandler2 ):
    def endElement(self, tag):
        if self.body_seen and tag == "p":
            print("Paragraph element is ending in the body!")

parser.setContentHandler(TEIHandler2withEnd())
parser.parse("tei_1.xml")

Starting body element!
Starting paragraph element in the body!
Paragraph element is ending in the body!
Starting paragraph element in the body!
Paragraph element is ending in the body!


In [23]:
class TEIHandler2withEndAndText( TEIHandler2withEnd ):
    def characters(self, content):
        if self.body_seen and content.strip() != "":
            print(">>> "+content)

parser.setContentHandler(TEIHandler2withEndAndText())
parser.parse("tei_1.xml")

Starting body element!
Starting paragraph element in the body!
>>> Paragraph one is refreshingly short.
Paragraph element is ending in the body!
Starting paragraph element in the body!
>>> Paragraph two is even shorter.
Paragraph element is ending in the body!


In [24]:
import codecs
import urllib.request

In [42]:
url = "https://shakespeare.folger.edu/downloads/teisimple/" ## Download the file instead!
url = url + "macbeth_TEIsimple_FolgerShakespeare.xml"
remote_file = urllib.request.urlopen(url)

tei_file2 = codecs.open("macbeth.xml", "w", "utf-8")
tei_file2.write(remote_file.read().decode("utf-8"))
tei_file2.close()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc0 in position 10: invalid start byte

In [43]:
parser.parse("macbeth.xml")

SAXParseException: macbeth.xml:1:0: no element found