In [1]:
tei_1 = """
<TEI version="3.3.0"
    xmlns="http://www.tei-c.org/ns/1.0">
    <teiHeader>
        <fileDesc>
            <titleStmt>
                <title>Compact valid TEI document</title>
            </titleStmt>
            <publicationStmt>
                <p>This file is published as part of the Digital Philology module.</p>
            </publicationStmt>
            <sourceDesc>
                <p>No source: this is an original work.</p>
            </sourceDesc>
        </fileDesc>
    </teiHeader>
    <text>
        <body>
            <p>Paragraph one is refreshingly short.</p>
            <p>Paragraph two is even shorter.</p>
        </body>
    </text>
</TEI>
"""

In [2]:
tei_file_1 = open("tei_1.xml", "w")
tei_file_1.write(tei_1)

668

In [3]:
tei_file_1.close()

In [4]:
import xml.sax

In [5]:
class TEIHandler1(xml.sax.ContentHandler):
    def startElement(self, tag, attributes):
        print("Starting " + tag + " element!")

In [6]:
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)

In [7]:
parser.setContentHandler(TEIHandler1())

In [8]:
parser.parse('tei_1.xml')

Starting TEI element!
Starting teiHeader element!
Starting fileDesc element!
Starting titleStmt element!
Starting title element!
Starting publicationStmt element!
Starting p element!
Starting sourceDesc element!
Starting p element!
Starting text element!
Starting body element!
Starting p element!
Starting p element!


In [9]:
class TEIHandler2(xml.sax.ContentHandler):
    def __init__(self):
        self.body_seen = False
    def startElement(self, tag, attributes):
        if tag == "body":
            print("Starting body element!")
            self.body_seen = True
        if self.body_seen and tag == "p":
            print("Starting paragraph element in the body!")

In [10]:
parser.setContentHandler(TEIHandler2())
parser.parse("tei_1.xml")

Starting body element!
Starting paragraph element in the body!
Starting paragraph element in the body!


In [11]:
class TEIHandler2withEnd( TEIHandler2 ):
    def endElement(self, tag):
        if self.body_seen and tag == "p":
            print("Paragraph element is ending in the body!")

parser.setContentHandler(TEIHandler2withEnd())
parser.parse("tei_1.xml")

Starting body element!
Starting paragraph element in the body!
Paragraph element is ending in the body!
Starting paragraph element in the body!
Paragraph element is ending in the body!


In [12]:
class TEIHandler2withEndAndText( TEIHandler2withEnd ):
    def characters(self, content):
        if self.body_seen and content.strip() != "":
            print(">>> "+content)

parser.setContentHandler(TEIHandler2withEndAndText())
parser.parse("tei_1.xml")

Starting body element!
Starting paragraph element in the body!
>>> Paragraph one is refreshingly short.
Paragraph element is ending in the body!
Starting paragraph element in the body!
>>> Paragraph two is even shorter.
Paragraph element is ending in the body!


In [13]:
import codecs

In [17]:
tei_2 = codecs.open("/home/mark/Documents/digital-philology/macbeth/macbeth_xml/macbeth.xml", "r", "utf-8")

In [1]:
from bs4 import BeautifulSoup as BS

In [2]:
soup = BS(open("/home/mark/Documents/digital-philology/macbeth/macbeth_content/Mac.xml", 'r'))



In [3]:
type(soup)

bs4.BeautifulSoup

In [7]:
type(soup.contents)

list

In [5]:
len(soup.contents)

2

In [6]:
soup.contents[0]

'xml-stylesheet type="text/xsl" href="fdt.xsl"?'