In [4]:
import os
from tabulate import tabulate
from bs4 import BeautifulSoup

In [2]:
class JEMHEphemeraDoc:
    """
        Data members: (should be self explanatory)
        fileName            | str
        filesDir            | str
        year                | int
        placeOfPublication  | str
        text                | str list
        """

    """
    JEMHDocParser is used to read the XML files where ephemera from JEMH corpus are stored.
    """

    # inner class to read XML files of ephemera
    class JEMHEphemeraDocParser:
        @staticmethod
        def createSoup(fileName, fileDir):
            infile = open(fileDir + "/" + fileName, "r")
            soup = BeautifulSoup(infile.read(), 'xml')

            return soup

        @staticmethod
        def parseMetadata(fileName, fileDir):

            soup = JEMHEphemeraDoc.JEMHEphemeraDocParser.createSoup(fileName, fileDir)
            
            date = soup.find('date')
            year = int(date['when'][0:4])
            
            tree = ET.parse(fileDir + "/" + fileName)
            root = tree.getroot()

            # xml address to place of publication
            placeOfPublication = soup.find('pubPlace').get_text()

            return year, placeOfPublication

        @staticmethod
        def parseText(fileName, fileDir):
            soup = JEMHEphemeraDoc.JEMHEphemeraDocParser.createSoup(fileName, fileDir)
            texts = soup.find_all('p')
            corp = [text.get_text() for text in texts]
            return corp

    def __init__(self, fileName, fileDir):
        self.fileName = fileName
        self.fileDir = fileDir

        self.year, self.placeOfPublication = JEMHEphemeraDoc.JEMHEphemeraDocParser.parseMetadata(fileName, fileDir)
        self.text = JEMHEphemeraDoc.JEMHEphemeraDocParser.parseText(fileName, fileDir)

    def __iter__(self):
        return (t for t in self.fileName)

    def __str__(self):
        file = self.fileName + '\n'
        year_and_place = self.placeOfPublication + ", " + str(self.year) + '\n'
        body = ""
        for t in self.text:
            body += t + '\n'

        return file + year_and_place + body

    # Accessors
    def getFileName(self):
        return self.fileName

    def getFileDir(self):
        return self.fileDir

    def getYear(self):
        return self.year

    def getPlaceOfPublication(self):
        return self.placeOfPublication

    def getText(self):
        return self.text

    def getMetadata(self):
        return self.fileName, self.year, self.placeOfPublication