In [1]:
import xml.etree.ElementTree as ET
import os
import torch
from tabulate import tabulate
from bs4 import BeautifulSoup

In [4]:
class JEMHBenYehudaDoc:
    """
        Data members: (should be self explanatory)
        fileName            | str
        filesDir            | str
        year                | int
        text                | str list
        """

    # inner class to read XML files of 
    class JEMHBenYehudaDocParser:
        @staticmethod
        def createSoup(fileName, fileDir):
            infile = open(fileDir + "/" + fileName, "r")
            soup = BeautifulSoup(infile.read(), 'xml')

            return soup

        @staticmethod
        def parseMetadata(fileName, fileDir):
            soup = JEMHBenYehudaDoc.JEMHBenYehudaDocParser.createSoup(fileName, fileDir)
            date = soup.find('date')

            return int(date["when"])

        @staticmethod
        def parseText(fileName, fileDir):
            soup = JEMHBenYehudaDoc.JEMHBenYehudaDocParser.createSoup(fileName, fileDir)
            texts = soup.find_all('p')
            corp = [text.get_text() for text in texts]
            return corp

    def __init__(self, fileName, fileDir):
        self.fileName = fileName
        self.fileDir = fileDir

        self.year = JEMHBenYehudaDoc.JEMHBenYehudaDocParser.parseMetadata(fileName, fileDir)
        self.text = JEMHBenYehudaDoc.JEMHBenYehudaDocParser.parseText(fileName, fileDir)

    def __iter__(self):
        return (t for t in self.fileName)

    def __str__(self):
        file = self.fileName + '\n'
        year = str(self.year) + '\n'
        body = ""
        for t in self.text:
            body += t + '\n'

        return file + year + body

    # Accessors
    def getFileName(self):
        return self.fileName

    def getFileDir(self):
        return self.fileDir

    def getYear(self):
        return self.year

    def getText(self):
        return self.text

    def getMetadata(self):
        return self.fileName, self.year