In [1]:
import os
import pandas as pd
import re
from lxml import etree
from datetime import datetime

ns = {'tei': 'http://www.tei-c.org/ns/1.0', "xml": "http://www.w3.org/XML/1998/namespace"}


def readlist(pathtofile, filename):
      tree = etree.parse(pathtofile+"/"+filename)
      root = tree.getroot()
      return root

def readfiles(pathtofiles, listxml):
    listoforganisations = []
    for filename in os.listdir(pathtofiles):
       regesttree = etree.parse(pathtofiles+"/"+filename)
       regestroot = regesttree.getroot()
       actualregest = regestroot.find(".//tei:idno", ns).text
       date = createdate(regestroot.find(".//tei:creation/tei:date", ns))
       organisationNodes = regestroot.findall(".//*[@type='org']")
       for rs in organisationNodes:
           organisation = createneworganisation(rs, listxml)
           organisation.append(actualregest)
           organisation.append(date)
           listoforganisations.append(organisation)

    createcoloumns(listoforganisations)    
    
def createcoloumns(listoforganisations):
    organisationsFrame = pd.DataFrame()
    reflist = []
    regesttext = []
    type =[]
    observance = []
    eventref = []
    regest = []
    regname =[]
    date =[]
    fnrole = []
    
    for org in listoforganisations:
        print(org)
        if org[0] != "": reflist.append(org[0])
        else: reflist.append(" ")
        
        if org[1] != "": regesttext.append(org[1])
        else: regesttext.append(" ")
        
        if len(org[2]) >0:
            if org[2][0] != "": type.append(org[2][0])
            else: type.append(" ")
            if org[2][1] != "": observance.append(org[2][1])
            else: observance.append(" ")
            if org[2][2] != "": regname.append(org[2][2])
            else: regname.append(" ")
            
        if len(org[3]) >0: eventref.append(org[3])
        else: eventref.append(" ")
    
        if len(org[4])>0: fnrole.append(org[4])
        else: fnrole.append(" ")
        
        if len(org[5]) >0: regest.append(org[5])
        else: regest.append(" ")

        date.append(org[6])  
          


        
    organisationsFrame['ref'] = reflist
    organisationsFrame['regest'] = regest
    organisationsFrame['Regest Text'] = regesttext 
    organisationsFrame['reg Name'] = regname
    organisationsFrame['type'] = type
    organisationsFrame['observance'] = observance
    organisationsFrame['fn role'] = fnrole
    organisationsFrame['Events ref']  = eventref
    organisationsFrame['regest date'] = date

    
    writer = pd.ExcelWriter('../data/output/xlsx/extractedAllOrganisations.xlsx')
    organisationsFrame.to_excel(writer, 'ExtractedAllOrganisations', index=False)
    writer.save()
    
    organisationsFrame.to_csv("../data/output/csv/extractedAllOrganisations.csv", sep="\t", encoding="utf-8")


def createneworganisation(rs, listxml):
        text = cleanuptextnodestostring(rs.xpath(".//text()"))
        rsrole = getrstyperole(rs)
        events = getorgevent(rs)
        ref = rs.attrib["ref"]
        org = getinformationsfromorglistxml(ref, listxml)
        return [ref, text, org, events, rsrole]

def getrstyperole(node):
    parentrs = node.find("./..[@type='fn']", ns)
    if parentrs is not None:
        return parentrs.attrib['role']
    else:
        return " "

def getorgevent(node):
    
    eventnode = getparentevent(node)
    if eventnode is not None:
        event_ref = eventnode.attrib['ref']
        return event_ref
    else: return ""


def getparentevent(node):
        actualnode = node
        parents = actualnode.xpath("ancestor::*[@type='event']")
        return parents[0]


    

def createdate(node):
    when = node.attrib['when']
    return datetime.strptime(when, '%Y%m%d').date()


def getinformationsfromorglistxml(ref, orglist):
    ref = ref.replace("#", '')
    org = orglist.find(".//tei:org[@xml:id='"+ref+"']", ns)
    name = " "
    if org is not None:
        regname = org.find(".//tei:reg",ns)
        if regname is not None:
            name = regname.text
        if "type" in org.attrib:
            type = org.attrib['type']
            if "Kloster_" in type:
               observance = org.find(".//tei:label",ns).text
               desc = org.find(".//tei:desc", ns).text
               return [type, desc, name]
            else: return [type, " ", name]
    else: return[" ", " ", " "]




def cleanuptextnodestostring(textnodes):
    cleanedlist =[]
    for textnode in textnodes:
        textstring = str(textnode)
        tokenized = textstring.split(" ")
        cleandtokens = []
        for token in tokenized:
            if len(token) >1:
                token = re.sub("\n","", token)
                cleandtokens.append(token)
        cleanedlist += cleandtokens    
    return " ".join(cleanedlist) 

personlistxml = readlist("../../data/lists", "orgList.xml")    
readfiles("../../regests_QGW_PR_Stadt_ab_1400/done", personlistxml)


['#org__st_stephan', 'sand Jacob auf Unser Vraun altar', ['Kirche_Kapelle', ' ', 'St. Stephan'], '#ev__QGW_II_I_1484', 'recipient', '1484', datetime.date(1401, 2, 17)]
['#org__st_stephan', 'Stephan ze Wienne', ['Kirche_Kapelle', ' ', 'St. Stephan'], '#ev__QGW_II_I_1484', 'recipient', '1484', datetime.date(1401, 2, 17)]
['#org__dominikaner', 'den Predigern', ['Kloster_m', 'OP', 'Dominikaner'], '#ev__QGW_II_I_1484', 'recipient', '1484', datetime.date(1401, 2, 17)]
['#org__augustinereremiten', 'den Augustinern', ['Kloster_m', 'OESA', 'Augustinereremiten'], '#ev__QGW_II_I_1484', 'recipient', '1484', datetime.date(1401, 2, 17)]
['#org__karmeliter', 'den Weissenprüdern', ['Kloster_m', 'OCarm', 'Karmeliter-Weissenbrüder'], '#ev__QGW_II_I_1484', 'recipient', '1484', datetime.date(1401, 2, 17)]
['#org__st_johannes_in_der_siechenals', 'sand Johanns in der Siechen Alz/zz/z', ['Kirche_Kapelle', ' ', 'St. Johannes in der Siechenals'], '#ev__QGW_II_I_1484', 'recipient', '1484', datetime.date(1401, 2