In [2]:
import os
import pandas as pd
import re
from lxml import etree
from datetime import datetime

ns = {'tei': 'http://www.tei-c.org/ns/1.0', "xml": "http://www.w3.org/XML/1998/namespace"}


def readlist(pathtofile, filename):
      tree = etree.parse(pathtofile+"/"+filename)
      root = tree.getroot()
      return root

def readfiles(pathtofiles, listxml):
    listoforganisations = []
    for filename in os.listdir(pathtofiles):
       print(filename)
       regesttree = etree.parse(pathtofiles+"/"+filename)
       regestroot = regesttree.getroot()
       actualregest = regestroot.find(".//tei:idno", ns).text
       date = createdate(regestroot.find(".//tei:creation/tei:date", ns))
       organisationNodes = regestroot.findall(".//*[@type='org']")
       for rs in organisationNodes:
           organisation = createneworganisation(rs, listxml)
           organisation.append(actualregest)
           organisation.append(date)
           listoforganisations.append(organisation)

    createcoloumns(listoforganisations)

def createcoloumns(listoforganisations):
    organisationsFrame = pd.DataFrame()
    reflist = []
    regesttext = []
    type =[]
    observance = []
    eventref = []
    regest = []
    regname =[]
    date =[]
    fromdate =[]
    todate = []
    fromyear =[]
    toyear = []
    year = []
    fnrole = []

    for org in listoforganisations:
        if org[0] != "": reflist.append(org[0])
        else: reflist.append(" ")

        if org[1] != "": regesttext.append(org[1])
        else: regesttext.append(" ")

        if len(org[2]) >0:
            if org[2][0] != "": type.append(org[2][0])
            else: type.append(" ")
            if org[2][1] != "": observance.append(org[2][1])
            else: observance.append(" ")
            if org[2][2] != "": regname.append(org[2][2])
            else: regname.append(" ")

        if len(org[3]) >0: eventref.append(org[3])
        else: eventref.append(" ")

        if len(org[4])>0: fnrole.append(org[4])
        else: fnrole.append("no role given")

        if len(org[5]) >0: regest.append(org[5])
        else: regest.append(" ")

        if len(org[6]) == 1:
            orgdate = org[6]
            date.append(orgdate[0])
            fromdate.append("")
            todate.append("")
            year.append(orgdate[0].year)
            fromyear.append("")
            toyear.append("")
        elif len(org[6]) == 2:
            fromtodate = org[6]
            date.append("")
            fromdate.append(fromtodate[0])
            todate.append(fromtodate[1])
            year.append("")
            fromyear.append(fromtodate[0].year)
            toyear.append(fromtodate[1].year)





    organisationsFrame['ref'] = reflist
    organisationsFrame['regest'] = regest
    organisationsFrame['Regest Text'] = regesttext
    organisationsFrame['reg Name'] = regname
    organisationsFrame['type'] = type
    organisationsFrame['observance'] = observance
    organisationsFrame['fn role'] = fnrole
    organisationsFrame['Events ref']  = eventref
    organisationsFrame['regest date'] = date
    organisationsFrame['from'] = fromdate
    organisationsFrame['to'] = todate
    organisationsFrame['year'] = year
    organisationsFrame['fromyear'] = fromyear
    organisationsFrame['toyear'] = toyear


    writer = pd.ExcelWriter('../data/output/xlsx/extractedAllOrganisations.xlsx')
    organisationsFrame.to_excel(writer, 'ExtractedAllOrganisations', index=False)
    writer.save()

    organisationsFrame.to_csv("../data/output/csv/extractedAllOrganisations.csv", sep="\t", encoding="utf-8")


def createneworganisation(rs, listxml):
        text = cleanuptextnodestostring(rs.xpath(".//text()"))
        rsrole = getrstyperole(rs)
        events = getorgevent(rs)
        print(rs)
        ref = rs.attrib["ref"].replace("#", "")
        org = getinformationsfromorglistxml(ref, listxml)
        return [ref, text, org, events, rsrole]

def getrstyperole(node):
    parentrs = node.find("./..[@type='fn']", ns)
    if parentrs is not None:
        return parentrs.attrib['role']
    else:
        return "None"

def getorgevent(node):

    eventnode = getparentevent(node)
    if eventnode is not None:
        event_ref = eventnode.attrib['ref']
        return event_ref
    else: return ""


def getparentevent(node):
        actualnode = node
        parents = actualnode.xpath("ancestor::*[@type='event']")
        return parents[0]




def createdate(node):
    if 'when' in node.attrib:
        when = node.attrib['when']
        date = datetime.strptime(when, '%Y%m%d').date()
        return [date]
    elif 'from' in node.attrib:
        fr =node.attrib['from']
        to = node.attrib['to']
        fromdate = datetime.strptime(fr, '%Y%m%d').date()
        todate = datetime.strptime(to, '%Y%m%d').date()
        if fromdate == todate:
            return[fromdate]
        else: return [fromdate, todate]


def getinformationsfromorglistxml(ref, orglist):
    ref = ref.replace("#", '')
    org = orglist.find(".//tei:org[@xml:id='"+ref+"']", ns)
    name = " "
    if org is not None:
        regname = org.find(".//tei:reg",ns)
        if regname is not None:
            name = regname.text
        if "type" in org.attrib:
            type = org.attrib['type']
            if "Kloster_" in type:
               observance = org.find(".//tei:label",ns).text
               desc = org.find(".//tei:desc", ns).text
               return [type, desc, name]
            else: return [type, " ", name]
    else: return[" ", " ", " "]




def cleanuptextnodestostring(textnodes):
    cleanedlist =[]
    for textnode in textnodes:
        textstring = str(textnode)
        tokenized = textstring.split(" ")
        cleandtokens = []
        for token in tokenized:
            if len(token) >1:
                token = re.sub("\n","", token)
                cleandtokens.append(token)
        cleanedlist += cleandtokens
    return " ".join(cleanedlist)

personlistxml = readlist("../../data/lists", "orgList.xml")
readfiles("../../regests_QGW_PR_Stadt_ab_1400/done", personlistxml)





1482.xml
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4d165b48>
1483.xml
1484.xml
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4d165f08>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4d165bc8>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4d1659c8>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4d1651c8>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4d165748>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4d16c108>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4d16c088>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4d16c0c8>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4d16c188>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4d16c248>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4d16c288>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4d16c2c8>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4d16c348>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4d16c388>
1485.xml
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4d16ff48>
1486.xml
<Element {h

1565.xml
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4fc64548>
1566.xml
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4fc69588>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4fc691c8>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4fc69208>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4fc69388>
1567.xml
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4fc69788>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4fc696c8>
1568.xml
1569.xml
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4fc69708>
1570.xml
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4fc69e88>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4fc69f08>
1571.xml
1572.xml
1573.xml
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4fc67088>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4fc67108>
1574.xml
1575.xml
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4fc67348>
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21a4fc67288>
1576.xml
1577.xml
1578.xml
<Element {http://www.tei-c.org/ns/1.0}rs at 0x21