In [1]:

import os
import pandas as pd
import re
from lxml import etree
import datetime
from datetime import datetime
import time



ns = {'tei': 'http://www.tei-c.org/ns/1.0', "xml": "http://www.w3.org/XML/1998/namespace"}



def readlist(pathtofile, filename):
      tree = etree.parse(pathtofile+"/"+filename)
      root = tree.getroot()
      return root

def createdate(node):
    if 'when' in node.attrib:
        when = node.attrib['when'].replace('-','')
        date = datetime.strptime(when, '%Y%m%d').date()
        year = str(date).split("-")[0]
        return [date,year]

    elif 'notAfter' in node.attrib:
        notafter = node.attrib['notAfter']
        return [notafter]

    elif 'from' in node.attrib:
        fr =node.attrib['from'].replace('-','')
        to = node.attrib['to'].replace('-','')
        fromdate = datetime.strptime(fr, '%Y%m%d').date()
        todate = datetime.strptime(to, '%Y%m%d').date()
        year = str(fromdate).split("-")[0]
        if fromdate == todate:
            return[fromdate,year]
        else:
            return [fromdate, todate,year]


def getplaceName(ref, xml):
    nodes = xml.xpath("normalize-space(.//tei:place[@xml:id='"+ref+"']/tei:placeName)", namespaces=ns)
    return nodes

def processparentplace(topo, subdict):
    parent = topo.xpath("parent::tei:rs[@type='place']", namespaces=ns)
    if len(parent) > 0:
        subdict['place'] = parent[0].attrib['ref']
    else:
        subdict['place'] =  " "

def readfiles(pathtofiles, relationtype, placelist):
    listofplaces = []
    for filename in os.listdir(pathtofiles):
        regesttree = etree.parse(pathtofiles+"/"+filename)
        regestroot = regesttree.getroot()
        creationdate = regestroot.find(".//tei:creation/tei:date", ns)
        origindate = regestroot.find(".//tei:history/tei:origin", ns)
        if creationdate is not None:
           date = createdate(creationdate)
        elif origindate is not None:
           date = createdate(origindate)
        else:
           date = " "

        nodes = regestroot.xpath(".//tei:roleName[@type='"+relationtype+"']", namespaces=ns)
        if len(nodes) > 0:
            print("---------")
            print(filename)
            for node in nodes:
                subdict ={}
                if "corresp" in node.attrib:
                    subdict['topo'] = node.attrib['corresp']
                else:
                    subdict['topo'] = "NONE"
                subdict['date'] = date
                subdict['xml'] = filename
                processparentplace(node, subdict)
                subdict['placename'] = getplaceName(subdict['place'], placelist)
                if relationtype is "topo":
                    subdict['relationplacename'] = getplaceName(subdict['topo'], placelist)
                subdict['text'] = node.xpath("normalize-space(.//text())")
                listofplaces.append(subdict)
    writetable(listofplaces, relationtype)


def writetable(list, relationtype):
    frame = pd.DataFrame()
    places = []
    placenames = []
    texts = []
    topos = []
    xmls = []
    dates = []
    if relationtype is "topo":
        relationplacenames =[]


    for place in list:
        places.append(place['place'])
        placenames.append(place['placename'])
        texts.append(place['text'])
        topos.append(place['topo'])
        dates.append(place['date'][0])
        xmls.append(place['xml'])
        if relationtype is "topo":
            relationplacenames.append(place['relationplacename'])

    frame['place'] = places
    frame['name'] = placenames
    frame['text'] = texts
    frame['relation'] = topos
    if relationtype is "topo":
        frame['relation name'] = relationplacenames
    frame['xml'] = xmls
    frame['date'] = dates



    frame.to_csv("../data/PlaceRelations.csv", sep="\t", encoding="utf-8")


placelistxml = readlist("../../indices/lists", "placeList.xml")


relationtype="owner"
readfiles("../../sources/Satzbuch_CD/1448/done", relationtype, placelistxml)
print("done")

---------
00054-eintrag_vom_1448-06-10.xml
---------
00055-eintrag_vom_1448-06-21.xml
---------
00056-eintrag_vom_1448-08-09.xml
---------
00156-eintrag_vom_1448-12-13.xml
---------
00226-eintrag_vom_1448-02-23.xml
---------
00227-eintrag_vom_1448-03-06.xml
---------
00228-eintrag_vom_1448-04-22.xml
---------
00229-eintrag_vom_1448-05-27.xml
---------
00230-eintrag_vom_1448-11-20.xml
---------
00231-eintrag_vom_1448-09-20.xml
---------
00299-eintrag_vom_1448-07-08.xml
---------
00300-eintrag_vom_1448-08-05.xml
---------
00349-eintrag_vom_1448-02-22.xml
---------
00351-eintrag_vom_1448-11-29.xml
---------
00406-eintrag_vom_1448-05-31.xml
---------
00604-eintrag_vom_1448-02-24.xml
---------
00605-eintrag_vom_1448-03-04.xml
---------
00606-eintrag_vom_1448-04-03.xml
---------
00607-eintrag_vom_1448-03-13.xml
---------
00608-eintrag_vom_1448-04-29.xml
---------
00609-eintrag_vom_1448-05-06.xml
---------
00610-eintrag_vom_1448-05-29.xml
---------
00611-eintrag_vom_1448-06-21.xml
---------
0