In [1]:
import os
import pandas as pd
import re
from lxml import etree
import datetime
from datetime import datetime
import time


ns = {'tei': 'http://www.tei-c.org/ns/1.0', "xml": "http://www.w3.org/XML/1998/namespace"}


def readlist(pathtofile, filename):
      tree = etree.parse(pathtofile+"/"+filename)
      root = tree.getroot()
      return root


def processNodes(xpath, xml):
    nodes = xml.xpath(xpath, namespaces=ns)
    if len(nodes) == 1:
        return nodes[0].xpath("normalize-space(./text())")
    elif len(nodes) > 1:
        texts = []
        for node in nodes:
            texts.append(node.xpath("normalize-space(.)"))
        return "; ".join(texts)
    else:
        return " "

def readfiles(pathtofiles, placelistxml):
    listofplaces = []
    for filename in os.listdir(pathtofiles):
        regesttree = etree.parse(pathtofiles+"/"+filename)
        regestroot = regesttree.getroot()
        actualregest = regestroot.find(".//tei:idno", ns).text
        creationdate = regestroot.find(".//tei:creation/tei:date", ns)
        origindate = regestroot.find(".//tei:history/tei:origin", ns)
        if creationdate is not None:
           date = createdate(creationdate)
        elif origindate is not None:
           date = createdate(origindate)
        else:
           date = " "
        nodes = regestroot.xpath(".//*[contains(@ref, 'pl__' ) or contains(@corresp, 'pl__')]")
        for node in nodes:
            placedict = {}
            print("--------------------------")
            print(filename)
            if 'corresp' in node.attrib:
                id = node.attrib['corresp']
                placedict['linking'] = 'corresp'
            if 'ref' in node.attrib:
                id = node.attrib['ref']
                placedict['linking'] = 'ref'

            placedict['id'] = id
            parentfn = node.xpath("parent::tei:rs[@type='fn']", namespaces=ns)
            if len(parentfn)>0:
                placedict['fn'] = parentfn[0].attrib['role']
                event = parentfn[0].xpath("parent::tei:rs[@type='event']", namespaces=ns)
                if len(event)>0:
                    placedict['event'] = event[0].attrib['ref']
                else:
                    placedict['event'] = "None"
            else:
                placedict['fn'] = "None"
                placedict['event'] = "None"

            placenode = placelistxml.xpath(".//tei:place[@xml:id='"+id+"']", namespaces=ns)
            if len(placenode) >0:
                place = placenode[0]

            if "type" in place.attrib:
                placedict['type'] = place.attrib['type']
            else: placedict['type'] = " "

            placedict['date'] = date
            placedict['xml'] = filename

            placedict['name'] = processNodes(".//tei:placeName", place)
            placedict['reg'] = processNodes("./tei:placeName//tei:reg", place)
            placedict['orig'] = processNodes("./tei:placeName//tei:orig", place)
            placedict['addName'] = processNodes("./tei:placeName//tei:addName", place)
            placedict['note'] = processNodes(".//tei:note", place)
            placedict['latlng'] = processNodes(".//tei:geo", place)
            placedict['authority'] = processNodes(".//tei:idno", place)

            listofplaces.append(placedict)
    writecolumns(listofplaces)

def writecolumns(list):
    frame = pd.DataFrame()
    ids = []
    linkings = []
    placenames = []
    placefunctions = []
    event = []
    regs = []
    origs = []
    adds = []
    xmls = []
    types = []
    dates = []
    notes = []
    latlngs = []
    authority= []


    for place in list:
        ids.append(place['id'])
        linkings.append(place['linking'])
        placenames.append(place['name'])
        regs.append(place['reg'])
        origs.append(place['orig'])
        adds.append(place['addName'])
        placefunctions.append(place['fn'])
        event.append(place['event'])
        xmls.append(place['xml'])
        types.append(place['type'])
        dates.append(place['date'])
        notes.append(place['note'])
        latlngs.append(place['latlng'])
        authority.append(place['authority'])
    
    frame['ID']=ids
    frame['name']=placenames
    frame['reg']=regs
    frame['orig']=origs
  #  frame['add']=adds
    frame['type']=types
    frame['linking']=linkings
    frame['function'] = placefunctions
    frame['event'] = event
    frame['xml']=xmls
    frame['date']=dates
    frame['note']=notes
    frame['latlng']=latlngs
    frame['authority']=authority



    frame.to_csv("../data/AllPlaces.csv", sep="\t", encoding="utf-8")


def createdate(node):
    if 'when' in node.attrib:
        when = node.attrib['when'].replace('-','')
        date = datetime.strptime(when, '%Y%m%d').date()
        year = str(date).split("-")[0]
        return [date,year]

    elif 'notAfter' in node.attrib:
        notafter = node.attrib['notAfter']
        return [notafter]

    elif 'from' in node.attrib:
        fr =node.attrib['from'].replace('-','')
        to = node.attrib['to'].replace('-','')
        fromdate = datetime.strptime(fr, '%Y%m%d').date()
        todate = datetime.strptime(to, '%Y%m%d').date()
        year = str(fromdate).split("-")[0]
        if fromdate == todate:
            return[fromdate,year]
        else:
            return [fromdate, todate,year]



placelistxml = readlist("../../indices/lists", "placeList.xml")

nodes = placelistxml.xpath(".//tei:place//*", namespaces=ns)
readfiles("../../sources/Satzbuch_CD/1448/done", placelistxml)
print("done")


--------------------------
00054-eintrag_vom_1448-06-10.xml
--------------------------
00054-eintrag_vom_1448-06-10.xml
--------------------------
00054-eintrag_vom_1448-06-10.xml
--------------------------
00054-eintrag_vom_1448-06-10.xml
--------------------------
00054-eintrag_vom_1448-06-10.xml
--------------------------
00055-eintrag_vom_1448-06-21.xml
--------------------------
00055-eintrag_vom_1448-06-21.xml
--------------------------
00055-eintrag_vom_1448-06-21.xml
--------------------------
00055-eintrag_vom_1448-06-21.xml
--------------------------
00055-eintrag_vom_1448-06-21.xml
--------------------------
00055-eintrag_vom_1448-06-21.xml
--------------------------
00056-eintrag_vom_1448-08-09.xml
--------------------------
00056-eintrag_vom_1448-08-09.xml
--------------------------
00056-eintrag_vom_1448-08-09.xml
--------------------------
00056-eintrag_vom_1448-08-09.xml
--------------------------
00056-eintrag_vom_1448-08-09.xml
--------------------------
00156-eintrag