In [1]:
import xml.etree.ElementTree as ET

tree = ET.parse("annotations.xmi")
root = tree.getroot()
sentences = root.findall("{http:///uima/cas.ecore}Sofa")
text = sentences[0].attrib["sofaString"]


In [2]:
motifs = []
motif_anno = root.findall("{http:///webanno/custom.ecore}Motif")
for annotation in motif_anno:
    entry = {
            "begin":int(annotation.attrib["begin"]),
            "end":int(annotation.attrib["end"]),        
            "surface":text[int(annotation.attrib["begin"]):int(annotation.attrib["end"])],
            "type":annotation.attrib["MOTIF"],
            "value":annotation.attrib["Identifier"]
         }
    motifs.append(entry)

In [3]:
import random

print(random.choice(motifs))

{'begin': 64673, 'end': 64678, 'surface': 'Peter', 'type': 'SUBJ', 'value': '11H(PETER)'}


In [4]:
entities = []
works = []
artifacts = []
entities_anno = root.findall("{http:///webanno/custom.ecore}Entity")
for annotation in entities_anno:
    if annotation.attrib["Value"] in {"PER", "ORG", "LOC", "OTH"}:
        try:
            entry = {
                    "begin":int(annotation.attrib["begin"]),
                    "end":int(annotation.attrib["end"]),        
                    "surface":text[int(annotation.attrib["begin"]):int(annotation.attrib["end"])],
                    "type":annotation.attrib["Value"],
                    "value":annotation.attrib["id"]
                 }
            entities.append(entry)
        except KeyError:
            entry = {
                    "begin":int(annotation.attrib["begin"]),
                    "end":int(annotation.attrib["end"]),        
                    "surface":text[int(annotation.attrib["begin"]):int(annotation.attrib["end"])],
                    "type":annotation.attrib["Value"],
                    "value":"NIL"
                 }
            entities.append(entry)
    elif annotation.attrib["Value"]=="WORK":
        try:
            entry = {
                    "begin":int(annotation.attrib["begin"]),
                    "end":int(annotation.attrib["end"]),        
                    "surface":text[int(annotation.attrib["begin"]):int(annotation.attrib["end"])],
                    "type":annotation.attrib["Value"],
                    "value":annotation.attrib["id"]
                 }
            works.append(entry)
        except KeyError:
            entry = {
                    "begin":int(annotation.attrib["begin"]),
                    "end":int(annotation.attrib["end"]),        
                    "surface":text[int(annotation.attrib["begin"]):int(annotation.attrib["end"])],
                    "type":annotation.attrib["Value"],
                    "value":"NIL"
                 }
            works.append(entry)
    else:
        entry = {
                    "begin":int(annotation.attrib["begin"]),
                    "end":int(annotation.attrib["end"]),        
                    "surface":text[int(annotation.attrib["begin"]):int(annotation.attrib["end"])],
                    "type":annotation.attrib["Value"],
                    "value":"NIL"
                 }
        artifacts.append(entry)

In [5]:
print(random.choice(entities))

{'begin': 1544, 'end': 1550, 'surface': 'Vasari', 'type': 'PER', 'value': 'Q128027'}


In [6]:
time_expressions = []
time_anno = root.findall("{http:///webanno/custom.ecore}Time")
for annotation in time_anno:
    values = []
    for value in annotation:
        values.append(value.text)
    values.reverse()
    if len(values)==1:
        entry = {
                "begin":int(annotation.attrib["begin"]),
                "end":int(annotation.attrib["end"]), 
                "type":"DATE",
                "surface":text[int(annotation.attrib["begin"]):int(annotation.attrib["end"])],
                "value":values[0],
             }
        time_expressions.append(entry)
    else:
        entry = {
                "begin":int(annotation.attrib["begin"]),
                "end":int(annotation.attrib["end"]), 
                "type":"INTERVAL",
                "surface":text[int(annotation.attrib["begin"]):int(annotation.attrib["end"])],
                "value":";".join(values),
             }
        time_expressions.append(entry)
        

In [7]:
print(random.choice(time_expressions))

{'begin': 19098, 'end': 19117, 'type': 'INTERVAL', 'surface': 'until the year 1529', 'value': 'w;x;1529;z'}


In [8]:
sentences = text.split("\n")

In [9]:
line_breaks = [pos for pos, char in enumerate(text) if char == "\n"]

In [10]:
import bisect

def normalize_lst(line_breaks, annotations):
    new_lst = []
    for element in annotations:
        index = bisect.bisect(line_breaks, element["begin"])
        if index == 0:
            entry = {
                "id":index,
                "begin":element["begin"],
                "end":element["end"],
                "surface":element["surface"],
                "type":element["type"],
                "value":element["value"]
            }
            new_lst.append(entry)
        else:
            entry = {
                "id":index,
                "begin":element["begin"]-line_breaks[index-1],
                "end":element["end"]-line_breaks[index-1],
                "surface":element["surface"],
                "type":element["type"],
                "value":element["value"]
            }
            new_lst.append(entry)
    return new_lst

motifs = normalize_lst(line_breaks, motifs)              
entities = normalize_lst(line_breaks, entities)  
works = normalize_lst(line_breaks, works)  
artifacts = normalize_lst(line_breaks, artifacts)
time_expressions = normalize_lst(line_breaks, time_expressions)

In [11]:
print(random.choice(motifs))

{'id': 41, 'begin': 943, 'end': 956, 'surface': 'S. Laurentino', 'type': 'SUBJ', 'value': '11H(LAURENTINUS)'}


In [12]:
sentence_lst = [{"id":pos, "sentence":sentence} for pos, sentence in enumerate(sentences)]

In [13]:
print(sentence_lst[0])

{'id': 0, 'sentence': 'Finally, Pope Clement, having determined that Buonarroti should return to Florence to finish the works of the sacristy and library of S. Lorenzo, gave him orders, since many statues were wanting there, as will be told in the Life of Michelagnolo himself, that he should avail himself of the most able men that could be found, and particularly of Fra Giovanni Agnolo, employing the same methods as had been adopted by Antonio da San Gallo in order to finish the works of the Madonna di Loreto. Having therefore made his way with the Frate to Florence, Michelagnolo, in executing the statues of Duke Lorenzo and Duke Giuliano, employed the Frate much in polishing them and in executing certain difficult undercuttings; with which occasion Fra Giovanni Agnolo learned many things from that truly divine man, standing with attention  to watch him at work, and observing every least thing. Now among other statues that were wanting to the completion of that work, there were lacking

In [14]:
import csv

keys = sentence_lst[0].keys()
with open('sentences.csv', 'w', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(sentence_lst)
    
keys = motifs[0].keys()
with open('motifs.csv', 'w', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(motifs)
    
keys = artifacts[0].keys()
with open('artifacts.csv', 'w', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(artifacts)
    
keys = works[0].keys()
with open('works.csv', 'w', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(works)
    
keys = entities[0].keys()
with open('entities.csv', 'w', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(entities)
    
keys = time_expressions[0].keys()
with open('dates.csv', 'w', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(time_expressions)

    
