In [69]:
import os

def load(d):
    for f in os.listdir(d):
        if f.endswith('.txt'):
            f = os.path.join(d, f)
            yield load_file(f)


def load_file(f):
    assert f.endswith('.txt')
    annfile = f[:-4] + '.ann'
    ents = []
    for ln in open(annfile):
        _, typ, start, end, text = ln.strip().split(None, 4)
        ents.append((int(start), int(end), typ, text))
    return f, open(f).read(), {"entities": ents}




data_dir = "/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/"


In [70]:
gen = load(data_dir)

training_data = [(f, text, ents["entities"]) for f, text, ents in gen]
len(training_data)

56

In [60]:
from collections import defaultdict
from itertools import combinations

def filter_overlapping(annotations):
    covered = defaultdict(set)
    for anno in annotations:
        for offset in range(anno[0], anno[1]+1):
            covered[offset].add(anno)
    overlapping = defaultdict(set)
    for offset in covered:
        if len(covered[offset]) > 1:
            for ent1, ent2 in combinations(covered[offset], 2):
                overlapping[ent1].add(ent2)
                overlapping[ent2].add(ent1)
                print(offset, ent1, ent2)
    non_overlapping = []
    filtered = []
    for anno in annotations:
        if anno not in overlapping:
            non_overlapping.append(anno)
        elif anno in filtered:
            print('skipping filtered', anno)
            continue
        else:
            print('overlapping first', anno)
            non_overlapping.append(anno)
            for overlapping_anno in overlapping[anno]:
                filtered.append(overlapping_anno)
    return non_overlapping

In [65]:
import spacy
from spacy.tokens import DocBin

nlp = spacy.blank("nl")
#training_data = [
#  ("Tokyo Tower is 333m tall.", [(0, 11, "BUILDING")]),
#]
# the DocBin will store the example documents
db = DocBin()
for fname, text, annotations in training_data:
    print(len(text), fname)
    doc = nlp(text)
    print(len(doc.text))
    ents = []
    annotations = filter_overlapping(annotations)
    for start, end, label in annotations:
        span = doc.char_span(start, end, label=label)
        if span:
            ents.append(span)
        if end == start + 1:
            print(text[start:end], start, end, label, span)
    try:
        doc.ents = ents
    except ValueError:
        for anno in annotations:
            print(anno, text[anno[0]:anno[1]])
        print(fname)
        raise
    except TypeError:
        print(annotations)
        print(ents)
        print(fname)
        raise
    except IndexError:
        print(text)
        print(len(text))
        print(annotations)
        print(len(doc.text))
        print(fname)
        raise
    db.add(doc)
    
    
version = 1

db.to_disk(f"./train_{version}.spacy")


1113 /Users/marijnkoolen/Code/Huygens/temp/nerdata/republic-brat/rsg1725/NL-HaNA_1.01.02_3780_0539.txt
1113
9722 /Users/marijnkoolen/Code/Huygens/temp/nerdata/republic-brat/rsg1725/NL-HaNA_1.01.02_3780_0505.txt
9722
8275 /Users/marijnkoolen/Code/Huygens/temp/nerdata/republic-brat/rsg1725/NL-HaNA_1.01.02_3780_0511.txt
8275
8252 /Users/marijnkoolen/Code/Huygens/temp/nerdata/republic-brat/rsg1725/NL-HaNA_1.01.02_3780_0510.txt
8252
7075 /Users/marijnkoolen/Code/Huygens/temp/nerdata/republic-brat/rsg1725/NL-HaNA_1.01.02_3780_0504.txt
7075
2655 (2655, 2661, 'PER') (2655, 2661, 'LOC')
2656 (2655, 2661, 'PER') (2655, 2661, 'LOC')
2657 (2655, 2661, 'PER') (2655, 2661, 'LOC')
2658 (2655, 2661, 'PER') (2655, 2661, 'LOC')
2659 (2655, 2661, 'PER') (2655, 2661, 'LOC')
2660 (2655, 2661, 'PER') (2655, 2661, 'LOC')
2661 (2655, 2661, 'PER') (2655, 2661, 'LOC')
overlapping first (2655, 2661, 'LOC')
skipping filtered (2655, 2661, 'PER')
7524 /Users/marijnkoolen/Code/Huygens/temp/nerdata/republic-brat/rsg1

In [72]:
import re

from nltk import sent_tokenize, word_tokenize


def get_text_sents(text, annotations):
    sents = []
    #nltk_sents = [sent for sent in sent_tokenize(text)]
    nltk_sents = text.split('\n')
    sent_start = 0
    for si, sent in enumerate(nltk_sents):
        sent_end = sent_start + len(sent)
        for anno in annotations:
            if anno[0] < sent_end and anno[1] > sent_end:
                print('MERGE SENTENCES')
                print(anno)
                print(len(text))
                print('\t', sent_start, sent_end, sent)
                print('\t', nltk_sents[si+1])
        sent_start = sent_end + 1
        #sent_start = text.index(sent)
    sent_start = 0
    for sent in nltk_sents:
        sent_end = sent_start + len(sent)
        sents.append({
            "start": sent_start, 
            "end": sent_end,
            "text": sent
        })
        sent_start = sent_end + 1
    return sents


def get_sentence_entities(sent, annotations):
    entities = []
    for anno in annotations:
        #print("ANNOTATION:", anno)
        if anno[1] <= sent["start"]:
            #print('\tbefore start')
            continue
        elif anno[0] > sent["end"]:
            #print('\tafter end')
            continue
        elif anno[0] < sent["start"] or anno[1] > sent["end"]:
            print(anno)
            print(sent)
            raise IndexError("annotation partially overlaps sentence")
        #print('\twithin sent')
        entities.append(anno)
    return entities


def map_sentence_entities(sent, annotations, suffix_size: int = 3):
    entities = []
    for anno in annotations:
        ent_start = anno[0] - sent["start"]
        ent_end = anno[1] - sent["start"]
        entity_text = sent["text"][ent_start:ent_end]
        if entity_text != anno[3]:
            print("EXTRACTED RANGE DOES NOT MATCH ANNOTATION")
            print(anno)
            print(ent_start, ent_end, entity_text)
        prefix = sent["text"][:ent_start].strip()
        postfix = sent["text"][ent_end:].strip()
        prefix_terms = re.split(r'\W+', prefix)
        prefix_terms = prefix_terms[-suffix_size:] if suffix_size < len(prefix_terms) else prefix_terms
        postfix_terms = re.split(r'\W+', postfix)
        postfix_terms = postfix_terms[:suffix_size] if suffix_size < len(postfix_terms) else postfix_terms
        entity = {
            "start": ent_start,
            "end": ent_end,
            "start_found": False,
            "end_found": False,
            "text": entity_text,
            "type": anno[2],
            "prefix": prefix_terms,
            "postfix": postfix_terms
        }
        entities.append(entity)
    return entities
    

def get_sents_entities(training_data):
    for fname, text, annotations in training_data[1:]:
        print(fname)
        #print(len(text), text)
        #print(annotations)
        for sent in get_text_sents(text, annotations):
            #print(sent)
            entities = get_sentence_entities(sent, annotations)
            entities = map_sentence_entities(sent, entities)
            yield sent, entities, annotations


def get_sent_words_tags(sent, entities, annotations):
    words = word_tokenize(sent["text"])
    offset = 0 #sent["start"]
    for word in words:
        tag = "O"
        for entity in entities:
            if entity["start"] <= offset < entity["end"]:
                tag = f"I-{entity['type']}"
                if offset == entity["start"] and offset + len(word) < entity["end"]:
                    tag = f"B-{entity['type']}"
            if offset == entity["start"]:
                entity["start_found"] = True
            if offset + len(word) == entity["end"]:
                entity["end_found"] = True
        yield word, tag
        offset += len(word)
        if offset < len(sent["text"]) and sent["text"][offset] == ' ':
            offset += 1

gen = load(data_dir)

training_data = [(f, text, ents["entities"]) for f, text, ents in gen]
len(training_data)


output_file = "republic_ner_train.txt"

with open(output_file, 'wt') as fh:
    for sent, entities, annotations in get_sents_entities(training_data):
        for word, tag in get_sent_words_tags(sent, entities, annotations):
            fh.write(f"{word} {tag}\n")
        fh.write("\n")
        for entity in entities:
            if not entity["start_found"] or not entity["end_found"]:
                print(sent)
                print(entity)
                print(entities)
                print(annotations)
                raise IndexError("Unmatched entity words")




/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0505.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0511.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0510.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0504.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0538.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0499.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0512.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0506.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0507.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0513.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0498.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0488.txt
/Users/marijnkoolen/Code/Huy

In [74]:
import random


corpus_dir = "../../ground_truth/entities/flair-training"
train_file = f"{corpus_dir}/train.txt"
test_file = f"{corpus_dir}/test.txt"
valid_file = f"{corpus_dir}/valid.txt"

fh_train = open(train_file, 'wt')
fh_test = open(test_file, 'wt')
fh_valid = open(valid_file, 'wt')

with open(output_file, 'wt') as fh:
    for sent, entities, annotations in get_sents_entities(training_data):
        draw = random.random()
        if draw < 0.1:
            fh = fh_test
        elif draw < 0.2:
            fh = fh_valid
        else:
            fh = fh_train
        for word, tag in get_sent_words_tags(sent, entities, annotations):
            fh.write(f"{word} {tag}\n")
        fh.write("\n")
        for entity in entities:
            if not entity["start_found"] or not entity["end_found"]:
                print(sent)
                print(entity)
                print(entities)
                print(annotations)
                raise IndexError("Unmatched entity words")


fh_train.close()
fh_test.close()
fh_valid.close()


/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0505.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0511.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0510.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0504.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0538.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0499.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0512.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0506.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0507.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0513.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0498.txt
/Users/marijnkoolen/Code/Huygens/brat/data/rsg1725/NL-HaNA_1.01.02_3780_0488.txt
/Users/marijnkoolen/Code/Huy