# Trouble with unaligned entities and tokens
(aka I don't really like spacy's tokenizer)

In [None]:
# All objects are list with aligned content, 
# i.e. train_ids[i] is the document ID for the ith element,
# train_texts[i] contains the text for this particular document,
# and train_entities[i] contains the target entities for this document.
# test_* variants are structured the same way, with 70% of the samples in train, and 30% in test.
train_ids = []
train_texts = []
train_entities = []
test_ids = []
test_texts = []
test_entities = []
for count, (id, (text, entity)) in enumerate(dataset.items()):
    # Switch the destination based on the percentage of elements already added
    # dict storage is shuffled but deterministic, so no need to seed any RNG here
    dst_ids, dst_texts, dst_entities = (
        (train_ids, train_texts, train_entities)
        if (100 * count / len(dataset)) < 70 else
        (test_ids, test_texts, test_entities)
    )
    for dst, item in zip((dst_ids, dst_texts, dst_entities), (id, text, entity)):
        dst.append(item)
print(f"Produced a training set with {len(train_ids)} elements, and a test set with {len(test_ids)} elements.")

Produced a training set with 70 elements, and a test set with 30 elements.


In [None]:
train_entities[0][0]

[78, 83, 'LOC']

In [None]:
tt = [len(x) for x in train_entities]
for id, ent_lst in zip(train_ids, train_entities):
    if len(ent_lst) <= 0:
        print(f"Error with document '{id}'.")

In [None]:
import spacy
from spacy.tokens import DocBin

def prepare_save_dataset(ids, texts, entities, dst_path):
    nlp = spacy.blank("fr")
    # the DocBin will store the example documents
    db = DocBin()
    for id, text, annotations in zip(ids, texts, entities):
        ents = None
        try:
            doc = nlp(text)
            ents = []
            for start, end, label in annotations:
                span = doc.char_span(start, end, label=label)
                ents.append(span)
            doc.ents = ents
            db.add(doc)
        except Exception as e:
            err_msg = f"Problem generating train/val data for document '{id}'. "
            err_msg += getattr(e, "msg", str(e))
            print(err_msg)
            print(ents)
            raise RuntimeError(err_msg)
    db.to_disk(dst_path)

In [None]:
prepare_save_dataset(train_ids, train_texts, train_entities, "./tmp/train.spacy")
prepare_save_dataset(test_ids, test_texts, test_entities, "./tmp/test.spacy")

Problem generating train/val data for document 'FRA01201_Feval'. object of type 'NoneType' has no len()
[Fortune, Fortune, Paris, Florence, Turin, Espagne, Espagne, None, Jules Alberoni, Charles-quint, Suède, Italie, Allemagne, Turquie, Russie, France, Angleterre, France, Angleterre, Alberoni, Europe, Angleterre, France, France, Louis XIV, Bretagne, Angleterre, Stuarts, Écosse, Irlande, de Saint-Georges, roi Jacques, des Ursins, Alcala, Hénarès, Madrid, Fortune]


RuntimeError: Problem generating train/val data for document 'FRA01201_Feval'. object of type 'NoneType' has no len()

In [None]:
ii = train_ids.index("FRA01201_Feval")
print(train_texts[ii])
train_entities[ii]

PREMIÈRE PARTIE -- LA CONSPIRATION EN DENTELLES
Où Fortune établit qu'il a une étoile.
— Monseigneur, dit Fortune, nous autres Français nous n'avons point la vanterie des Espagnols. S'il y a chez nous un défaut, c'est que nous ne savons pas nous faire valoir suffisamment. Je suis brave, mes preuves sont faites, et quant à la prudence, j'en ai en vérité à revendre. À Paris, comme à Florence, à Turin et dans d'autres villes capitales, mon adresse passe en proverbe, et c'est justice, car aussitôt que j'entreprends une affaire elle est dans le sac. En me choisissant, Votre Éminence a eu la main heureuse : je lui en fais mon sincère compliment.
C'était un magnifique garçon, à la taille élégante et robuste à la fois. Il disait tout cela en souriant, debout qu'il était, dans une attitude noble mais respectueuse, incliné à demi devant un personnage aux traits sévères et fortement accentués qui portait le costume de prêtre.
Il avait, lui, notre beau jeune homme, l'accoutrement d'un cavalier d'E

[[51, 58, 'PER'],
 [106, 113, 'PER'],
 [369, 374, 'LOC'],
 [384, 392, 'LOC'],
 [396, 401, 'LOC'],
 [999, 1006, 'LOC'],
 [1757, 1764, 'LOC'],
 [1806, 1820, 'PER'],
 [1835, 1849, 'PER'],
 [1922, 1935, 'PER'],
 [1940, 1945, 'LOC'],
 [1964, 1970, 'LOC'],
 [1980, 1989, 'LOC'],
 [2001, 2008, 'LOC'],
 [2023, 2029, 'LOC'],
 [2166, 2172, 'LOC'],
 [2178, 2188, 'LOC'],
 [2194, 2200, 'LOC'],
 [2240, 2250, 'LOC'],
 [2374, 2382, 'PER'],
 [2488, 2494, 'LOC'],
 [2541, 2551, 'LOC'],
 [2558, 2564, 'LOC'],
 [2598, 2604, 'LOC'],
 [2715, 2724, 'PER'],
 [2759, 2767, 'LOC'],
 [2779, 2789, 'LOC'],
 [2804, 2811, 'PER'],
 [2838, 2844, 'LOC'],
 [2857, 2864, 'LOC'],
 [2901, 2917, 'PER'],
 [2927, 2938, 'PER'],
 [3098, 3108, 'PER'],
 [3130, 3136, 'LOC'],
 [3140, 3147, 'LOC'],
 [3157, 3163, 'LOC'],
 [3486, 3493, 'PER']]

In [None]:
from spacy import displacy
manual_content = {
    "text": train_texts[ii],
    "ents": [{"start": e[0], "end": e[1], "label": e[2]} for e in train_entities[ii]],
    "title": train_ids[ii]
}
displacy.render(manual_content, manual=True, style="ent", jupyter=True)

In [None]:
train_texts[ii][1806-10:1820+10]

"d'État du roi Philippe V.\nIl avait"

In [None]:
nlp = spacy.blank("fr")
doc = nlp(train_texts[ii])
ents = []
for start, end, label in train_entities[ii]:
    # print(f"start: '{start}', end: {end}, label: {label}")
    span = doc.char_span(start, end, label=label)
    if span is None:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        print(f"Warning: could align entity '{train_texts[ii][start:end]}' to computed tokens. Contracting to span '{span}'.")
    # print(f"\t{span}")
    ents.append(span)
doc.ents = ents



In [None]:
found = False
count = 3
for tok in doc:
    found = found or (tok.text == "roi")
    if found:
        print(tok,)
        count -= 1
    if count == 0:
        break

roi
Philippe
V.
