In [2]:
import csv
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [3]:
with open("../vasari-kg.github.io/data/sentences_it.csv", "r") as f:
    sentences = list(csv.DictReader(f=f, delimiter=","))

In [4]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
tagger = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
nlp = pipeline("ner", model=tagger, tokenizer=tokenizer)

In [4]:
text = sentences[0]["sentence"]
ner = nlp(text, aggregation_strategy="simple")
print(ner)

[{'entity_group': 'PER', 'score': 0.9669188, 'word': 'Ser Piero', 'start': 74, 'end': 83}, {'entity_group': 'PER', 'score': 0.9996276, 'word': 'Andrea del Verrocchio', 'start': 103, 'end': 124}, {'entity_group': 'PER', 'score': 0.7879479, 'word': 'S. John', 'start': 160, 'end': 167}, {'entity_group': 'PER', 'score': 0.89505774, 'word': 'Christ', 'start': 178, 'end': 184}, {'entity_group': 'PER', 'score': 0.9987483, 'word': 'Leonardo', 'start': 191, 'end': 199}, {'entity_group': 'PER', 'score': 0.9989743, 'word': 'Leonardo', 'start': 279, 'end': 287}, {'entity_group': 'PER', 'score': 0.99748766, 'word': 'Andrea', 'start': 368, 'end': 374}, {'entity_group': 'PER', 'score': 0.9975649, 'word': 'Andrea', 'start': 402, 'end': 408}]


In [9]:
output = []

pbar = tqdm(total=len(sentences))
for sample in sentences:
    sent_idx = sample["id"]
    text = sample["sentence"]
    ner = nlp(text, aggregation_strategy="simple")
    for ent in ner:
        if len(output)==0:
            output.append({
                "id":sent_idx,
                "start_pos":ent["start"],
                "end_pos":ent["end"],
                "surface":ent["word"],
                "type":ent["entity_group"],
            })
        elif output[-1]["end_pos"]!=ent["start"]:
            output.append({
                "id":sent_idx,
                "start_pos":ent["start"],
                "end_pos":ent["end"],
                "surface":ent["word"],
                "type":ent["entity_group"],
            })
        else:
            output[-1] = {
                "id":sent_idx,
                "start_pos":output[-1]["start_pos"],
                "end_pos":ent["end"],
                "surface":output[-1]["surface"]+ent["word"].replace("#", ""),
                "type":output[-1]["type"],
            }
    pbar.update(1)
pbar.close()

keys = output[0].keys()
a_file = open("results/wikineural_multi_it/output.csv", "w")
dict_writer = csv.DictWriter(a_file, keys)
dict_writer.writeheader()
dict_writer.writerows(output)
a_file.close()
    

100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:02<00:00,  9.59it/s]
