In [1]:
import csv
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("../vasari-kg.github.io/data/sentences_it.csv", "r", encoding="utf-8") as f:
    sentences = list(csv.DictReader(f=f, delimiter=","))

In [3]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
tagger = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
nlp = pipeline("ner", model=tagger, tokenizer=tokenizer)

In [4]:
text = sentences[0]["sentence"]
ner = nlp(text, aggregation_strategy="simple")
print(ner)

[{'entity_group': 'LOC', 'score': 0.9345933, 'word': 'Mercatanzia di Fiorenza', 'start': 35, 'end': 58}, {'entity_group': 'LOC', 'score': 0.92038906, 'word': 'Fortezza', 'start': 64, 'end': 72}, {'entity_group': 'PER', 'score': 0.99630225, 'word': 'Antonio', 'start': 104, 'end': 111}, {'entity_group': 'PER', 'score': 0.99892634, 'word': 'Piero del Pollaiuolo', 'start': 114, 'end': 134}]


In [5]:
output = []

pbar = tqdm(total=len(sentences))
for sample in sentences:
    sent_idx = sample["id"]
    text = sample["sentence"]
    ner = nlp(text, aggregation_strategy="simple")
    for ent in ner:
        if len(output)==0:
            output.append({
                "id":sent_idx,
                "start_pos":ent["start"],
                "end_pos":ent["end"],
                "surface":ent["word"],
                "type":ent["entity_group"],
            })
        elif output[-1]["end_pos"]!=ent["start"]:
            output.append({
                "id":sent_idx,
                "start_pos":ent["start"],
                "end_pos":ent["end"],
                "surface":ent["word"],
                "type":ent["entity_group"],
            })
        else:
            output[-1] = {
                "id":sent_idx,
                "start_pos":output[-1]["start_pos"],
                "end_pos":ent["end"],
                "surface":output[-1]["surface"]+ent["word"].replace("#", ""),
                "type":output[-1]["type"],
            }
    pbar.update(1)
pbar.close()

keys = output[0].keys()
a_file = open("results3/wikineural_it/output.csv", "w", encoding="utf-8")
dict_writer = csv.DictWriter(a_file, keys)
dict_writer.writeheader()
dict_writer.writerows(output)
a_file.close()
    

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:02<00:00, 10.53it/s]
