In [1]:
import csv
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open("../data/sentences_en.csv", "r") as f:
    sentences = list(csv.DictReader(f=f, delimiter=","))

In [2]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
tagger = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
nlp = pipeline("ner", model=tagger, tokenizer=tokenizer)

Downloading: 100%|██████████| 333/333 [00:00<00:00, 166kB/s]
Downloading: 100%|██████████| 972k/972k [00:00<00:00, 1.37MB/s]
Downloading: 100%|██████████| 1.87M/1.87M [00:01<00:00, 1.80MB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 28.0kB/s]
Downloading: 100%|██████████| 1.16k/1.16k [00:00<00:00, 395kB/s]
Downloading: 100%|██████████| 676M/676M [00:58<00:00, 12.2MB/s] 


In [4]:
text = sentences[0]["sentence"]
ner = nlp(text, aggregation_strategy="simple")
print(ner)

[{'entity_group': 'PER', 'score': 0.9669188, 'word': 'Ser Piero', 'start': 74, 'end': 83}, {'entity_group': 'PER', 'score': 0.9996276, 'word': 'Andrea del Verrocchio', 'start': 103, 'end': 124}, {'entity_group': 'PER', 'score': 0.7879479, 'word': 'S. John', 'start': 160, 'end': 167}, {'entity_group': 'PER', 'score': 0.89505774, 'word': 'Christ', 'start': 178, 'end': 184}, {'entity_group': 'PER', 'score': 0.9987483, 'word': 'Leonardo', 'start': 191, 'end': 199}, {'entity_group': 'PER', 'score': 0.9989743, 'word': 'Leonardo', 'start': 279, 'end': 287}, {'entity_group': 'PER', 'score': 0.99748766, 'word': 'Andrea', 'start': 368, 'end': 374}, {'entity_group': 'PER', 'score': 0.9975649, 'word': 'Andrea', 'start': 402, 'end': 408}]


In [5]:
output = []

pbar = tqdm(total=len(sentences))
for sample in sentences:
    sent_idx = sample["id"]
    text = sample["sentence"]
    ner = nlp(text, aggregation_strategy="simple")
    for ent in ner:
        output.append({
            "id":sent_idx,
            "start_pos":ent["start"],
            "end_pos":ent["end"],
            "surface":ent["word"],
            "type":ent["entity_group"],
            "score":ent["score"]
        })
    pbar.update(1)
pbar.close()

keys = output[0].keys()
a_file = open("results/wikineural_multi/output.csv", "w")
dict_writer = csv.DictWriter(a_file, keys)
dict_writer.writeheader()
dict_writer.writerows(output)
a_file.close()
    

100%|██████████| 33/33 [00:06<00:00,  5.29it/s]
