In [1]:
## multilingual

import csv
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("../data/sentences.csv", "r", encoding="utf-8") as f:
    data = list(csv.DictReader(f=f, delimiter=","))

In [6]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
tagger = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
nlp = pipeline("ner", model=tagger, tokenizer=tokenizer, aggregation_strategy="simple")

In [7]:
from nltk.tokenize import sent_tokenize
text = data[0]["sentence"]
sentences = sent_tokenize(text)

for sentence in sentences:
    entities = nlp(sentence)
entities

[{'entity_group': 'LOC',
  'score': 0.9993399,
  'word': 'Florence',
  'start': 3,
  'end': 11},
 {'entity_group': 'PER',
  'score': 0.59895015,
  'word': 'Frate',
  'start': 30,
  'end': 35},
 {'entity_group': 'PER',
  'score': 0.9934189,
  'word': 'Michelagnolo',
  'start': 224,
  'end': 236},
 {'entity_group': 'PER',
  'score': 0.9826374,
  'word': 'Fra Giovanni Agnolo',
  'start': 304,
  'end': 323}]

In [10]:
output = []

pbar = tqdm(total=len(data))
for sample in data:
    doc_id = sample["id"]
    text = sample["sentence"]
    sentences = [s for s in sent_tokenize(text)]
    curr_pos = 0
    for sentence_id, sentence in enumerate(sentences):
        if sentence_id>0:
            curr_pos = curr_pos + len(sentences[sentence_id-1])+1
        ner = nlp(sentence)
        for ent in ner:
            if len(output)>0:
                if output[-1]["sent_end_pos"]==ent["start"]:
                    output[-1]={
                        "doc_id":doc_id,
                        "doc_start_pos":output[-1]["doc_start_pos"],
                        "doc_end_pos":curr_pos+ent["end"],
                        "sentence_id":sentence_id,
                        "sent_start_pos":output[-1]["sent_start_pos"],
                        "sent_end_pos":ent["end"],
                        "surface":output[-1]["surface"]+ent["word"][2:],
                        "type":ent["entity_group"],
                        "score":ent["score"]
                    }
                else:
                    output.append({
                        "doc_id":doc_id,
                        "doc_start_pos":curr_pos+ent["start"],
                        "doc_end_pos":curr_pos+ent["end"],
                        "sentence_id":sentence_id,
                        "sent_start_pos":ent["start"],
                        "sent_end_pos":ent["end"],
                        "surface":ent["word"],
                        "type":ent["entity_group"],
                        "score":ent["score"]
                    })
            else:
                output.append({
                    "doc_id":doc_id,
                    "doc_start_pos":curr_pos+ent["start"],
                    "doc_end_pos":curr_pos+ent["end"],
                    "sentence_id":sentence_id,
                    "sent_start_pos":ent["start"],
                    "sent_end_pos":ent["end"],
                    "surface":ent["word"],
                    "type":ent["entity_group"],
                    "score":ent["score"]
                })
    pbar.update(1)
pbar.close()

keys = output[0].keys()
a_file = open("../results/wikineural/output.csv", "w", encoding="utf-8")
dict_writer = csv.DictWriter(a_file, keys)
dict_writer.writeheader()
dict_writer.writerows(output)
a_file.close()
    

100%|██████████████████████████████████████████████████████████████████████████████████| 55/55 [01:04<00:00,  1.18s/it]
