In [1]:
import csv
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("../data/sentences.csv", "r", encoding="utf-8") as f:
    data = list(csv.DictReader(f=f, delimiter=","))

In [3]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
tagger = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
nlp = pipeline("ner", model=tagger, tokenizer=tokenizer, grouped_entities=True)



In [4]:
from nltk.tokenize import sent_tokenize
text = data[0]["sentence"]
sentences = sent_tokenize(text)

for sentence in sentences:
    entities = nlp(sentence)
entities

[{'entity_group': 'LOC',
  'score': 0.9993399,
  'word': 'Florence',
  'start': 3,
  'end': 11},
 {'entity_group': 'PER',
  'score': 0.59895015,
  'word': 'Frate',
  'start': 30,
  'end': 35},
 {'entity_group': 'PER',
  'score': 0.9934189,
  'word': 'Michelagnolo',
  'start': 224,
  'end': 236},
 {'entity_group': 'PER',
  'score': 0.9826374,
  'word': 'Fra Giovanni Agnolo',
  'start': 304,
  'end': 323}]

In [5]:
output = []

pbar = tqdm(total=len(data))
for sample in data:
    text_idx = sample["id"]
    text = sample["sentence"]
    sentences = sent_tokenize(text)
    for idx, sentence in enumerate(sentences):
        if idx > 0:
            prev_sentence_len = len(sentences[idx-1])
        else: 
            prev_sentence_len = 0
        ner = nlp(text)
        for ent in ner:
            if len(ent["word"])>2:
                output.append({
                    "id":text_idx,
                    "start_pos":ent["start"]+prev_sentence_len,
                    "end_pos":ent["end"]+prev_sentence_len,
                    "surface":ent["word"],
                    "type":ent["entity_group"],
                    "score":ent["score"]
                })
    pbar.update(1)
pbar.close()

keys = output[0].keys()
a_file = open("../results/wikineural/output.csv", "w", encoding="utf-8")
dict_writer = csv.DictWriter(a_file, keys)
dict_writer.writeheader()
dict_writer.writerows(output)
a_file.close()
    

IndentationError: unexpected indent (2937744189.py, line 27)