In [1]:
import csv
from tqdm import tqdm

In [2]:
with open("../data/sentences.csv", "r", encoding="utf-8") as f:
    data = list(csv.DictReader(f=f, delimiter=","))

In [3]:
from flair.data import Sentence
from flair.models import SequenceTagger

tagger = SequenceTagger.load("flair/ner-english-large")
sentence = Sentence(data[0]["sentence"])

# predict NER tags
tagger.predict(sentence)

# print sentence
print(sentence)

# print predicted NER spans
print('The following NER tags are found:')

# iterate over entities and print
for entity in sentence.get_spans('ner'):
    print(f'entity.text is: "{entity.text}"')
    print(f'entity.start_position is: "{entity.start_position}"')
    print(f'entity.end_position is: "{entity.end_position}"')
    
    # also print the value and score of its "ner"-label
    print(f'entity "ner"-label value is: "{entity.get_label("ner").value}"')
    print(f'entity "ner"-label score is: "{entity.get_label("ner").score}"\n')

  from .autonotebook import tqdm as notebook_tqdm
Downloading: 100%|████████████████████████████████████████████████████████████████| 2.24G/2.24G [03:13<00:00, 11.5MB/s]

2023-09-07 15:23:49,190 loading file C:\Users\CSA\.flair\models\ner-english-large\07301f59bb8cb113803be316267f06ddf9243cdbba92a4c8067ef92442d2c574.554244d3476d97501a766a98078421817b14654496b86f2f7bd139dc502a4f29





2023-09-07 15:24:05,382 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
Sentence: "Finally , Pope Clement , having determined that Buonarroti should return to Florence to finish the works of the sacristy and library of S . Lorenzo , gave him orders , since many statues were wanting there , as will be told in the Life of Michelagnolo himself , that he should avail himself of the most able men that could be found , and particularly of Fra Giovanni Agnolo , employing the same methods as had been adopted by Antonio da San Gallo in order to finish the works of the Madonna di Loreto . Having therefore made his way with the Frate to Florence , Michelagnolo , in executing the statues of Duke Lorenzo and Duke Giuliano , employed the Frate much in polishing them and in executing certain difficult undercuttings ; with which occasion Fra Giovanni Agnolo le

In [4]:
from nltk.tokenize import sent_tokenize

output = []

pbar = tqdm(total=len(data))
for sample in data:
    doc_id = sample["id"]
    text = sample["sentence"]
    sentences = [s for s in sent_tokenize(text)]
    curr_pos = 0
    for sentence_id, sentence in enumerate(sentences):
        if sentence_id>0:
            curr_pos = curr_pos + len(sentences[sentence_id-1])+1
        sentence = Sentence(sentence)
        tagger.predict(sentence)
        for entity in sentence.get_spans("ner"):
            output.append({
                "doc_id":doc_id,
                "doc_start_pos":curr_pos+entity.start_position,
                "doc_end_pos":curr_pos+entity.end_position,
                "sentence_id":sentence_id,
                "sent_start_pos":entity.start_position,
                "sent_end_pos":entity.end_position,
                "surface":entity.text,
                "type":entity.get_label("ner").value,
                "score":entity.get_label("ner").score
            })
    pbar.update(1)
pbar.close()

keys = output[0].keys()
a_file = open("../results/flair_ner/output.csv", "w", encoding="utf-8")
dict_writer = csv.DictWriter(a_file, keys)
dict_writer.writeheader()
dict_writer.writerows(output)
a_file.close()
    

100%|██████████████████████████████████████████████████████████████████████████████████| 55/55 [04:08<00:00,  4.52s/it]
