In [1]:
import csv
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("../vasari-kg.github.io/data/sentences_en.csv", "r") as f:
    sentences = list(csv.DictReader(f=f, delimiter=","))

In [3]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
tagger = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
nlp = pipeline("ner", model=tagger, tokenizer=tokenizer)

In [2]:
with open("../vasari-kg.github.io/extra/surfaces.csv", "r", encoding="utf-8") as f2:
    surfaces = list(csv.DictReader(f=f2, delimiter=","))

In [3]:
set_of_surfaces = set([surface["surface"].lower() for surface in surfaces])

In [4]:
import marisa_trie
trie = marisa_trie.Trie(list(set_of_surfaces))

In [5]:
print(trie.keys("king"))

['king', 'king of france', 'king of france francis i', 'king of france francisco i', 'king of france francesco i', 'king of france francois i', 'king of france franz i', 'king of france françois i', 'king of portugal', 'king of portugal and the algarves', 'king of the church', 'king francis', 'king francis i', 'king francis i of france', 'king francis&nbsp;i', 'king francois i', 'king françois i', 'king françois i of france', 'king cepheus', 'king neptune', 'kingdom animal', 'kingdom animalia', 'kingdom of france', 'kingdomanimalia', "king's", "king's colour", "king's colour and regimental colour", 'kings', 'kings and queens', 'kings’ day', 'king’s colour']


In [7]:
from flair.data import Sentence
from flair.models import SequenceTagger

In [21]:
text = "Lorenzo de Medici painted a S. Anna"
tagger = SequenceTagger.load("flair/pos-english")
sentence = Sentence(text)
tagger.predict(sentence)
print(len(sentence))

2022-06-08 15:35:47,442 loading file C:\Users\CSA\.flair\models\pos-english\a9a73f6cd878edce8a0fa518db76f441f1cc49c2525b2b4557af278ec2f0659e.121306ea62993d04cd1978398b68396931a39eb47754c8a06a87f325ea70ac63
2022-06-08 15:35:47,601 SequenceTagger predicts: Dictionary with 53 tags: <unk>, O, UH, ,, VBD, PRP, VB, PRP$, NN, RB, ., DT, JJ, VBP, VBG, IN, CD, NNS, NNP, WRB, VBZ, WDT, CC, TO, MD, VBN, WP, :, RP, EX, JJR, FW, XX, HYPH, POS, RBR, JJS, PDT, NNPS, RBS, AFX, WP$, -LRB-, -RRB-, ``, '', LS, $, SYM, ADD
8


In [27]:
counter = 0
candidates = []

while counter < len(sentence):
    idx = counter
    counter+=1
    candidate = None
    prefix = None
    for idx2 in range(idx, len(sentence)):
        if prefix == None and sentence[idx2].get_label("pos").value in {"DT"}:
            break
        elif prefix == None:
            prefix = sentence[idx2].text.lower()
            start_pos = sentence[idx2].start_pos
            end_pos = sentence[idx2].end_pos
            tags = [sentence[idx2].get_label("pos").value]
        else:
            prefix += " "+sentence[idx2].text.lower()
            end_pos = sentence[idx2].end_pos
            tags.append(sentence[idx2].get_label("pos").value)
            
        matches = trie.keys(prefix)
        if len(matches)<1:
            if candidate!=None:
                del candidate["tags"][-1]
                candidates.append(candidate)
            break
        else:
            if prefix in matches:
                candidate = {"text":prefix, "start_pos":start_pos, "end_pos":end_pos, "tags":tags}
                if idx2 == len(sentence)-1:
                    candidates.append(candidate)
                
            

In [28]:
print(candidates)

[{'text': 'lorenzo de medici', 'start_pos': 0, 'end_pos': 17, 'tags': ['NNP', 'NNP', 'NNP']}, {'text': 'de medici', 'start_pos': 8, 'end_pos': 17, 'tags': ['NNP', 'NNP']}, {'text': 'medici', 'start_pos': 11, 'end_pos': 17, 'tags': ['NNP']}, {'text': 'painted', 'start_pos': 18, 'end_pos': 25, 'tags': ['VBD']}, {'text': 'anna', 'start_pos': 31, 'end_pos': 35, 'tags': ['NNP']}]


In [29]:
def filter_candidates(candidates):
    output = []
    for item in candidates:
        if len(output)==0:
            output.append(item)
        else:
            last_range = range(output[-1]["start_pos"], output[-1]["end_pos"])
            curr_range = range(item["start_pos"], item["end_pos"])
            if set(last_range).intersection(set(curr_range)):
                if len(output[-1]["text"])<len(item["text"]):
                    output[-1]=item
            else:
                output.append(item)
    return output

print(filter_candidates(candidates))

[{'text': 'lorenzo de medici', 'start_pos': 0, 'end_pos': 17, 'tags': ['NNP', 'NNP', 'NNP']}, {'text': 'painted', 'start_pos': 18, 'end_pos': 25, 'tags': ['VBD']}, {'text': 'anna', 'start_pos': 31, 'end_pos': 35, 'tags': ['NNP']}]


In [92]:
text = """
Leonardo then made a picture of Our Lady, a most excellent work, which was in the possession of Pope Clement VII; and, among other things painted therein, he counterfeited a glass vase full of water, containing some flowers, in which, besides its marvellous naturalness, he had imitated the dew-drops on the flowers, so that it seemed more real than the reality.
"""
tagger = SequenceTagger.load("flair/upos-multi")
sentence = Sentence(text)
tagger.predict(sentence)
for entity in sentence:
    print(entity.get_label("upos").value)

2022-06-08 18:54:03,880 loading file C:\Users\CSA\.flair\models\upos-multi\1a44f168663182024fd3ea6d7dcaeee47fe5bcb537cc737ad058b64ad4db9736.5f899f25846741510a6567b89027d988bd6f634b2776a7c3e834fea4629367cb
2022-06-08 18:54:04,319 SequenceTagger predicts: Dictionary with 21 tags: <unk>, O, PROPN, PUNCT, ADJ, NOUN, VERB, DET, ADP, AUX, PRON, PART, SCONJ, NUM, ADV, CCONJ, X, INTJ, SYM, <START>, <STOP>
PROPN
ADV
VERB
DET
NOUN
ADP
PRON
PROPN
PUNCT
DET
ADV
ADJ
NOUN
PUNCT
PRON
AUX
ADP
DET
NOUN
ADP
PROPN
PROPN
NUM
PUNCT
CCONJ
PUNCT
ADP
ADJ
NOUN
VERB
ADV
PUNCT
PRON
VERB
DET
NOUN
NOUN
ADJ
ADP
NOUN
PUNCT
VERB
DET
NOUN
PUNCT
ADP
PRON
PUNCT
ADP
PRON
ADJ
NOUN
PUNCT
PRON
AUX
VERB
DET
NOUN
ADP
DET
NOUN
PUNCT
SCONJ
SCONJ
PRON
VERB
ADV
ADJ
ADP
DET
NOUN
PUNCT


In [104]:
jj_nn = "((ADJ|PRON)\s)?((NOUN|PROPN)\s?)+"
post_mod = "(\s(NUM|DET\s(ADJ|NOUN)))?"
prop_phrase = "(\sADP\s((ADJ|PRON)\s)?((NOUN|PROPN)\s?)+){0,2}$"

In [107]:
import re
text = "Church of S. Mary in Florence"
sentence = Sentence(text)
tagger.predict(sentence)
tags = " ".join([item.get_label("upos").value for item in sentence if item.get_label("upos").value != "PUNCT"])
print(tags)
print(re.match(jj_nn+post_mod+prop_phrase, tags))

NOUN ADP PROPN PROPN ADP PROPN
<re.Match object; span=(0, 30), match='NOUN ADP PROPN PROPN ADP PROPN'>
