### Find Wikipedia Article

In [1]:
import os
from utils_wiki import get_wikipedia_article, save_wikipedia_page

person_name = "Albrecht Duerer"


if not os.path.exists("data/wikipedia"): os.makedirs("data/wikipedia")
if not os.path.exists("data/json"): os.makedirs("data/json")

text_filename = f"data/wikipedia/{person_name.replace(' ', '_').lower()}.txt"
json_nlp_filename = f"data/json/{person_name.replace(' ', '_').lower()}.json"

if not os.path.exists(text_filename):
    wiki_page = get_wikipedia_article(person_name)
    if wiki_page:
        print(f"Found a Page: {wiki_page.title}")
        text = wiki_page.content
        save_wikipedia_page(wiki_page, text_filename, include_metadata=True)
    else:
        print(f"Query Failed! Couldn't find {person_name}")
else:
    text = open(text_filename).read()

### Clean & Pre-process Text (SpaCy)

In [3]:
from utils_nlp_common import create_nlp_template, add_morphosyntax, run_spacy
import spacy
from spacy import __version__ as spacy_version


with open(text_filename) as f:
    text = f.read()
    text = text[:5000]
    # text = preprocess_and_clean_text(text)
    nlp_dict, is_from_file = create_nlp_template(text, filepath=json_nlp_filename)

# NLP Basic processing using SpaCy (Only if file did not exist already)
if not is_from_file:
    spacy_model = "en_core_web_sm"
    nlp = spacy.load(spacy_model)
    spacy_dict = run_spacy(text, nlp)
    nlp_dict['tokenization'] = {f'spacy_{spacy_model}_{spacy_version}': spacy_dict['tokens']}
    nlp_dict['morphology'] = {f'spacy_{spacy_model}_{spacy_version}': add_morphosyntax(spacy_dict['token_objs'])}
else:
    text = nlp_dict['text']

print(text[:100])
print(nlp_dict.keys())
print(nlp_dict["entities"][0])
print(is_from_file)

Albrecht Dürer (; German: [ˈʔalbʁɛçt ˈdyːʁɐ]; Hungarian: Ajtósi Adalbert; 21 May 1471 – 6 April 1528
dict_keys(['text', 'morphology', 'entities', 'time_expressions', 'semantic_roles', 'coreference', 'tokenization', 'relations', 'linked_entities'])
{'ID': 'ent_0_0', 'sentenceID': 0, 'surfaceForm': 'Albrecht Dürer', 'category': 'PERSON', 'locationStart': 0, 'locationEnd': 14, 'tokenStart': 0, 'tokenEnd': 2, 'score': 0.9976517856121063, 'method': 'flair_ner-ontonotes-large_0.12.2'}
True


In [4]:
## Load Flair Libraries
from flair import __version__ as flair_version
from flair.splitter import SegtokSentenceSplitter
from utils_nlp_flair import run_flair, add_morphosyntax_flair

splitter = SegtokSentenceSplitter()
flair_models = {
    "chunker": "chunk",
    "ner": 'ner-ontonotes-large', # 
    "relations": "relations", # If relations is provided then is not necessary to do NER sepparately!
    "frames": "frame",
    "linker": "linker"
}

morpho, tokenized_doc = add_morphosyntax_flair(text, splitter)

nlp_dict['tokenization'][f"flair_{flair_version}"] = tokenized_doc
nlp_dict['morphology'][f"flair_{flair_version}"] = morpho

### Find Named Entities & Relations (Flair)

In [5]:
sentences = splitter.split(text)

if 'entities' not in nlp_dict: nlp_dict['entities'] = []
if 'relations' not in nlp_dict: nlp_dict['relations'] = []

ent_rel_out = run_flair(sentences, "relations", flair_models)
nlp_dict['entities'] = ent_rel_out["tagged_entities"]
nlp_dict['relations'] = ent_rel_out["tagged_relations"]

2023-06-15 15:47:13,904 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY


### Find Linked Entities (Flair)

In [6]:
# Must restart the sentence to erase previous tags
sentences = splitter.split(text)
if 'linked_entities' not in nlp_dict: nlp_dict['linked_entities'] = []
nlp_dict['linked_entities'] = run_flair(sentences, "linker", flair_models, metadata={"entity_ids":ent_rel_out["entity_ids"]})["tagged_entities"]

2023-06-15 15:49:08,980 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


### Find Predicate Senses & Merge with SRL

In [None]:
# nlp_dict['semantic_roles'] = run_flair(sentences, "frames", flair_models)["tagged_entities"]

### Save File Appending the new Annotations

In [10]:
import json

intavia_dict = {
            'status': '200',
            'data': nlp_dict
        }

json.dump(intavia_dict, open(json_nlp_filename, "w"), indent=2, ensure_ascii=False)