### Find Wikipedia Article

In [None]:
import os
from utils.utils_wiki import get_wikipedia_article, save_wikipedia_page
from utils.nlp_common import preprocess_and_clean_text

person_name = "Albrecht Duerer"

# person_name = "Ida Pfeiffer" # To test for any name on demand ...

wikipedia_title = person_name


if not os.path.exists("data/wikipedia"): os.makedirs("data/wikipedia")
if not os.path.exists("data/json"): os.makedirs("data/json")


wiki_page = get_wikipedia_article(person_name)
if wiki_page:
    print(f"Found a Page: {wiki_page.title}")
    text = wiki_page.content
    wikipedia_title = wiki_page.title
    text_filename = f"data/wikipedia/{wikipedia_title.replace(' ', '_').lower()}.txt"
    json_nlp_filename = f"data/json/{wikipedia_title.replace(' ', '_').lower()}.json"
    save_wikipedia_page(wiki_page, text_filename, include_metadata=True, include_sections=True)
else:
    print(f"Query Failed! Couldn't find {person_name}")

text = preprocess_and_clean_text(text)
print(wikipedia_title)

In [None]:
import requests, re
from collections import OrderedDict
from typing import  Dict, Optional, Any
from utils.utils_wiki import get_wiki_linked_entities

text_filename = f"data/wikipedia/{wikipedia_title.replace(' ', '_').lower()}.txt"
json_nlp_filename = f"data/json/{wikipedia_title.replace(' ', '_').lower()}.json"

if wikipedia_title:
    raw_file = f"data/wikipedia/{wikipedia_title.replace(' ', '_').lower()}.raw.txt"
    response = requests.get(f'https://en.wikipedia.org/wiki/{wikipedia_title}?action=raw')
    raw_wiki = response.text
    with open(raw_file, "w") as f:
        f.write(response.text)

linked = get_wiki_linked_entities(raw_wiki)

### Clean & Pre-process Text (SpaCy)

In [None]:
from utils.nlp_common import create_nlp_template, add_morphosyntax, run_spacy, preprocess_and_clean_text
import spacy
from spacy import __version__ as spacy_version


with open(text_filename) as f:
    text = f.read()
    text = text[:5000]
    text = preprocess_and_clean_text(text)
    nlp_dict, is_from_file = create_nlp_template(text, filepath=json_nlp_filename)

# NLP Basic processing using SpaCy (Only if file did not exist already)
if not is_from_file:
    spacy_model = "en_core_web_sm"
    nlp = spacy.load(spacy_model)
    spacy_dict = run_spacy(text, nlp)
    nlp_dict['tokenization'] = {f'spacy_{spacy_model}_{spacy_version}': spacy_dict['tokens']}
    nlp_dict['morphology'] = {f'spacy_{spacy_model}_{spacy_version}': add_morphosyntax(spacy_dict['token_objs'])}
else:
    text = nlp_dict['text']

print(text[:100])
print(nlp_dict.keys())
print(nlp_dict["entities"])
print(is_from_file)

In [None]:
## Load Flair Libraries
from flair import __version__ as flair_version
from flair.splitter import SegtokSentenceSplitter
from utils.nlp_flair import run_flair, add_morphosyntax_flair
from flair.nn import Classifier

splitter = SegtokSentenceSplitter()
ner_tagger = Classifier.load('ner-ontonotes-large')
rel_tagger = Classifier.load('relations')
frames_tagger = Classifier.load('frame')
linker_tagger = Classifier.load('linker')

flair_models = {
    "chunker": "chunk",
    "ner": ner_tagger, # These are the specific pre-trained models, can be switched...
    "relations": rel_tagger,
    "frames": frames_tagger,
    "linker": linker_tagger
}

morpho, tokenized_doc = add_morphosyntax_flair(text, splitter)

nlp_dict['tokenization'][f"flair_{flair_version}"] = tokenized_doc
nlp_dict['morphology'][f"flair_{flair_version}"] = morpho

### Find Named Entities & Relations (Flair)

In [None]:
sentences = splitter.split(text)

if 'entities' not in nlp_dict: nlp_dict['entities'] = []
if 'relations' not in nlp_dict: nlp_dict['relations'] = []

ent_rel_out = run_flair(sentences, "relations", flair_models)
nlp_dict['entities'] = ent_rel_out["tagged_entities"]
nlp_dict['relations'] = ent_rel_out["tagged_relations"]

### Find Linked Entities (Flair)

In [None]:
# Must restart the sentence to erase previous tags
sentences = splitter.split(text)
if 'linked_entities' not in nlp_dict: nlp_dict['linked_entities'] = []
nlp_dict['linked_entities'] = run_flair(sentences, "linker", flair_models, metadata={"entity_ids":ent_rel_out["entity_ids"]})["tagged_entities"]

### Find Predicate Senses & Merge with SRL

In [None]:
frames_flair = run_flair(sentences, "frames", flair_models)["tagged_entities"]

for fr in frames_flair:
    print(fr["locationStart"],fr["locationEnd"],fr["predicateSense"])

### Save File Appending the new Annotations

In [None]:
import json

intavia_dict = {
            'status': '200',
            'data': nlp_dict
        }
print(json_nlp_filename)
print(nlp_dict)
json.dump(intavia_dict, open(json_nlp_filename, "w"), indent=2, ensure_ascii=False)