### Find Wikipedia Article

In [9]:
import os
from utils.utils_wiki import get_wikipedia_article, save_wikipedia_page
from utils.nlp_common import preprocess_and_clean_text

person_name = "Albrecht Duerer"
wikipedia_title = person_name


if not os.path.exists("data/wikipedia"): os.makedirs("data/wikipedia")
if not os.path.exists("data/json"): os.makedirs("data/json")


wiki_page = get_wikipedia_article(person_name)
if wiki_page:
    print(f"Found a Page: {wiki_page.title}")
    text = wiki_page.content
    wikipedia_title = wiki_page.title
    text_filename = f"data/wikipedia/{wikipedia_title.replace(' ', '_').lower()}.txt"
    json_nlp_filename = f"data/json/{wikipedia_title.replace(' ', '_').lower()}.json"
    save_wikipedia_page(wiki_page, text_filename, include_metadata=True, include_sections=True)
else:
    print(f"Query Failed! Couldn't find {person_name}")

text = preprocess_and_clean_text(text)
print(wikipedia_title)

Options: {'Deutsche Schule Sevilla', 'Albrecht Dürer', 'Goslar'}
Ordered Options Compund Metric: [RankedArticle(wikipage_title='Albrecht Dürer', queried_name='Albrecht Duerer', lev_similarity=0.896551724137931, token_overlap=0.5, dates_confidence=-1)]

Retrieving page for Albrecht Dürer
Wiki Life Data = (1471 - 1528)
Page Chosen! Confidence Score = 1
Found a Page: Albrecht Dürer
Albrecht Dürer


In [10]:
import requests, re
from collections import OrderedDict
from typing import  Dict, Optional, Any
from utils.utils_wiki import get_wiki_linked_entities

wikipedia_title = "Albrecht Dürer" # This title has to match EXACTLY the Wikipedia Article's name
text_filename = f"data/wikipedia/{wikipedia_title.replace(' ', '_').lower()}.txt"
json_nlp_filename = f"data/json/{wikipedia_title.replace(' ', '_').lower()}.json"

if wikipedia_title:
    raw_file = f"data/wikipedia/{wikipedia_title.replace(' ', '_').lower()}.raw.txt"
    response = requests.get(f'https://en.wikipedia.org/wiki/{wikipedia_title}?action=raw')
    raw_wiki = response.text
    with open(raw_file, "w") as f:
        f.write(response.text)

linked = get_wiki_linked_entities(raw_wiki)

### Clean & Pre-process Text (SpaCy)

In [11]:
from utils.nlp_common import create_nlp_template, add_morphosyntax, run_spacy, preprocess_and_clean_text
import spacy
from spacy import __version__ as spacy_version


with open(text_filename) as f:
    text = f.read()
    text = text[:5000]
    text = preprocess_and_clean_text(text)
    nlp_dict, is_from_file = create_nlp_template(text, filepath=json_nlp_filename)

# NLP Basic processing using SpaCy (Only if file did not exist already)
if not is_from_file:
    spacy_model = "en_core_web_sm"
    nlp = spacy.load(spacy_model)
    spacy_dict = run_spacy(text, nlp)
    nlp_dict['tokenization'] = {f'spacy_{spacy_model}_{spacy_version}': spacy_dict['tokens']}
    nlp_dict['morphology'] = {f'spacy_{spacy_model}_{spacy_version}': add_morphosyntax(spacy_dict['token_objs'])}
else:
    text = nlp_dict['text']

print(text[:100])
print(nlp_dict.keys())
print(nlp_dict["entities"])
print(is_from_file)

Albrecht Dürer (; German: [ˈʔalbʁɛçt ˈdyːʁɐ]; Hungarian: Ajtósi Adalbert; 21 May 1471 – 6 April 1528
dict_keys(['text', 'tokenization', 'morphology', 'entities', 'time_expressions', 'semantic_roles', 'coreference'])
[]
False


In [12]:
## Load Flair Libraries
from flair import __version__ as flair_version
from flair.splitter import SegtokSentenceSplitter
from utils.nlp_flair import run_flair, add_morphosyntax_flair
from flair.nn import Classifier

splitter = SegtokSentenceSplitter()
ner_tagger = Classifier.load('ner-ontonotes-large')
rel_tagger = Classifier.load('relations')
frames_tagger = Classifier.load('frame')
linker_tagger = Classifier.load('linker')

flair_models = {
    "chunker": "chunk",
    "ner": ner_tagger, # These are the specific pre-trained models, can be switched...
    "relations": rel_tagger,
    "frames": frames_tagger,
    "linker": linker_tagger
}

morpho, tokenized_doc = add_morphosyntax_flair(text, splitter)

nlp_dict['tokenization'][f"flair_{flair_version}"] = tokenized_doc
nlp_dict['morphology'][f"flair_{flair_version}"] = morpho

2023-06-26 17:02:50,991 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY
2023-06-26 17:02:56,163 SequenceTagger predicts: Dictionary with 4852 tags: <unk>, be.01, be.03, have.01, say.01, do.01, have.03, do.02, be.02, know.01, think.01, come.01, see.01, want.01, go.02, tell.01, give.01, use.01, make.02, take.01, talk.01, get.01, go.04, live.01, need.01, believe.01, work.01, mean.01, have.02, look.01, become.01, die.01, help.01, find.01, try.01, hear.01, know.06, show.01, happen.01, let.01, sell.01, bring.01, make.01,

### Find Named Entities & Relations (Flair)

In [13]:
sentences = splitter.split(text)

if 'entities' not in nlp_dict: nlp_dict['entities'] = []
if 'relations' not in nlp_dict: nlp_dict['relations'] = []

ent_rel_out = run_flair(sentences, "relations", flair_models)
nlp_dict['entities'] = ent_rel_out["tagged_entities"]
nlp_dict['relations'] = ent_rel_out["tagged_relations"]

relations


100%|██████████| 34/34 [00:00<00:00, 7614.61it/s]


### Find Linked Entities (Flair)

In [14]:
# Must restart the sentence to erase previous tags
sentences = splitter.split(text)
if 'linked_entities' not in nlp_dict: nlp_dict['linked_entities'] = []
nlp_dict['linked_entities'] = run_flair(sentences, "linker", flair_models, metadata={"entity_ids":ent_rel_out["entity_ids"]})["tagged_entities"]

linker


100%|██████████| 34/34 [00:00<00:00, 25280.33it/s]


### Find Predicate Senses & Merge with SRL

In [9]:
frames_flair = run_flair(sentences, "frames", flair_models)["tagged_entities"]

for fr in frames_flair:
    print(fr["locationStart"],fr["locationEnd"],fr["predicateSense"])

frames


100%|██████████| 6/6 [00:00<00:00, 9660.59it/s]

113 120 spell.01
172 175 be.01
246 250 bear.02
271 282 establish.01
302 311 influence.01
385 388 be.01
392 399 contact.01
518 521 be.03
522 532 patronize.01
584 592 include.01
609 618 prefer.01
737 740 be.01
806 813 include.01
862 867 die.01
966 970 mark.01





### Save File Appending the new Annotations

In [15]:
import json

intavia_dict = {
            'status': '200',
            'data': nlp_dict
        }
print(json_nlp_filename)
json.dump(intavia_dict, open(json_nlp_filename, "w"), indent=2, ensure_ascii=False)

data/json/albrecht_dürer.json
