### Find Wikipedia Article

In [1]:
import os
from utils.utils_wiki import get_wikipedia_article, save_wikipedia_page, extract_sections

person_name = "Albrecht Duerer"

# person_name = "Ida Pfeiffer" # To test for any name on the go ...

wikipedia_title = person_name

if not os.path.exists("data/wikipedia"): os.makedirs("data/wikipedia")
if not os.path.exists("data/json"): os.makedirs("data/json")

wiki_page = get_wikipedia_article(person_name)
if wiki_page:
    print(f"Found a Page: {wiki_page.title}")
    wikipedia_title = wiki_page.title
    text = wiki_page.content
    text_filename = f"data/wikipedia/{wikipedia_title.replace(' ', '_').lower()}.txt"
    json_nlp_filename = f"data/json/{wikipedia_title.replace(' ', '_').lower()}.json"
    save_wikipedia_page(wiki_page, text_filename, include_metadata=True, include_sections=True)
else:
    print(f"Query Failed! Couldn't find {person_name}")


Options: {'Erasmus', 'Albrecht Dürer', 'Deutsche Schule Sevilla'}
Ordered Options Compund Metric: [RankedArticle(wikipage_title='Albrecht Dürer', queried_name='Albrecht Duerer', lev_similarity=0.896551724137931, token_overlap=0.5, dates_confidence=-1)]

Retrieving page for Albrecht Dürer
Wiki Life Data = (1471 - 1528)
Page Chosen! Confidence Score = 1
Found a Page: Albrecht Dürer


### Clean & Pre-process Text

In [2]:
import utils.nlp_common as unlp
import spacy
from spacy import __version__ as spacy_version


text_filename = f"data/wikipedia/{wikipedia_title.replace(' ', '_').lower()}.txt"
text_clean_filename = f"data/wikipedia/{wikipedia_title.replace(' ', '_').lower()}.clean.txt"
json_nlp_filename = f"data/json/{wikipedia_title.replace(' ', '_').lower()}.json"

with open(text_filename) as f:
    text = f.read()
    text = unlp.preprocess_and_clean_text(text)
    nlp_dict, is_from_file = unlp.create_nlp_template(text, filepath=json_nlp_filename)

# NLP Basic processing using SpaCy (Only if file did not exist already)
if not is_from_file:
    spacy_model = "en_core_web_lg"
    nlp = spacy.load(spacy_model)
    spacy_dict = unlp.run_spacy(text, nlp)
    sentences = spacy_dict['sentences']
    token_objs = spacy_dict['token_objs']
    nlp_dict['tokenization'] = {f'spacy_{spacy_model}_{spacy_version}': spacy_dict['tokens']}
    nlp_dict['morpho_syntax'] = {f'spacy_{spacy_model}_{spacy_version}': unlp.add_morphosyntax(spacy_dict['token_objs'])}
else:
    sentences, token_objs = [], []
    flair_versions = [v for v in nlp_dict['morpho_syntax'].keys() if 'flair_' in v]
    if len(flair_versions)>0:
        print("Taking sentences as defined by FLAIR (for compatibility reasons)")
        for sent_ix, sent_obj in enumerate(nlp_dict['morpho_syntax'][flair_versions[0]]):
            if len(sent_obj['text']) > 0:
                sentences.append(sent_obj['text'])
                for tok in sent_obj['words']:
                    token_objs.append({'sent_id': sent_ix, 'text': tok['FORM'], 'lemma': tok['FORM'], 
                                    'start_char': tok['MISC']['StartChar'], 'end_char': tok['MISC']['EndChar'], 'space_after': tok['MISC']['SpaceAfter']})

with open(text_clean_filename, "w") as f: f.write(text)
print(len(text), text[:100])
print(nlp_dict.keys())
print(f"Loaded from Flair file: {is_from_file}")


Taking sentences as defined by FLAIR (for compatibility reasons)
33155 Albrecht Dürer (; German: [ˈʔalbʁɛçt ˈdyːʁɐ]; 21 May 1471 – 6 April 1528), sometimes spelled in Engl
dict_keys(['text', 'tokenization', 'morpho_syntax', 'entities', 'time_expressions', 'semantic_roles', 'coreference', 'relations', 'linked_entities', 'frames'])
True


### Find Semantic Roles

In [3]:
from utils.nlp_allen import add_json_srl_allennlp
from allennlp.predictors import Predictor

srl_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")
nlp_dict['semantic_roles'] += add_json_srl_allennlp(sentences, srl_predictor, token_objs)

2023-08-22 14:13:08,332 - INFO - allennlp.common.plugins - Plugin allennlp_models available
2023-08-22 14:13:09,163 - INFO - cached_path - cache of https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz is up-to-date
2023-08-22 14:13:09,164 - INFO - allennlp.models.archival - loading archive file https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz from cache at /Users/daza/.allennlp/cache/b5f1db011cc85691a5fa2bf29e055a712261a2e5d74a74edd7da2fffc98d4ab8.4c4ac7e06ec3d85631bd26b839f90b5a375d3ceeb43e3c74f1cf4758dcee2bb3
2023-08-22 14:13:09,165 - INFO - allennlp.models.archival - extracting archive file /Users/daza/.allennlp/cache/b5f1db011cc85691a5fa2bf29e055a712261a2e5d74a74edd7da2fffc98d4ab8.4c4ac7e06ec3d85631bd26b839f90b5a375d3ceeb43e3c74f1cf4758dcee2bb3 to temp dir /var/folders/dx/h319cjln6sz2cjsd0t05zgjr0000gn/T/tmplbe8fat0
2023-08-22 14:13:12,476 - INFO - allennlp.common.params - datase

### Find Named Entities (AllenNLP)

In [None]:
from utils.nlp_allen import add_json_ner_allennlp

ner_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz")
nlp_dict['entities'] += add_json_ner_allennlp(sentences, ner_predictor, token_objs)

### Find Coreferences (AllenNLP)

In [4]:
from utils.nlp_allen import add_json_coref_allennlp

coref_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz")
nlp_dict['coreference'] = add_json_coref_allennlp(sentences, coref_predictor)

2023-08-22 14:17:50,741 - INFO - allennlp.common.plugins - Plugin allennlp_models available
2023-08-22 14:17:51,613 - INFO - cached_path - cache of https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz is up-to-date
2023-08-22 14:17:51,614 - INFO - allennlp.models.archival - loading archive file https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz from cache at /Users/daza/.allennlp/cache/038f918d294bd1a45e3709dfb22af5277b0be8677f750a85748c39979ce0e549.b897bfe76a04a5f70d6e88762a4d819b4b8b90e45b31b8314e0a6a9630d3f213
2023-08-22 14:17:51,615 - INFO - allennlp.models.archival - extracting archive file /Users/daza/.allennlp/cache/038f918d294bd1a45e3709dfb22af5277b0be8677f750a85748c39979ce0e549.b897bfe76a04a5f70d6e88762a4d819b4b8b90e45b31b8314e0a6a9630d3f213 to temp dir /var/folders/dx/h319cjln6sz2cjsd0t05zgjr0000gn/T/tmp385d2l24
2023-08-22 14:18:03,634 - INFO - allennlp.common.params - dataset_reader.type = core

### Merge Predicate Senses with SRL

In [5]:
from utils.nlp_common import merge_frames_srl

merged = merge_frames_srl(nlp_dict["semantic_roles"], nlp_dict["frames"])
nlp_dict["semantic_roles"] = merged
if 'frames' in nlp_dict: del nlp_dict['frames']

[('match', 585), ('non_match', 32), ('fuzzy_match', 2)]
-----------------------------------


### Find Time Expressions (HeidelTime)

In [6]:
from python_heideltime import Heideltime
from utils.nlp_heideltime import add_json_heideltime
heideltime_parser = Heideltime()
heideltime_parser.set_language('ENGLISH')
heideltime_parser.set_document_type('NARRATIVES')

nlp_dict['time_expressions'] = add_json_heideltime(text, heideltime_parser)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Save File

In [7]:
import json

print(len(nlp_dict["coreference"]))

intavia_dict = {
            'status': '200',
            'data': nlp_dict
        }
print(json_nlp_filename)
json.dump(intavia_dict, open(json_nlp_filename, "w"), indent=2, ensure_ascii=False)

111
data/json/albrecht_dürer.json
