### Find Wikipedia Article

In [None]:
import os
from utils.utils_wiki import get_wikipedia_article, save_wikipedia_page, extract_sections

person_name = "Albrecht Duerer"


if not os.path.exists("data/wikipedia"): os.makedirs("data/wikipedia")
if not os.path.exists("data/json"): os.makedirs("data/json")
text_filename = f"data/wikipedia/{person_name.replace(' ', '_').lower()}.txt"
json_nlp_filename = f"data/json/{person_name.replace(' ', '_').lower()}.json"

wiki_page = get_wikipedia_article(person_name)
if wiki_page:
    print(f"Found a Page: {wiki_page.title}")
    text = wiki_page.content
    section_dict = extract_sections(text)
    save_wikipedia_page(wiki_page, text_filename, include_metadata=True, section_dict=section_dict)
else:
    print(f"Query Failed! Couldn't find {person_name}")


### Clean & Pre-process Text

In [None]:
import utils.nlp_common as unlp
import spacy
from spacy import __version__ as spacy_version

person_name = "Albrecht Dürer"
text_filename = f"data/wikipedia/{person_name.replace(' ', '_').lower()}.txt"
json_nlp_filename = f"data/json/{person_name.replace(' ', '_').lower()}.json"

with open(text_filename) as f:
    text = f.read()
    text = text[:1000]
    text = unlp.preprocess_and_clean_text(text)
    nlp_dict, is_from_file = unlp.create_nlp_template(text, filepath=json_nlp_filename)

# NLP Basic processing using SpaCy (Only if file did not exist already)
if not is_from_file:
    spacy_model = "en_core_web_lg"
    nlp = spacy.load(spacy_model)
    spacy_dict = unlp.run_spacy(text, nlp)
    sentences = spacy_dict['sentences']
    token_objs = spacy_dict['token_objs']
    nlp_dict['tokenization'] = {f'spacy_{spacy_model}_{spacy_version}': spacy_dict['tokens']}
    nlp_dict['morphology'] = {f'spacy_{spacy_model}_{spacy_version}': unlp.add_morphosyntax(spacy_dict['token_objs'])}
else:
    sentences, token_objs = [], []
    flair_versions = [v for v in nlp_dict['morphology'].keys() if 'flair_' in v]
    if len(flair_versions)>0:
        for sent_ix, sent_obj in enumerate(nlp_dict['morphology'][flair_versions[0]]):
            if len(sent_obj['text']) > 0:
                sentences.append(sent_obj['text'])
                for tok in sent_obj['words']:
                    token_objs.append({'sent_id': sent_ix, 'text': tok['FORM'], 'lemma': tok['FORM'], 
                                    'start_char': tok['MISC']['StartChar'], 'end_char': tok['MISC']['EndChar'], 'space_after': tok['MISC']['SpaceAfter']})

print(text[:100])
print(nlp_dict.keys())
print(is_from_file)


### Find Semantic Roles

In [None]:
from utils.nlp_allen import add_json_srl_allennlp
from allennlp.predictors import Predictor

srl_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")
nlp_dict['semantic_roles'] += add_json_srl_allennlp(sentences, srl_predictor, token_objs)

### Find Named Entities (AllenNLP)

In [None]:
from utils.nlp_allen import add_json_ner_allennlp

ner_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz")
nlp_dict['entities'] += add_json_ner_allennlp(sentences, ner_predictor, token_objs)

### Find Coreferences (AllenNLP)

In [None]:
from utils.nlp_allen import add_json_coref_allennlp

coref_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz")
nlp_dict['coreference'] = add_json_coref_allennlp(sentences, coref_predictor, token_objs)

### Find Time Expressions (HeidelTime)

In [None]:
from python_heideltime import Heideltime
from utils.nlp_heideltime import add_json_heideltime
heideltime_parser = Heideltime()
heideltime_parser.set_language('ENGLISH')
heideltime_parser.set_document_type('NARRATIVES')

nlp_dict['time_expressions'] = add_json_heideltime(text, heideltime_parser)

### Save File

In [None]:
import json

intavia_dict = {
            'status': '200',
            'data': nlp_dict
        }

json.dump(intavia_dict, open(json_nlp_filename, "w"), indent=2, ensure_ascii=False)