### Find Wikipedia Article

In [24]:
import os
from utils_wiki import get_wikipedia_article, save_wikipedia_page, extract_sections

person_name = "Octavio Paz"


if not os.path.exists("data/wikipedia"): os.makedirs("data/wikipedia")
if not os.path.exists("data/json"): os.makedirs("data/json")
text_filename = f"data/wikipedia/{person_name.replace(' ', '_').lower()}.txt"
json_nlp_filename = f"data/json/{person_name.replace(' ', '_').lower()}.json"

if not os.path.exists(text_filename) or True:
    wiki_page = get_wikipedia_article(person_name)
    if wiki_page:
        print(f"Found a Page: {wiki_page.title}")
        text = wiki_page.content
        section_dict = extract_sections(text)
        save_wikipedia_page(wiki_page, text_filename, include_metadata=True, section_dict=section_dict)
    else:
        print(f"Query Failed! Couldn't find {person_name}")
else:
    text = open(text_filename).read()

Options: {'For Octavio Paz', 'Juana Inés de la Cruz', 'Octavio Paz'}
Ordered Options Compund Metric: [RankedArticle(wikipage_title='Octavio Paz', queried_name='Octavio Paz', lev_similarity=1.0, token_overlap=1.0, dates_confidence=-1), RankedArticle(wikipage_title='For Octavio Paz', queried_name='Octavio Paz', lev_similarity=0.8461538461538461, token_overlap=0.6666666666666666, dates_confidence=-1)]

Retrieving page for Octavio Paz
Wiki Life Data = (1914 - 1998)
Page Chosen! Confidence Score = 1
Found a Page: Octavio Paz


### Clean & Pre-process Text

In [25]:
import utils_nlp_common as unlp
import spacy
from spacy import __version__ as spacy_version


with open(text_filename) as f:
    text = f.read()
    text = text[:1000]
    text = unlp.preprocess_and_clean_text(text)
    nlp_dict, is_from_file = unlp.create_nlp_template(text, filepath=json_nlp_filename)

# NLP Basic processing using SpaCy (Only if file did not exist already)
if not is_from_file:
    spacy_model = "en_core_web_sm"
    nlp = spacy.load(spacy_model)
    spacy_dict = unlp.run_spacy(text, nlp)
    nlp_dict['tokenization'] = {f'spacy_{spacy_model}_{spacy_version}': spacy_dict['tokens']}
    nlp_dict['morphology'] = {f'spacy_{spacy_model}_{spacy_version}': unlp.add_morphosyntax(spacy_dict['token_objs'])}
else:
    spacy_model = "en_core_web_sm"
    nlp = spacy.load(spacy_model)
    text = nlp_dict['text']
    spacy_dict = unlp.run_spacy(text, nlp)

print(text[:100])
print(nlp_dict.keys())



Octavio Paz Lozano (March 31, 1914 – April 19, 1998) was a Mexican poet and diplomat. For his body o
dict_keys(['text', 'tokenization', 'morphology', 'entities', 'time_expressions', 'semantic_roles', 'coreference'])


### Find Semantic Roles

In [26]:
from utils_nlp_allen import add_json_srl_allennlp
from allennlp.predictors import Predictor

srl_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")
nlp_dict['semantic_roles'] += add_json_srl_allennlp(spacy_dict['sentences'], srl_predictor, spacy_dict['token_objs'])

2023-06-16 15:18:20,559 - INFO - allennlp.common.plugins - Plugin allennlp_models available
2023-06-16 15:18:20,762 - INFO - cached_path - cache of https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz is up-to-date
2023-06-16 15:18:20,763 - INFO - allennlp.models.archival - loading archive file https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz from cache at /Users/daza/.allennlp/cache/b5f1db011cc85691a5fa2bf29e055a712261a2e5d74a74edd7da2fffc98d4ab8.4c4ac7e06ec3d85631bd26b839f90b5a375d3ceeb43e3c74f1cf4758dcee2bb3
2023-06-16 15:18:20,765 - INFO - allennlp.models.archival - extracting archive file /Users/daza/.allennlp/cache/b5f1db011cc85691a5fa2bf29e055a712261a2e5d74a74edd7da2fffc98d4ab8.4c4ac7e06ec3d85631bd26b839f90b5a375d3ceeb43e3c74f1cf4758dcee2bb3 to temp dir /var/folders/dx/h319cjln6sz2cjsd0t05zgjr0000gn/T/tmp4mcouxuc
2023-06-16 15:18:24,136 - INFO - allennlp.common.params - datase

### Find Named Entities (AllenNLP)

In [27]:
from utils_nlp_allen import add_json_ner_allennlp

ner_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz")
nlp_dict['entities'] += add_json_ner_allennlp(spacy_dict['sentences'], ner_predictor, spacy_dict['token_objs'])

2023-06-16 15:23:50,467 - INFO - allennlp.common.plugins - Plugin allennlp_models available
2023-06-16 15:23:50,656 - INFO - cached_path - cache of https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz is up-to-date
2023-06-16 15:23:50,657 - INFO - allennlp.models.archival - loading archive file https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz from cache at /Users/daza/.allennlp/cache/898f78f54a9a02cabe98c9bc232c83f1728f7e69281bb8842bf3829e5c07bfff.7efc10a5d4b0bbe3c88362e2c230f7c72f6e463fddd5b37e55d9f3a44358c303
2023-06-16 15:23:50,658 - INFO - allennlp.models.archival - extracting archive file /Users/daza/.allennlp/cache/898f78f54a9a02cabe98c9bc232c83f1728f7e69281bb8842bf3829e5c07bfff.7efc10a5d4b0bbe3c88362e2c230f7c72f6e463fddd5b37e55d9f3a44358c303 to temp dir /var/folders/dx/h319cjln6sz2cjsd0t05zgjr0000gn/T/tmp9z9vkjig
2023-06-16 15:23:53,635 - INFO - allennlp.common.params - dataset_reader.type = conll2003
2023-06-16 15:23:5

### Find Correferences (AllenNLP)

In [28]:
from utils_nlp_allen import add_json_coref_allennlp

coref_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz")
nlp_dict['coreference'] = add_json_coref_allennlp(spacy_dict['sentences'], coref_predictor, spacy_dict['token_objs'])

2023-06-16 15:24:29,501 - INFO - allennlp.common.plugins - Plugin allennlp_models available
2023-06-16 15:24:29,654 - INFO - cached_path - cache of https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz is up-to-date
2023-06-16 15:24:29,655 - INFO - allennlp.models.archival - loading archive file https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz from cache at /Users/daza/.allennlp/cache/038f918d294bd1a45e3709dfb22af5277b0be8677f750a85748c39979ce0e549.b897bfe76a04a5f70d6e88762a4d819b4b8b90e45b31b8314e0a6a9630d3f213
2023-06-16 15:24:29,657 - INFO - allennlp.models.archival - extracting archive file /Users/daza/.allennlp/cache/038f918d294bd1a45e3709dfb22af5277b0be8677f750a85748c39979ce0e549.b897bfe76a04a5f70d6e88762a4d819b4b8b90e45b31b8314e0a6a9630d3f213 to temp dir /var/folders/dx/h319cjln6sz2cjsd0t05zgjr0000gn/T/tmpjm82h9le
2023-06-16 15:24:40,601 - INFO - allennlp.common.params - dataset_reader.type = core

### Find Time Expressions (HeidelTime)

In [29]:
from python_heideltime import Heideltime
from utils_nlp_heideltime import add_json_heideltime
heideltime_parser = Heideltime()
heideltime_parser.set_language('ENGLISH')
heideltime_parser.set_document_type('NARRATIVES')

nlp_dict['time_expressions'] = add_json_heideltime(text, heideltime_parser)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Save File

In [30]:
import json

intavia_dict = {
            'status': '200',
            'data': nlp_dict
        }

json.dump(intavia_dict, open(json_nlp_filename, "w"), indent=2, ensure_ascii=False)