In [1]:
import wikipediaapi
import nltk
import pickle 
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import tensorflow 

# nltk.download('punkt') # at first run
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

wiki_wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)

Using TensorFlow backend.


In [17]:
def construct_instance_reasons(statement, section_dict_path, vocab_w2v_path, max_len=-1):
    # Load the vocabulary
    vocab_w2v = pickle.load(open(vocab_w2v_path, 'rb'),encoding='latin1')

    # load the section dictionary.
    section_dict = pickle.load(open(section_dict_path, 'rb'))

    # construct the training data
    X = []
    sections = []
    y = []
    outstring=[]
    
    X_inst = []

    for word in statement:
        if max_len != -1 and len(X_inst) >= max_len:
            continue
        if word not in vocab_w2v:
            X_inst.append(vocab_w2v['UNK'])
        else:
            X_inst.append(vocab_w2v[word])

    # extract the section, and in case the section does not exist in the model, then assign UNK
    section = statement.strip().lower()
    sections.append(np.array([section_dict[section] if section in section_dict else 0]))

    X.append(X_inst)
    outstring.append(statement)
    #entity_id  revision_id timestamp   entity_title    section_id  section prg_idx sentence_idx    statement   citations

    X = pad_sequences(X, maxlen=max_len, value=vocab_w2v['UNK'], padding='pre')

    return X, np.array(sections), outstring

In [48]:
def get_title_text(section, title):
    if len(section.sections) == 0:
        title.append(section.title)
        if section.text != '':
            return [(title,phrase) for phrase in sent_detector.tokenize(section.text.strip())]
        else:
            return []
    
    result = []
    title.append(section.title)
    for subsection in section.sections:
        t = title.copy()
        response = get_title_text(subsection, t)
        if len(response) > 0:
            result += response
    
    return result
    

In [60]:
# p_wiki = wiki_wiki.page("RM (rapper)")
# f_wiki = wiki_wiki.page("Fungus")
# path_model = "models/fa_en_model_rnn_attention_section.h5"
# model = load_model(path_model)
h_wiki =  wiki_wiki.page("Hesse")
phrases_by_section = get_title_text(h_wiki,[])

In [61]:
predictions_text = []
max_seq_length = model.input[0].shape[1].value

for section, phrase in phrases_by_section:
    X, sections, outstring = construct_instance_reasons(phrase, "embeddings/word_dict_en.pck", "embeddings/section_dict_en.pck", max_seq_length)
    pred = model.predict([X, sections])
    predictions_text.append((pred, phrase))

In [55]:
construct_instance_reasons(phrases_by_section[98][1], "embeddings/word_dict_en.pck", "embeddings/section_dict_en.pck", max_seq_length)

(array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32),
 array([[0]]),
 ['Compatible haploid hyphae fuse to produce a dikaryotic mycelium.'])

In [58]:
construct_instance_reasons(phrases_by_section[60][1], "embeddings/word_dict_en.pck", "embeddings/section_dict_en.pck", max_seq_length)

(array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32),
 array([[0]]),
 ['The track "Seoul" was produced by British electropop duo Honne.'])

In [67]:
predictions_text.sort(key=lambda tup: tup[0][0][0]) 