In [1]:
import numpy as np
import pandas as pd
import os
import json
import spacy
from itertools import chain, count
from collections import Counter
import re
from pprint import pprint

In [2]:
dat = pd.read_pickle("../data/elmondo_es.pkl")

In [3]:
dat.head()

Unnamed: 0,headline,keyfacts,content,tags,time
http://www.elmundo.es/america/2014/01/02/52c4d39a22601d6f658b457c.html,Una jueza del Tribunal Supremo suspende parcia...,[ Decidió atender a los grupos conservadores y...,"El martes, antes de presidir la fiesta de fin ...",[],2014-01-02
http://www.elmundo.es/america/2014/01/02/52c4d99622601d6d658b458a.html,'La revolución cubana sigue sin compromisos co...,[ 'Jamás hemos cedido ni cederemos ante agresi...,El presidente Raúl Castro reveló que se está i...,[],2014-01-02
http://www.elmundo.es/america/2014/01/03/52c61ede268e3e3c528b456b.html,La NSA trabaja en un ordenador cuántico capaz ...,[ La información proviene de los documentos de...,La Agencia de Seguridad Nacional (NSA) trabaja...,[],2014-01-03
http://www.elmundo.es/america/2014/01/10/52cfbb62ca47415a218b456b.html,Último adiós a la ex Miss Venezuela Mónica Spe...,[ Mónica Spear y su marido fueron asesinados e...,Esta semana Venezuela ha recibido una noticia ...,[],2014-01-10
http://www.elmundo.es/america/2014/01/14/52d4b8ba268e3eb2318b456a.html,Michoacán pone en jaque al Gobierno de Peña Nieto,[ El Gobierno envía más policías y militares y...,La situación en el Estado mexicano de Michoacá...,[],2014-01-14


Check if there are any remaining articles without keyfacts:

In [4]:
(dat['keyfacts'].apply(len) == 0).sum()

0

Identify named entities:

In [5]:
nlp = spacy.load('es')
with open("../wordvecs/wiki.es/wiki.es.nospace.vec") as f:
    nlp.vocab.load_vectors(f)

In [6]:
nlp.pipeline

[<spacy.tagger.Tagger at 0x7f278b5d2090>,
 <spacy.pipeline.DependencyParser at 0x7f278382be08>,
 <spacy.matcher.Matcher at 0x7f2746e9e7b8>,
 <spacy.pipeline.EntityRecognizer at 0x7f2747a91cc8>]

In [7]:
fact_entlist = dat['keyfacts'].apply(lambda l: [nlp(s).ents for s in l])

(remove square bracket links)

In [8]:
re.sub(r'\[.*?]', '', 'y de nuestro amigo el gobernador del estado [Fausto Vallejo].')

'y de nuestro amigo el gobernador del estado .'

In [9]:
dat['content'] = dat['content'].apply(lambda x: re.sub(r'\[.*?]', '', x))

In [10]:
story_entlist = dat['content'].apply(lambda x: nlp(x).ents)

In [11]:
def random_id_generator(n):
    rand_list = np.random.choice(range(n), n, replace=False)
    i = 0
    while i < n:
        yield rand_list[i]
        i += 1

Maximum entity id is 584 in rc-data.

## Testing Question Generation

In [19]:
pairs = []
for i in range(5):
    doc_ents = np.array(story_entlist[i])
    for j in range(len(fact_entlist[i])):
        if len(fact_entlist[i][j]) == 0:
            pass
        else:
            for ent in fact_entlist[i][j]:
                if not re.search(r'\w+', ent.text):
                    continue
                ent_id = random_id_generator(100)
                num_id = random_id_generator(100)
                matches = ~pd.isnull([re.search(r'(?<!(?:\w|[-_]))' + ent.text + r'(?!(?:\w|[-_]))', x.text) 
                                      for x in doc_ents])
                if np.any(matches):
                    # remove all instances of ent (incl. partial matches) from the story entity list
                    nq_ents = doc_ents[~matches]
                    if ent.label_ in ['CARDINAL', 'ORDINAL']:
                        ent_type = 'number'
                        ent_iter = num_id
                        # skip numbers
                        continue
                    else:
                        ent_type = 'entity'
                        ent_iter = ent_id
                    ind = next(ent_iter)
                    # replace entity in question
#                     question_text = dat['keyfacts'][i][j].replace(ent.text, '@placeholder')
                    question_text = re.sub(r'(?<!(?:\w|[-_]))' + ent.text + r'(?!(?:\w|[-_]))',
                                           '@placeholder', dat['keyfacts'][i][j])
                    # replace entity in text
#                     content_text = dat['content'][i].replace(ent.text, '@{0}{1}'.format(ent_type, ind))
                    content_text = re.sub(r'(?<!(?:\w|[-_]))' + ent.text + r'(?!(?:\w|[-_]))', 
                                          '@{0}{1}'.format(ent_type, ind), dat['content'][i])
                    ans = '@{0}{1}'.format(ent_type, ind)
                    # replace other instances of entities
                    for other_ent in nq_ents:
                        if not re.search(r'\w+', other_ent.text):
                            continue
                        if other_ent.label_ in ['CARDINAL', 'ORDINAL']:
                            ent_type = 'number'
                            ent_iter = num_id
                            continue
                        else:
                            ent_type = 'entity'
                            ent_iter = ent_id
                        ind = next(ent_iter)
#                         content_text = content_text.replace(other_ent.text, '@{0}{1}'.format(ent_type, ind))
                        question_text = re.sub(r'(?<!(?:\w|[-_]))' + other_ent.text + r'(?!(?:\w|[-_]))', 
                                               '@{0}{1}'.format(ent_type, ind), question_text)
                        content_text = re.sub(r'(?<!(?:\w|[-_]))' + other_ent.text + r'(?!(?:\w|[-_]))', 
                                              '@{0}{1}'.format(ent_type, ind), content_text)
                    print(question_text + '?')
                    print('...')
                    print(ans)
                    print('...')
                    print(content_text)
                    print('--------------------')

 La información proviene de los documentos del ex analista @placeholder ?
...
@entity28
...
La @entity38 (@entity37) trabaja en la construcción de un ordenador cuántico que puede descifrar cualquier contraseña, incluso las de más alta seguridad, según reveló hoy en exclusiva el diario '@entity4', a partir de los documentos del ex analista de la @entity37 @entity28.
El desarrollo de la computación cuántica es un objetivo que persigue desde hace años la comunidad científica y en el que la @entity37, la @entity54 y @entity81 han hecho importantes avances en la última década.
Un ordenador cuántico es mucho más rápido que uno común, tanto que es capaz de descifrar todas las formas de codificación, incluso las de más alta seguridad que se emplean para proteger secretos de Estado, transacciones financieras, e información médica y de negocios.
@entity6 los documentos proporcionados por @entity27, los trabajos de la @entity37 para construir un ordenador cuántico forman parte de un programa de i

## Generate Questions

In [20]:
dat.shape

(15022, 5)

In [21]:
M = dat.shape[0]
pairs = []
# ent_id = count()
# num_id = count()
for i in range(M):
    if i % 500 == 0:
        print("generating question {0} to {1}...".format(i, i + 500))
    doc_ents = np.array(story_entlist[i])
    for j in range(len(fact_entlist[i])):
        if len(fact_entlist[i][j]) == 0:
            pass
        else:
            for ent in fact_entlist[i][j]:
                if not re.search(r'\w+', ent.text):
                    continue
                ent_id = random_id_generator(500)
                num_id = random_id_generator(500)
                try:
                    matches = ~pd.isnull([re.search(r'(?<!(?:\w|[-_]))' + ent.text + r'(?!(?:\w|[-_]))', x.text)
                                          for x in doc_ents])
                except:
                    continue
                if np.any(matches):
                    # remove all instances of ent (incl. partial matches) from the story entity list
                    nq_ents = doc_ents[~matches]
                    if ent.label_ in ['CARDINAL', 'ORDINAL']:
                        ent_type = 'number'
                        ent_iter = num_id
                        continue
                    else:
                        ent_type = 'entity'
                        ent_iter = ent_id
                    ind = next(ent_iter)
                    # replace entity in question
#                     question_text = dat['keyfacts'][i][j].replace(ent.text, '@placeholder')
                    question_text = re.sub(r'(?<!(?:\w|[-_]))' + ent.text + r'(?!(?:\w|[-_]))',
                                           '@placeholder', dat['keyfacts'][i][j])
                    # replace entity in text
#                     content_text = dat['content'][i].replace(ent.text, '@{0}{1}'.format(ent_type, ind))
                    content_text = re.sub(r'(?<!(?:\w|[-_]))' + ent.text + r'(?!(?:\w|[-_]))', 
                                          '@{0}{1}'.format(ent_type, ind), dat['content'][i])
                    ans = '@{0}{1}'.format(ent_type, ind)
                    # replace other instances of entities
                    for other_ent in nq_ents:
                        if not re.search(r'\w+', other_ent.text):
                            continue
                        if re.search(r"\(|\)", other_ent.text):
                            continue
                        if other_ent.label_ in ['CARDINAL', 'ORDINAL']:
                            ent_type = 'number'
                            ent_iter = num_id
                            continue
                        else:
                            ent_type = 'entity'
                            ent_iter = ent_id
                        ind = next(ent_iter)
#                         content_text = content_text.replace(other_ent.text, '@{0}{1}'.format(ent_type, ind))
                        try:
                            question_text = re.sub(r'(?<!(?:\w|[-_]))' + other_ent.text + r'(?!(?:\w|[-_]))', 
                                                   '@{0}{1}'.format(ent_type, ind), question_text)
                            content_text = re.sub(r'(?<!(?:\w|[-_]))' + other_ent.text + r'(?!(?:\w|[-_]))', 
                                                  '@{0}{1}'.format(ent_type, ind), content_text)
                        except:
                            print("skipped bad token {0}".format(other_ent.text))
                    pairs += [[question_text, ans, content_text]]

generating question 0 to 500...
generating question 500 to 1000...
generating question 1000 to 1500...
generating question 1500 to 2000...
generating question 2000 to 2500...
generating question 2500 to 3000...
generating question 3000 to 3500...
generating question 3500 to 4000...
generating question 4000 to 4500...
generating question 4500 to 5000...
generating question 5000 to 5500...
generating question 5500 to 6000...
generating question 6000 to 6500...
generating question 6500 to 7000...
generating question 7000 to 7500...
generating question 7500 to 8000...
generating question 8000 to 8500...
generating question 8500 to 9000...
generating question 9000 to 9500...
generating question 9500 to 10000...
generating question 10000 to 10500...
generating question 10500 to 11000...
generating question 11000 to 11500...
generating question 11500 to 12000...
generating question 12000 to 12500...
generating question 12500 to 13000...
generating question 13000 to 13500...
generating questio

In [22]:
df = pd.DataFrame(pairs, columns=["question", "answer", "story"])

In [23]:
df.head()

Unnamed: 0,question,answer,story
0,La información proviene de los documentos del...,@entity318,La @entity57 (@entity187) trabaja en la constr...
1,Han detenido a 7 personas intregrantes de la ...,@entity299,Esta semana @entity170 ha recibido una noticia...
2,@placeholder tiene una gran tasa de criminali...,@entity223,Esta semana @entity223 ha recibido una noticia...
3,Los civiles armados se niegan y se registran ...,@entity249,La situación en el @entity163 de @entity179 es...
4,"Su líder moral, el dr. @placeholder, dice en ...",@entity420,La situación en el @entity47 de @entity355 es ...


In [24]:
df.shape

(23786, 3)

In [25]:
# df.to_csv("es_sample.csv")

In [26]:
df.to_pickle("../data/elmondo_es_qa_no_numbers.pkl")