In [1]:
import numpy as np
import pandas as pd
import os
import json
import spacy
from itertools import chain, count
from collections import Counter
import re
from pprint import pprint

In [2]:
dat = pd.read_pickle("../data/elmondo_es_sp.pkl")

In [3]:
dat.head()

Unnamed: 0,headline,keyfacts,content,tags,time
http://www.elmundo.es//2016/08/31/57c6baad46163f610d8b45b3.html,"Rosberg: ""Estoy disfrutando mucho de la lucha ...",[El piloto germano alaba el gran ambiente de l...,El piloto alemán Nico Rosberg (Mercedes) esper...,"[fórmula 1, Nico Rosberg, Lewis Hamilton, depo...",2016-08-31 13:12:13
http://www.elmundo.es/america/2014/12/29/54a19293e2704e19708b458f.html,El estadio de David Beckham en Miami se esfuma,[\nEl futbolista quiere construirlo en una zon...,El sueño del astro David Beckham se ha aplazad...,[],2014-12-29
http://www.elmundo.es/andalucia/2014/01/03/52c69fe322601dd86c8b4576.html,Del Nido recogerá firmas en apoyo de su solici...,[ Voluntarios recorrerán los aledaños del Sánc...,"La mañana del domingo 5 de enero, unos 40 volu...",[],2014-01-03
http://www.elmundo.es/andalucia/2014/01/14/52d54c02268e3eb1318b4579.html,Garrido va pidiendo la cuenta,[ El entrenador del Betis critica a la directi...,"Nada más aterrizar Juan Carlos Garrido, en el ...",[],2014-01-14
http://www.elmundo.es/andalucia/2014/01/19/52dc119eca4741a5458b4570.html,El keniano Paul Tanui inscribe su nombre en el...,[ Triunfo para la etíope Hiwot Ayalew en la ca...,El keniano Paul Tanui ha inscrito este domingo...,[],2014-01-19


Check if there are any remaining articles without keyfacts:

In [4]:
(dat['keyfacts'].apply(len) == 0).sum()

0

Identify named entities:

In [5]:
nlp = spacy.load('es')
with open("../wordvecs/wiki.es/wiki.es.nospace.vec") as f:
    nlp.vocab.load_vectors(f)

In [6]:
nlp.pipeline

[<spacy.tagger.Tagger at 0x7fc9cded6ee8>,
 <spacy.pipeline.DependencyParser at 0x7fc9e6cda138>,
 <spacy.matcher.Matcher at 0x7fc9aa133128>,
 <spacy.pipeline.EntityRecognizer at 0x7fc9aad0af98>]

In [7]:
fact_entlist = dat['keyfacts'].apply(lambda l: [nlp(s).ents for s in l])

In [8]:
dat['content'] = dat['content'].apply(lambda x: re.sub(r'\[.*?]', '', x))

In [9]:
story_entlist = dat['content'].apply(lambda x: nlp(x).ents)

In [10]:
def random_id_generator(n):
    rand_list = np.random.choice(range(n), n, replace=False)
    i = 0
    while i < n:
        yield rand_list[i]
        i += 1

## Generate Questions

In [11]:
dat.shape

(11580, 5)

In [12]:
M = dat.shape[0]
pairs = []
# ent_id = count()
# num_id = count()
for i in range(M):
    if i % 500 == 0:
        print("generating question {0} to {1}...".format(i, i + 500))
    doc_ents = np.array(story_entlist[i])
    for j in range(len(fact_entlist[i])):
        if len(fact_entlist[i][j]) == 0:
            pass
        else:
            for ent in fact_entlist[i][j]:
                if not re.search(r'\w+', ent.text):
                    continue
                ent_id = random_id_generator(500)
                num_id = random_id_generator(500)
                try:
                    matches = ~pd.isnull([re.search(r'(?<!(?:\w|[-_]))' + ent.text + r'(?!(?:\w|[-_]))', x.text)
                                          for x in doc_ents])
                except:
                    continue
                if np.any(matches):
                    # remove all instances of ent (incl. partial matches) from the story entity list
                    nq_ents = doc_ents[~matches]
                    if ent.label_ in ['CARDINAL', 'ORDINAL']:
                        ent_type = 'number'
                        ent_iter = num_id
                        continue
                    else:
                        ent_type = 'entity'
                        ent_iter = ent_id
                    ind = next(ent_iter)
                    # replace entity in question
#                     question_text = dat['keyfacts'][i][j].replace(ent.text, '@placeholder')
                    question_text = re.sub(r'(?<!(?:\w|[-_]))' + ent.text + r'(?!(?:\w|[-_]))',
                                           '@placeholder', dat['keyfacts'][i][j])
                    # replace entity in text
#                     content_text = dat['content'][i].replace(ent.text, '@{0}{1}'.format(ent_type, ind))
                    content_text = re.sub(r'(?<!(?:\w|[-_]))' + ent.text + r'(?!(?:\w|[-_]))', 
                                          '@{0}{1}'.format(ent_type, ind), dat['content'][i])
                    ans = '@{0}{1}'.format(ent_type, ind)
                    # replace other instances of entities
                    for other_ent in nq_ents:
                        if not re.search(r'\w+', other_ent.text):
                            continue
                        if re.search(r"\(|\)", other_ent.text):
                            continue
                        if other_ent.label_ in ['CARDINAL', 'ORDINAL']:
                            ent_type = 'number'
                            ent_iter = num_id
                            continue
                        else:
                            ent_type = 'entity'
                            ent_iter = ent_id
                        ind = next(ent_iter)
#                         content_text = content_text.replace(other_ent.text, '@{0}{1}'.format(ent_type, ind))
                        try:
                            question_text = re.sub(r'(?<!(?:\w|[-_]))' + other_ent.text + r'(?!(?:\w|[-_]))', 
                                                   '@{0}{1}'.format(ent_type, ind), question_text)
                            content_text = re.sub(r'(?<!(?:\w|[-_]))' + other_ent.text + r'(?!(?:\w|[-_]))', 
                                                  '@{0}{1}'.format(ent_type, ind), content_text)
                        except:
                            print("skipped bad token {0}".format(other_ent.text))
                    pairs += [[question_text, ans, content_text]]

generating question 0 to 500...
generating question 500 to 1000...
generating question 1000 to 1500...
generating question 1500 to 2000...
generating question 2000 to 2500...
generating question 2500 to 3000...
generating question 3000 to 3500...
generating question 3500 to 4000...
generating question 4000 to 4500...
generating question 4500 to 5000...
generating question 5000 to 5500...
generating question 5500 to 6000...
generating question 6000 to 6500...
generating question 6500 to 7000...
skipped bad token Bruce [Fraser
skipped bad token Bruce [Fraser
generating question 7000 to 7500...
generating question 7500 to 8000...
generating question 8000 to 8500...
generating question 8500 to 9000...
generating question 9000 to 9500...
generating question 9500 to 10000...
generating question 10000 to 10500...
generating question 10500 to 11000...
generating question 11000 to 11500...
generating question 11500 to 12000...


In [13]:
df = pd.DataFrame(pairs, columns=["question", "answer", "story"])

In [14]:
df.head()

Unnamed: 0,question,answer,story
0,El piloto germano alaba el gran ambiente de lo...,@entity288,El piloto alemán @entity39 (@entity490) espera...
1,"@placeholder valora que sería ""increíble"" repe...",@entity414,El piloto alemán @entity459 (@entity266) esper...
2,Voluntarios recorrerán los aledaños del @plac...,@entity82,"La mañana del domingo 5 de enero, unos 40 volu..."
3,Ese día se disputará el partido @placeholder....,@entity145,"La mañana del domingo 5 de enero, unos 40 volu..."
4,Ese día se disputará el partido Sevilla F.C.-...,@entity163,"La mañana del domingo 5 de enero, unos 40 volu..."


In [15]:
df.shape

(23961, 3)

In [16]:
# df.to_csv("es_sample.csv")

In [17]:
df.to_pickle("../data/elmondo_es_qa_no_numbers_sp.pkl")