In [1]:
import numpy as np
import pandas as pd
import os
import json
import spacy
from itertools import chain, count
from collections import Counter
import re
from pprint import pprint

In [2]:
dat = pd.read_pickle("../data/elmondo_es_eco.pkl")

In [3]:
dat.head()

Unnamed: 0,headline,keyfacts,content,tags,time
http://www.elmundo.es/america/2014/01/08/52cdabed268e3e892e8b458e.html,La Fed avisa de que no tiene marcada la senda ...,"[ Dependerá de la evolución de la economía, el...",La mayoría de los miembros del Comité Federal ...,[],2014-01-08
http://www.elmundo.es/america/2014/01/22/52dfe0eae2704ea74a8b4570.html,El Gobierno argentino sólo permitirá dos compr...,[ Cuando se supere el límite el comprador debe...,El Gobierno de Cristina Fernández de Kirchner ...,[],2014-01-22
http://www.elmundo.es/america/2014/01/26/52e52770e2704ecd598b4571.html,Marcha atrás en 48 horas del Gobierno argentin...,[ 'En la mentalidad argentina está insertado e...,"""Donde dije 'digo', digo 'Diego'"" parece procl...",[],2014-01-26
http://www.elmundo.es/america/2014/04/12/53492fd522601d410e8b456c.html,EEUU investiga a Herbalife por supuesta estafa...,[\nVende productos y suplementos dietéticos a ...,"La Comisión Federal del Comercio de EEUU (FTC,...",[],2014-04-12
http://www.elmundo.es/america/2014/04/21/535517cb268e3eb4218b457a.html,Argentina recurre al Supremo de EEUU contra lo...,[\nLos fondos compraron a precio de ganga bono...,Los ocho jueces del Tribunal Supremo de los Es...,[],2014-04-21


Check if there are any remaining articles without keyfacts:

In [4]:
(dat['keyfacts'].apply(len) == 0).sum()

0

Identify named entities:

In [5]:
nlp = spacy.load('es')
with open("../wordvecs/wiki.es/wiki.es.nospace.vec") as f:
    nlp.vocab.load_vectors(f)

In [6]:
nlp.pipeline

[<spacy.tagger.Tagger at 0x7f4390515c60>,
 <spacy.pipeline.DependencyParser at 0x7f4388ec99f8>,
 <spacy.matcher.Matcher at 0x7f434b842898>,
 <spacy.pipeline.EntityRecognizer at 0x7f434c4bb8b8>]

In [7]:
fact_entlist = dat['keyfacts'].apply(lambda l: [nlp(s).ents for s in l])

In [8]:
dat['content'] = dat['content'].apply(lambda x: re.sub(r'\[.*?]', '', x))

In [9]:
story_entlist = dat['content'].apply(lambda x: nlp(x).ents)

In [10]:
def random_id_generator(n):
    rand_list = np.random.choice(range(n), n, replace=False)
    i = 0
    while i < n:
        yield rand_list[i]
        i += 1

## Generate Questions

In [11]:
dat.shape

(8031, 5)

In [12]:
M = dat.shape[0]
pairs = []
# ent_id = count()
# num_id = count()
for i in range(M):
    if i % 500 == 0:
        print("generating question {0} to {1}...".format(i, i + 500))
    doc_ents = np.array(story_entlist[i])
    for j in range(len(fact_entlist[i])):
        if len(fact_entlist[i][j]) == 0:
            pass
        else:
            for ent in fact_entlist[i][j]:
                if not re.search(r'\w+', ent.text):
                    continue
                ent_id = random_id_generator(500)
                num_id = random_id_generator(500)
                try:
                    matches = ~pd.isnull([re.search(r'(?<!(?:\w|[-_]))' + ent.text + r'(?!(?:\w|[-_]))', x.text)
                                          for x in doc_ents])
                except:
                    continue
                if np.any(matches):
                    # remove all instances of ent (incl. partial matches) from the story entity list
                    nq_ents = doc_ents[~matches]
                    if ent.label_ in ['CARDINAL', 'ORDINAL']:
                        ent_type = 'number'
                        ent_iter = num_id
                        continue
                    else:
                        ent_type = 'entity'
                        ent_iter = ent_id
                    ind = next(ent_iter)
                    # replace entity in question
#                     question_text = dat['keyfacts'][i][j].replace(ent.text, '@placeholder')
                    question_text = re.sub(r'(?<!(?:\w|[-_]))' + ent.text + r'(?!(?:\w|[-_]))',
                                           '@placeholder', dat['keyfacts'][i][j])
                    # replace entity in text
#                     content_text = dat['content'][i].replace(ent.text, '@{0}{1}'.format(ent_type, ind))
                    content_text = re.sub(r'(?<!(?:\w|[-_]))' + ent.text + r'(?!(?:\w|[-_]))', 
                                          '@{0}{1}'.format(ent_type, ind), dat['content'][i])
                    ans = '@{0}{1}'.format(ent_type, ind)
                    # replace other instances of entities
                    for other_ent in nq_ents:
                        if not re.search(r'\w+', other_ent.text):
                            continue
                        if re.search(r"\(|\)", other_ent.text):
                            continue
                        if other_ent.label_ in ['CARDINAL', 'ORDINAL']:
                            ent_type = 'number'
                            ent_iter = num_id
                            continue
                        else:
                            ent_type = 'entity'
                            ent_iter = ent_id
                        ind = next(ent_iter)
#                         content_text = content_text.replace(other_ent.text, '@{0}{1}'.format(ent_type, ind))
                        try:
                            question_text = re.sub(r'(?<!(?:\w|[-_]))' + other_ent.text + r'(?!(?:\w|[-_]))', 
                                                   '@{0}{1}'.format(ent_type, ind), question_text)
                            content_text = re.sub(r'(?<!(?:\w|[-_]))' + other_ent.text + r'(?!(?:\w|[-_]))', 
                                                  '@{0}{1}'.format(ent_type, ind), content_text)
                        except:
                            print("skipped bad token {0}".format(other_ent.text))
                    pairs += [[question_text, ans, content_text]]

generating question 0 to 500...
generating question 500 to 1000...
generating question 1000 to 1500...
generating question 1500 to 2000...
generating question 2000 to 2500...
generating question 2500 to 3000...
generating question 3000 to 3500...
generating question 3500 to 4000...
generating question 4000 to 4500...
generating question 4500 to 5000...
generating question 5000 to 5500...
generating question 5500 to 6000...
generating question 6000 to 6500...
generating question 6500 to 7000...
generating question 7000 to 7500...
generating question 7500 to 8000...
generating question 8000 to 8500...


In [13]:
df = pd.DataFrame(pairs, columns=["question", "answer", "story"])

In [14]:
df.head()

Unnamed: 0,question,answer,story
0,'En la mentalidad argentina está insertado el...,@entity115,"""Donde dije 'digo', digo '@entity249'"" parece ..."
1,"\nTras confirmar la empresa la investigación, ...",@entity271,"La @entity179 de @entity436 (@entity394, en su..."
2,\nLos fondos compraron a precio de ganga bonos...,@entity256,Los ocho jueces del @entity321 @entity299 reci...
3,\nRechazaron la posterior reestructuración de ...,@entity423,Los ocho jueces del @entity375 @entity395 reci...
4,\nLa sanción se debe a la complicidad de la en...,@entity36,"El @entity232, @entity327, el mayor banco de @..."


In [15]:
df.shape

(9378, 3)

In [16]:
# df.to_csv("es_sample.csv")

In [17]:
df.to_pickle("../data/elmondo_es_qa_no_numbers_eco.pkl")