In [1]:
import numpy as np
import pandas as pd
import spacy
import re
from itertools import chain

In [2]:
dat1 = pd.read_pickle("../data/elmondo_es_qa_no_numbers.pkl")
dat2 = pd.read_pickle("../data/elmondo_es_qa_no_numbers_eco.pkl")
dat3 = pd.read_pickle("../data/elmondo_es_qa_no_numbers_sp.pkl")

In [3]:
dat = pd.concat([dat1, dat2, dat3], axis=0)
dat = dat.reset_index().drop('index', axis=1)

In [4]:
dat.head()

Unnamed: 0,question,answer,story
0,La información proviene de los documentos del...,@entity176,La @entity268 (@entity284) trabaja en la const...
1,Han detenido a 7 personas intregrantes de la ...,@entity239,Esta semana @entity404 ha recibido una noticia...
2,@placeholder tiene una gran tasa de criminali...,@entity159,Esta semana @entity159 ha recibido una noticia...
3,Los civiles armados se niegan y se registran ...,@entity90,La situación en el @entity287 de @entity341 es...
4,"Su líder moral, el dr. @placeholder, dice en ...",@entity263,La situación en el @entity280 de @entity349 es...


In [5]:
dat.shape

(57416, 3)

In [6]:
nlp = spacy.load("es")

In [7]:
# with open("../wordvecs/wiki.es/wiki.es.vec") as f:
#     f.readline()
#     lines = f.readlines()

In [8]:
# with open("../wordvecs/wiki.es/wiki.es.nospace.vec", "w") as f:
#     for line in lines:
#         if not re.search(r'[\u00A0\u1680\u180e\u2000-\u2009\u200a\u200b\u202f\u205f\u3000\u2028]', line):
#             f.write(line)

In [9]:
with open("../wordvecs/wiki.es/wiki.es.nospace.vec") as f:
    nlp.vocab.load_vectors(f)

In [27]:
dat['question'][0]

' La información proviene de los documentos del ex analista @placeholder '

In [47]:
temp = nlp(dat['question'][0])

In [48]:
len(temp)

11

In [50]:
temp.vector.shape

(300,)

Find the maximum question and story length

In [64]:
q_lengths = dat['question'].apply(lambda x: len(nlp(x, parse=False, tag=False, entity=False)))

Cap question length at 50

In [65]:
(q_lengths > 50).sum()

68

In [75]:
selected = dat.loc[q_lengths <= 50]

In [76]:
selected.shape

(57348, 3)

In [78]:
s_lengths = selected['story'].apply(lambda x: len(nlp(x, parse=False, tag=False, entity=False)))

In [79]:
s_lengths.max()

6438

In [80]:
s_lengths.mean()

656.17371137615953

In [81]:
s_lengths.shape

(57348,)

In [82]:
(s_lengths > 2000).sum()

260

Cap story length at 2000

In [83]:
selected = selected.loc[s_lengths <= 2000]

In [84]:
selected.shape

(57088, 3)

Sort stories by length

In [86]:
selected['story_length'] = selected['story'].apply(lambda x: len(nlp(x, parse=False, tag=False, entity=False)))

In [88]:
selected.sort_values("story_length", ascending=False, inplace=True)

In [89]:
selected.head()

Unnamed: 0,question,answer,story,story_length
6571,El estudiante de EEUU detenido en @placeholder...,@entity457,"En @entity457 se aprende a llorar en silencio,...",2000
19317,CAPÍTULO1: La segunda ciudad de @placeholder d...,@entity97,"Según su propio relato, @entity404 tenía 6 año...",2000
19316,La exhibición que recuerda la masacre de @enti...,@entity396,"Según su propio relato, @entity439 tenía 6 año...",2000
6570,VIAJE AL REINO HERMÉTICO: Una española en @pla...,@entity435,"En @entity344 se aprende a llorar en silencio,...",2000
8186,Volver a nacer dos veces en un mismo siglo en ...,@entity200,Si se callejea por el casco antiguo de @entity...,1998


In [90]:
selected.to_pickle("../data/compiled_es_qa.pkl")

batch generation

In [2]:
selected = pd.read_pickle("../data/compiled_es_qa.pkl")

In [8]:
nlp = spacy.load("es")

In [4]:
with open("../wordvecs/wiki.es/wiki.es.nospace.vec") as f:
    nlp.vocab.load_vectors(f)

In [5]:
dev = selected.sample(frac=0.1, random_state=7777, replace=False).sort_values("story_length", ascending=False)
rest = selected[~np.in1d(selected.index, dev.index)]
test = rest.sample(frac=0.1, random_state=7777, replace=False).sort_values("story_length", ascending=False)
train = rest[~np.in1d(rest.index, test.index)]

In [6]:
dev.shape

(5709, 4)

In [7]:
test.shape

(5138, 4)

In [8]:
train.shape

(46241, 4)

In [9]:
train.to_pickle("../input_data/train_es.pkl")
dev.to_pickle("../input_data/dev_es.pkl")
test.to_pickle("../input_data/test_es.pkl")

In [2]:
train = pd.read_pickle("../input_data/train_es.pkl")

In [3]:
import torch.utils.data as tud

In [4]:
class QADataset(tud.Dataset):
    
    def __init__(self, data_df):
        self.data_df = data_df
    
    def __len__(self):
        return self.data_df.shape[0]
    
    def __getitem__(self, i):
        s = np.zeros((2000, 300))
        s_mask = np.zeros(2000)
        s_var = np.zeros(2000)
        q = np.zeros((50, 300))
        q_mask = np.zeros(50)
        q_var = np.zeros(50)
        q_ph = np.zeros(50)
        
        story = nlp(self.data_df['story'].iloc[i].lower(), parse=False, tag=False, entity=False)
        s_mask[:len(story)] = 1
        s_var[np.where([x.text[:7] == '@entity' for x in story])[0]] = 1
        story = np.stack([x.vector for x in story])
        s[:story.shape[0], :] = story
        
        question = nlp(self.data_df['question'].iloc[i].lower(), parse=False, tag=False, entity=False)
        q_mask[:len(question)] = 1
        q_var[np.where([x.text[:7] == '@entity' for x in question])[0]] = 1
        q_ph[np.where([x.text == '@placeholder' for x in question])[0]] = 1
        question = np.stack([x.vector for x in question])
        q[:question.shape[0], :] = question
        
        answer = int(re.search(r'\d+', self.data_df['answer'].iloc[i]).group(0))
        
        return s, q, s_mask, q_mask, s_var, q_var, q_ph, answer

In [5]:
ds = QADataset(train)

In [6]:
ds.__len__()

46241

In [7]:
s, q, sm, qm, sv, qv, qph, a = ds.__getitem__(2)

NameError: name 'nlp' is not defined

In [127]:
s.shape

(2000, 300)

In [128]:
q.shape

(50, 300)

In [129]:
a

416

In [130]:
sm

array([ 1.,  1.,  1., ...,  0.,  0.,  0.])

In [131]:
qm

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [132]:
sv

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [133]:
qv

array([ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [134]:
qph

array([ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [135]:
list(tud.sampler.BatchSampler(range(10), batch_size=3, drop_last=True))

[[0, 1, 2], [3, 4, 5], [6, 7, 8]]

In [136]:
qa_loader = tud.DataLoader(ds, batch_size=20)

In [137]:
s, q, sm, qm, sv, qv, qph, a = next(iter(qa_loader))

In [138]:
s.shape

torch.Size([20, 2000, 300])

In [140]:
q.shape

torch.Size([20, 50, 300])

In [141]:
sm.shape

torch.Size([20, 2000])

In [142]:
qm.shape

torch.Size([20, 50])

In [143]:
sv.shape

torch.Size([20, 2000])

In [144]:
qv.shape

torch.Size([20, 50])

In [145]:
qph.shape

torch.Size([20, 50])

In [139]:
a


 435
 200
 416
 307
 261
 395
 426
 486
 394
 492
 316
  38
 290
  24
 483
  14
 214
   6
  49
 474
[torch.LongTensor of size 20]