In [1]:
import numpy as np
import pandas as pd
import spacy

In [2]:
import re
import torch.utils.data as tud


def get_word_ids(doc, rnn_encode=False, max_length=100, nr_unk=100, nr_var=600):
    queue = list(doc)
    X = -np.ones(max_length, dtype='int32')
    words = []
    while len(words) <= max_length and queue:
        word = queue.pop(0)
        if rnn_encode or (not word.is_punct and not word.is_space):
            words.append(word)
    words.sort()
    for j, token in enumerate(words):
        if token.has_vector:
            X[j] = token.rank
        if j >= max_length - 1:
            break
    return X


class QADataset(tud.Dataset):
    def __init__(self, data_df, nlp):
        self.data_df = data_df
        self.nlp = nlp

    def __len__(self):
        return self.data_df.shape[0]

    def __getitem__(self, i):

        story = self.nlp(self.data_df['story'].iloc[i], parse=False, tag=False, entity=False)
        s = get_word_ids(story, max_length=2000)

        question = self.nlp(self.data_df['question'].iloc[i], parse=False, tag=False, entity=False)
        q = get_word_ids(question, max_length=50)

        return s, q


In [3]:
train = pd.read_pickle("../input_data/train_es.pkl")
dev = pd.read_pickle("../input_data/dev_es.pkl")
test = pd.read_pickle("../input_data/test_es.pkl")

In [4]:
combined = pd.concat([train, dev, test], axis=0)

In [6]:
nlp = spacy.load('es')
with open("../wordvecs/wiki.es/wiki.es.nospace.vec", "r") as f:
    nlp.vocab.load_vectors(f)

In [7]:
ds = QADataset(combined, nlp)
qa_loader = tud.DataLoader(ds, batch_size=32, pin_memory=True, num_workers=3)

In [8]:
vocab_set = set()
for i, qa in enumerate(qa_loader):
    if i % 1000 == 0:
        print("current batch {0}/{1}".format(i, len(qa_loader)))
    s, q = qa
    cur_set = set(np.unique(s.numpy())).union(set(np.unique(q.numpy())))
    vocab_set = vocab_set.union(cur_set)

current batch 0/1755
current batch 1000/1755


In [9]:
len(vocab_set)

113092

In [10]:
nlp.vocab.length

1957564

In [11]:
valid = set([x for x in vocab_set if x >= 0])

In [12]:
len(valid)

113091

In [13]:
valid_words = set([x.text for x in nlp.vocab if x.rank in valid])

In [14]:
list(valid_words)[:10]

['sonoras',
 'colocarlo',
 'sudafricano',
 'glosado',
 'sobreponiendo',
 'Aeronáutico',
 'implementaba',
 'Rosales',
 'lástima',
 'finalicen']

In [15]:
'united' in valid_words

True

In [16]:
with open("../wordvecs/wiki.es/wiki.es.vec") as f:
    with open("../wordvecs/wiki.es/wiki.es.small.vec", "w") as g:
        f.readline()
        i = 0
        line = f.readline()
        while line:
            if not re.search(r'[\u00A0\u1680\u180e\u2000-\u2009\u200a\u200b\u202f\u205f\u3000\u2028\x85]',
                             line):
                word = line.split(" ")[0]
                if word in valid_words:
                    g.write(line)
            line = f.readline()
            i += 1

## Verify:

In [17]:
nlp = spacy.load('es')
with open("../wordvecs/wiki.es/wiki.es.small.vec", "r") as f:
    nlp.vocab.load_vectors(f)

In [18]:
# count valid vocab size
np.sum([x.has_vector for x in nlp.vocab])

536327

In [19]:
nlp.vocab.length

1232563

In [20]:
nlp.vocab.vectors_length

300

In [21]:
# train = pd.read_pickle("../input_data/train_es.pkl")

In [22]:
ds = QADataset(combined, nlp)
qa_loader = tud.DataLoader(ds, batch_size=32, pin_memory=True, num_workers=3)

In [23]:
vocab_set = set()
for i, qa in enumerate(qa_loader):
    if i % 1000 == 0:
        print("current batch {0}/{1}".format(i, len(qa_loader)))
    s, q = qa
    cur_set = set(np.unique(s.numpy())).union(set(np.unique(q.numpy())))
    vocab_set = vocab_set.union(cur_set)

current batch 0/1755
current batch 1000/1755


In [24]:
len(vocab_set)

113092