In [1]:
import numpy as np
import pandas as pd
import os
import json
import spacy
from itertools import chain, count
from collections import Counter
import re
from pprint import pprint

In [2]:
dat1 = pd.read_pickle("../data/elmondo_es.pkl")
dat2 = pd.read_pickle("../data/elmondo_es_eco.pkl")
dat3 = pd.read_pickle("../data/elmondo_es_sp.pkl")
dat = pd.concat([dat1, dat2, dat3], axis=0)

In [3]:
dat.head()

Unnamed: 0,headline,keyfacts,content,tags,time
http://www.elmundo.es/america/2014/01/02/52c4d39a22601d6f658b457c.html,Una jueza del Tribunal Supremo suspende parcia...,[ Decidió atender a los grupos conservadores y...,"El martes, antes de presidir la fiesta de fin ...",[],2014-01-02
http://www.elmundo.es/america/2014/01/02/52c4d99622601d6d658b458a.html,'La revolución cubana sigue sin compromisos co...,[ 'Jamás hemos cedido ni cederemos ante agresi...,El presidente Raúl Castro reveló que se está i...,[],2014-01-02
http://www.elmundo.es/america/2014/01/03/52c61ede268e3e3c528b456b.html,La NSA trabaja en un ordenador cuántico capaz ...,[ La información proviene de los documentos de...,La Agencia de Seguridad Nacional (NSA) trabaja...,[],2014-01-03
http://www.elmundo.es/america/2014/01/10/52cfbb62ca47415a218b456b.html,Último adiós a la ex Miss Venezuela Mónica Spe...,[ Mónica Spear y su marido fueron asesinados e...,Esta semana Venezuela ha recibido una noticia ...,[],2014-01-10
http://www.elmundo.es/america/2014/01/14/52d4b8ba268e3eb2318b456a.html,Michoacán pone en jaque al Gobierno de Peña Nieto,[ El Gobierno envía más policías y militares y...,La situación en el Estado mexicano de Michoacá...,[],2014-01-14


In [4]:
dat.shape

(34633, 5)

In [157]:
def get_embeddings(vocab, nr_unk=100):
    nr_vector = max(lex.rank for lex in vocab) + 1
    vectors = np.zeros((nr_vector+nr_unk+2, vocab.vectors_length), dtype='float32')
    for lex in vocab:
        if lex.has_vector:
            vectors[lex.rank+1] = lex.vector / lex.vector_norm
#             vectors[lex.rank+1] = lex.vector
    return vectors


def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100, nr_unk=100):
    Xs = np.zeros((len(docs), max_length), dtype='int32')
    for i, doc in enumerate(docs):
        if tree_truncate:
            if isinstance(doc, Span):
                queue = [doc.root]
            else:
                queue = [sent.root for sent in doc.sents]
        else:
            queue = list(doc)
        words = []
        while len(words) <= max_length and queue:
            word = queue.pop(0)
            if rnn_encode or (not word.is_punct and not word.is_space):
                words.append(word)
            if tree_truncate:
                queue.extend(list(word.lefts))
                queue.extend(list(word.rights))
        words.sort()
        for j, token in enumerate(words):
            if token.has_vector:
                Xs[i, j] = token.rank+1
            else:
                Xs[i, j] = (token.shape % (nr_unk-1))+2
            j += 1
            if j >= max_length:
                break
        else:
            Xs[i, len(words)] = 1
    return Xs

In [158]:
nlp = spacy.load('es')

In [159]:
len(nlp.vocab)

1230023

In [160]:
emb = get_embeddings(nlp.vocab)

In [161]:
emb.shape

(1230125, 50)

In [223]:
res = nlp.pipe(dat['content'][0])

In [224]:
res

<generator object Language.pipe at 0x7f6b78adf8e0>

In [165]:
wl = get_word_ids(list(nlp.pipe(dat['content'][:5])), max_length=2000, rnn_encode=True)

In [213]:
wl.shape

(5, 2000)

In [193]:
wl[0]

array([  32, 1554,   57, ...,    0,    0,    0], dtype=int32)

In [211]:
N = 54
print(res[N])
print(res[N].vector)
print(wl[0][N])
print(emb[wl[0][N]])
print((res[N].vector / np.linalg.norm(res[N].vector)) / emb[wl[0][N]])

del
[-0.46074301 -0.77140599  0.661452   -0.70173502  1.71059406 -0.55488902
 -0.83409899 -0.78157902 -0.88442397  4.36215019  0.119592    0.210757
  0.202582   -0.087535    1.20798302  0.38466701  0.72116601  0.040439
 -0.25994101  0.401526    2.70532393  0.378241   -0.28162301 -0.86927801
 -0.028642   -0.109049   -1.12738705  2.71435308 -0.81711698 -0.320108
  2.7643199  -0.59532702 -3.16500711 -0.008701    0.033842    0.64077002
  0.772273   -0.202738   -0.038664    0.029468   -1.41614401  0.652206
 -0.37901801 -0.66175002  0.94164902 -0.401712   -1.337767   -3.16256809
  0.064858   -0.652147  ]
15230
[-0.05099941 -0.08538653  0.07321578 -0.07767469  0.18934478 -0.06142038
 -0.09232599 -0.08651258 -0.09789643  0.48284417  0.01323758  0.02332858
  0.0224237  -0.0096892   0.13371103  0.04257859  0.0798255   0.00447617
 -0.02877274  0.04444471  0.2994509   0.0418673  -0.03117271 -0.09621993
 -0.00317037 -0.01207058 -0.12478989  0.30045035 -0.09044626 -0.03543259
  0.30598116 -0.0658964

In [None]:
with open("../wordvecs/wiki.es/wiki.es.nospace.vec") as f:
    nlp.vocab.load_vectors(f)

In [8]:
fact_nlp = dat['keyfacts'].apply(lambda l: [nlp(s) for s in l])

In [None]:
dat['content'] = dat['content'].apply(lambda x: re.sub(r'\[.*?]', '', x))