In [112]:
import numpy as np
np.random.seed(13)

from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Activation, Input
from keras.layers.merge import Dot
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import text
import gensim

In [84]:
path = get_file('alice.txt', origin='http://www.gutenberg.org/files/11/11-0.txt')
corpus = open(path).readlines()

In [85]:
corpus[2:4]

['This eBook is for the use of anyone anywhere at no cost and with\n',
 'almost no restrictions whatsoever.  You may copy it, give it away or\n']

In [86]:
corpus = corpus[2:4]

In [87]:
corpus = [sentence for sentence in corpus if sentence.count(' ') >= 2]
corpus

['This eBook is for the use of anyone anywhere at no cost and with\n',
 'almost no restrictions whatsoever.  You may copy it, give it away or\n']

In [88]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
V = len(tokenizer.word_index) + 1
V

25

In [89]:
dim_embedddings = 128

# inputs
w_inputs = Input(shape=(1, ), dtype='int32')
w = Embedding(V, dim_embedddings)(w_inputs)

# context
c_inputs = Input(shape=(1, ), dtype='int32')
c  = Embedding(V, dim_embedddings)(c_inputs)

In [90]:
c

<tf.Tensor 'embedding_7_1/Identity:0' shape=(None, 1, 128) dtype=float32>

In [91]:
w

<tf.Tensor 'embedding_6_1/Identity:0' shape=(None, 1, 128) dtype=float32>

In [92]:
output = Dot(axes=2)([w, c])
output

<tf.Tensor 'dot_5/Identity:0' shape=(None, 1, 1) dtype=float32>

In [93]:
output = Reshape((1,), input_shape=(1, 1))(output)
output

<tf.Tensor 'reshape_11/Identity:0' shape=(None, 1) dtype=float32>

In [94]:
output = Activation('sigmoid')(output)

SkipGram = Model(inputs=[w_inputs, c_inputs], outputs=output)
SkipGram.summary()
SkipGram.compile(loss='binary_crossentropy', optimizer='adam')

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_16 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 1, 128)       3200        input_15[0][0]                   
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 1, 128)       3200        input_16[0][0]                   
____________________________________________________________________________________________

In [95]:
tokenizer.word_index

{'no': 1,
 'it': 2,
 'this': 3,
 'ebook': 4,
 'is': 5,
 'for': 6,
 'the': 7,
 'use': 8,
 'of': 9,
 'anyone': 10,
 'anywhere': 11,
 'at': 12,
 'cost': 13,
 'and': 14,
 'with': 15,
 'almost': 16,
 'restrictions': 17,
 'whatsoever': 18,
 'you': 19,
 'may': 20,
 'copy': 21,
 'give': 22,
 'away': 23,
 'or': 24}

In [96]:
corpus

['This eBook is for the use of anyone anywhere at no cost and with\n',
 'almost no restrictions whatsoever.  You may copy it, give it away or\n']

In [97]:
for i, doc in enumerate(tokenizer.texts_to_sequences(corpus)):
    print(i, doc)

0 [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 13, 14, 15]
1 [16, 1, 17, 18, 19, 20, 21, 2, 22, 2, 23, 24]


In [103]:
for i, doc in enumerate(tokenizer.texts_to_sequences(corpus)):
    data, labels = skipgrams(sequence=doc, vocabulary_size=V, window_size=2, negative_samples=2.)
    x = [np.array(x) for x in zip(*data)]
    y = np.array(labels, dtype=np.int32)
    print('data: ', data,'\nX: ' , x, '\nY' ,y)
    break


data:  [[7, 7], [8, 9], [8, 13], [6, 9], [5, 6], [7, 5], [15, 14], [11, 21], [9, 22], [5, 4], [1, 14], [5, 3], [13, 1], [8, 9], [15, 14], [13, 15], [4, 6], [14, 13], [4, 6], [7, 14], [11, 12], [1, 12], [5, 18], [14, 12], [1, 6], [7, 20], [10, 20], [12, 16], [5, 12], [14, 12], [9, 20], [3, 22], [15, 21], [11, 10], [13, 22], [10, 12], [12, 23], [4, 7], [8, 22], [11, 1], [13, 21], [1, 12], [13, 14], [10, 11], [6, 7], [1, 7], [9, 14], [8, 1], [11, 12], [12, 21], [5, 23], [5, 12], [11, 10], [13, 14], [1, 2], [8, 7], [14, 9], [9, 11], [15, 13], [14, 15], [1, 13], [13, 6], [10, 17], [15, 12], [6, 5], [6, 8], [4, 3], [6, 5], [6, 20], [6, 4], [13, 5], [7, 9], [1, 21], [11, 8], [8, 10], [11, 11], [14, 23], [14, 14], [12, 10], [11, 8], [7, 22], [8, 13], [14, 13], [12, 17], [13, 8], [3, 5], [9, 10], [6, 13], [8, 6], [9, 8], [13, 15], [9, 7], [1, 11], [7, 5], [1, 10], [13, 12], [12, 9], [3, 14], [10, 22], [9, 12], [6, 12], [9, 14], [5, 6], [4, 8], [1, 13], [8, 10], [10, 21], [12, 5], [9, 3], [12, 1

In [111]:
id2word = {v:k for k, v in tokenizer.word_index.items()}
id2word

{1: 'no',
 2: 'it',
 3: 'this',
 4: 'ebook',
 5: 'is',
 6: 'for',
 7: 'the',
 8: 'use',
 9: 'of',
 10: 'anyone',
 11: 'anywhere',
 12: 'at',
 13: 'cost',
 14: 'and',
 15: 'with',
 16: 'almost',
 17: 'restrictions',
 18: 'whatsoever',
 19: 'you',
 20: 'may',
 21: 'copy',
 22: 'give',
 23: 'away',
 24: 'or'}

In [120]:
# generate skip-grams
word2id = tokenizer.word_index
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in corpus]
skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=2) for wid in wids]

# view sample skip-grams
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(2):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
          id2word[pairs[i][0]], pairs[i][0], 
          id2word[pairs[i][1]], pairs[i][1], 
          labels[i]))

(of (9), the (7)) -> 1
(at (12), anywhere (11)) -> 1


In [104]:
for _ in range(10):
    loss = 0.
    for i, doc in enumerate(tokenizer.texts_to_sequences(corpus)):
        data, labels = skipgrams(sequence=doc, vocabulary_size=V, window_size=5, negative_samples=5.)
        x = [np.array(x) for x in zip(*data)]
        y = np.array(labels, dtype=np.int32)
        if x:
            loss += SkipGram.train_on_batch(x, y)

    print(loss)

1.3862591981887817
1.385212004184723
1.3839866518974304
1.383117437362671
1.3821192979812622
1.3805545568466187
1.379438877105713
1.3780673146247864
1.376146912574768
1.373725414276123


In [105]:
f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(V-1, dim_embedddings))
vectors = SkipGram.get_weights()[0]

In [106]:
vectors

array([[-0.01723747, -0.0421329 , -0.01856668, ...,  0.04394963,
        -0.00677763, -0.04928473],
       [ 0.00532074,  0.02658218,  0.03579596, ..., -0.0129492 ,
        -0.04443016,  0.03593078],
       [ 0.00578544, -0.02669989, -0.01933427, ...,  0.06094663,
         0.02118776,  0.01218737],
       ...,
       [ 0.04260482, -0.03379558,  0.00244163, ..., -0.00258035,
         0.01659034, -0.01577614],
       [ 0.02773357, -0.00603765, -0.00810921, ..., -0.00518424,
         0.02827771, -0.04466083],
       [-0.02400018,  0.04943145,  0.05313882, ...,  0.02259539,
         0.05670473,  0.04488812]], dtype=float32)

In [107]:

for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

In [109]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)
w2v.most_similar(positive=['use'])

[('this', 0.3456296920776367),
 ('at', 0.2521226108074188),
 ('with', 0.23469021916389465),
 ('of', 0.22455015778541565),
 ('away', 0.19185395538806915),
 ('anywhere', 0.18722069263458252),
 ('you', 0.15692223608493805),
 ('or', 0.15685653686523438),
 ('no', 0.1402404010295868),
 ('and', 0.13369552791118622)]