In [1]:
# http://adventuresinmachinelearning.com/word2vec-keras-tutorial/

In [2]:
from keras.models import Model
from keras.layers import Input, Dense, Reshape, Dot, Lambda
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence
from keras import backend as K
from keras.layers import dot
import urllib
import collections
import os
import zipfile
import numpy as np
import tensorflow as tf

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
def download(filename, url, expected_bytes):
    if not os.path.exists(os.path.join(os.path.join(os.path.abspath(''),"data"),filename)):
        filename, _ = urllib.request.urlretrieve(url + filename, os.path.join(os.path.join(os.path.abspath(''),"data"),filename))
    statinfo = os.stat(os.path.join(os.path.join(os.path.abspath(''),"data"),filename))
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return os.path.join(os.path.join(os.path.abspath(''),"data"),filename)

In [4]:
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

In [5]:
def build_dataset(words, n_words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


In [6]:
def collect_data(vocabulary_size=10000):
    url = 'http://mattmahoney.net/dc/'
    filename = download('text8.zip', url, 31344016)
    vocabulary = read_data(filename)
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,vocabulary_size)
    del vocabulary
    return data, count, dictionary, reverse_dictionary

In [7]:
vocab_size = 10000
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)

Found and verified text8.zip


In [8]:
window_size = 3
vector_dim = 300
epochs = 200000

In [9]:
"""
valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
"""

'\nvalid_size = 16\nvalid_window = 100\nvalid_examples = np.random.choice(valid_window, valid_size, replace=False)\n'

In [10]:
sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

In [11]:
input_target = Input((1,))
input_context = Input((1,))

W0922 18:35:48.789585  7180 deprecation_wrapper.py:119] From c:\users\yoshi\appdata\local\programs\python\python37\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0922 18:35:48.967045  7180 deprecation_wrapper.py:119] From c:\users\yoshi\appdata\local\programs\python\python37\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.



In [12]:
embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)

W0922 18:35:49.026143  7180 deprecation_wrapper.py:119] From c:\users\yoshi\appdata\local\programs\python\python37\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



In [13]:
# setup a cosine similarity operation which will be output in a secondary model
similarity = dot([target,context],axes=1,normalize=True)

In [14]:
# now perform the dot product operation to get a similarity measure
dot_product = dot([target, context], normalize=False, axes=1)
dot_product = Reshape((1,))(dot_product)
# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(dot_product)
# create the primary training model
model = Model(input=[input_target, input_context], output=output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop')

  import sys
W0922 18:35:49.172068  7180 deprecation_wrapper.py:119] From c:\users\yoshi\appdata\local\programs\python\python37\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0922 18:35:49.188053  7180 deprecation_wrapper.py:119] From c:\users\yoshi\appdata\local\programs\python\python37\lib\site-packages\keras\backend\tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.

W0922 18:35:49.193012  7180 deprecation.py:323] From c:\users\yoshi\appdata\local\programs\python\python37\lib\site-packages\tensorflow\python\ops\nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [15]:
# create a secondary validation model to run our similarity checks during training
validation_model = Model(input=[input_target, input_context], output=similarity)

  


In [16]:
class SimilarityCallback:
    def run_sim(self,word):
        valid_word = reverse_dictionary[dictionary[word]]
        top_k = 8  # number of nearest neighbors
        sim = self._get_sim(dictionary[word])
        nearest = (-sim).argsort()[1:top_k + 1]
        log_str = 'Nearest to %s:' % valid_word
        for k in range(top_k):
            close_word = reverse_dictionary[nearest[k]]
            log_str = '%s %s,' % (log_str, close_word)
        print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        in_arr1[0,] = valid_word_idx
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
sim_cb = SimilarityCallback()

In [18]:
arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(epochs):
    idx = np.random.randint(0, len(labels)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if cnt % 1000 == 0:
        print("Iteration {}, loss={}".format(cnt, loss))

Iteration 0, loss=0.3458108603954315
Iteration 1000, loss=1.051347255706787
Iteration 2000, loss=0.014564258977770805
Iteration 3000, loss=0.3665485680103302
Iteration 4000, loss=0.36206772923469543
Iteration 5000, loss=0.1662810742855072
Iteration 6000, loss=0.7548686861991882
Iteration 7000, loss=0.9571908712387085
Iteration 8000, loss=0.0072945524007081985
Iteration 9000, loss=0.4969686269760132
Iteration 10000, loss=0.09309297055006027
Iteration 11000, loss=0.7348788380622864
Iteration 12000, loss=0.5062342882156372
Iteration 13000, loss=0.9834762811660767
Iteration 14000, loss=0.4801466464996338
Iteration 15000, loss=1.0392485857009888
Iteration 16000, loss=2.5971810817718506
Iteration 17000, loss=0.4553026258945465
Iteration 18000, loss=0.17215649783611298
Iteration 19000, loss=0.3885313868522644
Iteration 20000, loss=2.79307222366333
Iteration 21000, loss=0.3432311713695526
Iteration 22000, loss=0.8328967094421387
Iteration 23000, loss=0.0007345391204580665
Iteration 24000, loss

Iteration 195000, loss=0.3932865858078003
Iteration 196000, loss=1.1050282716751099
Iteration 197000, loss=0.33030253648757935
Iteration 198000, loss=0.31553784012794495
Iteration 199000, loss=0.26304760575294495


In [19]:
validation_model.save("word2vec-keras-model.h5")

In [24]:
sim_cb.run_sim("the")

Nearest to the: of, a, in, by, and, as, that, is,
