In [1]:
import tensorflow as tf
from tensorflow import keras
# import keras
import numpy as np
import util
print (keras.__version__)
print (tf.__version__)

2.1.6-tf
1.9.0


In [2]:
vocab_size = 5000
data, count, dictionary, reverse_dictionray = util.collect_data(vocabulary_size=vocab_size)
print('\ndata: \n\t{}'.format(data[:5]))
print('\ncount: \n\t{}'.format(count[:5]))


Found and verified text8.zip
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']

data: 
	[0, 3081, 12, 6, 195]

count: 
	[['UNK', 2735459], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]


In [3]:
i = 0
for x, y in dictionary.items():
    print (x, y)
    i += 1
    if i > 10: break

UNK 0
the 1
of 2
and 3
one 4
in 5
a 6
to 7
zero 8
nine 9
two 10


In [4]:
i = 0
for x, y in reverse_dictionray.items():
    print (x, y)
    i += 1
    if i > 10: break

0 UNK
1 the
2 of
3 and
4 one
5 in
6 a
7 to
8 zero
9 nine
10 two


In [5]:
window_size = 3
vector_dim = 300
epochs = 200000

valid_size = 16
valid_window = 100
valid_examples = np.random.choice(
    valid_window,
    valid_size,
    replace=False
)

In [6]:
sampling_table = keras.preprocessing.sequence.make_sampling_table(vocab_size)
print ('sample table created')
couples, labels = keras.preprocessing.sequence.skipgrams(
    data,
    vocab_size,
    window_size = window_size,
    sampling_table = sampling_table
)
print ('skipgrams created')
word_target, word_context = zip(*couples)
print ('word_target and word_context zipped')
word_target = np.array(word_target, dtype='int32')
word_context = np.array(word_context, dtype='int32')

print (couples[:10], labels[:10])

sample table created
skipgrams created
word_target and word_context zipped
[[235, 2347], [400, 972], [871, 2987], [3100, 1085], [1491, 5], [1763, 3995], [3083, 21], [4382, 3546], [1850, 130], [1013, 2837]] [0, 0, 0, 1, 1, 0, 1, 1, 1, 0]


In [7]:
print ('couples, labels')
for i in range(10):
    print ('{},\t{}'.format(couples[i], labels[i]))

couples, labels
[235, 2347],	0
[400, 972],	0
[871, 2987],	0
[3100, 1085],	1
[1491, 5],	1
[1763, 3995],	0
[3083, 21],	1
[4382, 3546],	1
[1850, 130],	1
[1013, 2837],	0


# Create Keras model using functional API

In [8]:
input_target = keras.layers.Input((1,))
input_context = keras.layers.Input((1,))

embedding = keras.layers.Embedding(
    vocab_size,
    vector_dim,
    input_length=1,
    name='embedding'
)

In [9]:
target = embedding(input_target)
target = keras.layers.Reshape((vector_dim, 1))(target)

context = embedding(input_context)
context = keras.layers.Reshape((vector_dim, 1))(context)

In [10]:
print (target)
print (context)

Tensor("reshape/Reshape:0", shape=(?, 300, 1), dtype=float32)
Tensor("reshape_1/Reshape:0", shape=(?, 300, 1), dtype=float32)


```python
from sklearn.metrics.pairwise import cosine_similarity

similarity = keras.layers.merge(
    [target, context],
    mode = 'cos',
    dot_axes = 0
)

K = keras.backend

def l2_norm(x, axis=None):
    """
    takes an input tensor and returns the l2 norm along specified axis
    """

    square_sum = K.sum(K.square(x), axis=axis, keepdims=True)
    norm = K.sqrt(K.maximum(square_sum, K.epsilon()))

    return norm

def pairwise_cosine_sim(A_B):
    """
    A [batch x n x d] tensor of n rows with d dimensions
    B [batch x m x d] tensor of n rows with d dimensions

    returns:
    D [batch x n x m] tensor of cosine similarity scores between each point i<n, j<m
    """

    A_tensor, B_tensor = A_B
    A_mag = l2_norm(A, axis=2)
    B_mag = l2_norm(B, axis=2)
    num = K.batch_dot(A_tensor, K.permute_dimensions(B_tensor, (0,2,1)))
    den = (A_mag * K.permute_dimensions(B_mag, (0,2,1)))
    dist_mat =  num / den

    return dist_mat



```

In [11]:
similarity = keras.layers.dot(
    inputs=[target, context],
    axes=0,
    normalize=True)

In [12]:
dot_product = keras.layers.dot(
    [target, context],
    axes=1,
    normalize=False
)

dot_product = keras.layers.Reshape((1,))(dot_product)
output = keras.layers.Dense(1, activation=tf.nn.sigmoid)(dot_product)

In [13]:
model = keras.Model(
    inputs = [input_target, input_context],
    outputs = output
)
model.compile(
    loss=keras.losses.binary_crossentropy,
    optimizer=keras.optimizers.RMSprop()
)

In [14]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 300)       1500000     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
reshape (Reshape)               (None, 300, 1)       0           embedding[0][0]                  
__________

In [15]:
validation_model = keras.Model(
    inputs = [input_target, input_context],
    outputs = similarity
)

In [16]:
class SimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionray[valid_examples[i]]
            top_k = 6   # Number nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionray[nearest[k]]
                log_str = '%s %s, ' % (log_str, close_word)
            print (log_str)
            
    @staticmethod
    def _get_sim(valid_word_index):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        in_arr1[0,] = valid_word_index
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
    
sim_cb = SimilarityCallback()

In [17]:
arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(epochs):
    idx = np.random.randint(0, len(labels)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if cnt % 250 == 0:
        print("Iteration {}, loss={}".format(cnt, loss))
    if cnt % 10000 == 0:
        sim_cb.run_sim()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Iteration 0, loss=1.0000001537946446e-07
Nearest to called: describing,  libraries,  specialized,  virus,  cleveland,  firm, 
Nearest to their: describing,  libraries,  specialized,  virus,  cleveland,  firm, 
Nearest to on: describing,  libraries,  specialized,  virus,  cleveland,  firm, 
Nearest to it: describing,  libraries,  specialized,  virus,  cleveland,  firm, 
Nearest to while: describing,  libraries,  specialized,  virus,  cleveland,  firm, 
Nearest to from: describing,  libraries,  specialized,  virus,  cleveland,  firm, 
Nearest to all: describing,  libraries,  specialized,  virus,  cleveland,  firm, 
Nearest to when: describing,  libraries,  specialized,  virus,  cleveland,  firm, 
Nearest to a: describing,  libraries,  specialized,  virus,  cleveland,  firm, 
Nearest to known: describing,  libraries,  specialized,  virus,  cleveland,  firm, 
Nearest to has: describing,  libraries,  specialized,  virus,  cleveland,  firm, 
Nearest to of: describing,  libraries,  specialize

KeyboardInterrupt: 