In [34]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [35]:
import collections
import math
import os
import random 
import zipfile

In [36]:
from six.moves import urllib
from six.moves import xrange

In [37]:
import numpy as np
import tensorflow as tf

In [38]:
print(np.__version__)
print(tf.__version__)

1.18.1
2.1.0


In [39]:
Downloaded_Filename = 'WordEmbeddingData.zip'

def maybe_download(url_path, expected_bytes):
    if not os.path.exists(Downloaded_Filename):
        filename, _ = urllib.request.urlretrieve(url_path, Downloaded_Filename)
        
        statinfo = os.stat(Downloaded_Filename)
        if statinfo.st_size == expected_bytes:
            print('Found and verified file from this path: ', url_path)
            print('Downloaded file: ', Downloaded_Filename)
        else:
            print(statinfo.st_size)
            raise Exception(
            'failed to verify file from: ' + url_path + '. Can you get to it with a browser?')
        

In [49]:
def read_words():
    with zipfile.ZipFile(Downloaded_Filename) as f:
        firstfile = f.namelist()[0]
        filestring = tf.compat.as_str(f.read(firstfile))
        words = filestring.split()
        
    return words

In [50]:
URL_PATH = 'http://mattmahoney.net/dc/text8.zip'
FILESIZE = 31344016

maybe_download(URL_PATH, FILESIZE)

In [51]:
vocabulary = read_words()

In [52]:
len(vocabulary)

17005207

In [54]:
vocabulary[:25]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against',
 'early',
 'working',
 'class',
 'radicals',
 'including',
 'the',
 'diggers',
 'of',
 'the',
 'english',
 'revolution',
 'and',
 'the',
 'sans',
 'culottes']

In [61]:
def build_dataset(words, n_words):
    word_counts = [['UNLNOWN', -1]]
    
    counter = collections.Counter(words)
    word_counts.extend(counter.most_common(n_words - 1))
    
    dictionary = dict()
    
    for word, _ in word_counts:
        dictionary[word] = len(dictionary)
        
    word_indexes = list()
    
    unknown_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0 #dictionary['UNKNOWN']
            unknown_count += 1;
            
        word_indexes.append(index)
        
    word_counts[0][1] = unknown_count
    
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

    return word_counts, word_indexes, dictionary, reversed_dictionary

In [66]:
VOCABULARY_SIZE = 5000

word_counts, word_indexes, dictionary, reversed_dictionary = build_dataset(
    vocabulary, VOCABULARY_SIZE)

In [67]:
word_counts[:10]

[['UNLNOWN', 2735459],
 ('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430)]

In [68]:
word_indexes[:10]

[0, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]

In [73]:
import random

for key in random.sample(list(dictionary), 10):
    print(key, ":", dictionary[key])

greek : 321
institutions : 1775
annual : 1766
who : 57
claims : 904
alternate : 2656
largely : 824
belgian : 3893
managed : 2215
technological : 4831


In [77]:
import random

for key in random.sample(list(reversed_dictionary), 10):
    print(key, ":", reversed_dictionary[key])

476 : died
2346 : colonial
4310 : reject
3871 : extend
590 : our
4275 : silent
3561 : quantity
3125 : angle
2837 : heritage
3418 : custom


In [78]:
del vocabulary

In [79]:
#global index into words maintained across batches
global_index = 0

In [80]:
def generate_batch(word_indexes, batch_size, num_skips, skip_window):
    global global_index
    
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    
    batch = np.ndarray(shape = (batch_size), dtype=np.int32)
    labels = np.ndarray(shape = (batch_size, 1), dtype=np.int32)
    
    span = 2 * skip_window + 1 #[ skip_window input_word skip_window ]
    
    buffer = collections.deque(maxlen = span)
    
    for _ in range(span):
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes)
        
    for i in range(batch_size // num_skips):
        target = skip_window #input word at the center of the buffer
        targets_to_avoid = [skip_window]
        
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
                
            targets_to_avoid.append(target)
            
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
            
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes)
        
    global_index = (global_index + len(word_indexes) - span) % len(word_indexes)
    
    return batch, labels


In [81]:
batch, labels = generate_batch(word_indexes, 10, 2, 5)

In [82]:
batch

array([   2,    2, 3134, 3134,   46,   46,   59,   59,  156,  156],
      dtype=int32)

In [83]:
labels

array([[3134],
       [   6],
       [ 156],
       [   2],
       [ 128],
       [ 742],
       [3134],
       [ 477],
       [ 134],
       [ 742]], dtype=int32)

In [84]:
for i in range(9):
    print(reversed_dictionary[batch[i]], ":", reversed_dictionary[labels[i][0]])

of : abuse
of : a
abuse : against
abuse : of
first : early
first : working
used : abuse
used : class
against : including


In [86]:
global_index = 0

In [87]:
valid_size = 16
valid_window = 100

valid_examples = np.random.choice(valid_window, valid_size, replace = False)

In [88]:
batch_size = 128
embedding_size = 50
skip_window = 2
num_skips = 2

In [95]:
tf.compat.v1.reset_default_graph()
tf.compat.v1.disable_eager_execution()

train_inputs = tf.compat.v1.placeholder(tf.int32, shape = [batch_size])
train_labels = tf.compat.v1.placeholder(tf.int32, shape = [batch_size, 1])

In [96]:
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [99]:
embeddings = tf.Variable(
    tf.compat.v1.random_uniform([VOCABULARY_SIZE, embedding_size], -1.0, 1.0))

embed = tf.nn.embedding_lookup(embeddings, train_inputs)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [100]:
embeddings

<tf.Variable 'Variable:0' shape=(5000, 50) dtype=float32>

In [101]:
embed

<tf.Tensor 'embedding_lookup/Identity_1:0' shape=(128, 50) dtype=float32>

In [104]:
weights = tf.Variable(tf.compat.v1.truncated_normal([VOCABULARY_SIZE, embedding_size],
                                         stddev = 1.0 / math.sqrt(embedding_size)))

biases = tf.Variable(tf.zeros([VOCABULARY_SIZE]))

hidden_out = tf.matmul(embed, tf.transpose(weights)) + biases


In [105]:
hidden_out

<tf.Tensor 'add:0' shape=(128, 5000) dtype=float32>

In [106]:
train_one_hot =  tf.one_hot(train_labels, VOCABULARY_SIZE)

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hidden_out,
                                                             labels = train_one_hot))

In [109]:
optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.1).minimize(loss)

In [113]:
l2_norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, True))

normalized_embeddings = embeddings / l2_norm

In [114]:
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)

In [115]:
valid_embeddings

<tf.Tensor 'embedding_lookup_1/Identity:0' shape=(16, 50) dtype=float32>

In [116]:
normalized_embeddings

<tf.Tensor 'truediv_1:0' shape=(5000, 50) dtype=float32>

In [118]:
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [122]:
init = tf.compat.v1.global_variables_initializer()

In [124]:
num_steps = 201

In [129]:
with tf.compat.v1.Session() as session:
    init.run()
    
    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(
        word_indexes, batch_size, num_skips, skip_window)
        
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        
        _, loss_val = session.run([optimizer, loss], feed_dict)
        average_loss += loss_val
        
        if step % 200 == 0:
            if step > 0:
                average_loss /= 2000
                
                print('Average_loss at step ', step, ':', average_loss)
                average_loss = 0
                    
        if step % 100 == 0:
            sim = similarity.eval()
        
            for i in xrange(valid_size):
                valid_word = reversed_dictionary[valid_examples[i]]
                top_k = 8
                
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest tp %s:' % valid_word
                
                for k in xrange(top_k):
                    close_word = reversed_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
                
            print("\n")
            

Nearest tp first: drive, various, parks, difference, recording, man, jersey, legend,
Nearest tp all: conventional, expelled, mm, proclaimed, worlds, hollywood, bce, tissue,
Nearest tp up: considerable, waste, principal, abilities, stations, nato, termed, f,
Nearest tp from: funeral, eating, vessels, organization, arguably, luther, psychology, vs,
Nearest tp if: will, could, arabs, official, aspects, result, contrast, m,
Nearest tp have: sales, hundreds, reaching, continued, dollars, brian, spend, worth,
Nearest tp can: phenomena, executive, model, tested, attorney, categories, identical, concepts,
Nearest tp as: comment, dating, implemented, est, meaning, approaches, updated, apply,
Nearest tp and: corner, individual, pro, actions, operator, mechanics, tales, directors,
Nearest tp there: output, guitarist, below, spring, creature, hard, emphasis, corresponds,
Nearest tp world: vowel, variation, percentage, ce, amounts, locally, administered, type,
Nearest tp over: board, und, simply, o

Average_loss at step  600 : 0.7302287192344665
Nearest tp first: drive, various, parks, difference, recording, man, jersey, engineers,
Nearest tp all: conventional, mm, expelled, proclaimed, worlds, hollywood, bce, tissue,
Nearest tp up: considerable, waste, principal, abilities, stations, nato, termed, f,
Nearest tp from: funeral, eating, vessels, luther, organization, arguably, vs, psychology,
Nearest tp if: will, could, arabs, official, aspects, result, contrast, m,
Nearest tp have: sales, hundreds, reaching, continued, dollars, brian, spend, scales,
Nearest tp can: phenomena, executive, model, tested, attorney, categories, identical, concepts,
Nearest tp as: comment, implemented, dating, est, meaning, approaches, updated, apply,
Nearest tp and: corner, individual, operator, actions, pro, mechanics, tales, bearing,
Nearest tp there: output, guitarist, below, spring, creature, hard, emphasis, corresponds,
Nearest tp world: vowel, variation, percentage, ce, amounts, locally, administe

Average_loss at step  1200 : 0.68186017537117
Nearest tp first: drive, various, parks, difference, recording, man, jersey, engineers,
Nearest tp all: conventional, mm, proclaimed, expelled, worlds, hollywood, bce, tissue,
Nearest tp up: considerable, waste, principal, abilities, stations, nato, termed, f,
Nearest tp from: funeral, eating, vessels, luther, organization, vs, arguably, psychology,
Nearest tp if: will, could, arabs, official, aspects, result, contrast, m,
Nearest tp have: sales, reaching, hundreds, continued, dollars, spend, brian, scales,
Nearest tp can: phenomena, executive, model, tested, attorney, categories, identical, concepts,
Nearest tp as: comment, implemented, dating, est, meaning, approaches, updated, apply,
Nearest tp and: corner, individual, operator, actions, pro, mechanics, scientific, taylor,
Nearest tp there: output, guitarist, below, spring, creature, hard, emphasis, corresponds,
Nearest tp world: vowel, variation, percentage, ce, amounts, locally, type, 

Average_loss at step  1800 : 0.6491164963245392
Nearest tp first: drive, various, parks, difference, recording, jersey, man, engineers,
Nearest tp all: conventional, mm, proclaimed, expelled, worlds, hollywood, bce, tissue,
Nearest tp up: considerable, waste, principal, abilities, stations, nato, termed, f,
Nearest tp from: funeral, eating, vessels, luther, vs, organization, arguably, psychology,
Nearest tp if: will, could, arabs, official, aspects, result, contrast, m,
Nearest tp have: sales, reaching, hundreds, continued, dollars, spend, scales, brian,
Nearest tp can: phenomena, executive, model, tested, attorney, categories, identical, concepts,
Nearest tp as: comment, implemented, dating, meaning, est, approaches, updated, increased,
Nearest tp and: corner, individual, operator, actions, mechanics, pro, scientific, taylor,
Nearest tp there: output, guitarist, below, spring, creature, hard, emphasis, corresponds,
Nearest tp world: vowel, variation, percentage, ce, amounts, locally, 