http://adventuresinmachinelearning.com/word2vec-tutorial-tensorflow/

In [27]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
import urllib
import zipfile
import collections
import math

In [2]:
def maybe_downlaod(filename, url, expected_bytes):
    # If file not already present, urllib request to download
    if not os.path.exists(filename): 
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print ('Found and verified')
    else:
        print (statinfo.st_size)
        raise Exception('Failed to verify ' + filename)
    return filename

In [3]:
url = 'http://mattmahoney.net/dc/'
filename = maybe_downlaod('text8.zip', url, 31344016)

Found and verified


In [4]:
def read_data(filename):
    # Extract first file as list of words...
    with zipfile.ZipFile(filename) as f:
        fileNames = f.namelist()
        fileContentAsString = f.read(fileNames[0])
        data = tf.compat.as_str(fileContentAsString).split()
    return data

In [5]:
vocabulary = read_data(filename)
print(vocabulary[:7])

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']


In [6]:
def build_dataset(words, n_words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = {}
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = []
    unk_count = 0
    for word in words:
        if word in dictionary:
            idx = dictionary[word]
        else:
            idx = 0 # This stands for 'UNK'
            unk_count += 1
        data.append(idx)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [9]:
data, count, dictionary, reversed_dictionary = build_dataset(vocabulary, 20)

In [12]:
data[:15]

[0, 0, 12, 6, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [13]:
count

[['UNK', 11668632],
 ('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430),
 ('two', 192644),
 ('is', 183153),
 ('as', 131815),
 ('eight', 125285),
 ('for', 118445),
 ('s', 116710),
 ('five', 115789),
 ('three', 114775),
 ('was', 112807),
 ('by', 111831)]

In [15]:
dictionary.items()

dict_items([('UNK', 0), ('the', 1), ('of', 2), ('and', 3), ('one', 4), ('in', 5), ('a', 6), ('to', 7), ('zero', 8), ('nine', 9), ('two', 10), ('is', 11), ('as', 12), ('eight', 13), ('for', 14), ('s', 15), ('five', 16), ('three', 17), ('was', 18), ('by', 19)])

In [16]:
reversed_dictionary.items()

dict_items([(0, 'UNK'), (1, 'the'), (2, 'of'), (3, 'and'), (4, 'one'), (5, 'in'), (6, 'a'), (7, 'to'), (8, 'zero'), (9, 'nine'), (10, 'two'), (11, 'is'), (12, 'as'), (13, 'eight'), (14, 'for'), (15, 's'), (16, 'five'), (17, 'three'), (18, 'was'), (19, 'by')])

In [17]:
data_index = 0
def generate_batch(data, batch_size, num_skips, skip_window):
    # Ensure proper setup
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2*skip_window
    
    batch = np.ndarray(
        shape=(batch_size),
        dtype=np.int32
    )
    context = np.ndarray(
        shape=(batch_size, 1), 
        dtype=np.int32
    )
    span = 2*skip_window + 1
    
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
        
    for i in range(batch_size // num_skips):
        target = skip_window   # Input word at center of buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span-1)
            targets_to_avoid.append(target)
            batch[i*num_skips+j] = buffer[skip_window]
            context[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    

In [20]:
valid_size = 16
valid_window = 100
valid_examples = np.random.choice(
    valid_window,
    valid_size,
    replace=False
)

# Make Tensorflow model
***

In [33]:
batch_size = 64
embedding_size = 128
skip_window = 1
num_skips = 2

vocabulary_size = 10000

In [39]:
train_inputs = tf.placeholder(
    tf.int32,
    shape = [batch_size]
)

train_context = tf.placeholder(
    tf.int32,
    shape = [batch_size, 1]
)

valid_dataset = tf.constant(
    value = valid_examples,
    dtype=tf.int32
)

In [40]:
embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
)

embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [41]:
weights = tf.Variable(
    tf.truncated_normal(
        [vocabulary_size, embedding_size],
        stddev = 1.0/math.sqrt(embedding_size)
    )
)

biases = tf.Variable(
    tf.zeros([vocabulary_size])
)

hidden_out = tf.matmul(embed, tf.transpose(weights)) + biases

In [42]:
print ('embeddings: \t{}'.format(embeddings))
print ('embed: \t\t{}'.format(embed))
print ('weights: \t{}'.format(weights))
print ('biases: \t{}'.format(biases))
print ('hidden_out: \t{}'.format(hidden_out))


embeddings: 	<tf.Variable 'Variable_8:0' shape=(10000, 128) dtype=float32_ref>
embed: 		Tensor("embedding_lookup_2:0", shape=(64, 128), dtype=float32)
weights: 	<tf.Variable 'Variable_9:0' shape=(10000, 128) dtype=float32_ref>
biases: 	<tf.Variable 'Variable_10:0' shape=(10000,) dtype=float32_ref>
hidden_out: 	Tensor("add_2:0", shape=(64, 10000), dtype=float32)


In [43]:
train_one_hot = tf.one_hot(
    train_context,
    vocabulary_size
)

cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(
        logits = hidden_out,
        labels = train_one_hot
    )
)

optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(cross_entropy)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



In [44]:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [45]:
valid_embeddings = tf.nn.embedding_lookup(
      normalized_embeddings, valid_dataset)

In [46]:
similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True)