In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import collections
import math
import os
import random
import zipfile

In [3]:
from six.moves import urllib
from six.moves import xrange

In [4]:
import numpy as np
import tensorflow as tf

In [5]:
print(np.__version__)
print(tf.__version__)

1.15.4
1.10.0


In [6]:
DOWNLOAD_FILENAME = "SampleText.zip"
def download(url_path, expected_bytes):
    if not os.path.exists(DOWNLOAD_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOAD_FILENAME)
        
        statinfo = os.stat(DOWNLOAD_FILENAME)
        if statinfo.st_size == expected_bytes:
            print("Found and verified file from this path : ", url_path)
            print("Downloaded filed: ", DOWNLOAD_FILENAME)
        else:
            print(statinfo.st_size)
            raise Exception("Failed to verify file from " + url_path + " Can you get with a browser?")                

In [7]:
def read_words():
    with zipfile.ZipFile(DOWNLOAD_FILENAME) as f:
        firstfile = f.namelist()[0]
        filestring = tf.compat.as_str(f.read(firstfile))
        words = filestring.split()
    
    return words

In [8]:
URL_PATH = 'http://mattmahoney.net/dc/text8.zip'
FILE_SIZE = 31344016
download(URL_PATH, FILE_SIZE)

In [9]:
voc = read_words()

In [10]:
len(voc)

17005207

In [11]:
voc[:25]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against',
 'early',
 'working',
 'class',
 'radicals',
 'including',
 'the',
 'diggers',
 'of',
 'the',
 'english',
 'revolution',
 'and',
 'the',
 'sans',
 'culottes']

In [12]:
def build_dataset(words, n_words):
    word_counts = [['UNKOWN', -1]]
    
    counter = collections.Counter(words)
    word_counts.extend(counter.most_common(n_words-1))
    
    dictonary = dict()
    
    for word, _ in word_counts:
        dictonary[word] = len(dictonary)
    
    word_indexes = list()
    unkown_count = 0
    for word in words:
        if word in dictonary:
            index = dictonary[word]
        else:
            index = 0
            unkown_count += 1
        
        word_indexes.append(index)
    
    word_counts[0][1] = unkown_count
    
    reversed_dictonary = dict(zip(dictonary.values(), dictonary.keys()))
    
    return word_counts, word_indexes, dictonary, reversed_dictonary    

In [13]:
VOC_SIZE = 10000
word_counts, word_indexes, dictonary , reversed_dictonary = build_dataset(voc, VOC_SIZE)

In [14]:
word_counts[:10]

[['UNKOWN', 1737307],
 ('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430)]

In [15]:
word_indexes[:10]

[5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]

In [16]:
import random
for key in random.sample(list(dictonary), 10):
    print(key, ":", dictonary[key])

moments : 7008
state : 94
confusion : 3383
policy : 739
absinthe : 9441
houses : 2037
winter : 1444
bengals : 7699
sk : 5505
cowboy : 8141


In [17]:
import random
for key in random.sample(list(reversed_dictonary),10):
    print(key, ":", reversed_dictonary[key])

9515 : ethnicity
1070 : magazine
4681 : maxwell
6763 : kyoto
7174 : click
6927 : bsd
2613 : anglo
1617 : begin
2220 : rail
4221 : mechanisms


In [18]:
del voc

In [19]:
global_index = 0

In [20]:
def generate_batch(word_indexes, batch_size, num_skips, skip_window):
    global global_index
    
    assert batch_size % num_skips == 0
    assert num_skips <=  2 * skip_window
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    span = 2 * skip_window + 1
    
    buffer = collections.deque(maxlen=span)
    
    for _ in range(span):
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes)
    
    
    for i in range(batch_size // num_skips):
        target = skip_window
        targets_to_avoid = [skip_window]
        
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span-1)
            
            targets_to_avoid.append(target)
            
            batch[i * num_skips + j] = buffer[skip_window] # the input word
            labels[i * num_skips + j, 0] = buffer[target] # the context words
        
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes)
    
    global_index = (global_index + len(word_indexes) - span) % len(word_indexes)
    
    return batch, labels

In [21]:
batch, labels = generate_batch(word_indexes, 10, 2, 5)

In [22]:
batch

array([   2,    2, 3134, 3134,   46,   46,   59,   59,  156,  156],
      dtype=int32)

In [23]:
labels

array([[3081],
       [  46],
       [ 156],
       [   6],
       [  12],
       [ 742],
       [ 477],
       [  46],
       [3134],
       [ 128]], dtype=int32)

In [24]:
for i in range(10):
    print(reversed_dictonary[batch[i]], ":", reversed_dictonary[labels[i][0]])

of : originated
of : first
abuse : against
abuse : a
first : as
first : working
used : class
used : first
against : abuse
against : early


In [25]:
global_index = 0

In [26]:
valid_size = 16
valid_window = 100

valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [27]:
num_samples = 64

In [28]:
batch_size = 128
embedding_size = 50 # no of hidden layers neuorns
skip_window = 2
num_skips = 2

In [29]:
tf.reset_default_graph()

train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

In [30]:
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [31]:
embeddings = tf.Variable(

    tf.random_uniform([VOC_SIZE, embedding_size], -1.0, 1.0))

embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [32]:
embeddings

<tf.Variable 'Variable:0' shape=(10000, 50) dtype=float32_ref>

In [33]:
embed

<tf.Tensor 'embedding_lookup:0' shape=(128, 50) dtype=float32>

In [34]:
nce_weights = tf.Variable(tf.truncated_normal([VOC_SIZE, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))

nce_bias = tf.Variable(tf.zeros(VOC_SIZE))

In [35]:
loss = loss = tf.reduce_mean(
                    tf.nn.nce_loss(weights=nce_weights,
                                   biases=nce_bias,
                                   labels=train_labels,
                                   inputs=embed,
                                   num_sampled=num_samples,
                                   num_classes=VOC_SIZE
                                  ))


In [36]:
optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

In [37]:
l2_norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))

normalized_embeddings = embeddings / l2_norm

In [38]:
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)

In [39]:
valid_embeddings

<tf.Tensor 'embedding_lookup_1:0' shape=(16, 50) dtype=float32>

In [40]:
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [41]:
init = tf.global_variables_initializer()

In [42]:
num_steps = 200001

In [None]:
with tf.Session() as session:
    init.run()
    
    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(word_indexes, batch_size, num_skips, skip_window)
        
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        
        ـ, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        
        average_loss += loss_val
        
        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            
            print("Average loss at step", step, ": ", average_loss)
            average_loss = 0
        
        if step % 10000 == 0:
            sim = similarity.eval()
            
            for i in xrange(valid_size):
                valid_word = reversed_dictonary[valid_examples[i]]
                top_k = 8 # number of nearest neighbour 

                nearest = (-sim[i, :]).argsort()[1: top_k+1]
                log_str = 'Nearest to %s: ' % valid_word
                
                for k in xrange(top_k):
                    close_word = reversed_dictonary[nearest[k]]
                    log_str = '%s %s' % (log_str, close_word)
                print(log_str)
            print("\n")
    final_embeddings = normalized_embeddings.eval()

Average loss at step 0 :  249.76617431640625
Nearest to no:  helps poll neo pixel elector emphasis pace scripture
Nearest to states:  conviction certainly physicists parents spellings heard bees protests
Nearest to be:  zeppelin bachelor pace scotland bankruptcy answered pakistani zones
Nearest to these:  latex coup holiday neighbouring served confession euclid traits
Nearest to may:  coded sullivan underlying drafted alloys increases root we
Nearest to was:  hebrew soap adds gain principally missions end friedrich
Nearest to when:  codified farther especially editions trotsky gary included demonstrate
Nearest to such:  wine flanders updated burkina violations closure bag chomsky
Nearest to i:  wonder consent dakota jay patterns particular cannon drinking
Nearest to six:  questions experts stressed ribbentrop ca espionage official publications
Nearest to nine:  eighty producer registered langle database q library session
Nearest to there:  deposed intense injection onset faced particle

Average loss at step 60000 :  5.296778514623642
Nearest to no:  helps sea el live often poll step nearly
Nearest to states:  himself certainly electron heard disease parents wrote medicine
Nearest to be:  whose hellenistic companion selected scotland that massachusetts sons
Nearest to these:  served italian coup holiday best pascal john doctrine
Nearest to may:  title group central we physician population food apple
Nearest to was:  is hebrew and were channels missions richard friedrich
Nearest to when:  especially included einstein described maryland o uk anchor
Nearest to such:  wine civil rate burkina updated wwii story votes
Nearest to i:  consent influential particular wonder patterns drinking longer anti
Nearest to six:  net four eight oxford five three ca feet
Nearest to nine:  one zero include five six seven give count
Nearest to there:  actor agave rand cat particles injection intense prior
Nearest to called:  believe aikido suicide austrian deaths computer issued named
Neares

Average loss at step 112000 :  4.867471558213234
Average loss at step 114000 :  4.893861685633659
Average loss at step 116000 :  4.822347330570221
Average loss at step 118000 :  4.858448916554451
Average loss at step 120000 :  4.807747573018074
Nearest to no:  helps often el poll sea pixel they nearly
Nearest to states:  certainly heard himself electron conviction parents wrote disease
Nearest to be:  hellenistic companion selected scotland whose useful pakistani sons
Nearest to these:  coup italian holiday served pascal doctrine best napoleon
Nearest to may:  physician we performer animated sullivan soap apple fossil
Nearest to was:  is were hebrew missions labour channels friedrich richard
Nearest to when:  included especially einstein codified maryland farther discs loose
Nearest to such:  chomsky wine flanders wwii updated parties rate learned
Nearest to i:  consent influential patterns wonder dakota jay junta drinking
Nearest to six:  four eight three five net nine one two
Nearest

Average loss at step 172000 :  4.770021822094917
Average loss at step 174000 :  4.743802294373512
Average loss at step 176000 :  4.773569930553436
Average loss at step 178000 :  4.760605023145676
Average loss at step 180000 :  4.772156841158867
Nearest to no:  helps often el poll they pixel sea drama
Nearest to states:  certainly heard electron himself conviction parents spellings tech
Nearest to be:  hellenistic companion selected scotland pakistani useful whose zones
Nearest to these:  coup italian holiday served pascal doctrine multimedia napoleon
Nearest to may:  physician sullivan we performer fossil animated pretty apple
Nearest to was:  is were labour missions hebrew friedrich being abilities
Nearest to when:  included especially einstein maryland codified peers discs worshipped
Nearest to such:  chomsky flanders wwii wine learned updated parties utilize
Nearest to i:  consent influential patterns junta dakota jay wonder leap
Nearest to six:  four eight three five one nine seven