In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import collections
import math
import os
import random
import zipfile

In [3]:
from six.moves import urllib
from six.moves import xrange

In [4]:
import numpy as np
import tensorflow as tf

In [5]:
print(np.__version__)
print(tf.__version__)

1.15.4
1.10.0


In [6]:
DOWNLOAD_FILENAME = "SampleText.zip"
def download(url_path, expected_bytes):
    if not os.path.exists(DOWNLOAD_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOAD_FILENAME)
        
        statinfo = os.stat(DOWNLOAD_FILENAME)
        if statinfo.st_size == expected_bytes:
            print("Found and verified file from this path : ", url_path)
            print("Downloaded filed: ", DOWNLOAD_FILENAME)
        else:
            print(statinfo.st_size)
            raise Exception("Failed to verify file from " + url_path + " Can you get with a browser?")                

In [7]:
def read_words():
    with zipfile.ZipFile(DOWNLOAD_FILENAME) as f:
        firstfile = f.namelist()[0]
        filestring = tf.compat.as_str(f.read(firstfile))
        words = filestring.split()
    
    return words

In [None]:
URL_PATH = 'http://mattmahoney.net/dc/text8.zip'
FILE_SIZE = 31344016
download(URL_PATH, FILE_SIZE)

In [8]:
voc = read_words()

In [9]:
len(voc)

17005207

In [10]:
voc[:25]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against',
 'early',
 'working',
 'class',
 'radicals',
 'including',
 'the',
 'diggers',
 'of',
 'the',
 'english',
 'revolution',
 'and',
 'the',
 'sans',
 'culottes']

In [12]:
def build_dataset(words, n_words):
    word_counts = [['UNKOWN', -1]]
    
    counter = collections.Counter(words)
    word_counts.extend(counter.most_common(n_words-1))
    
    dictonary = dict()
    
    for word, _ in word_counts:
        dictonary[word] = len(dictonary)
    
    word_indexes = list()
    unkown_count = 0
    for word in words:
        if word in dictonary:
            index = dictonary[word]
        else:
            index = 0
            unkown_count += 1
        
        word_indexes.append(index)
    
    word_counts[0][1] = unkown_count
    
    reversed_dictonary = dict(zip(dictonary.values(), dictonary.keys()))
    
    return word_counts, word_indexes, dictonary, reversed_dictonary    

In [13]:
VOC_SIZE = 5000
word_counts, word_indexes, dictonary , reversed_dictonary = build_dataset(voc, VOC_SIZE)

In [14]:
word_counts[:10]

[['UNKOWN', 2735459],
 ('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430)]

In [15]:
word_indexes[:10]

[0, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]

In [27]:
import random
for key in random.sample(list(dictonary), 10):
    print(key, ":", dictonary[key])

men : 371
prisoners : 3536
missile : 2861
impossible : 2182
galaxy : 3445
trees : 2211
suggested : 1552
aka : 4061
confederation : 4614
revenue : 4150


In [29]:
import random
for key in random.sample(list(reversed_dictonary),10):
    print(key, ":", reversed_dictonary[key])

1698 : ibm
991 : relationship
4666 : elvis
385 : good
228 : among
4986 : critique
3861 : nelson
3742 : turing
4421 : sweet
2506 : consciousness


In [30]:
del voc

In [31]:
global_index = 0

In [32]:
def generate_batch(word_indexes, batch_size, num_skips, skip_window):
    global global_index
    
    assert batch_size % num_skips == 0
    assert num_skips <=  2 * skip_window
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    span = 2 * skip_window + 1
    
    buffer = collections.deque(maxlen=span)
    
    for _ in range(span):
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes)
    
    
    for i in range(batch_size // num_skips):
        target = skip_window
        targets_to_avoid = [skip_window]
        
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span-1)
            
            targets_to_avoid.append(target)
            
            batch[i * num_skips + j] = buffer[skip_window] # the input word
            labels[i * num_skips + j, 0] = buffer[target] # the context words
        
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes)
    
    global_index = (global_index + len(word_indexes) - span) % len(word_indexes)
    
    return batch, labels

In [33]:
batch, labels = generate_batch(word_indexes, 10, 2, 5)

In [34]:
batch

array([   2,    2, 3134, 3134,   46,   46,   59,   59,  156,  156],
      dtype=int32)

In [35]:
labels

array([[ 128],
       [  46],
       [  12],
       [   2],
       [  12],
       [ 156],
       [ 195],
       [3134],
       [ 742],
       [  59]], dtype=int32)

In [36]:
for i in range(10):
    print(reversed_dictonary[batch[i]], ":", reversed_dictonary[labels[i][0]])

of : early
of : first
abuse : as
abuse : of
first : as
first : against
used : term
used : abuse
against : working
against : used


In [37]:
global_index = 0

In [38]:
valid_size = 16
valid_window = 100

valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [39]:
batch_size = 128
embedding_size = 50 # no of hidden layers neuorns
skip_window = 2
num_skips = 2

In [41]:
tf.reset_default_graph()

train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

In [42]:
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [44]:
embeddings = tf.Variable(

    tf.random_uniform([VOC_SIZE, embedding_size], -1.0, 1.0))

embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [45]:
embeddings

<tf.Variable 'Variable:0' shape=(5000, 50) dtype=float32_ref>

In [46]:
embed

<tf.Tensor 'embedding_lookup:0' shape=(128, 50) dtype=float32>

In [47]:
weights = tf.Variable(tf.truncated_normal([VOC_SIZE, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))

bias = tf.Variable(tf.zeros(VOC_SIZE))

In [48]:
hidden_out = tf.matmul(embed, tf.transpose(weights)) + bias

In [49]:
hidden_out

<tf.Tensor 'add:0' shape=(128, 5000) dtype=float32>

In [50]:
train_one_hot = tf.one_hot(train_labels, VOC_SIZE)

In [52]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=hidden_out, labels=train_one_hot))

optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)