In [24]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
import urllib
import zipfile
import collections

In [25]:
def maybe_downlaod(filename, url, expected_bytes):
    # If file not already present, urllib request to download
    if not os.path.exists(filename): 
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print ('Found and verified')
    else:
        print (statinfo.st_size)
        raise Exception('Failed to verify ' + filename)
    return filename

In [26]:
url = 'http://mattmahoney.net/dc/'
filename = maybe_downlaod('text8.zip', url, 31344016)

Found and verified


In [27]:
def read_data(filename):
    # Extract first file as list of words...
    with zipfile.ZipFile(filename) as f:
        fileNames = f.namelist()
        fileContentAsString = f.read(fileNames[0])
        data = tf.compat.as_str(fileContentAsString).split()
    return data

In [28]:
vocabulary = read_data(filename)
print(vocabulary[:7])

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']


In [31]:
def build_dataset(words, n_words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = {}
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = []
    unk_count = 0
    for word in words:
        if word in dictionary:
            idx = dictionary[word]
        else:
            idx = 0 # This stands for 'UNK'
            unk_count += 1
        data.append(idx)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [32]:
data, count, dictionary, reversed_dictionary = build_dataset(vocabulary, 100)

In [34]:
count

[['UNK', 9018576],
 ('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430),
 ('two', 192644),
 ('is', 183153),
 ('as', 131815),
 ('eight', 125285),
 ('for', 118445),
 ('s', 116710),
 ('five', 115789),
 ('three', 114775),
 ('was', 112807),
 ('by', 111831),
 ('that', 109510),
 ('four', 108182),
 ('six', 102145),
 ('seven', 99683),
 ('with', 95603),
 ('on', 91250),
 ('are', 76527),
 ('it', 73334),
 ('from', 72871),
 ('or', 68945),
 ('his', 62603),
 ('an', 61925),
 ('be', 61281),
 ('this', 58832),
 ('which', 54788),
 ('at', 54576),
 ('he', 53573),
 ('also', 44358),
 ('not', 44033),
 ('have', 39712),
 ('were', 39086),
 ('has', 37866),
 ('but', 35358),
 ('other', 32433),
 ('their', 31523),
 ('its', 29567),
 ('first', 28810),
 ('they', 28553),
 ('some', 28161),
 ('had', 28100),
 ('all', 26229),
 ('more', 26223),
 ('most', 25563),
 ('can', 25519),
 ('been', 25383),
 ('such', 24413),
 ('many',