### Unsupervised dimensionality reduction using a 1 Hidden-layer perceptron where label == ground truth
### For NLP, we can say somewhat say that word2vec and autoencoders are similiar.

> Dimensionality reduction works only if the inputs are correlated (like images from the same domain). It fails if we pass in completely random inputs each time we train an autoencoder. So in the end, an autoencoder can produce lower dimensional output (at the encoder) given an input much like Principal Component Analysis (PCA). And since we don’t have to use any labels during training, it’s an unsupervised model as well.

In [5]:
import os
from random import randint
from collections import Counter
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

import numpy as np
import tensorflow as tf

In [182]:
corpus = "the quick brown fox jumped over the lazy dog from the quick tall fox".split()
test_corpus = "the quick brown fox jumped over the lazy dog from the quick tall fox".split()
corpus[:10]

['the',
 'quick',
 'brown',
 'fox',
 'jumped',
 'over',
 'the',
 'lazy',
 'dog',
 'from']

In [183]:
def build_vocab(words, vocab_size):
    """ Build vocabulary of VOCAB_SIZE most frequent words """
    dictionary = dict()
    count = [('UNK', -1)]
    count.extend(Counter(words).most_common(vocab_size - 1))
    index = 0
    for word, _ in count:
        dictionary[word] = index
        index += 1
    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, index_dictionary

In [184]:
vocabulary, reverse_vocabulary = build_vocab(corpus, 100)

In [185]:
vocabulary

{'UNK': 0,
 'brown': 9,
 'dog': 7,
 'fox': 3,
 'from': 10,
 'jumped': 8,
 'lazy': 6,
 'over': 5,
 'quick': 2,
 'tall': 4,
 'the': 1}

In [186]:
def index_words_in_corpus(corpus):
    return [vocabulary[token] if token in vocabulary else 0 for token in corpus]

In [187]:
corpus = index_words_in_corpus(corpus)
test_corpus = index_words_in_corpus(test_corpus)

In [188]:
test_corpus

[1, 2, 9, 3, 8, 5, 1, 6, 7, 10, 1, 2, 4, 3]

In [189]:
vocabulary_size = len(vocabulary)
vocabulary_size

11

In [190]:
def one_hot_encode(index):
    row = np.zeros(vocabulary_size, dtype=np.int32)
    row[index] = 1
    return row

In [191]:
data = np.array([one_hot_encode(i) for i in corpus])
test_data = np.array([one_hot_encode(i) for i in test_corpus])

In [192]:
print("(TRAIN: Total number of words, Vocabulary size):", data.shape)
print("(TEST:  Total number of words, Vocabulary size):", test_data.shape)

(TRAIN: Total number of words, Vocabulary size): (14, 11)
(TEST:  Total number of words, Vocabulary size): (14, 11)


In [193]:
data[randint(1, data.shape[0])]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [194]:
X = tf.placeholder(tf.float32, shape=(None, vocabulary_size))
Y = tf.placeholder(tf.float32, shape=(None, vocabulary_size))

In [196]:
w1 = tf.Variable(tf.random_normal(shape=(vocabulary_size, 1000), stddev=0.01), name='weights1')
b1 = tf.Variable(tf.zeros([1, 1000]), name="bias1")
layer1 = tf.nn.relu(tf.add(tf.matmul(X, w1), b1))

w2 = tf.Variable(tf.random_normal(shape=(1000, 250), stddev=0.01), name='weights2')
b2 = tf.Variable(tf.zeros([1, 250]), name="bias2")
layer2 = tf.nn.relu(tf.add(tf.matmul(layer1, w2), b2))

w = tf.Variable(tf.random_normal(shape=(250, 50), stddev=0.01), name='weights')
b = tf.Variable(tf.zeros([1, 50]), name="bias")
code = tf.nn.relu(tf.add(tf.matmul(layer2, w), b))

w3 = tf.Variable(tf.random_normal(shape=(50, 250), stddev=0.01), name='weights3')
b3 = tf.Variable(tf.zeros([1, 250]), name="bias3")
layer3 = tf.nn.relu(tf.add(tf.matmul(code, w3), b3))

w4 = tf.Variable(tf.random_normal(shape=(250, 1000), stddev=0.01), name='weights4')
b4 = tf.Variable(tf.zeros([1, 1000]), name="bias4")
layer4 = tf.nn.relu(tf.add(tf.matmul(layer3, w4), b4))

w5 = tf.Variable(tf.random_normal(shape=(1000, vocabulary_size), stddev=0.01), name='weights5')
b5 = tf.Variable(tf.zeros([1, vocabulary_size]), name="bias5")
decoder = tf.nn.sigmoid(tf.add(tf.matmul(layer4, w5), b5))

In [197]:
# entropy = tf.nn.softmax_cross_entropy_with_logits(logits=decoder, labels=Y)
loss = tf.reduce_mean(tf.pow(X - decoder, 2))

In [198]:
optimizer = tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE).minimize(loss)
init = tf.global_variables_initializer()

In [203]:
LEARNING_RATE = 0.01
NUM_TRAIN_STEPS = 1000
SKIP_STEP = 10 # how many steps to skip before reporting the loss

In [205]:
with tf.Session() as sess:
    sess.run(init)
    
    for i in range(NUM_TRAIN_STEPS):
        _, loss_val = sess.run([optimizer, loss], feed_dict={X: data})
            
        if i % SKIP_STEP == 0:
                print("EPOCH {}/{}, LOSS {}".format(i , NUM_TRAIN_STEPS, loss_val))
                
    test_data_compressed = sess.run(decoder, feed_dict={X: test_data})
#     np.save(outfile, test_data_compressed)
        

EPOCH 0/1000, LOSS 0.2499999850988388
EPOCH 10/1000, LOSS 0.24945718050003052
EPOCH 20/1000, LOSS 0.24854017794132233
EPOCH 30/1000, LOSS 0.24700087308883667
EPOCH 40/1000, LOSS 0.24443811178207397
EPOCH 50/1000, LOSS 0.24024417996406555
EPOCH 60/1000, LOSS 0.2336183488368988
EPOCH 70/1000, LOSS 0.22381845116615295
EPOCH 80/1000, LOSS 0.21059055626392365
EPOCH 90/1000, LOSS 0.19337916374206543
EPOCH 100/1000, LOSS 0.16469691693782806
EPOCH 110/1000, LOSS 0.10726149380207062
EPOCH 120/1000, LOSS 0.0821770429611206
EPOCH 130/1000, LOSS 0.08011706173419952
EPOCH 140/1000, LOSS 0.07989447563886642
EPOCH 150/1000, LOSS 0.07984154671430588
EPOCH 160/1000, LOSS 0.07981390506029129
EPOCH 170/1000, LOSS 0.07979799807071686
EPOCH 180/1000, LOSS 0.08027677237987518
EPOCH 190/1000, LOSS 0.08026178926229477
EPOCH 200/1000, LOSS 0.08014322817325592
EPOCH 210/1000, LOSS 0.08006715029478073
EPOCH 220/1000, LOSS 0.08003073930740356
EPOCH 230/1000, LOSS 0.08000756800174713
EPOCH 240/1000, LOSS 0.0799922

In [206]:
test_data_compressed.shape

(14, 11)

In [207]:
test_data_compressed

array([[ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.07649098],
       [ 0.00250547,  0.22422886,  0.15112008,  0.15111433,  0.07651036,
         0.07650194,  0.07649717,  0.07650229,  0.07651076,  0.07651436,
         0.076

### Since our compressed data is in probabilities, we'll convert  to whole nums to look up words

In [208]:
test_data_compressed[test_data_compressed>0] = 1

In [209]:
test_data_compressed

array([[ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.]], dtype=float32)

In [210]:
test_data

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]])

### Tadaa!!! And here's our prediction
This show's how well our compression is able to recover data
> Remember that Autoencoders are lossy compression which means you will never be able to full reconstruct that data

In [212]:
sent = np.ndarray.tolist(test_data_compressed)[0]
print(' '.join([reverse_vocabulary[i] if sent[i] == 1. else "" for i in range(len(sent))]))

UNK the quick fox tall over lazy dog jumped brown from
