# HW 6 Group 7
- First download the dataset ("bible.txt") from Stud.IP and put it on the same level as this notebook

## General Setup

In [1]:
import tensorflow as tf
import numpy as np
from nltk.tokenize import RegexpTokenizer
from collections import Counter

In [2]:
# General tensorflow settings
config = tf.ConfigProto()
# Use GPU in incremental mode (is ignored on CPU version)
config.gpu_options.allow_growth=True
# Add config=config in every tf.Session() -> tf.Session(config=config)

## Data preparation and visualization

In [3]:
# Helper functions
def tokenize_text(text):
    text_lower = text.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    text_tokenized = tokenizer.tokenize(text_lower)
    return text_tokenized

def create_dicts_from_tokenized_text(tokenized_text, vocabulary_size):
    words_and_count = Counter(tokenized_text).most_common(vocabulary_size - 1)
    # print(words_and_count)
    word2id = {word: word_id for word_id, (word, _) in enumerate(words_and_count, 1)}
    word2id["_UNKNOWN_"] = 0
    id2word = dict(zip(word2id.values(), word2id.keys()))
    return word2id, id2word

def find_and_print_nearest_neighbors(target_words, number_of_nearest_neighbors):
    embedding_values = sess.run(embeddings)
    normed_embeddings = embedding_values / np.sqrt(np.sum(embedding_values**2, axis=1, keepdims=True))
    for word in target_words:
        word_id = word2id[word]
        word_embedding = normed_embeddings[word_id, :]
        cosine_similarities = np.matmul(normed_embeddings, word_embedding )
        n_nearest_neighbors = np.argsort(-cosine_similarities)[:number_of_nearest_neighbors]
        print("Nearest to " + word + ": " + ", ".join([id2word[nearest] for nearest in n_nearest_neighbors]))

In [4]:
vocab_size = 10000
embedding_size = 64

with open('bible.txt', 'r') as f:
    text = f.read()

text_tokenized = tokenize_text(text)

word2id, id2word = create_dicts_from_tokenized_text(text_tokenized, vocab_size)

text_ids = [word2id.get(word, 0) for word in text_tokenized]

In [5]:
tf.reset_default_graph()

In [6]:
print(len(text_ids))
print(text_ids[:5])

# Create the training and context words 
# Ignore the first two and the last two words, because they don't have a valid context
context_words = []
training_words = []
for i in range(2, len(text_ids)-2):
    for j in [-2,-1,1,2]:
        training_words.append(text_ids[i])
        context_words.append(text_ids[i+j])

print(training_words[:10])
print(context_words[:10])
print(len(training_words), len(context_words))

training_dataset = tf.data.Dataset.from_tensor_slices((training_words, context_words))
training_dataset = training_dataset.shuffle(buffer_size=len(training_words), reshuffle_each_iteration=True)

training_batch_size = 128
training_dataset = training_dataset.batch(training_batch_size)

# First we create the iterator
iterator = tf.data.Iterator.from_structure(training_dataset.output_types,
                                           training_dataset.output_shapes)

# We name the get_next method of the iterator to use it as a shortcut
next_batch = iterator.get_next()

# We prepare the initializer operations for both the training and the validation dataset
training_init_op = iterator.make_initializer(training_dataset)

input_data = next_batch[0]
input_goal = tf.expand_dims(next_batch[1], 1)

854033
[1, 253, 447, 3, 161]
[447, 447, 447, 447, 3, 3, 3, 3, 161, 161]
[1, 253, 3, 161, 253, 447, 161, 193, 447, 3]
3416116 3416116


In [7]:
print(input_data, input_goal)

Tensor("IteratorGetNext:0", shape=(?,), dtype=int32) Tensor("ExpandDims:0", shape=(?, 1), dtype=int32)


## Model

In [8]:
with tf.variable_scope("embedding", reuse=tf.AUTO_REUSE) as scope:
    uni_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    embeddings = tf.get_variable("embeddings", [vocab_size, embedding_size], initializer=uni_initializer)
    
    norm_initializer = tf.truncated_normal_initializer(stddev=1.0/np.sqrt(embedding_size))
    weight_matrix = tf.get_variable("weight_matrix", [vocab_size, embedding_size], initializer=norm_initializer)
    
    bias_initializer = tf.zeros_initializer()
    biases = tf.get_variable("bias", [vocab_size], initializer=bias_initializer)

    print(weight_matrix)
    print(biases)
    input_emb = tf.nn.embedding_lookup(embeddings, input_data)
    batch_losses = tf.nn.nce_loss(weight_matrix, biases, input_goal, input_emb, 64, vocab_size)
    
    loss = tf.reduce_mean(batch_losses)
    
    # Specify the variables for the summaries
    tf.summary.scalar('loss', loss)

    # Now we will merge our summary scalars
    merged_summaries = tf.summary.merge_all()

    # We also have too specify summary file writers 
    train_writer = tf.summary.FileWriter('./summaries/train/')

<tf.Variable 'embedding/weight_matrix:0' shape=(10000, 64) dtype=float32_ref>
<tf.Variable 'embedding/bias:0' shape=(10000,) dtype=float32_ref>


In [9]:
learning_rate = 1
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_step = optimizer.minimize(loss)

In [10]:
# First we specify the number of epochs
epochs = 15

# We safely create our tensorflow session and pass our config parameters (for correct GPU usage, if GPU available)
with tf.Session(config=config) as sess:

    # We initialize our variables
    sess.run(tf.global_variables_initializer())

    # We define a step counter (for the summaries)
    global_steps = 0

    # We will run our training as often as specified in epochs
    for ep in range(epochs):

        # Training Part
        # We have to load the training data into the iterator
        sess.run(training_init_op) 
        # Validation Part
        print("\nBefore epoch: {}".format(ep))
        find_and_print_nearest_neighbors(["israel", "sin", "god", "5", "make", "jesus", "year"], 8)

        # We have to loop over all our batches in every epoch
        while True:
            try:
                # We train with one batch and read the summary and save it in the variable summary
                _, summary = sess.run((training_step, merged_summaries))
                
                # We write the summary to the disk at the specified location
                train_writer.add_summary(summary, global_steps)

                # We update our step counter
                global_steps += 1

            # After we finished all batches, we catch the OutOfRangeError and break
            except tf.errors.OutOfRangeError:
                break


Before epoch: 0
Nearest to israel: israel, akan, taketh, ithrite, may, hewer, salah, avenge
Nearest to sin: sin, slaughter, mahershalalhashbaz, stamp, islands, embalm, penny, milcom
Nearest to god: god, counsellor, earthquake, india, wound, agreeth, smoking, nophah
Nearest to 5: 5, mehujael, nourished, vesture, ellasar, wedding, comforters, tilgathpilneser
Nearest to make: make, drove, sorely, dregs, amongst, valiant, weariness, danced
Nearest to jesus: jesus, custody, languages, husbandmen, pleased, slippery, haft, mealtime
Nearest to year: year, bedstead, andrew, jemuel, mithredath, mounts, phalti, diminish

Before epoch: 1
Nearest to israel: israel, pilled, ithrite, hewer, taketh, death, seeth, tables
Nearest to sin: sin, slaughter, 88, islands, mahershalalhashbaz, throne, sun, chaff
Nearest to god: god, filthiness, thank, lord, o, mocked, earthquake, psalms
Nearest to 5: 5, 8, 4, 15, 19, 14, 10, 6
Nearest to make: make, executeth, seekest, corinth, lilies, fetters, lend, sanctify


## Tensorboard loss screenshots

- with outliers
![](https://i.imgur.com/sH6N7kP.png)

- without outliers
![](https://i.imgur.com/oPXn11u.png)