# The Skip Gram model
Dataset: movie review 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz' <br>
The CBOW (continuous bag of words) is a method of word2vec. It is very similar to the skip-gram method, except it predicts a single target word from a surrounding window of context words.

Skip-gram worked with combinations of window and target as a group of paired inputs and outputs, but with CBOW we will add the surrounding window embeddings together to get one embedding to predict the target word embedding.

In [12]:

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import random
import os
import pickle
import string
import requests
import collections
import io
import tarfile
import urllib.request
from nltk.corpus import stopwords
from tensorflow.python.framework import ops
ops.reset_default_graph()

import sys
sys.path.insert(0, './utils')
import text_helpers

In [13]:
# saving directory
data_folder_name = 'temp'
if not os.path.exists(data_folder_name):
    os.makedirs(data_folder_name)

In [14]:
sess = tf.Session()

In [15]:
batch_size = 200            # Model Batch Size
embedding_size = 50        # word embedding size
vocabulary_size = 2000      # Maximum vocabulary size
generations = 50000        # number of iterations for training.
model_learning_rate = 0.05   # Learning rate

num_sampled = int(batch_size/2) # Number of negative examples to sample.
window_size = 3                 # How many words to consider left and right.

# Add checkpoints to training
save_embeddings_every = 5000
print_valid_every = 5000
print_loss_every = 1000

### Load and normalize data

In [16]:
# Declare stop words
stops = stopwords.words('english')

# We pick some test words. We are expecting synonyms to appear
valid_words = ['love', 'hate', 'happy', 'sad', 'man', 'woman']
# Later we will have to transform these into indices

# Load the movie review data
texts, target = text_helpers.load_movie_data()

# Normalize text
texts = text_helpers.normalize_text(texts, stops)

# Texts must contain at least 3 words
target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2]
texts = [x for x in texts if len(x.split()) > 2]

In [17]:
# Build our data set and dictionaries
word_dictionary = text_helpers.build_dictionary(texts, vocabulary_size)
word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys()))
text_data = text_helpers.text_to_numbers(texts, word_dictionary)

# Get validation word keys
valid_examples = [word_dictionary[x] for x in valid_words]

### Construction of model

In [18]:
# Define Embeddings:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

# NCE loss parameters
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                                               stddev=1.0 / np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# Create data/target placeholders
x_inputs = tf.placeholder(tf.int32, shape=[batch_size, 2*window_size])
y_target = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

# Lookup the word embedding
# Add together window embeddings of the context window:
embed = tf.zeros([batch_size, embedding_size])
for element in range(2*window_size):
    embed += tf.nn.embedding_lookup(embeddings, x_inputs[:, element])

### Loss

In [19]:
# Get loss from prediction
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                     biases=nce_biases,
                                     labels=y_target,
                                     inputs=embed,
                                     num_sampled=num_sampled,
                                     num_classes=vocabulary_size))

# Create optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate=model_learning_rate).minimize(loss)

# Cosine similarity between words
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [20]:
# Create model saving operation
saver = tf.train.Saver({"embeddings": embeddings})

#Add variable initializer.
init = tf.global_variables_initializer()
sess.run(init)

In [21]:
# Filter out sentences that aren't long enough:
# the length must be at least 7: window size(3) * 2 + 1
text_data = [x for x in text_data if len(x)>=(2*window_size+1)]

### Training and testing

In [22]:
loss_vec = []
loss_x_vec = []
for i in range(generations):
    batch_inputs, batch_labels = text_helpers.generate_batch_data(text_data, batch_size,
                                                                  window_size, method='cbow')
    feed_dict = {x_inputs : batch_inputs, y_target : batch_labels}

    # Run the train step
    sess.run(optimizer, feed_dict=feed_dict)

    # Return the loss
    if (i+1) % print_loss_every == 0:
        loss_val = sess.run(loss, feed_dict=feed_dict)
        loss_vec.append(loss_val)
        loss_x_vec.append(i+1)
        print('Loss at step {} : {}'.format(i+1, loss_val))
      
    # Validation: Print some random words and top 5 related words
    if (i+1) % print_valid_every == 0:
        sim = sess.run(similarity, feed_dict=feed_dict)
        for j in range(len(valid_words)):
            valid_word = word_dictionary_rev[valid_examples[j]]
            top_k = 5 # number of nearest neighbors
            nearest = (-sim[j, :]).argsort()[1:top_k+1]
            log_str = "Nearest to {}:".format(valid_word)
            for k in range(top_k):
                close_word = word_dictionary_rev[nearest[k]]
                log_str = '{} {},' .format(log_str, close_word)
            print(log_str)
            
    # Save dictionary + embeddings
    if (i+1) % save_embeddings_every == 0:
        # Save vocabulary dictionary
        with open(os.path.join(data_folder_name,'movie_vocab.pkl'), 'wb') as f:
            pickle.dump(word_dictionary, f)
        
        # Save embeddings
        model_checkpoint_path = os.path.join(os.getcwd(),data_folder_name,'cbow_movie_embeddings.ckpt')
        save_path = saver.save(sess, model_checkpoint_path)

Loss at step 1000 : 2.382382869720459
Loss at step 2000 : 2.1709465980529785
Loss at step 3000 : 2.15669322013855
Loss at step 4000 : 1.9244437217712402
Loss at step 5000 : 1.8567508459091187
Nearest to love: disappointing, songs, shallow, gone, pass,
Nearest to hate: diverting, certain, particularly, promising, austin,
Nearest to happy: heard, overthetop, better, respect, dramatic,
Nearest to sad: mark, finds, storys, storyline, horror,
Nearest to man: dark, evil, animation, turns, brown,
Nearest to woman: success, minor, genres, psychological, animal,
Loss at step 6000 : 2.621933937072754
Loss at step 7000 : 1.6907410621643066
Loss at step 8000 : 1.7902206182479858
Loss at step 9000 : 1.620408296585083
Loss at step 10000 : 1.654314398765564
Nearest to love: disappointing, songs, shallow, gone, pass,
Nearest to hate: diverting, certain, particularly, promising, austin,
Nearest to happy: heard, overthetop, better, respect, dramatic,
Nearest to sad: mark, finds, storys, storyline, horro

In [None]:
# Plot loss over time
plt.plot(loss_x_vec, loss_vec, 'k-')
plt.title('Training Loss per Generation')
plt.xlabel('Generation')
plt.ylabel('Loss')
plt.show()