In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np 
import random
import os
import string
import requests
import collections 
import os
import tarfile 
import urllib.request
from nltk.corpus import stopwords
sess = tf.Session()
%matplotlib inline

In [2]:
#We declare some model parameters. We will look at 50 pairs of word embeddings at a time (batch size). The embedding size
#of each word will be a vector of length 200, and we will only consider the 10,000 most frequent words (every other word
#will be classified as unknown). We will train for 50,000 generations and print out the loss function, and we also declare
#our skip-gram window size. Here we set our window size to two, so we will look at the surrounding two words on each side
#of the target. We set our stopwords from the Python package nltk. We also want a way to check how our word embeddings
#are performing, so we choose some common movie review words and we will print out the nearest neighbor words from these
#every 2,000 iterations 
batch_size = 50
embedding_size = 200
vocabulary_size = 10000
generations = 50000
print_loss_every = 500
num_sampled = int(batch_size/2)
window_size = 2
stops = stopwords.words('english')
print_valid_every = 2000
valid_words = ['cliche', 'love', 'hate', 'silly', 'sad']

In [3]:
#Next we declare our data loading function, which checks to make sure we have not downloaded the data before it downloads,
#or it will load the data from the disk if we have saved it before
def load_movie_data():
    save_folder_name = 'rt-polaritydata'
    pos_file = os.path.join(save_folder_name, 'rt-polaritydata', 'rt-polarity.pos')
    neg_file = os.path.join(save_folder_name, 'rt-polaritydata', 'rt-polarity.neg')

    # Check if files are already downloaded
    if not os.path.exists(os.path.join(save_folder_name, 'rt-polaritydata')):
        movie_data_url = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'

        # Save tar.gz file
        req = requests.get(movie_data_url, stream=True)
        with open(os.path.join(save_folder_name,'temp_movie_review_temp.tar.gz'), 'wb') as f:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    f.flush()
        # Extract tar.gz file into temp folder
        tar = tarfile.open(os.path.join(save_folder_name,'temp_movie_review_temp.tar.gz'), "r:gz")
        tar.extractall(path='temp')
        tar.close()

    pos_data = []
    with open(pos_file, 'r', encoding='latin-1') as f:
        for line in f:
            pos_data.append(line.encode('ascii',errors='ignore').decode())
    f.close()
    pos_data = [x.rstrip() for x in pos_data]

    neg_data = []
    with open(neg_file, 'r', encoding='latin-1') as f:
        for line in f:
            neg_data.append(line.encode('ascii',errors='ignore').decode())
    f.close()
    neg_data = [x.rstrip() for x in neg_data]
    
    texts = pos_data + neg_data
    target = [1]*len(pos_data) + [0]*len(neg_data)
    
    return(texts, target)


texts, target = load_movie_data()

In [4]:
#Now we crate a normalization function for text. This function will input a list of strings and apply lowecasing, remove
#punctiation, remove numbers, trim extra whitespace, and remove stop words. 
def normalize_text(texts, stops):
    #Lower case
    texts = [x.lower() for x in texts]
    #Remove punctuation
    texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
    #Remove numbers
    texts = [''.join(c for c in x if c not in '0123456789') for x in texts]
    #Remove stopwords
    texts = [' '.join(word for word in x.split() if word not in (stops)) for x in texts]
    #Trim extra whitespace
    texts = [' '.join(x.split()) for x in texts]
    
    return(texts)
texts = normalize_text(texts, stops)

In [5]:
#To make sure that all our movie reviews are informative, we should make sure they are long enough to contain important
#word relationships. We arbitrarily set this to three or more words
target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2]
texts = [x for x in texts if len(x.split()) > 2]

In [6]:
#To build our vocabulary, we create a function that creates a dictionary of words with their count, and any word that is
#uncommon enough to not make our vocabulary size cut-off, we label as 'RARE'
def build_dictionary(sentences, vocabulary_size):
    #Turn sentences (list of strings) into lists of words
    split_sentences = [s.split() for s in sentences]
    words = [x for sublist in split_sentences for x in sublist]
    #Initialize list of [word, word_count] for each word, starting with unknown
    count = [['RARE', -1]]
    #Now add most frequent words, limited to the N-most frequent(N=vocabulary size)
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    #Now create the dictionary
    word_dict = {}
    #For each word, that we want in the dictionary, add it, then make it the value of the prior dictionary length
    for word, word_count in count:
        word_dict[word] = len(word_dict)
    return(word_dict)

In [7]:
#We need a function that will convert a list of sentences into lists of word indices that we can pass into our embedding 
#lookup function
def text_to_numbers(sentences, word_dict):
    #Initialize the returned data 
    data = []
    for sentence in sentences:
        sentence_data = []
        #For each word, either use selected index or rare word index
        for word in sentence:
            if word in word_dict:
                word_ix = word_dict[word]
            else:
                word_ix = 0
            sentence_data.append(word_ix)
        data.append(sentence_data)
    return(data)

In [8]:
#Now we can actually create our dictionary and transform our list of sentences into list of word indices
word_dictionary = build_dictionary(texts, vocabulary_size)
word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys()))
text_data = text_to_numbers(texts, word_dictionary)

In [9]:
#From the preceding word dictionary, we can look up the index for the validation words we choose before
valid_examples = [word_dictionary[x] for x in valid_words]

In [10]:
#We now create a function that will return our skip-gram batches. We want to train on pairs of words where one word
#is the training input (from the target word at the center of our window) and the other word is selected from the 
#window. For example, the sentence "the cat in the hat" may result in (input,output) pairs such as the following:
#(the,in), (cat,in), (the,in), (hat,in), if in was the target word, and we had a window size of two in each direction
def generate_batch_data(sentences, batch_size, window_size, method='skip_gram'):
    # Fill up data batch
    batch_data = []
    label_data = []
    while len(batch_data) < batch_size:
        # select random sentence to start
        rand_sentence = np.random.choice(sentences)
        # Generate consecutive windows to look at
        window_sequences = [rand_sentence[max((ix-window_size),0):(ix+window_size+1)] for ix, x in enumerate(rand_sentence)]
        # Denote which element of each window is the center word of interest
        label_indices = [ix if ix<window_size else window_size for ix,x in enumerate(window_sequences)]
        
        # Pull out center word of interest for each window and create a tuple for each window
        if method=='skip_gram':
            batch_and_labels = [(x[y], x[:y] + x[(y+1):]) for x,y in zip(window_sequences, label_indices)]
            # Make it in to a big list of tuples (target word, surrounding word)
            tuple_data = [(x, y_) for x,y in batch_and_labels for y_ in y]
        elif method=='cbow':
            batch_and_labels = [(x[:y] + x[(y+1):], x[y]) for x,y in zip(window_sequences, label_indices)]
            # Make it in to a big list of tuples (target word, surrounding word)
            tuple_data = [(x_, y) for x,y in batch_and_labels for x_ in x]
        else:
            raise ValueError('Method {} not implemented yet.'.format(method))
            
        # extract batch and labels
        batch, labels = [list(x) for x in zip(*tuple_data)]
        batch_data.extend(batch[:batch_size])
        label_data.extend(labels[:batch_size])
    # Trim batch and label at the end
    batch_data = batch_data[:batch_size]
    label_data = label_data[:batch_size]
    
    # Convert to numpy array
    batch_data = np.array(batch_data)
    label_data = np.transpose(np.array([label_data]))
    
    return(batch_data, label_data)

In [11]:
#Now we can initialize our embedding matrix, and declare our placeholders and our embedding lookup function.
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
#Create data/target placeholders
x_inputs = tf.placeholder(tf.int32, shape=[batch_size])
y_target = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

#Lookup the word embedding
embed = tf.nn.embedding_lookup(embeddings, x_inputs)

In [12]:
#The loss function should be something such as a softmax, which calculates the loss on predicting the wrong word category.
#But since our target has 10,000 different categories, it is very sparse. This sparsity causes problems fitting or 
#converging for a model. To tackle this, we use a loss function called noise-contrastive error(NCE). This NCE loss
#function turns our problem into a binary prediction, by predicting the word class versus random noise predictions. 
#The num_sampled parameter is how much of the batch to turn into random noise 
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                                               stddev=1.0 / np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                     biases=nce_biases,
                                     labels=y_target,
                                     inputs=embed,
                                     num_sampled=num_sampled,
                                     num_classes=vocabulary_size))

In [13]:
#Now we need to create a way to find nearby words to our validation words. We will do this by computing the cosine 
#similarity between the validatoin set and all of our word embeddings, then we can print out the closest set of words for
#each validation word.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
normalized_embeddings = embeddings/norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings,transpose_b=True)

In [14]:
#We now declare our optimizer function, and initialize our model variables. 
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
init = tf.global_variables_initializer()
sess.run(init)

In [15]:
#Now we can print our embeddings and print off the loss and closest words to our validations set during the training.
loss_vec = []
loss_x_vec = []
for i in range(generations):
    batch_inputs, batch_labels = generate_batch_data(text_data, batch_size, window_size)
    feed_dict = {x_inputs : batch_inputs, y_target : batch_labels}

    # Run the train step
    sess.run(optimizer, feed_dict=feed_dict)

    # Return the loss
    if (i+1) % print_loss_every == 0:
        loss_val = sess.run(loss, feed_dict=feed_dict)
        loss_vec.append(loss_val)
        loss_x_vec.append(i+1)
        print("Loss at step {} : {}".format(i+1, loss_val))
      
    # Validation: Print some random words and top 5 related words
    if (i+1) % print_valid_every == 0:
        sim = sess.run(similarity)
        for j in range(len(valid_words)):
            valid_word = word_dictionary_rev[valid_examples[j]]
            top_k = 5 # number of nearest neighbors
            nearest = (-sim[j, :]).argsort()[1:top_k+1]
            log_str = "Nearest to {}:".format(valid_word)
            for k in range(top_k):
                close_word = word_dictionary_rev[nearest[k]]
                score = sim[j,nearest[k]]
                log_str = "%s %s," % (log_str, close_word)
            print(log_str)

Loss at step 500 : 28.444011688232422
Loss at step 1000 : 19.690587997436523
Loss at step 1500 : 28.480619430541992
Loss at step 2000 : 1.0775953531265259
Nearest to cliche: hermetic, huppert, satirical, assuredly, relocation,
Nearest to love: assassination, cuban, hanks, friend, rapturous,
Nearest to hate: male, art, dense, unbearable, mismo,
Nearest to silly: pokes, shots, nuances, accent, theatres,
Nearest to sad: died, guitar, hawn, brilliant, kane,
Loss at step 2500 : 4.748654842376709
Loss at step 3000 : 2.215075969696045
Loss at step 3500 : 4.729773044586182
Loss at step 4000 : 2.888399600982666
Nearest to cliche: hermetic, huppert, satirical, assuredly, relocation,
Nearest to love: assassination, cuban, hanks, friend, rapturous,
Nearest to hate: male, art, dense, unbearable, mismo,
Nearest to silly: pokes, shots, nuances, accent, theatres,
Nearest to sad: died, guitar, hawn, brilliant, kane,
Loss at step 4500 : 2.9385247230529785
Loss at step 5000 : 18.377548217773438
Loss at s

Loss at step 36500 : 0.8065575957298279
Loss at step 37000 : 2.987954616546631
Loss at step 37500 : 1.5680650472640991
Loss at step 38000 : 2.871208906173706
Nearest to cliche: hermetic, huppert, satirical, assuredly, relocation,
Nearest to love: assassination, cuban, hanks, friend, rapturous,
Nearest to hate: male, art, dense, unbearable, mismo,
Nearest to silly: pokes, shots, nuances, accent, theatres,
Nearest to sad: died, guitar, hawn, brilliant, kane,
Loss at step 38500 : 1.2031164169311523
Loss at step 39000 : 4.00250244140625
Loss at step 39500 : 1.1819109916687012
Loss at step 40000 : 1.5188648700714111
Nearest to cliche: hermetic, huppert, satirical, assuredly, relocation,
Nearest to love: assassination, cuban, hanks, friend, rapturous,
Nearest to hate: male, art, dense, unbearable, mismo,
Nearest to silly: pokes, shots, nuances, accent, theatres,
Nearest to sad: died, guitar, hawn, brilliant, kane,
Loss at step 40500 : 1.5759927034378052
Loss at step 41000 : 0.576181888580322