# A model for predicting word meaning from characters

See my blog post at http://ijdykeman.github.io/ml/2017/01/03/chars-to-vec.html for a description of the model.

### This code sets up the data and some functions we'll need later

In [None]:
%pylab inline
import random
import seaborn as sns
import string
import csv
import string
import pickle
from __future__ import absolute_import, division, print_function
import os
from six import moves
import ssl
import tensorflow as tf
import tflearn
import collections
import functools
from scipy.spatial import KDTree

# you'll need to replace this path with the path to the word vector pickle file
with open("../training_data/D_cbow_pdw_8B.pkl", 'rb') as f:
    model = pickle.load(f)


def get_vec(word):
    '''
    Returns the word vector corresponding to the given word.
    '''
    if word in model:
        return model[word]
    else:
        return np.zeros_like(model['dog'])
    
words = model.keys()
random.shuffle(words)

train_words = words[:int(len(words) * .75)]
val_words = words[int(len(words) * .75) : ]

seq_max_len = 17 # maximum length of input the model can consider

# Create processor for converting strings to one-hot vectors
processor = tflearn.data_utils.VocabularyProcessor (seq_max_len, min_frequency=0, vocabulary=None, tokenizer_fn=None)
data = [" ".join(list(word)) for word in model.keys()]
num_characters = len(set(" ".join(data))) + 1
processor.fit ([" ".join(data)+":;,.'/"], unused_y=None)


def cosine_similarity(a, b):
    '''
    returns cosine similarity between vectors a and b
    '''
    return np.dot(a,b)/(np.sqrt(np.sum(np.power(a,2)))*np.sqrt(np.sum(np.power(b,2))))

def sort_topn(l, n):
    '''
    given a list of tuples (_, value) returns a sorted list of the tuples
    with the top n values.
    '''
    return list(reversed(sorted(l, key = lambda x: x[1])))[:n]

all_words = model.keys()
all_word_vectors = np.zeros([len(all_words), 500])
i=0
for word in all_words:
    all_word_vectors[i,:] = model[word]
    i+=1

norms = np.linalg.norm(all_word_vectors, ord=2, axis=1) # the length of each vector in the vocabulary
# project all word vectors onto a unit sphere
# this way, euclidean nearest neighbor lookup 
# is the same as finding the vector with the 
# lowest cosine distance to some vector
all_word_vectors = all_word_vectors / norms[:,None] 

word_tree = KDTree(all_word_vectors)

def get_most_similar(vec, exclude = None):
    '''
    get the words with vectors closes to vec.  Exclude is optionally
    a list of words to ignore.
    '''
    topn = [("none", -2)]
    words = []
    for i in word_tree.query([vec], k=15)[1][0]:
        words.append(all_words[i])
    for word in words:
        if word != exclude:
            topn.append((word, cosine_similarity(vec, model[word])))
            # KDtree doesn't return results in sorted order, so we sort them
            topn = sort_topn(topn, 15)
    return topn

def get_characters_vector(word):
    '''
    Converts string word into a sequence of one-hot vectors.
    '''
    item = processor.transform( [" ".join(list(word))]).next()    
    result = tflearn.data_utils.to_categorical (item, num_characters)
    return result
    
def get_batch_from_words_list(n, words):
    '''
    returns a batch of n inputs and outputs drawn from the list words.
    '''
    x = []
    y = []
    for _ in range(n):
        i = random.randint(0, len(words)-1)
        word = words[i]
        x.append(get_characters_vector(word))
        y.append(get_vec(word))
    x = np.array(x)[:,::-1] 
    return x,y

def get_batch(n):
    '''
    returns a batch of training and validation examples
    '''
    tx, ty = get_batch_from_words_list(n, train_words)
    vx, vy = get_batch_from_words_list(n, val_words)
    return tx, ty, vx, vy


### Define the model

In [None]:
tf.reset_default_graph()

input_chars = tf.placeholder(tf.float32, [None, seq_max_len, num_characters], name = 'inpt_ph')
word_vectors = tf.placeholder(tf.float32, [None, len(get_vec('dog'))], name = 'yn_ph')
y = word_vectors

g = tflearn.lstm(input_chars, 250, return_seq = True)
g = tflearn.lstm(g, 250, return_seq = True)
g = tflearn.lstm(g, 250, return_seq = False)
g = tflearn.fully_connected(g, 400, activation='relu')
g = tflearn.layers.dropout(g, .7)
pred = tflearn.fully_connected(g, 500, activation='tanh')

# This tensor is the length (2 norm) of each vector predicted by the model.  It's needed for the cosine
# distance calculation.
pred_len = tf.sqrt(tf.reduce_sum(tf.square(pred), 1, keep_dims = True))
# The length of the input vectors
y_len =    tf.sqrt(tf.reduce_sum(tf.square(y),    1, keep_dims = True))
# Cosine distance between each correct vector and predicted vector
cosine_similarity_ten = tf.reduce_sum(tf.div(tf.mul(pred, y),   tf.mul(pred_len, y_len)), 1, keep_dims = False)
# 1 - cosine similarity = cosine distance.  Cosine distance is our loss.
cost = tf.reduce_mean(tf.square(tf.sub(tf.constant(1.0), cosine_similarity_ten)))
optimizer = tf.train.AdamOptimizer(learning_rate=.002).minimize(cost)

saver = tf.train.Saver()
config = tf.ConfigProto(
#         device_count = {'GPU': 0} # this disables the GPU for this session
    )
sess = tf.Session(config=config)
init = tf.initialize_all_variables()
sess.run(init)
step = 0
batch_costs = []
val_costs = []

### Naive Loss

Let's get an idea of what kind of loss a naive model which always guesses the average of vectors in the vocabulary would achieve.  I'll take the average of 2000 vectors then measure the average cosine distance between those vectors and their average.  We'd like to beat this model.

In [None]:
x,y = get_batch_from_words_list(2000, train_words)  
avg = (np.sum(y, axis = 0))/len(y)
sims = []
for i in range(len(y)):
    sims.append((1 - cosine_similarity(y[i], avg))**2)
print ("naive loss:", np.mean(sims))

### The training loop

In [None]:
from ipywidgets import FloatProgress
from IPython.display import display


display_step = 1000
# a bar to show progress toward the next graph printout
f = FloatProgress(min=0, max=display_step, readout = True)
display(f)


while step < 8000:
    x,y = get_batch_from_words_list(250, train_words)  
    _, batch_cost = sess.run([optimizer, cost], feed_dict={input_chars:x, word_vectors: y })
    batch_costs.append(batch_cost)
    f.value = step%display_step
    
    # to save time, only calculate the validation loss every 10 batches.
    if step %  10 == 0:
        x,y = get_batch_from_words_list(250, val_words)  
        val_cost = sess.run([cost], feed_dict={input_chars:x, word_vectors: y })
        val_costs.append(val_cost)
    if  step % display_step == 0 and step > 0:
        plt.plot(batch_costs)
        plt.ylabel('train loss')
        plt.show()
        plt.plot(val_costs)
        plt.ylabel('validation loss')
        plt.show()

    step += 1


### Functions for seeing the model's responses

In [None]:

def predict_vec(letters):
    '''
    Get the model's prediction of a word vector given the string letters.
    '''
    letters = letters.lower()
    return np.array(sess.run(pred, feed_dict = 
                             {input_chars:np.array([get_characters_vector(letters)])[:,::-1]})[0])\
           .astype(np.float32)


def report(letters):
    '''
    Print out the model's guess at words similar to the string letters.  Also show the word vectors
    closest to the vector for letters if one already exists in the embedding.
    '''
    print ("\nreport for "+letters)
    letters = letters.lower()
    if len(letters) > seq_max_len:
        print ("word too long")
        return
    guess = predict_vec(letters)
    print ("The most similar words according to the model:")
    print (str(map(lambda x: x[0],get_most_similar(guess))).decode('unicode-escape'))
    if letters in model:
        print ("The most similar words according to the original embedding space:")
        print (map(lambda x: x[0],get_most_similar(model[letters])))
    else:
        print ("This isn't in the original embedding space.")
    return guess

In [None]:
report("fawkes")
report("dobby")
report("arnold")
report("dumbledore")
report("Throbfernoma")
report("Glazioxx")
report("Faeranduil")
report("dromgooles")
report("beowulf")
report("gawain")
report("megathron")
report("hagrid")
report("impairor")
report("Cecetzin")
report("iteron") 
report("eddard")
report("dursley")
report("lothlorien")
report("zorgon")
report("galadriel")
report("goron")
report("danolfini")
report("ashimmu")

print ("reports complete")