## Introduction to Word2Vec
#### The model Word2Vec is a simple word embedding neural network with a single hidden layer, based on the study of Le & Mikolov (2014).

In [None]:
import seaborn as sb
import numpy as np
import pandas as pd
%matplotlib inline

Word embeddings from plain text.

The model assumes the *Distributional Hypothesis* that words are characterized by words they hang out with. this idea is used to estimate the probability of two words occurring near each other.

#### Softmax Regression

Word2Vec is a very simple neural network with a single hidden layer.

In [1]:
sentences = ['the king loves the queen', 'the queen loves the king',
             'the dwarf hates the king', 'the queen hates the dwarf',
             'the dwarf poisons the king', 'the dwarf poisons the queen']

Bag-of-Words

In [None]:
def text2sentences(text):
    return text.lower().split('.')

In [29]:
import numpy as np
from collections import defaultdict

class Word2Vec(object):
    def __init__(self, sentences):
        self.sentences = sentences
        self.N = 3 # number of nodes in Hidden Layer
        self.V = None # Vocabulary size
        self.WI = None
        self.WO = None
        self.vocabulary = None
        self.learning_rate = 1.0
    
    def Vocabulary(self):
        dictionary = defaultdict()
        # len of dictionary gives a unique integer to each new word
        dictionary.default_factory = lambda: len(dictionary) 
        return dictionary

    def docs2bow(self, docs, dictionary):
        """Transforms a list of strings into a list of lists where 
        each unique item is converted into a unique integer."""
        for doc in docs:
            yield [dictionary[word] for word in doc.split()] # returns a generator
    
    def sentences2bow(self):
        self.vocabulary = self.Vocabulary()
        bow = list(self.docs2bow(self.sentences, self.vocabulary))
        return bow
    
    def random_init(self):
        self.V = len(self.vocabulary)
        
        # random initialization of weights between [-0.5 , 0.5] normalized by number of nodes mapping to.
        self.WI =(np.random.random((self.V, self.N)) - 0.5) / self.N # input weights
        self.WO =(np.random.random((self.N, self.V)) - 0.5) / self.V # output weights
        
    def context_prob(self, context):
        return sum(np.exp(self.WI[self.vocabulary[context]].dot(self.WO.T[self.vocabulary[word]]))
                   for word in self.vocabulary)
    
    def backprop(self, context, target):
        denominator = self.context_prob(context)

        for word in self.vocabulary:

            numerator = np.exp(self.WI[self.vocabulary[context]].dot(self.WO.T[self.vocabulary[word]]))
            P_word_context = numerator / denominator # posterior probability P(word | context)

            if word == target:
                t = 1
                print "P(target|context)", P_word_context
            else:
                t = 0

            err = t - P_word_context # error
            #print "Error: ", err

            # weight update using stochastic gradient descent
            self.WO.T[self.vocabulary[word]] -= self.learning_rate * err * self.WI[self.vocabulary[context]]
            # update brings word vector closer in the feature space if word = target, and push them apart otherwise.

        self.WI[self.vocabulary[context]] -= self.WO.sum(axis=1) # update only weights for input word
    
    def train(self):

        bow = self.sentences2bow()
        # visualize bag-of-word sentence conversion
        # print bow
        
        self.random_init()
        
        for sentence in self.sentences:
            prev_word = None
            for word in sentence.split():
                if prev_word != None:
                    target = word
                    context = prev_word
                    self.backprop(context, target)
                prev_word = word

In [30]:
model = Word2Vec(sentences)
model.train()

P(target|context) 0.143473967631
P(target|context) 0.143059248075
P(target|context) 0.142646438945
P(target|context) 0.14364795132
P(target|context) 0.135354000989
P(target|context) 0.142306570948
P(target|context) 0.135610380013
P(target|context) 0.162206459026
P(target|context) 0.162699267825
P(target|context) 0.142914135961
P(target|context) 0.148657993322
P(target|context) 0.106086118779
P(target|context) 0.0912081448296
P(target|context) 0.181075248948
P(target|context) 0.14602407654
P(target|context) 0.0454139659483
P(target|context) 0.00154787392611
P(target|context) 0.186976530737
P(target|context) 0.129804793258
P(target|context) 0.000872148893805
P(target|context) 1.33377106512e-07
P(target|context) 0.126644109941
P(target|context) 0.140128027403
P(target|context) 2.18473559499e-08


In [None]:
dict.get

In [22]:
vocabulary = Vocabulary()
sentences_bow = list(docs2bow(sentences, vocabulary))
sentences_bow

NameError: name 'Vocabulary' is not defined

In [23]:
vocabulary

NameError: name 'vocabulary' is not defined

In [None]:
V = len(vocabulary)
N = 3 # number of nodes in Hidden Layer

# random initialization of weights between [-0.5 , 0.5] normalized by number of nodes mapping to.
WI =(np.random.random((V, N)) - 0.5) / N # input weights
WO =(np.random.random((N, V)) - 0.5) / V # output weights

In [None]:
print WI

In [None]:
print WO

In [None]:
# input weights associated with dwarf
WI[vocabulary['dwarf']]

In [None]:
# output weights associated with hates
WO.T[vocabulary['hates']]

Using the dot product $W_I \cdot W'^T_O$ we compute the distance between the input word *dwarf* and the output word *hates*:

In [None]:
WI[vocabulary['dwarf']].dot(WO.T[vocabulary['hates']])

Now using softmax regression, we can compute the posterior probability $P(w_O|w_I)$:

$$ P(w_O|w_I) = y_i = \frac{exp(W_I \cdot W'^T_O)}{\sum^V_{j=1} exp(W_I \cdot W'^T_j)} $$

In [None]:
numerator = np.exp(WI[vocabulary['dwarf']].dot(WO.T[vocabulary['hates']]))
denominator = sum(np.exp(WI[vocabulary['dwarf']].dot(WO.T[vocabulary[word]])) for word in vocabulary)

P_hates_dwarf = numerator / denominator
P_hates_dwarf

### Updating the hidden-to-output layer weights

loss function to minimize is: $E = -\log P(w_O|w_I)$

The error is computed with $t_j - P(w_O|w_I) = e_j$, where $t_j$ is 1 if $w_j$ is the actual output word, otherwise $t_j$ is 0.

To obtain the gradient on the hidden-to-output weights, we compute $e_j \cdot h_i$, where $h_i$ is a copy of the vector corresponding to the input word (only holds with a context of a single word). Finally, using stochastic gradient descent, with a learning rate $\nu$ we obtain the weight update equation for the hidden to output layer weights:

$$W'^{T (t)}_j = W'^{T (t-1)}_j - \nu \cdot e_j \cdot h_j$$.


In [None]:
target_word = 'king' 
input_word = 'queen' # context word
learning_rate = 1.0

denominator = sum(np.exp(WI[vocabulary[input_word]].dot(WO.T[vocabulary[word]])) for word in vocabulary)

for word in vocabulary:
    
    numerator = np.exp(WI[vocabulary[input_word]].dot(WO.T[vocabulary[word]]))
    P_word_queen = numerator / denominator # posterior probability P(word | queen)
    
    if word == target_word:
        t = 1
    else:
        t = 0
    
    err = t - P_word_queen # error
    
    # weight update using stochastic gradient descent
    WO.T[vocabulary[word]] -= learning_rate * err * WI[vocabulary[input_word]]
    # update brings word vector closer in the feature space if word = target, and push them apart otherwise.
    
print WO

### Updating the input-to-hidden layer weights

backpropagate the prediction errors to the input-to-hidden weights

In [None]:
WI[vocabulary[input_word]] -= WO.sum(axis=1) # update only weights for input word
WI

In [None]:
denominator = sum(np.exp(WI[vocabulary[input_word]].dot(WO.T[vocabulary[word]])) for word in vocabulary)

for word in vocabulary:
    numerator = np.exp(WI[vocabulary[input_word]].dot(WO.T[vocabulary[word]]))
    P_word_queen = numerator / denominator # posterior probability P(word | queen)
    
    print word, P_word_queen

### Multi-word context

In [None]:
target_word = 'king'
context = ['queen', 'loves']

In [None]:
h = (WI[vocabulary[context[0]]] + WI[vocabulary[context[1]]]) / 2.0

In [None]:
denominator = sum(np.exp(h.dot(WO.T[vocabulary[word]])) for word in vocabulary)

for word in vocabulary:
    
    numerator = np.exp(h.dot(WO.T[vocabulary[word]]))
    P_word_context = numerator / denominator # posterior probability P(word | context)
    
    if word == target_word:
        t = 1
    else:
        t = 0
    
    err = t - P_word_context # error
    
    # weight update using stochastic gradient descent
    WO.T[vocabulary[word]] -= learning_rate * err * h
    # update brings word vector closer in the feature space if word = target, and push them apart otherwise.
    
print WO

In [None]:
for input_word in context:
    WI[vocabulary[input_word]] -= (1. / len(context)) * learning_rate * WO.sum(axis=1)   

In [None]:
h = (WI[vocabulary[context[0]]] + WI[vocabulary[context[1]]]) / 2.0

In [None]:
denominator = sum(np.exp(h.dot(WO.T[vocabulary[word]])) for word in vocabulary)

for word in vocabulary:
    numerator = np.exp(h.dot(WO.T[vocabulary[word]]))
    P_word_context = numerator / denominator # posterior probability P(word | context)
    
    print word, P_word_context

### Paragraph Vector

In [None]:
V = len(vocabulary)
N = 3 # number of hiden nodes
P = 5 # number of sentences

WI = (np.random.random((V, N)) - 0.5) / N
WO = (np.random.random((N, V)) - 0.5) / V
D =  (np.random.random((P, N)) - 0.5) / N 

In [None]:
sentences = ['snowboarding is dangerous', 'skydiving is dangerous',
             'escargots are tasty to some people', 'everyone loves tasty food',
             'the minister has some dangerous ideas']

In [None]:
vocabulary = Vocabulary()
sentences_bow = list(docs2bow(sentences, vocabulary))
sentences_bow

In [None]:
vocabulary

In [None]:
D

In [None]:
target_word = 'dangerous'
h = (D[0] + WI[vocabulary['snowboarding']]) /2.0
learning_rate = 1.0

denominator = sum(np.exp(h.dot(WO.T[vocabulary[word]])) for word in vocabulary)

for word in vocabulary:
    
    numerator = np.exp(h.dot(WO.T[vocabulary[word]]))
    P = numerator / denominator # posterior probability P(word | context, phrase)
    
    if word == target_word:
        t = 1
    else:
        t = 0
    
    err = t - P # error
    
    # weight update using stochastic gradient descent
    WO.T[vocabulary[word]] -= learning_rate * err * h
    # update brings word vector closer in the feature space if word = target, and push them apart otherwise.
    
print WO

Backpropagation

In [None]:
EH = WO.sum(axis=1)
len_context = 2
WI[vocabulary['snowboarding']] -= (1. / len_context) * learning_rate * EH
D[0] -= (1. / len_context) * learning_rate * EH