In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

CONTEXT_SIZE = 4 ### the number of size that we need to input for prediction 
EMBEDDING_DIM = 10 ### how many dimensions of matrix to represent a word

test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

We use previous 4 words as input to predict the fifth word

### Step1：Create the four-grams with target word

In [2]:
gram = [((test_sentence[i], test_sentence[i+1],test_sentence[i+2], test_sentence[i+3]), test_sentence[i+4]) 
            for i in range(len(test_sentence)-CONTEXT_SIZE)]

In [3]:
## encode words to be ints
vocb = set(test_sentence)
word_to_idx = {word: i for i, word in enumerate(vocb)}
idx_to_word = {word_to_idx[word]: word for word in word_to_idx}

In [4]:
for word, label in gram[:100]:
    word = [word_to_idx[i] for i in word] ### data sctructure :  word: ('When', 'forty','winters', 'shall') to extract the index
    label = [word_to_idx[label]] ## extract the index with corresponding lables

In [5]:
word

[74, 62, 89, 59]

In [6]:
label

[3]

In [7]:
import tensorflow as tf
import tensorflow.contrib.slim as slim

In [8]:
len(word_to_idx)  ## the totoal number of different word

97

In [14]:
### inputs  = [50, 55, 58, 3], vocab_size: how many word that I need to create a matrix
### n_dim : how many dim that you need to vectorize a word = 10 
### vocab_size: how totoal word that I need to genearte a matrix

def n_gram(inputs, vocab_size, context_size=CONTEXT_SIZE, n_dim=EMBEDDING_DIM, scope='n-gram', reuse=tf.AUTO_REUSE):
    with tf.variable_scope(scope, reuse=reuse):
        with tf.device('/cpu:0'):
            
            ### create a embeddings matrix with 97words with 10 dimension
            embeddings = tf.get_variable('embeddings', shape=[vocab_size, n_dim], initializer=tf.random_uniform_initializer)
        # size of embedding = [97,10]
        

        # to extract the two vectors that contain input(two word) information
        embed = tf.nn.embedding_lookup(embeddings, inputs)
        
        
        # combine four vectors to be one vector
        net = tf.reshape(embed, (1, -1))
        
        
        # fully connection to (vocab_size = 97) a vector that has 97 dims to present four words
        net = slim.fully_connected(net, vocab_size, activation_fn=None, scope='classification')
        
        return net,embeddings

In [17]:
input_ph = tf.placeholder(dtype=tf.int64, shape=[4 ], name='input') ## [53,43,54,21]
label_ph = tf.placeholder(dtype=tf.int64, shape=[1,], name='label')#[2]

In [18]:
net,embeddings = n_gram(input_ph,len(word_to_idx)) ### forword-prediction completed

In [19]:
### define the loss fuction
loss = tf.losses.sparse_softmax_cross_entropy(label_ph, net, scope='loss')

In [20]:
opt = tf.train.MomentumOptimizer(1e-2, 0.9)
train_op = opt.minimize(loss)

In [23]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())



In [24]:
#### train 200 times
for e in range(200):
    train_loss = 0
    for word, label in gram:
        word = [word_to_idx[i] for i in word] ### data sctructure :  word: ('When', 'forty') to extract the index
        label = [word_to_idx[label]] ## extract the index with corresponding lables
        
        ## word : [65,32,43,21], lables = [5]
        _, curr_loss = sess.run([train_op, loss], feed_dict={input_ph: word, label_ph: label})
        train_loss += curr_loss
    
    if (e + 1) % 20 == 0:
        print('Epoch: {}, Loss: {:.6f}'.format(e + 1, train_loss / 100))

Epoch: 20, Loss: 0.032060
Epoch: 40, Loss: 0.012355
Epoch: 60, Loss: 0.007572
Epoch: 80, Loss: 0.005407
Epoch: 100, Loss: 0.004180
Epoch: 120, Loss: 0.003393
Epoch: 140, Loss: 0.002848
Epoch: 160, Loss: 0.002448
Epoch: 180, Loss: 0.002143
Epoch: 200, Loss: 0.001904


In [26]:
## test the result
word, label = gram[19]
print('input: {}'.format(word))
print('label: {}'.format(label))
print()
word = [word_to_idx[i] for i in word]
out = sess.run(net, feed_dict={input_ph: word})
pred_label_idx = out[0].argmax()
predict_word = idx_to_word[pred_label_idx]
print('real word is {}, predicted word is {}'.format(label, predict_word))

input: ('so', 'gazed', 'on', 'now,')
label: Will

real word is Will, predicted word is Will


In [27]:
word, label = gram[75]
print('input: {}'.format(word))
print('label: {}'.format(label))
print()
word = [word_to_idx[i] for i in word]
out = sess.run(net, feed_dict={input_ph: word})
pred_label_idx = out[0].argmax()
predict_word = idx_to_word[pred_label_idx]
print('real word is {}, predicted word is {}'.format(label, predict_word))

input: ("'This", 'fair', 'child', 'of')
label: mine

real word is mine, predicted word is mine


In [28]:
### to see a trained word-embed
embeddings_matrix = embeddings.eval()

In [29]:
embeddings_matrix.shape

(97, 10)

In [30]:
embeddings_matrix[:3]

array([[ 0.04376271,  1.1123128 ,  0.4999244 , -0.84200466, -0.8110467 ,
         0.2883291 ,  0.4943249 , -0.80180794, -0.7253908 , -0.24113272],
       [ 1.5748116 , -0.10264023,  1.508522  , -0.88337845,  0.21163328,
         0.10263743, -0.55512154, -0.05898177, -0.05966415,  1.1262923 ],
       [ 1.1398656 ,  1.0337118 ,  0.27517706,  0.8770043 ,  0.64252543,
         0.93555075, -0.5430839 ,  0.00470811, -0.5112552 ,  1.2213902 ]],
      dtype=float32)