In [12]:
# Reference: https://github.com/seyedsaeidmasoumzadeh/Predict-next-word/blob/master/main.py

In [None]:
import collections
import tensorflow as tf
import numpy as np
from tensorflow.contrib import rnn
import nltk

In [9]:
#nltk.download('all')

In [10]:
# Word embedding
def word_embedding(words):
    vocabulary = collections.Counter(words).most_common()
    vocabulary_dictionary = dict()
    for word, _ in vocabulary:
        # Assign a numerical unique value to each word inside vocabulary 
        vocabulary_dictionary[word] = len(vocabulary_dictionary)
    rev_vocabulary_dictionary = dict(zip(vocabulary_dictionary.values(), vocabulary_dictionary.keys()))
    return vocabulary_dictionary, rev_vocabulary_dictionary


In [13]:
# Build Training data. For example if X = ['long', 'ago', ','] then Y = ['the']
def sampling(words, vocabulary_dictionary, window):
    X = []
    Y = []
    sample = []
    for index in range(0, len(words) - window):
        for i in range(0, window):
            sample.append(vocabulary_dictionary[words[index + i]])
            if (i + 1) % window == 0:
                X.append(sample)
                Y.append(vocabulary_dictionary[words[index + i + 1]])
                sample = []
    return X,Y

In [16]:
#Load text data

with open("train.txt") as f:
    content = f.read()
    
words = nltk.tokenize.word_tokenize(content)
vocabulary_dictionary, reverse_vocabulary_dictionary = word_embedding(words)

window = 3
num_classes = len(vocabulary_dictionary)
timesteps = window
num_hidden = 512
num_input = 1
batch_size = 20
iteration = 200


training_data, label = sampling(words, vocabulary_dictionary, window)

In [17]:
# RNN output node weights and biases
weights = {
    'out': tf.Variable(tf.random_normal([num_hidden, num_classes]))
}
biases = {
    'out': tf.Variable(tf.random_normal([num_classes]))
}

In [18]:
# tf graph input
X = tf.placeholder("float", [None, timesteps, num_input])
Y = tf.placeholder("float", [None, num_classes])

In [19]:
def RNN(x, weights, biases):

    # Unstack to get a list of 'timesteps' tensors, each tensor has shape (batch_size, n_input)
    x = tf.unstack(x, timesteps, 1)

    # Build a LSTM cell
    lstm_cell = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0)

    # Get LSTM cell output
    outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)

    # Linear activation, using rnn inner loop last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

In [20]:
logits = RNN(X, weights, biases)
prediction = tf.nn.softmax(logits)

In [21]:
# Loss and optimizer
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y))
train_op = tf.train.RMSPropOptimizer(learning_rate=0.001).minimize(loss_op)
correct_pred = tf.equal(tf.argmax(prediction,1), tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [22]:
# Initialize the variables with default values
init = tf.global_variables_initializer()

In [None]:
with tf.Session() as sess:
    # Run the initializer
    sess.run(init)
    for i in range(iteration):
        last_batch = len(training_data) % batch_size
        training_steps = int((len(training_data) / batch_size) + 1)
        for step in range(training_steps):
            X_batch = training_data[(step * batch_size) :((step + 1) * batch_size)]
            Y_batch = label[(step * batch_size) :((step + 1) * batch_size)]
            Y_batch_encoded = []
            for x in Y_batch:
                on_hot_vector = np.zeros([num_classes], dtype=float)
                on_hot_vector[x] = 1.0
                Y_batch_encoded = np.concatenate((Y_batch_encoded,on_hot_vector))
            if len(X_batch) < batch_size:
                X_batch = np.array(X_batch)
                X_batch = X_batch.reshape(last_batch, timesteps, num_input)
                Y_batch_encoded = np.array(Y_batch_encoded)
                Y_batch_encoded = Y_batch_encoded.reshape(last_batch, num_classes)
            else:
                X_batch = np.array(X_batch)
                X_batch = X_batch.reshape(batch_size, timesteps, num_input)
                Y_batch_encoded = np.array(Y_batch_encoded)
                Y_batch_encoded = Y_batch_encoded.reshape(batch_size, num_classes)
            _, acc, loss, onehot_pred = sess.run([train_op, accuracy, loss_op, logits], feed_dict={X: X_batch, Y: Y_batch_encoded})
            print("Step " + str(i) + ", Minibatch Loss= " + "{:.4f}".format(loss) + ", Training Accuracy= " + "{:.2f}".format(acc * 100))

Step 0, Minibatch Loss= 41.3661, Training Accuracy= 0.00
Step 0, Minibatch Loss= 37.4009, Training Accuracy= 0.00
Step 0, Minibatch Loss= 38.8068, Training Accuracy= 0.00
Step 0, Minibatch Loss= 41.3389, Training Accuracy= 0.00
Step 0, Minibatch Loss= 40.3996, Training Accuracy= 0.00
Step 0, Minibatch Loss= 43.0225, Training Accuracy= 0.00
Step 0, Minibatch Loss= 34.1131, Training Accuracy= 0.00
Step 0, Minibatch Loss= 33.8994, Training Accuracy= 0.00
Step 0, Minibatch Loss= 35.4247, Training Accuracy= 0.00
Step 0, Minibatch Loss= 29.2236, Training Accuracy= 0.00
Step 0, Minibatch Loss= 28.1835, Training Accuracy= 0.00
Step 0, Minibatch Loss= 26.4462, Training Accuracy= 0.00
Step 0, Minibatch Loss= 28.3528, Training Accuracy= 0.00
Step 0, Minibatch Loss= 30.8743, Training Accuracy= 0.00
Step 0, Minibatch Loss= 30.2847, Training Accuracy= 0.00
Step 0, Minibatch Loss= 27.5918, Training Accuracy= 0.00
Step 0, Minibatch Loss= 23.5280, Training Accuracy= 0.00
Step 0, Minibatch Loss= 28.2995

Step 0, Minibatch Loss= 14.4016, Training Accuracy= 5.00
Step 0, Minibatch Loss= 11.0187, Training Accuracy= 5.00
Step 0, Minibatch Loss= 14.9105, Training Accuracy= 0.00
Step 0, Minibatch Loss= 14.4272, Training Accuracy= 0.00
Step 0, Minibatch Loss= 12.4794, Training Accuracy= 0.00
Step 0, Minibatch Loss= 12.7518, Training Accuracy= 5.00
Step 0, Minibatch Loss= 13.6831, Training Accuracy= 5.00
Step 0, Minibatch Loss= 10.9753, Training Accuracy= 0.00
Step 0, Minibatch Loss= 13.5230, Training Accuracy= 0.00
Step 0, Minibatch Loss= 13.5248, Training Accuracy= 10.00
Step 0, Minibatch Loss= 15.0145, Training Accuracy= 5.00
Step 0, Minibatch Loss= 14.1835, Training Accuracy= 0.00
Step 0, Minibatch Loss= 14.0893, Training Accuracy= 0.00
Step 0, Minibatch Loss= 13.5815, Training Accuracy= 5.00
Step 0, Minibatch Loss= 11.9862, Training Accuracy= 0.00
Step 0, Minibatch Loss= 13.2237, Training Accuracy= 0.00
Step 0, Minibatch Loss= 13.1807, Training Accuracy= 5.00
Step 0, Minibatch Loss= 13.543

Step 0, Minibatch Loss= 13.0617, Training Accuracy= 0.00
Step 0, Minibatch Loss= 11.7825, Training Accuracy= 0.00
Step 0, Minibatch Loss= 13.0713, Training Accuracy= 0.00
Step 0, Minibatch Loss= 11.0475, Training Accuracy= 5.00
Step 0, Minibatch Loss= 11.4501, Training Accuracy= 10.00
Step 0, Minibatch Loss= 12.6998, Training Accuracy= 0.00
Step 0, Minibatch Loss= 11.4804, Training Accuracy= 0.00
Step 0, Minibatch Loss= 12.6493, Training Accuracy= 5.00
Step 0, Minibatch Loss= 12.4992, Training Accuracy= 0.00
Step 0, Minibatch Loss= 12.1552, Training Accuracy= 0.00
Step 0, Minibatch Loss= 11.9798, Training Accuracy= 0.00
Step 0, Minibatch Loss= 13.2130, Training Accuracy= 5.00
Step 0, Minibatch Loss= 12.1982, Training Accuracy= 0.00
Step 0, Minibatch Loss= 10.5193, Training Accuracy= 0.00
Step 0, Minibatch Loss= 11.3501, Training Accuracy= 10.00
Step 0, Minibatch Loss= 11.9886, Training Accuracy= 10.00
Step 0, Minibatch Loss= 9.6905, Training Accuracy= 10.00
Step 0, Minibatch Loss= 12.9

Step 0, Minibatch Loss= 10.6784, Training Accuracy= 5.00
Step 0, Minibatch Loss= 11.8416, Training Accuracy= 0.00
Step 0, Minibatch Loss= 10.4630, Training Accuracy= 10.00
Step 0, Minibatch Loss= 9.7488, Training Accuracy= 10.00
Step 0, Minibatch Loss= 10.4498, Training Accuracy= 20.00
Step 0, Minibatch Loss= 10.7889, Training Accuracy= 10.00
Step 0, Minibatch Loss= 10.9509, Training Accuracy= 15.00
Step 0, Minibatch Loss= 11.5291, Training Accuracy= 0.00
Step 0, Minibatch Loss= 9.1196, Training Accuracy= 10.00
Step 0, Minibatch Loss= 12.6060, Training Accuracy= 0.00
Step 0, Minibatch Loss= 10.3224, Training Accuracy= 10.00
Step 0, Minibatch Loss= 11.4777, Training Accuracy= 5.00
Step 0, Minibatch Loss= 9.9107, Training Accuracy= 0.00
Step 0, Minibatch Loss= 11.4654, Training Accuracy= 5.00
Step 0, Minibatch Loss= 11.1039, Training Accuracy= 10.00
Step 0, Minibatch Loss= 14.1092, Training Accuracy= 5.00
Step 0, Minibatch Loss= 11.2144, Training Accuracy= 0.00
Step 0, Minibatch Loss= 11

Step 0, Minibatch Loss= 11.3681, Training Accuracy= 10.00
Step 0, Minibatch Loss= 10.2434, Training Accuracy= 0.00
Step 0, Minibatch Loss= 11.9179, Training Accuracy= 0.00
Step 0, Minibatch Loss= 10.2030, Training Accuracy= 10.00
Step 0, Minibatch Loss= 10.2367, Training Accuracy= 5.00
Step 0, Minibatch Loss= 10.7321, Training Accuracy= 0.00
Step 0, Minibatch Loss= 11.0370, Training Accuracy= 10.00
Step 0, Minibatch Loss= 13.5627, Training Accuracy= 0.00
Step 0, Minibatch Loss= 12.8890, Training Accuracy= 5.00
Step 0, Minibatch Loss= 12.0231, Training Accuracy= 0.00
Step 0, Minibatch Loss= 12.1170, Training Accuracy= 10.00
Step 0, Minibatch Loss= 10.9350, Training Accuracy= 5.00
Step 0, Minibatch Loss= 13.1942, Training Accuracy= 0.00
Step 0, Minibatch Loss= 10.5355, Training Accuracy= 15.00
Step 0, Minibatch Loss= 9.4298, Training Accuracy= 5.00
Step 0, Minibatch Loss= 9.8823, Training Accuracy= 5.00
Step 0, Minibatch Loss= 11.6542, Training Accuracy= 0.00
Step 0, Minibatch Loss= 10.1

Step 0, Minibatch Loss= 11.4279, Training Accuracy= 5.00
Step 0, Minibatch Loss= 12.3179, Training Accuracy= 0.00
Step 0, Minibatch Loss= 10.1125, Training Accuracy= 0.00
Step 0, Minibatch Loss= 9.8040, Training Accuracy= 0.00
Step 0, Minibatch Loss= 9.1504, Training Accuracy= 10.00
Step 0, Minibatch Loss= 9.3141, Training Accuracy= 10.00
Step 0, Minibatch Loss= 8.2091, Training Accuracy= 0.00
Step 0, Minibatch Loss= 10.6945, Training Accuracy= 0.00
Step 0, Minibatch Loss= 7.0840, Training Accuracy= 0.00
Step 0, Minibatch Loss= 12.5119, Training Accuracy= 0.00
Step 0, Minibatch Loss= 9.5524, Training Accuracy= 10.00
Step 0, Minibatch Loss= 9.3944, Training Accuracy= 5.00
Step 0, Minibatch Loss= 11.4356, Training Accuracy= 0.00
Step 0, Minibatch Loss= 12.8088, Training Accuracy= 0.00
Step 0, Minibatch Loss= 10.2483, Training Accuracy= 5.00
Step 0, Minibatch Loss= 10.5153, Training Accuracy= 0.00
Step 0, Minibatch Loss= 9.6133, Training Accuracy= 5.00
Step 0, Minibatch Loss= 9.8224, Trai

Step 0, Minibatch Loss= 8.5650, Training Accuracy= 5.00
Step 0, Minibatch Loss= 10.7150, Training Accuracy= 5.00
Step 0, Minibatch Loss= 10.4929, Training Accuracy= 5.00
Step 0, Minibatch Loss= 10.0024, Training Accuracy= 0.00
Step 0, Minibatch Loss= 9.4435, Training Accuracy= 0.00
Step 0, Minibatch Loss= 8.1304, Training Accuracy= 5.00
Step 0, Minibatch Loss= 10.7587, Training Accuracy= 0.00
Step 0, Minibatch Loss= 9.8607, Training Accuracy= 0.00
Step 0, Minibatch Loss= 8.4395, Training Accuracy= 10.00
Step 0, Minibatch Loss= 10.3839, Training Accuracy= 5.00
Step 0, Minibatch Loss= 8.9022, Training Accuracy= 0.00
Step 0, Minibatch Loss= 10.7030, Training Accuracy= 5.00
Step 0, Minibatch Loss= 10.1824, Training Accuracy= 15.00
Step 0, Minibatch Loss= 8.9806, Training Accuracy= 10.00
Step 0, Minibatch Loss= 8.1876, Training Accuracy= 5.00
Step 0, Minibatch Loss= 10.5257, Training Accuracy= 0.00
Step 0, Minibatch Loss= 9.2803, Training Accuracy= 10.00
Step 0, Minibatch Loss= 9.3660, Trai

Step 0, Minibatch Loss= 11.0302, Training Accuracy= 5.00
Step 0, Minibatch Loss= 10.0695, Training Accuracy= 0.00
Step 0, Minibatch Loss= 9.3952, Training Accuracy= 15.00
Step 0, Minibatch Loss= 8.8092, Training Accuracy= 0.00
Step 0, Minibatch Loss= 9.0083, Training Accuracy= 0.00
Step 0, Minibatch Loss= 10.5713, Training Accuracy= 0.00
Step 0, Minibatch Loss= 7.8341, Training Accuracy= 5.00
Step 0, Minibatch Loss= 9.1521, Training Accuracy= 5.00
Step 0, Minibatch Loss= 9.6785, Training Accuracy= 5.00
Step 0, Minibatch Loss= 10.0259, Training Accuracy= 10.00
Step 0, Minibatch Loss= 12.5257, Training Accuracy= 0.00
