In [None]:
import os
import tensorflow as tf
import collections
import json
import os
import numpy as np


CONFIG_number_of_words = 3
CONFIG_batch_size = 200
CONFIG_hidden_size = 1500
CONFIG_num_epochs = 50
CONFIG_learning_rate = 0.0001
CONFIG_learning_rate_decay = 0


data_path = os.path.join(os.getcwd(), 'data')


def load_dictionary(path):
    return json.loads(open(path).read())


def read_words(filename):
    with tf.io.gfile.GFile(filename, 'r') as f:
        return f.read().replace('\n', '<eos>').split()


def build_vocab(filename):
    data = read_words(filename)

    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))

    return word_to_id


def file_to_word_ids(filename, word_to_id):
    data = read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]


def load_data():
    train_path = os.path.join(data_path, 'ptb.train.txt')
    valid_path = os.path.join(data_path, 'ptb.valid.txt')

    word_to_id = build_vocab(train_path)
    train_data = file_to_word_ids(train_path, word_to_id)
    valid_data = file_to_word_ids(valid_path, word_to_id)
    total_words = len(word_to_id)
    reversed_dictionary = dict(zip(word_to_id.values(), word_to_id.keys()))
    dictionary = {value: key for key, value in reversed_dictionary.items()}

    print('\ntotalwords : ', total_words, '\n')
    return train_data, valid_data, total_words, reversed_dictionary, dictionary


def save_json(dictionary, filename):
    with open(filename, 'w') as fp:
        json.dump(dictionary, fp)


class BatchGenerator(object):

    def __init__(self, data, num_steps, batch_size, total_words, skip_step=5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.total_words = total_words
        self.current_idx = 0
        self.skip_step = skip_step

    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.num_steps, self.total_words))
        while True:
            for i in range(self.batch_size):
                if self.current_idx + self.num_steps >= len(self.data):
                    self.current_idx = 0
                x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx +
                                   1:self.current_idx + self.num_steps + 1]
                y[i, :, :] = tf.keras.utils.to_categorical(
                    temp_y, num_classes=self.total_words)
                self.current_idx += self.skip_step
            yield x, y


def create_model(total_words, hidden_size, num_steps, optimizer='adam'):
    model = tf.keras.models.Sequential()

    # Embedding layer / Input layer
    model.add(tf.keras.layers.Embedding(
        total_words, hidden_size, input_length=num_steps))

    # 4 LSTM layers
    model.add(tf.keras.layers.LSTM(units=hidden_size, return_sequences=True))
    model.add(tf.keras.layers.LSTM(units=hidden_size, return_sequences=True))
    model.add(tf.keras.layers.LSTM(units=hidden_size, return_sequences=True))
    model.add(tf.keras.layers.LSTM(units=hidden_size, return_sequences=True))

    # Fully Connected layer
    model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1024)))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.Dropout(0.3, seed=0))
    model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(512)))
    model.add(tf.keras.layers.Activation('relu'))

    # Output Layer
    model.add(tf.keras.layers.TimeDistributed(
        tf.keras.layers.Dense(total_words)))
    model.add(tf.keras.layers.Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer=optimizer,
                  metrics=[tf.keras.metrics.categorical_accuracy])
    return model

train_data, valid_data, total_words, indexToString, stringToIndex = load_data()

train_data_generator = BatchGenerator(
    train_data, CONFIG_number_of_words, CONFIG_batch_size, total_words, skip_step=CONFIG_number_of_words)
valid_data_generator = BatchGenerator(
    valid_data, CONFIG_number_of_words, CONFIG_batch_size, total_words, skip_step=CONFIG_number_of_words)
optimizer = tf.keras.optimizers.Adam(learning_rate=CONFIG_learning_rate)
model = create_model(total_words=total_words, hidden_size=CONFIG_hidden_size,num_steps=CONFIG_number_of_words, optimizer=optimizer)
print(model.summary())

checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join('/content/model/checkpoint', 'model-{epoch:02d}.keras'), verbose=1)

save_json(stringToIndex, os.path.join(
    os.getcwd(), 'data', 'stringToIndex.json'))

save_json(indexToString, os.path.join(
    os.getcwd(), 'data', 'indexToString.json'))

model.fit(
    train_data_generator.generate(),
    steps_per_epoch=len(train_data) // (CONFIG_batch_size * CONFIG_number_of_words),
    epochs=CONFIG_num_epochs,
    validation_data=valid_data_generator.generate(),
    validation_steps=len(valid_data) // (CONFIG_batch_size * CONFIG_number_of_words),
    callbacks=[checkpointer],
)


model.save(os.path.join ('/content/model/', 'model.keras'))



totalwords :  9948 

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 1500)           14922000  
                                                                 
 lstm (LSTM)                 (None, 3, 1500)           18006000  
                                                                 
 lstm_1 (LSTM)               (None, 3, 1500)           18006000  
                                                                 
 lstm_2 (LSTM)               (None, 3, 1500)           18006000  
                                                                 
 lstm_3 (LSTM)               (None, 3, 1500)           18006000  
                                                                 
 time_distributed (TimeDist  (None, 3, 1024)           1537024   
 ributed)                                                        
                                  

In [None]:
from numpy import argmax
import os
import tensorflow as tf

indexToString = load_dictionary(os.path.join('/content/data/', 'indexToString.json'))
stringToIndex = load_dictionary(os.path.join('/content/data/', 'stringToIndex.json'))

model = tf.keras.models.load_model(os.path.join('/content/', 'model', 'model.keras'))

def predict_next_word(string, verbose=True, NUMBER_OF_PREDICTIONS=1):
    ques_bool = False
    idx, ques_bool = string_to_indexes(string.split(), ques_bool)

    if len(idx) >= CONFIG_number_of_words:
        if verbose:
            # Reshape the input to match the expected shape (batch_size, num_steps)
            prediction = model.predict(np.array([idx[-CONFIG_number_of_words:]]))
        else:
            prediction = model.predict(np.array([idx[-CONFIG_number_of_words:]]))
        best_predictions = []

        for _ in range(NUMBER_OF_PREDICTIONS):
            argmax_idx = argmax(prediction[:, CONFIG_number_of_words - 1, :])
            print(prediction[:, CONFIG_number_of_words - 1, argmax_idx])
            best_predictions.append(argmax_idx)
            prediction[:, CONFIG_number_of_words - 1, argmax_idx] = 0.0

        if verbose:
            print('\nprediction indexes\t:', best_predictions)
        converted_string = indexes_to_string(best_predictions, ques_bool)
        sentences = []

        for word in converted_string:
            sentences.append(string + ' ' + word)
        return sentences
    else:
        print('\n\nPlease enter at least', CONFIG_number_of_words, ' words.\n')


def string_to_indexes(array_of_string, ques_bool):
    array_of_indexes = []
    for word in array_of_string:
        if word == '<rare word>':
            word = '<unk>'
        if word == '.' or word == '?':
            word = '<eos>'
        if word in ['what', 'why', 'who', 'how', 'whose', 'when', 'which', 'where']:
            ques_bool = True

        try:
            array_of_indexes.append(stringToIndex[word])
        except:
            print("Word ", word, " does not exist in the vocabulary!\nReplacing it with '<unk>'")
            word = '<unk>'
            array_of_indexes.append(stringToIndex[word])
            pass
    return array_of_indexes, ques_bool

def indexes_to_string(array_of_indexes, ques_bool):
    array_of_strings = []

    for index in array_of_indexes:
        word = indexToString[str(index)]
        if word == '<eos>':
            if ques_bool:
                word = '?'
            else:
                word = '.'
        if word == 'N':
            pass
        array_of_strings.append(word)
    return array_of_strings

while True:
    sentences = predict_next_word(
        string=input('\n\nEnter atleast ' + str(CONFIG_number_of_words) +' words: \n'),
        NUMBER_OF_PREDICTIONS=1)
    print('\n')
    if sentences:
        count = 0
        for sentence in sentences:
            count += 1
            print(count, '\t-', sentence)


Word  hello  does not exist in the vocabulary!
Replacing it with '<unk>'
[0.06052542]

prediction indexes	: [1]


1 	- hello this is <unk>
