***
# Next Word Generation Project
***

In [8]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import json

In [3]:
# Reading corpus the text file
path = 'trim.txt'

def read_text_data(path):
    with open(path, 'r', encoding='utf-8') as myfile:
        text = myfile.read()
    return text

text = read_text_data(path)

In [4]:
text



In [5]:
mytokenizer = Tokenizer()
mytokenizer.fit_on_texts([text])
word_length = len(mytokenizer.word_index) + 1

In [10]:
# Save the tokenizer configuration to a file
tokenizer_path = 'tokenizer_config.json'
with open(tokenizer_path, 'w') as json_file:
    json.dump(mytokenizer.get_config(), json_file)

In [5]:
mytokenizer.word_index

{'i': 1,
 'feel': 2,
 'and': 3,
 'to': 4,
 'the': 5,
 'a': 6,
 'joy': 7,
 'feeling': 8,
 'that': 9,
 'of': 10,
 'sadness': 11,
 'my': 12,
 'in': 13,
 'it': 14,
 'like': 15,
 'so': 16,
 'for': 17,
 'im': 18,
 'me': 19,
 'was': 20,
 'have': 21,
 'anger': 22,
 'but': 23,
 'is': 24,
 'am': 25,
 'this': 26,
 'with': 27,
 'fear': 28,
 'not': 29,
 'about': 30,
 'love': 31,
 'be': 32,
 'you': 33,
 'on': 34,
 'as': 35,
 'just': 36,
 'at': 37,
 'or': 38,
 'when': 39,
 'more': 40,
 'all': 41,
 'because': 42,
 'do': 43,
 'can': 44,
 'really': 45,
 'up': 46,
 'by': 47,
 't': 48,
 'are': 49,
 'very': 50,
 'if': 51,
 'know': 52,
 'been': 53,
 'out': 54,
 'little': 55,
 'time': 56,
 'myself': 57,
 'how': 58,
 'what': 59,
 'will': 60,
 'get': 61,
 'now': 62,
 'from': 63,
 'had': 64,
 'people': 65,
 'want': 66,
 'being': 67,
 'they': 68,
 'he': 69,
 'surprise': 70,
 'would': 71,
 'them': 72,
 'one': 73,
 'her': 74,
 'who': 75,
 'still': 76,
 'even': 77,
 'ive': 78,
 'think': 79,
 'some': 80,
 'an': 81,


****
># Sequence Creation
****
Here the model is splitted into lines and then tokenised. The sequences are created from the corpus

In [6]:
def create_sequence(data):
    "function create sequences from the line of text in the corpus"
    text = data
    sequences = []
    
    for line in text.split('\n'):
        #do word tokenisation on each line in the whole corpus(text)
        token_list = mytokenizer.texts_to_sequences([line])[0]
        
        for i in range(1, len(token_list)):
            #sequence creation
            sequence = token_list[:i+1]
            sequences.append(sequence)
            
    return sequences

my_input_sequences = create_sequence(text)

***
># Padding
****
Here the sequences are padded so as to achieve equal pad length for all sequences to be created

In [7]:
def pad_sequences_here(sequence):
    max_len = max([len(seq) for seq in sequence])
    input_sequences = np.array(pad_sequences(sequence,
                                             maxlen=max_len,
                                             padding='pre')
                              )
    return input_sequences, max_len
input_sequences = pad_sequences_here(my_input_sequences)[0]

In [8]:
input_sequences[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  18,
         8, 208])

In [9]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [10]:
X[1]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 18,  8])

In [11]:
y

array([   8,  208,  650, ..., 1861, 2912,    7])

In [12]:
# The output is one-hot encoded

y = np.array(tf.keras.utils.to_categorical(y, num_classes=word_length))

In [13]:
y[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

***
># Sequential Model Building
***

In [13]:
max_sequence_len = pad_sequences_here(my_input_sequences)[1]

model = Sequential()
model.add(Embedding(word_length, 100, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(word_length, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 66, 100)           1192000   
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 11920)             1799920   
                                                                 
Total params: 3,142,520
Trainable params: 3,142,520
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
print(max_sequence_len)

67


In [16]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x277cfdf92b0>

In [17]:
def predict_words(input_text, predict_next_words=8):
    
    for _ in range(predict_next_words):
        token_list = mytokenizer.texts_to_sequences([input_text])[0]
        print(token_list)
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in mytokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        input_text += " " + output_word
    return print(input_text)

predicted_word = predict_words("i have been with")

[1, 21, 53, 27]
[1, 21, 53, 27, 4494]
[1, 21, 53, 27, 4494, 17]
[1, 21, 53, 27, 4494, 17, 228]
[1, 21, 53, 27, 4494, 17, 228, 3]
[1, 21, 53, 27, 4494, 17, 228, 3, 1]
[1, 21, 53, 27, 4494, 17, 228, 3, 1, 2]
[1, 21, 53, 27, 4494, 17, 228, 3, 1, 2, 15]
i have been with petronas for years and i feel like i


In [18]:
# Save the model
model.save('model2.h5')