## Import libraries

In [1]:
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense




In [2]:
# sample text data
text_data = [
    "hello how are you",
    "hello how have you been",
    "hi there",
    "good morning",
    "good night",
    "have a nice day",
    "how is it going",
    "how have you been",
    "nice to meet you",
    "thank you"]

In [3]:
# prepare the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

In [4]:
total_words 

21

In [5]:
# create sequences of words
input_sequence = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequence.append(n_gram_sequence)

In [6]:
input_sequence

[[4, 2],
 [4, 2, 8],
 [4, 2, 8, 1],
 [4, 2],
 [4, 2, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1, 5],
 [9, 10],
 [6, 11],
 [6, 12],
 [3, 13],
 [3, 13, 7],
 [3, 13, 7, 14],
 [2, 15],
 [2, 15, 16],
 [2, 15, 16, 17],
 [2, 3],
 [2, 3, 1],
 [2, 3, 1, 5],
 [7, 18],
 [7, 18, 19],
 [7, 18, 19, 1],
 [20, 1]]

### Padding of sequences

In [7]:
max_sequence_len = max([len(x) for x in input_sequence])
input_sequence = pad_sequences(input_sequence,maxlen=max_sequence_len,padding='pre')

In [8]:
input_sequence

array([[ 0,  0,  0,  4,  2],
       [ 0,  0,  4,  2,  8],
       [ 0,  4,  2,  8,  1],
       [ 0,  0,  0,  4,  2],
       [ 0,  0,  4,  2,  3],
       [ 0,  4,  2,  3,  1],
       [ 4,  2,  3,  1,  5],
       [ 0,  0,  0,  9, 10],
       [ 0,  0,  0,  6, 11],
       [ 0,  0,  0,  6, 12],
       [ 0,  0,  0,  3, 13],
       [ 0,  0,  3, 13,  7],
       [ 0,  3, 13,  7, 14],
       [ 0,  0,  0,  2, 15],
       [ 0,  0,  2, 15, 16],
       [ 0,  2, 15, 16, 17],
       [ 0,  0,  0,  2,  3],
       [ 0,  0,  2,  3,  1],
       [ 0,  2,  3,  1,  5],
       [ 0,  0,  0,  7, 18],
       [ 0,  0,  7, 18, 19],
       [ 0,  7, 18, 19,  1],
       [ 0,  0,  0, 20,  1]])

In [9]:
input_sequence.shape

(23, 5)

## Split into features and labels

In [10]:
input_sequence = np.array(input_sequence)
X,y = input_sequence[:,:-1],input_sequence[:,-1]
y = keras.utils.to_categorical(y,num_classes=total_words)

In [11]:
y

array([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0.,

## Build the model

In [12]:
model  = Sequential()
model.add(Embedding(total_words,10,input_length=max_sequence_len-1))
model.add(LSTM(150,return_sequences=True))
model.add(LSTM(100))
model.add(Dense(total_words,activation='softmax'))




In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 10)             210       
                                                                 
 lstm (LSTM)                 (None, 4, 150)            96600     
                                                                 
 lstm_1 (LSTM)               (None, 100)               100400    
                                                                 
 dense (Dense)               (None, 21)                2121      
                                                                 
Total params: 199331 (778.64 KB)
Trainable params: 199331 (778.64 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Compile the Model

In [14]:
model.compile(loss = 'binary_crossentropy',optimizer='adam',metrics=['accuracy'])




## Train the model


In [15]:
model.fit(X,y,epochs=400,batch_size=1)

Epoch 1/400


Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Epoch 77/400
Epoch 

<keras.src.callbacks.History at 0x1eb96340210>

## Prediction on new data

In [23]:
import tensorflow as tf

In [26]:
def predict_next_word(model,tokenizer,new,max_sequence_len):
    input_seq = tokenizer.texts_to_sequences([new])[0]
    input_seq = pad_sequences([input_seq], maxlen=max_sequence_len-1)
    predictions = model.predict(input_seq)
    predicted_word_idx = tf.argmax(predictions[0]).numpy()
    predicted_word = tokenizer.index_word[predicted_word_idx]
    return predicted_word

In [30]:
new = 'hello'

In [31]:
next_word = predict_next_word(model,tokenizer,new,max_sequence_len)
print(f'Next Word : {next_word}')

Next Word : how
