In [16]:
import numpy as np 
from numpy import array
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

In [32]:
data = '''My fav color black
my fav pet dog
my fav food upma
my fav flower rose
my fav place bengaluru
my fav outfit formals'''

In [33]:
# lowering the case of every character in the sentences

data = data.lower()
data

'my fav color black\nmy fav pet dog\nmy fav food upma\nmy fav flower rose\nmy fav place bengaluru\nmy fav outfit formals'

In [34]:
# integer encode text
tokenizer = Tokenizer()
# creates tokens for each words present in the data
tokenizer.fit_on_texts([data])                     

#converts tokens of text data into a sequence of integers.
encoded_data= tokenizer.texts_to_sequences([data])[0]  
print("Encoded data:", encoded_data)
print("Word_index:",tokenizer.word_index)

Encoded data: [1, 2, 3, 4, 1, 2, 5, 6, 1, 2, 7, 8, 1, 2, 9, 10, 1, 2, 11, 12, 1, 2, 13, 14]
Word_index: {'my': 1, 'fav': 2, 'color': 3, 'black': 4, 'pet': 5, 'dog': 6, 'food': 7, 'upma': 8, 'flower': 9, 'rose': 10, 'place': 11, 'bengaluru': 12, 'outfit': 13, 'formals': 14}


In [6]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # 0 is reserved for padding so that's why we added 1

print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 15


In [22]:
# create word -> word sequences
sequences = list()
for i in range(3, len(encoded_data)):
    sequence = encoded_data[i-3:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))
# split into X and y elements

Total Sequences: 21


In [23]:
sequences

[[1, 2, 3, 4],
 [2, 3, 4, 1],
 [3, 4, 1, 2],
 [4, 1, 2, 5],
 [1, 2, 5, 6],
 [2, 5, 6, 1],
 [5, 6, 1, 2],
 [6, 1, 2, 7],
 [1, 2, 7, 8],
 [2, 7, 8, 1],
 [7, 8, 1, 2],
 [8, 1, 2, 9],
 [1, 2, 9, 10],
 [2, 9, 10, 1],
 [9, 10, 1, 2],
 [10, 1, 2, 11],
 [1, 2, 11, 12],
 [2, 11, 12, 1],
 [11, 12, 1, 2],
 [12, 1, 2, 13],
 [1, 2, 13, 14]]

In [9]:
# splitting X and y values
sequences = np.array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]         # last column is taken as label y

y = to_categorical(y, num_classes=vocab_size)   # creating one hot encoding values

In [10]:
X.shape

(21, 3)

In [11]:
# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)
# y = np.delete(y[:], 0,1)
# define model
y[:5]

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]],
      dtype=float32)

In [12]:
# Model Building
model = Sequential()  # creating sequential model
model.add(Embedding(vocab_size, 64, input_length=3))  # adding embedding layer 
model.add(SimpleRNN(20))  # add SimpleRNN layer with 20 units 
model.add(Dense(vocab_size, activation='softmax'))  # adding dense layer 
print(model.summary())

# Compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=100)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 64)             960       
                                                                 
 simple_rnn (SimpleRNN)      (None, 20)                1700      
                                                                 
 dense (Dense)               (None, 15)                315       
                                                                 
Total params: 2,975
Trainable params: 2,975
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
# generate a sequence from the model
def generate_seq(model, tokenizer, enter_text, n_pred):  
    in_text, result = enter_text, enter_text
    # generate a fixed number of words
    for i in range(n_pred):

        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = array(encoded).reshape(-1,3)


        # predict a word in the vocabulary

        yhat = np.argmax(model.predict(encoded), axis=-1)

        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text, result = out_word, result + ' ' + out_word
    return result

In [15]:
# To evaluate
# 'my fav place' --> input text
# '1' --> generating of 1 number of word

print(generate_seq(model, tokenizer, 'my fav place', 1))

my fav place bengaluru
