In [1]:
import numpy as np 
from numpy import array
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

In [2]:
data = '''My favourite color is black
my favourite pet is dog
my favourite food is upma
my favourite flower is rose
my favourite place is bengaluru
my favourite outfit is formals'''

In [3]:
# lowering the case of every character in the sentences

data = data.lower().split('\n')
data

['my favourite color is black',
 'my favourite pet is dog',
 'my favourite food is upma',
 'my favourite flower is rose',
 'my favourite place is bengaluru',
 'my favourite outfit is formals']

In [4]:
# integer encode text
tokenizer = Tokenizer()

# creates tokens for each words present in the data
tokenizer.fit_on_texts(data)                     
vocab_size = len(tokenizer.word_index) + 1

print("WordIndex:", tokenizer.word_index)
print("Vocabulary size:", vocab_size)

WordIndex: {'my': 1, 'favourite': 2, 'is': 3, 'color': 4, 'black': 5, 'pet': 6, 'dog': 7, 'food': 8, 'upma': 9, 'flower': 10, 'rose': 11, 'place': 12, 'bengaluru': 13, 'outfit': 14, 'formals': 15}
Vocabulary size: 16


In [5]:
input_sequences = []

for sentence in data:
    token = tokenizer.texts_to_sequences([sentence])[0] #converts each sentence as its tokenized equivalent
    for i in range(1, len(token)):
        sequence = token[:i+1]           #generating sequences
        input_sequences.append(sequence) #appending each sequence to the list of our features

In [6]:
input_sequences

[[1, 2],
 [1, 2, 4],
 [1, 2, 4, 3],
 [1, 2, 4, 3, 5],
 [1, 2],
 [1, 2, 6],
 [1, 2, 6, 3],
 [1, 2, 6, 3, 7],
 [1, 2],
 [1, 2, 8],
 [1, 2, 8, 3],
 [1, 2, 8, 3, 9],
 [1, 2],
 [1, 2, 10],
 [1, 2, 10, 3],
 [1, 2, 10, 3, 11],
 [1, 2],
 [1, 2, 12],
 [1, 2, 12, 3],
 [1, 2, 12, 3, 13],
 [1, 2],
 [1, 2, 14],
 [1, 2, 14, 3],
 [1, 2, 14, 3, 15]]

In [9]:
maxlen_sequence = max([len(x) for x in input_sequences]) #calculating the length of the longest sequence
input_sequences = np.array(pad_sequences(input_sequences, maxlen=maxlen_sequence, padding='pre')) #pre-pading each value of the input_sequence

In [10]:
input_sequences

array([[ 0,  0,  0,  1,  2],
       [ 0,  0,  1,  2,  4],
       [ 0,  1,  2,  4,  3],
       [ 1,  2,  4,  3,  5],
       [ 0,  0,  0,  1,  2],
       [ 0,  0,  1,  2,  6],
       [ 0,  1,  2,  6,  3],
       [ 1,  2,  6,  3,  7],
       [ 0,  0,  0,  1,  2],
       [ 0,  0,  1,  2,  8],
       [ 0,  1,  2,  8,  3],
       [ 1,  2,  8,  3,  9],
       [ 0,  0,  0,  1,  2],
       [ 0,  0,  1,  2, 10],
       [ 0,  1,  2, 10,  3],
       [ 1,  2, 10,  3, 11],
       [ 0,  0,  0,  1,  2],
       [ 0,  0,  1,  2, 12],
       [ 0,  1,  2, 12,  3],
       [ 1,  2, 12,  3, 13],
       [ 0,  0,  0,  1,  2],
       [ 0,  0,  1,  2, 14],
       [ 0,  1,  2, 14,  3],
       [ 1,  2, 14,  3, 15]])

In [11]:
# splitting X and y values

X, y = input_sequences[:,:-1],input_sequences[:,-1]   # last column is taken as label y

y = to_categorical(y, num_classes=vocab_size)   # creating one hot encoding values

In [23]:
y

array([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,

In [12]:
X.shape

(24, 4)

In [13]:
# Model Building
model = Sequential()  # creating sequential model
model.add(Embedding(vocab_size, 64, input_length=4))  # adding embedding layer 
model.add(SimpleRNN(20))  # add SimpleRNN layer with 20 units 
model.add(Dense(vocab_size, activation='softmax'))  # adding dense layer 
print(model.summary())

# Compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=100)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 64)             1024      
                                                                 
 simple_rnn (SimpleRNN)      (None, 20)                1700      
                                                                 
 dense (Dense)               (None, 16)                336       
                                                                 
Total params: 3,060
Trainable params: 3,060
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/1

Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x200c0f34a00>

In [20]:
# generate a sequence from the model
def generate_seq(model, tokenizer, enter_text, n_pred):  
    in_text, result = enter_text, enter_text
    # generate a fixed number of words
    for i in range(n_pred):

        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = pad_sequences([encoded], maxlen=maxlen_sequence-1, padding='pre') #padding the input_phrase


        # predict a word in the vocabulary

        yhat = np.argmax(model.predict(encoded), axis=-1)

        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text, result = out_word, result + ' ' + out_word
    return result

In [22]:
# To evaluate
# 'my fav place' --> input text
# '1' --> generating of 1 number of word

print(generate_seq(model, tokenizer, 'my favourite place is', 1))

my favourite place is bengaluru
