# Language Modelling using RNN 

# i) Probability Prediction

## Importing necessary libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Embedding

from keras.utils import to_categorical


## Preprocessing Data

In [None]:
data = """ Jack and Jill went up the hill .\n To fetch a pail of water .\n Jack fell down and broke his crown .\n And Jill came tumbling after . """
print(data)

 Jack and Jill went up the hill .
 To fetch a pail of water .
 Jack fell down and broke his crown .
 And Jill came tumbling after . 


In [None]:
data_splitted = data.split('\n')
data_splitted

[' Jack and Jill went up the hill .',
 ' To fetch a pail of water .',
 ' Jack fell down and broke his crown .',
 ' And Jill came tumbling after . ']

Adding filters except '.' as we dont want '.' to be filtered out by the tokenizer but be considered as a token

In [None]:
tokenizer = Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~')

# Initializing vocabulary
tokenizer.fit_on_texts(data_splitted)
print(tokenizer.word_index)

vocab_length = len(tokenizer.word_index) + 1
vocab_length

{'.': 1, 'and': 2, 'jack': 3, 'jill': 4, 'went': 5, 'up': 6, 'the': 7, 'hill': 8, 'to': 9, 'fetch': 10, 'a': 11, 'pail': 12, 'of': 13, 'water': 14, 'fell': 15, 'down': 16, 'broke': 17, 'his': 18, 'crown': 19, 'came': 20, 'tumbling': 21, 'after': 22}


23

**Converting text to numerical sequences based on word index**

In [None]:
sequences = tokenizer.texts_to_sequences(data_splitted)
sequences

[[3, 2, 4, 5, 6, 7, 8, 1],
 [9, 10, 11, 12, 13, 14, 1],
 [3, 15, 16, 2, 17, 18, 19, 1],
 [2, 4, 20, 21, 22, 1]]

**Removing last word index from input as it is not used in the model**

In [None]:
X = []
y = []

for i in range(len(sequences)):
  X.append(sequences[i][:-1])

y = sequences

print(X)
print(y)

[[3, 2, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14], [3, 15, 16, 2, 17, 18, 19], [2, 4, 20, 21, 22]]
[[3, 2, 4, 5, 6, 7, 8, 1], [9, 10, 11, 12, 13, 14, 1], [3, 15, 16, 2, 17, 18, 19, 1], [2, 4, 20, 21, 22, 1]]


**Adding 0 as x<0> and y<0> for all inputs and outputs**

In [None]:
for x in X:
  x.insert(0,0)

for op in y:
  op.insert(0,0)

X
y

[[0, 3, 2, 4, 5, 6, 7, 8, 1],
 [0, 9, 10, 11, 12, 13, 14, 1],
 [0, 3, 15, 16, 2, 17, 18, 19, 1],
 [0, 2, 4, 20, 21, 22, 1]]

**Making every input of the same length**

In [None]:
# finding the max length of input sequences
max_len = 0; 
for x in X:
  max_len = max(max_len,len(x))

max_len

8

**Padding X with 0 to make length of all input same**

In [None]:
X = pad_sequences(X,max_len,padding='pre')
X

array([[ 0,  3,  2,  4,  5,  6,  7,  8],
       [ 0,  0,  9, 10, 11, 12, 13, 14],
       [ 0,  3, 15, 16,  2, 17, 18, 19],
       [ 0,  0,  0,  2,  4, 20, 21, 22]], dtype=int32)

**Adding 0 to y because of padding in X**

In [None]:
y = pad_sequences(y,max_len,padding='pre')
y

array([[ 3,  2,  4,  5,  6,  7,  8,  1],
       [ 0,  9, 10, 11, 12, 13, 14,  1],
       [ 3, 15, 16,  2, 17, 18, 19,  1],
       [ 0,  0,  2,  4, 20, 21, 22,  1]], dtype=int32)

**Converting to One-Hot Encoding**

In [None]:
y = to_categorical(y,num_classes = vocab_length)
y

array([[[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.]],

       [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],

In [None]:
y.shape

(4, 8, 23)

## Building the Model

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_length, output_dim=10))
model.add(SimpleRNN(50,return_sequences=True))
model.add(Dense(units=vocab_length, activation='softmax'))

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 10)          230       
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, None, 50)          3050      
_________________________________________________________________
dense_4 (Dense)              (None, None, 23)          1173      
Total params: 4,453
Trainable params: 4,453
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
model.fit(X,y,epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fd825bed9d0>

## Function for predicting probability of a Sentence

In [None]:
def prob_of_sentence(model,tokenizer,sentence):

  # converting sentence into numerical form
  encoded_sentence = tokenizer.texts_to_sequences([sentence])[0]
  print(encoded_sentence)

  # adding 0 as X<0>
  encoded_sentence.insert(0,0)
  print(encoded_sentence)

  encoded_sentence = np.array(encoded_sentence).reshape((1,-1))
  print(encoded_sentence)

  prob = model.predict_proba(encoded_sentence)
  print(prob.shape)

  probability = 1
  for i in range(0,prob.shape[1] - 1):
    probability *= prob[0,i,encoded_sentence[0,i+1]]
  print(probability)

In [None]:
prob_of_sentence(model,tokenizer,"Jack and Jill .")

[3, 2, 4, 1]
[0, 3, 2, 4, 1]
[[0 3 2 4 1]]
(1, 5, 23)
1.8183255523435641e-06




## ii) Sentence Generation

## Random sentence generation without seed

In [None]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [None]:
def sample_all_wo_seed(model,tokenizer,n_words,vocab_length):
  encoded_sentence = []
  inp_text = ''

  for i in range(n_words):
    print('-'*50)
    print('Input text : ', inp_text)

    # converting sentence into numerical form
    encoded_sentence = tokenizer.texts_to_sequences([inp_text])[0]

    # adding 0 as X<0>
    encoded_sentence.insert(0,0)
    
    encoded_sentence = np.array(encoded_sentence).reshape((1,-1))
    print("For i : {} Encoded is : {}".format(i, encoded_sentence))

    if i == 0:
      prob = model.predict_proba(encoded_sentence, verbose= 0)
      y_hat = 0
      while y_hat == 0:
        y_hat = np.random.choice(range(vocab_length),p=prob.ravel())
        y_hat = np.array(y_hat).reshape((1,-1))
      print("For i : {} yhat in if is : {}".format(i, y_hat))

    else:
      prob = model.predict_proba(encoded_sentence, verbose= 0)
      print(prob.shape)
      y_hat = np.append(y_hat,0)
      y_hat = np.array(y_hat).reshape((1,-1))

      while y_hat[0][i] == 0:
        y_hat[0][i] = np.random.choice(range(vocab_length),p=prob[0][i].ravel())
      print("For i : {} yhat in else is : {}".format(i, y_hat))

    output_word = ""
    for word, index in tokenizer.word_index.items():
      if index == y_hat[0][i]:
        output_word = word
        break
    inp_text += output_word + ' '

    print('-'*50)

  return inp_text

In [None]:
print('\n\n' + color.BOLD + sample_all_wo_seed(model,tokenizer,3,vocab_length) + color.END)

--------------------------------------------------
Input text :  
For i : 0 Encoded is : [[0]]
For i : 0 yhat in if is : [[3]]
--------------------------------------------------
--------------------------------------------------
Input text :  jack 
For i : 1 Encoded is : [[0 3]]
(1, 2, 23)
For i : 1 yhat in else is : [[3 2]]
--------------------------------------------------
--------------------------------------------------
Input text :  jack and 
For i : 2 Encoded is : [[0 3 2]]
(1, 3, 23)
For i : 2 yhat in else is : [[ 3  2 16]]
--------------------------------------------------


[1mjack and down [0m




## Sentence generation with highest probability

In [None]:
def sample_all_wo_seed_with_hp(model,tokenizer,n_words,vocab_length):
  encoded_sentence = []
  inp_text = ''

  for i in range(n_words):
    print('-'*50)
    print('Input text : ', inp_text)

    # converting sentence into numerical form
    encoded_sentence = tokenizer.texts_to_sequences([inp_text])[0]

    # adding 0 as X<0>
    encoded_sentence.insert(0,0)
    
    encoded_sentence = np.array(encoded_sentence).reshape((1,-1))
    print("For i : {} Encoded is : {}".format(i, encoded_sentence))

    if i == 0:
      prob = model.predict_proba(encoded_sentence, verbose= 0)
      y_hat = 0
      while y_hat == 0:
        y_hat = np.random.choice(range(vocab_length),p=prob.ravel())
        y_hat = np.array(y_hat).reshape((1,-1))
      print("For i : {} yhat in if is : {}".format(i, y_hat))

    else:
      prob = model.predict_proba(encoded_sentence, verbose= 0)
      print(prob.shape)
      y_hat = np.append(y_hat,0)
      y_hat = np.array(y_hat).reshape((1,-1))

      # while y_hat[0][i] == 0:
      
      y_hat[0][i] = np.argmax(prob[0][i].ravel()[1:] , axis=0)
      print("For i : {} yhat in else is : {}".format(i, y_hat))

    output_word = ""
    for word, index in tokenizer.word_index.items():
      if index == y_hat[0][i]:
        output_word = word
        break
    inp_text += output_word + ' '

    print('-'*50)

  return inp_text

In [None]:
print('\n\n' + color.BOLD + sample_all_wo_seed_with_hp(model,tokenizer,5,vocab_length) + color.END)

--------------------------------------------------
Input text :  
For i : 0 Encoded is : [[0]]
For i : 0 yhat in if is : [[3]]
--------------------------------------------------
--------------------------------------------------
Input text :  jack 
For i : 1 Encoded is : [[0 3]]
(1, 2, 23)
For i : 1 yhat in else is : [[ 3 14]]
--------------------------------------------------
--------------------------------------------------
Input text :  jack water 
For i : 2 Encoded is : [[ 0  3 14]]
(1, 3, 23)
For i : 2 yhat in else is : [[ 3 14  3]]
--------------------------------------------------
--------------------------------------------------
Input text :  jack water jack 
For i : 3 Encoded is : [[ 0  3 14  3]]




(1, 4, 23)
For i : 3 yhat in else is : [[ 3 14  3  4]]
--------------------------------------------------
--------------------------------------------------
Input text :  jack water jack jill 
For i : 4 Encoded is : [[ 0  3 14  3  4]]
(1, 5, 23)
For i : 4 yhat in else is : [[ 3 14  3  4  5]]
--------------------------------------------------


[1mjack water jack jill went [0m
