<a href="https://colab.research.google.com/github/Justabhi96/NLP/blob/master/12_Q%26A_Bot_with_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Link to the paper --> https://arxiv.org/pdf/1503.08895.pdf

In [0]:
import pickle
import numpy as np

In [0]:
with open("train_qa.txt", "rb") as f:
  train_data = pickle.load(f)
with open("test_qa.txt", "rb") as f:
  test_data = pickle.load(f)

len(train_data), len(test_data)

(10000, 1000)

In [0]:
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

###Create a Vocabulary

In [0]:
all_data = train_data + test_data

In [0]:
vocab = set()

for story, question, ans in all_data:
  vocab = vocab.union(set(story))
  vocab = vocab.union(set(question))

In [0]:
vocab.add("no")
vocab.add("yes")

vocab_len = len(vocab)+1 # for keras pad sequence to have a placeholder
vocab_len

38

In [0]:
# longest story
all_story_lens = [len(data[0]) for data in all_data]
max_story_len = max(all_story_lens)
max_story_len

156

In [0]:
# longest question
all_que_lens = [len(data[1]) for data in all_data]
max_que_len = max(all_que_lens)
max_que_len

6

###Vectorize the data

In [0]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [0]:
tokenizer = Tokenizer(filters = [])
tokenizer.fit_on_texts(vocab)

tokenizer.word_index

{'.': 36,
 '?': 37,
 'apple': 32,
 'back': 22,
 'bathroom': 10,
 'bedroom': 15,
 'daniel': 26,
 'discarded': 4,
 'down': 23,
 'dropped': 19,
 'football': 18,
 'garden': 31,
 'got': 8,
 'grabbed': 29,
 'hallway': 5,
 'in': 9,
 'is': 25,
 'john': 2,
 'journeyed': 11,
 'kitchen': 17,
 'left': 24,
 'mary': 33,
 'milk': 28,
 'moved': 3,
 'no': 12,
 'office': 13,
 'picked': 30,
 'put': 7,
 'sandra': 14,
 'the': 35,
 'there': 27,
 'to': 20,
 'took': 21,
 'travelled': 1,
 'up': 6,
 'went': 34,
 'yes': 16}

In [0]:
def vectorize_stories(data, word_index = tokenizer.word_index, max_story_len = max_story_len, 
                      max_que_len = max_que_len):
  # stories
  X = []
  # questions
  Xq = []
  # answers
  Y = []

  for story, que, ans in data:
    x = [word_index[word.lower()] for word in story]
    xq = [word_index[word.lower()] for word in que]

    y = np.zeros(len(word_index)+1)
    y[word_index[ans]] = 1

    X.append(x)
    Xq.append(xq)
    Y.append(y)

  return (pad_sequences(X, maxlen = max_story_len),
          pad_sequences(Xq, maxlen = max_que_len),
          np.array(Y))

In [0]:
inputs_story_train, inputs_que_train, inputs_ans_train = vectorize_stories(train_data)
inputs_story_test, inputs_que_test, inputs_ans_test = vectorize_stories(test_data)

In [0]:
inputs_story_train

array([[ 0,  0,  0, ..., 35, 15, 36],
       [ 0,  0,  0, ..., 35,  5, 36],
       [ 0,  0,  0, ..., 35, 10, 36],
       ...,
       [ 0,  0,  0, ..., 35, 15, 36],
       [ 0,  0,  0, ..., 28, 27, 36],
       [ 0,  0,  0, ..., 32, 27, 36]], dtype=int32)

###Build Network
  1. Input Encoder M
  2. Input Encoder C
  3. Question Encoder

####Complete the Network

In [0]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM

In [0]:
# placeholder shape = (max_story_len, batch_size)

input_seq = Input((max_story_len,))
question = Input((max_que_len,))

In [0]:
vocab_size = vocab_len

In [0]:
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim = vocab_size, output_dim = 64))
input_encoder_m.add(Dropout(0.3))

# OUTPUT --> (samples, story_max_len, embedding_dim)

In [0]:
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim = vocab_size, output_dim = max_que_len))
input_encoder_c.add(Dropout(0.3))

# OUTPUT --> (samples, story_max_len, max_que_len)

In [0]:
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim = vocab_size, output_dim = 64, input_length = max_que_len))
question_encoder.add(Dropout(0.3))

# OUTPUT --> (samples, max_que_len, embedding_dim)

In [0]:
# encoded --> encoder(input)
input_encoded_m = input_encoder_m(input_seq)
input_encoded_c = input_encoder_c(input_seq)

question_encoded = question_encoder(question)

In [0]:
match = dot([input_encoded_m, question_encoded], axes = (2,2))
match = Activation("softmax")(match)

In [0]:
response = add([match, input_encoded_c])
response = Permute((2,1))(response)

In [34]:
answer = concatenate([response, question_encoded])
answer

<tf.Tensor 'concatenate_1/concat:0' shape=(?, 6, 220) dtype=float32>

In [0]:
answer = LSTM(32)(answer)
answer = Dropout(0.5)(answer)
answer = Dense(vocab_size)(answer)
answer = Activation("softmax")(answer)

In [38]:
model = Model([input_seq, question], answer)

model.compile(loss = "categorical_crossentropy", optimizer= "rmsprop", metrics = ["accuracy"])
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 156)          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 6)            0                                            
__________________________________________________________________________________________________
sequential_3 (Sequential)       multiple             2432        input_3[0][0]                    
__________________________________________________________________________________________________
sequential_6 (Sequential)       (None, 6, 64)        2432        input_4[0][0]                    
____________________________________________________________________________________________

###Train Model

In [44]:
# I ran this code 3 times that's why accuracy is good on first epoch itself
# accuracy is as good as running with 30 epochs
r = model.fit([inputs_story_train, inputs_que_train], inputs_ans_train, batch_size = 32, epochs = 10,
              validation_data = ([inputs_story_test, inputs_que_test], inputs_ans_test))

Train on 10000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
model.save("my_chatbot_model.h5")

In [0]:
import random

def predict_on_random_test_data():
  random_index = random.randint(0, len(inputs_story_test))

  story = inputs_story_test[random_index]
  que = inputs_que_test[random_index]
  print("====================== Story ============================\n")
  print(" ".join([tokenizer.index_word.get(val) for val in list(story) if val > 0]))
  print("\n====================== Question ============================\n")
  print(" ".join([tokenizer.index_word.get(val) for val in list(que) if val > 0]))
  print("\n==================== Answer ==============================\n")
  sample_data = [[story], [que]]

  pred_data = model.predict((sample_data))
  pred_index = np.argmax(pred_data[0])
  for key, val in tokenizer.word_index.items():
    if val == pred_index:
      print(key)
      print("conf: ", pred_data[0][pred_index])
      break

In [104]:
predict_on_random_test_data()


mary picked up the apple there . mary moved to the kitchen . sandra went to the office . mary travelled to the garden .


is mary in the garden ?


yes
conf:  0.8302403


### Predict on manually created data

Remember we will be bound to only use those words which are already available in tokenizer vocab. i.e. On which words model is trained

In [0]:
def predict_on_created_data(my_story, my_que):
  my_story = my_story.split()
  my_que = my_que.split()
  
  print("====================== Story ============================\n")
  print(" ".join(my_story))
  print("\n====================== Question ============================\n")
  print(" ".join(my_que))
  print("\n==================== Answer ==============================\n")
  
  mydata = [(my_story, my_que, "yes")]
  my_story, my_que, my_ans = vectorize_stories(mydata)

  pred_data = model.predict(([my_story, my_que]))
  pred_index = np.argmax(pred_data[0])
  for key, val in tokenizer.word_index.items():
    if val == pred_index:
      print(key)
  print("conf: ", pred_data[0][pred_index])

In [97]:
my_story = "John left the kitchen . Sandra dropped the football in the garden ."

my_que = "Is the football in the garden ?"

predict_on_created_data(my_story, my_que)


John left the kitchen . Sandra dropped the football in the garden .


Is the football in the garden ?


yes
conf:  0.94340765


###We can load the better weights from already trained model

In [0]:
model.load_weights("chatbot_10.h5") 