In [1]:
import pickle
import numpy as np


In [3]:
with open('datasets/train_qa.txt','rb') as f:
    train_data = pickle.load(f)

In [4]:
with open('datasets/test_qa.txt','rb') as f:
    test_data = pickle.load(f)

In [5]:
type(train_data)

list

In [6]:
len(train_data)

10000

In [8]:
# story, question and answer
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

In [9]:
' '.join(train_data[0][0])

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [10]:
' '.join(train_data[0][1])

'Is Sandra in the hallway ?'

In [12]:
train_data[0][2]

'no'

In [13]:
#Create vocabulary with train and test data

In [14]:
all_data = test_data + train_data

In [15]:
len(all_data)

11000

In [16]:
vocab = set()

for story,question,answer in all_data:
    #story to unique words
    # adding unique word in the vocab
    vocab = vocab.union(set(story))
    vocab = vocab.union(set(question))

In [17]:
vocab.add('no')

In [18]:
vocab.add('yes')

In [20]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [21]:
vocab_len = len(vocab) + 1 # for keras pad sequece

In [22]:
vocab_len

38

In [23]:
#find the longest story and longest question

In [24]:
all_story_len =[len(data[0]) for data in all_data]

In [25]:
max_story_len = max(all_story_len)

In [26]:
max_story_len

156

In [27]:
max_question_len = max([len(data[1]) for data in all_data])

In [28]:
max_question_len

6

In [29]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [30]:
tokenizer = Tokenizer(filters=[])

In [32]:
tokenizer.fit_on_texts(vocab)

In [33]:
tokenizer.word_index

{'is': 1,
 'bedroom': 2,
 'put': 3,
 'mary': 4,
 'there': 5,
 'football': 6,
 'yes': 7,
 'took': 8,
 'office': 9,
 'back': 10,
 'sandra': 11,
 'down': 12,
 'travelled': 13,
 'grabbed': 14,
 'kitchen': 15,
 'no': 16,
 '?': 17,
 'journeyed': 18,
 'picked': 19,
 'left': 20,
 'milk': 21,
 'in': 22,
 'discarded': 23,
 'hallway': 24,
 'went': 25,
 'bathroom': 26,
 'garden': 27,
 'up': 28,
 'daniel': 29,
 '.': 30,
 'moved': 31,
 'to': 32,
 'dropped': 33,
 'the': 34,
 'apple': 35,
 'got': 36,
 'john': 37}

In [34]:
#vectorize the  qustion

train_story_text = []
train_question_text = []
train_answers = []

In [37]:
for story, question, answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)

In [38]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [39]:
train_story_seq

[[4, 31, 32, 34, 26, 30, 11, 18, 32, 34, 2, 30],
 [4, 31, 32, 34, 26, 30, 11, 18, 32, 34, 2, 30],
 [4, 31, 32, 34, 26, 30, 11, 18, 32, 34, 2, 30],
 [4,
  31,
  32,
  34,
  26,
  30,
  11,
  18,
  32,
  34,
  2,
  30,
  4,
  25,
  10,
  32,
  34,
  2,
  30,
  29,
  25,
  10,
  32,
  34,
  24,
  30],
 [4,
  31,
  32,
  34,
  26,
  30,
  11,
  18,
  32,
  34,
  2,
  30,
  4,
  25,
  10,
  32,
  34,
  2,
  30,
  29,
  25,
  10,
  32,
  34,
  24,
  30,
  11,
  25,
  32,
  34,
  15,
  30,
  29,
  25,
  10,
  32,
  34,
  26,
  30],
 [4,
  31,
  32,
  34,
  26,
  30,
  11,
  18,
  32,
  34,
  2,
  30,
  4,
  25,
  10,
  32,
  34,
  2,
  30,
  29,
  25,
  10,
  32,
  34,
  24,
  30,
  11,
  25,
  32,
  34,
  15,
  30,
  29,
  25,
  10,
  32,
  34,
  26,
  30,
  29,
  19,
  28,
  34,
  6,
  5,
  30,
  29,
  25,
  32,
  34,
  2,
  30],
 [4,
  31,
  32,
  34,
  26,
  30,
  11,
  18,
  32,
  34,
  2,
  30,
  4,
  25,
  10,
  32,
  34,
  2,
  30,
  29,
  25,
  10,
  32,
  34,
  24,
  30,
  11,
  25,

In [45]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len, max_question_len=max_question_len):
    
    #Story  = X
    X = []
    
    #Questions = Xq
    Xq = []
    
    #Y Correct answer *yes/no
    Y = []
    
    for story,query,asnwer in data:
        
        # for each story
        x  = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in query]
        
        
        y = np.zeros(len(word_index)+1)
        y[word_index[answer]] =1 
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)  
        
        
    return (pad_sequences(X, maxlen=max_story_len), pad_sequences(Xq, maxlen=max_question_len), np.array(Y))   
        
    
    
    
    

In [46]:
inputs_train, queires_train, answers_train = vectorize_stories(train_data)

In [47]:
inputs_test, queires_test, answers_test = vectorize_stories(test_data)

In [50]:
from keras.models import Sequential, Model


In [51]:
from keras.layers.embeddings import Embedding

In [54]:
from keras.layers import Input,Activation,Dense,Permute,Dropout,add,dot,concatenate,LSTM

In [55]:
#Stories and Questions as input

In [56]:
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

In [57]:
vocab_size = len(vocab)+1

In [58]:
#INPUT ENCODER M
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64))
input_encoder_m.add(Dropout(0.5))




Instructions for updating:
Colocations handled automatically by placer.


In [59]:
#INPUT ENCODER C
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size, output_dim=max_question_len))
input_encoder_c.add(Dropout(0.5))

In [61]:
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=max_question_len))
question_encoder.add(Dropout(0.5))

In [62]:
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [63]:
match = dot([input_encoded_m, question_encoded], axes=(2,2))
match = Activation('softmax')(match)

            

In [64]:
response = add([match,input_encoded_c])
response = Permute((2,1))(response)

In [65]:
answer = concatenate([response, question_encoded])

In [66]:
answer

<tf.Tensor 'concatenate_1/concat:0' shape=(?, 6, 220) dtype=float32>

In [67]:
answer = LSTM(32)(answer)

In [68]:
answer = Dropout(0.5)(answer)

In [69]:
answer = Dense(vocab_size)(answer)

In [70]:
answer = Activation('softmax')(answer)

In [73]:
model = Model([input_sequence, question], answer)

In [74]:
model.compile(optimizer='rmsprop', loss="categorical_crossentropy", metrics=['accuracy'])

In [75]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 156)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 6)            0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       multiple             2432        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_4 (Sequential)       (None, 6, 64)        2432        input_2[0][0]                    
____________________________________________________________________________________________

In [78]:
history = model.fit([inputs_train, queires_train], answers_train, batch_size=32, epochs=20, validation_data=([inputs_test, queires_test], answers_test) )

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 10000 samples, validate on 1000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
