In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import re

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Concatenate, Activation, dot, Lambda, Reshape, Add
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import RMSprop
import tensorflow.keras.backend as K

In [52]:
data_dir = "C:\\IMP\\datasets\\bAbI_datasets\\tasks_1-20_v1-2\\en-10k"

def load_data(given_dir):
    with open("{}\\{}".format(data_dir, given_dir), encoding="utf8") as f:
        story = []
        question = []
        answer = []
        data = []
        for line in f:
                number, sentence = line.split(" ", 1)

                # New story
                if int(number) == 1:
                    story = []

                tokenized_story = re.findall(r"[A-Za-z]+|[,.?]", sentence.strip())

                # Answer and the supporting number is in the line both seperated by a tab.
                if "\t" in sentence:
                    question, answer, supporting_number = sentence.split("\t")
                    tokenized_question = re.findall(r"[A-Za-z]+|[,.?]", question.strip())
                    tokenized_answer = re.findall(r"[A-Za-z]+|[,.?]", answer.strip())
                    story_so_far = [[str(i)] + s for i, s in enumerate(story)]
                    data.append((story_so_far, tokenized_question, tokenized_answer))
                else:
                    story.append(tokenized_story)
        
        return data

df_train = load_data("qa2_two-supporting-facts_train.txt")
df_test = load_data("qa2_two-supporting-facts_test.txt")

In [53]:
def get_mappings(data):
    # Get the mappings
    word2idx = {"<PAD>": 0}

    count = 1
    for stories, question, answer in data:
        for story in stories:
            for word in story:
                if word not in word2idx:
                    word2idx[word] = count 
                    count += 1
        for word in question:
            if word not in word2idx:
                word2idx[word] = count 
                count += 1
        for word in answer:
            if word not in word2idx:
                word2idx[word] = count 
                count += 1

    idx2word = {v: k for k, v in word2idx.items()}
    
    return word2idx, idx2word

In [67]:
word2idx

{'<PAD>': 0,
 '0': 1,
 'Mary': 2,
 'moved': 3,
 'to': 4,
 'the': 5,
 'bathroom': 6,
 '.': 7,
 '1': 8,
 'Sandra': 9,
 'journeyed': 10,
 'bedroom': 11,
 '2': 12,
 'got': 13,
 'football': 14,
 'there': 15,
 '3': 16,
 'John': 17,
 'went': 18,
 'kitchen': 19,
 '4': 20,
 'back': 21,
 '5': 22,
 'garden': 23,
 'Where': 24,
 'is': 25,
 '?': 26,
 '6': 27,
 'office': 28,
 '7': 29,
 '8': 30,
 'hallway': 31,
 '9': 32,
 'Daniel': 33,
 '10': 34,
 'dropped': 35,
 '11': 36,
 'milk': 37,
 '12': 38,
 'took': 39,
 '13': 40,
 'picked': 41,
 'up': 42,
 'apple': 43,
 '14': 44,
 'travelled': 45,
 '15': 46,
 '16': 47,
 '17': 48,
 'left': 49,
 '18': 50,
 '19': 51,
 '20': 52,
 '21': 53,
 '22': 54,
 '23': 55,
 '24': 56,
 '25': 57,
 'grabbed': 58,
 'discarded': 59,
 'put': 60,
 'down': 61,
 '26': 62,
 '27': 63,
 '28': 64,
 '29': 65,
 '30': 66,
 '31': 67,
 '32': 68,
 '33': 69,
 '34': 70,
 '35': 71,
 '36': 72,
 '37': 73,
 '38': 74,
 '39': 75,
 '40': 76,
 '41': 77,
 '42': 78,
 '43': 79,
 '44': 80,
 '45': 81,
 '46': 8

In [54]:
all_data = df_train + df_test
word2idx, idx2word = get_mappings(all_data)

max_input_len = max([len(story) for s, q, a in all_data for story in s])
max_query_len = max([len(story) for s, q, a in all_data for story in s])
max_no_of_sentences_in_story = max([len(s) for s, q, a in all_data])

In [55]:
def encode_mappings(data, max_input_len, max_query_len):
    # Encode the mappings into the data
    inputs, queries, outputs = [], [], []
    for stories, question, answer in data:
        inputs.append([[word2idx[word] for word in story] for story in stories])
        queries.append([word2idx[word] for word in question])
        outputs.append([word2idx[word] for word in answer])

    # Pad sequences
    inputs = [pad_sequences(x, maxlen=max_input_len) for x in inputs]
    queries = pad_sequences(queries, maxlen=max_query_len)
    
    return inputs, queries, np.array(outputs)

inputs_train, queries_train, outputs_train = encode_mappings(df_train, max_input_len, max_query_len)
inputs_test, queries_test, outputs_test = encode_mappings(df_test, max_input_len, max_query_len)

In [56]:
def stack_inputs(inputs, max_input_len, max_no_of_sentences_in_story):
    """
    this is like 'pad_sequences' but for entire stories
    we are padding each story with zeros so every story
    has the same number of sentences
    append an array of zeros of size:
    (max_sentences - num sentences in story, max words in sentence)
    """
    for i, story in enumerate(inputs):
        inputs[i] = np.concatenate(
          [
            story, 
            np.zeros((max_no_of_sentences_in_story - story.shape[0], max_input_len), 'int')
          ]
        )
    return np.stack(inputs)

inputs_train = stack_inputs(inputs_train, max_input_len, max_no_of_sentences_in_story)
inputs_test = stack_inputs(inputs_test, max_input_len, max_no_of_sentences_in_story)

In [71]:
# Model parameters
EMBEDDING_DIM = 30
EPOCHS = 20
BATCH_SIZE = 32

vocab_size = len(word2idx) + 1

In [72]:
# Shapes
# embedded_story -> (vocab_size, max_no_of_sentences_in_story, EMBEDDING_DIM)
# embedded_question -> (vocab_size, 1, EMBEDDING_DIM)

# Model inputs
input_story = Input((max_no_of_sentences_in_story, max_input_len))
input_question = Input((max_query_len, ))

# Model
embedded_story = Embedding(vocab_size, EMBEDDING_DIM)(input_story)
embedded_story = Lambda(lambda x: K.sum(x, axis=2))(embedded_story)

embedded_question = Embedding(vocab_size, EMBEDDING_DIM)(input_question)
embedded_question = Lambda(lambda x: K.sum(x, axis=1))(embedded_question)

# Hop 1
embedded_question = Reshape((1, EMBEDDING_DIM))(embedded_question)
x = dot([embedded_story, embedded_question], 2)
x = Reshape((max_no_of_sentences_in_story, ))(x)  
x = Activation("softmax")(x)
story_weights1 = Reshape((max_no_of_sentences_in_story, 1))(x)
embedded_story = Embedding(vocab_size, EMBEDDING_DIM)(input_story)
embedded_story = Lambda(lambda x: K.sum(x, axis=2))(embedded_story)
x = dot([story_weights1, embedded_story], 1)
x = Reshape((EMBEDDING_DIM, ))(x)
x = Dense(EMBEDDING_DIM, activation="elu")(x)

# Hop 2
x = Reshape((1, EMBEDDING_DIM))(x)
x = dot([embedded_story, x], 2)
x = Reshape((max_no_of_sentences_in_story, ))(x)  
x = Activation("softmax")(x)
story_weights2 = Reshape((max_no_of_sentences_in_story, 1))(x)
embedded_story = Embedding(vocab_size, EMBEDDING_DIM)(input_story)
embedded_story = Lambda(lambda x: K.sum(x, axis=2))(embedded_story)
x = dot([story_weights2, embedded_story], 1)
x = Reshape((EMBEDDING_DIM, ))(x)
x = Dense(EMBEDDING_DIM, activation="elu")(x)

output = Dense(vocab_size, activation="softmax")(x)

model = Model([input_story, input_question], output)

model.compile(optimizer=RMSprop(lr=5e-3), loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [73]:
model.summary()

Model: "model_14"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_22 (InputLayer)           [(None, 8)]          0                                            
__________________________________________________________________________________________________
input_21 (InputLayer)           [(None, 88, 8)]      0                                            
__________________________________________________________________________________________________
embedding_41 (Embedding)        (None, 8, 30)        3750        input_22[0][0]                   
__________________________________________________________________________________________________
embedding_40 (Embedding)        (None, 88, 8, 30)    3750        input_21[0][0]                   
___________________________________________________________________________________________

In [74]:
result = model.fit([inputs_train, queries_train], outputs_train, epochs=EPOCHS, batch_size=BATCH_SIZE, 
                   validation_data=([inputs_test, queries_test], outputs_test))

Train on 10000 samples, validate on 1000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [78]:
# Check how we weight each input sentence given a story and question
weights_model = Model([input_story, input_question], [story_weights1, story_weights2])

# choose a random story
story_idx = np.random.choice(len(inputs_train))

# get weights from debug model
i = inputs_train[story_idx:story_idx+1]
q = queries_train[story_idx:story_idx+1]
weights1, weights2 = weights_model.predict([i, q])
weights1 = weights1.flatten()
weights2 = weights2.flatten()
idx = model.predict([i, q])

story, question, ans = df_train[story_idx]

print("Story:\n")
for i, line in enumerate(story):
    print("{:1.5f}".format(weights1[i]), "\t", "{:1.5f}".format(weights2[i]), "\t", " ".join(line))

print("Question:", " ".join(question))
print("Answer:", ans[0])
print("Prediction: ", idx2word[np.argmax(idx)])

Story:

0.00000 	 0.00000 	 0 Mary picked up the apple there .
0.00000 	 0.00000 	 1 John went back to the garden .
0.00000 	 0.00000 	 2 Mary discarded the apple .
0.00000 	 0.36606 	 3 Daniel travelled to the bathroom .
0.00000 	 0.00000 	 4 Daniel grabbed the apple there .
0.00000 	 0.00002 	 5 John went back to the bedroom .
0.00000 	 0.00002 	 6 John travelled to the office .
0.00000 	 0.00000 	 7 Sandra journeyed to the office .
0.00000 	 0.00000 	 8 John went back to the bedroom .
0.00000 	 0.00000 	 9 John journeyed to the bathroom .
0.00000 	 0.02029 	 10 Daniel journeyed to the office .
0.00000 	 0.00000 	 11 Mary journeyed to the bedroom .
0.00000 	 0.00000 	 12 Sandra went to the hallway .
0.00000 	 0.06675 	 13 Daniel moved to the garden .
0.00000 	 0.00000 	 14 Sandra journeyed to the garden .
0.00000 	 0.00000 	 15 Sandra went back to the kitchen .
0.00000 	 0.00001 	 16 John moved to the garden .
0.00000 	 0.00009 	 17 Mary travelled to the kitchen .
0.00000 	 0.00001 	