# Building the Question Answering System

In [1]:
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from keras.models import Model
from keras.layers import Input, Dense, GRU, Masking, Lambda, Bidirectional, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers
from keras.optimizers import Adam
import keras.backend as K
import tensorflow as tf

from EpisodicMemoryModule import EpisodicMemoryModule

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Preprocessing

### Reading in Training and Dev Datasets
It should be noted that the most recent version of the Stanford Question Answering dataset (SQuAD 2.0) includes 'impossible questions' as denoted in the dataset's '*is_impossible*' column that was not part of the dataset in earlier versions. The reason for this (according to the SQuAD website) is to encourage research into Question Answering systems that can cope with un-answerable questions by not even attempting to answer them (by returning empty string outputs or else). Given the added complexity this brings to the project, you will see in the code below that I simply exclude 'impossible questions' in this project.

In [2]:
train_df_all=pd.read_json("C:/Users/Lukas/Desktop/training_data.json").reset_index(drop=True)
dev_df_all=pd.read_json("C:/Users/Lukas/Desktop/validation_data.json").reset_index(drop=True)

# Dataframes consisting only of 'possible' to answer questions.
train_df = train_df_all[train_df_all.is_impossible==False][["context", "question", "answer_text",
                                                    "answer_start", "title"]].reset_index(drop=True)

dev_df = dev_df_all[dev_df_all.is_impossible==False][["context", "question", "answer_text",
                                                    "answer_start", "title"]].reset_index(drop=True)

# Simple Processing/Adding answer_end
train_df.answer_start = train_df.answer_start.astype(int)
dev_df.answer_start = dev_df.answer_start.astype(int)

contexts, questions, answers, answer_start = (train_df.context.values,
                                              train_df.question.values, 
                                              train_df.answer_text.values,
                                              train_df.answer_start.values)
answer_end = np.array([answer_start[idx] + len(answers[idx]) for idx in range(len(answer_start))])

## Word Vectorization
To be able to use words, phrases, questions or other natural language constructs in our model we require a to provide our neural network a numerical representation of our words (as these are the elemental NLP 'particles'). The simplest implementation would be to use 'one hot encoding' and define each word as a vector the size of our dictionary (the number of unique words found in our collection of documents, our corpus). However, this approach will most likely be insufficient for the purposes of a question answering system. word2vec and GloVe are 2 popular choices sophisticated options for word embeddings that also capture word similarities. I will not go into the details of either architecture other than to say that we will not be re-training the word vectors due to the insufficient size of the dataset, and we will begin with the GloVe word embeddings due to it's superior performance in most 'downstream' modelling tasks. Having said that, given the simplicity of swapping word vector representations we will also test out performance with word2vec (providing we can do so in a time-efficient manner).

Info and download links for GloVe can be found at: https://nlp.stanford.edu/projects/glove/

### Word Vector Custom Function

In [3]:
def get_word_vector_dict(url_or_path):
    """ Input: url_or_path, a URL or a local path.
        Output: word_vector_dict, a dictionary of GloVe word vectors where words are the keys
        and the values are the word vectors with the dimension specified in the input file."""
    with open(url_or_path, encoding="utf8") as glove_text:
        word_embeddings = [line.split(" ") for line in glove_text.readlines()]
    word_vector_dict = {element[0]:list(map(float, element[1:])) for element in word_embeddings}
    
    return word_vector_dict

### Setting up the Word Vectors


In [4]:
word_vector_50_dict = get_word_vector_dict("C:/Users/Lukas/Desktop/glove.6B.50d.txt")
vocab = np.array(word_vector_50_dict.keys()) #400k words as per the documentation.
word_vector_100_dict = get_word_vector_dict("C:/Users/Lukas/Desktop/glove.6B.100d.txt")

### Model Hyperparameters

In [5]:
# Model Architecture Hypermarameters
word_vector_size = 50
hidden_units = 50
input_h_units = int(hidden_units/2)
num_memory_passes = 3
regularization_val = 1e-4
dropout = 0.1

### Tokenization, Embedding, Padding Custom Functions

In [6]:
def tokenize(string):
    """Takes a string (either context, question or answer) and returns the string as a list of tokens.
    Input: string, a str object.
    Output: a list of tokens, where each token is a substring."""
    tokens = [token.replace("``", '"').replace("''", '"').lower() for token in word_tokenize(string)]
    split_tokens = []
    
    for token in tokens:
        split_tokens.extend(re.split('(\W+)', token)) # split further
    
    return [token for token in split_tokens if token!=" " and token!=""]


def get_embedding(tokens):
    """Takes a tokenized context, question or answer and returns its vectorized/embedded form.
    Input: tokens, list of tokens of a string.
    Output: embedding, a numpy array of the vectorized/embedded representation of the string."""
    assert word_vector_size==50 or word_vector_size==100
    tokens = np.array(tokens)
    embedding=[]
    
    if word_vector_size==50:
        word_vector_dict=word_vector_50_dict
    else:
        word_vector_dict=word_vector_100_dict
    
    for token in tokens:
        if token in word_vector_dict.keys(): 
            embedding.extend(word_vector_dict[token])
        else:
            # Words with no embedding are assigned the 'unk' token vectorization (already in GloVe)
            embedding.extend(word_vector_dict["unk"])
    
    return np.array(embedding)


def get_sent_end_idx(context_tokenizations):
    """ Get indices of tokens that are '.' (sentence end tokens). For one or many contexts.
    Input: context_tokenizations, a list or numpy array of 1 or more tokenized contexts.
    Output: a numpy array of indices of sentence end tokens."""
    return np.array([np.where(np.array(context)==".") for context in context_tokenizations])


def get_padded_inputs(tokenized_inputs, string_type="context"):
    """ Takes a numpy array of tokenized inputs and returns embedding and padding.
    Input: input_array, a numpy array of question or answer tokenizations.
    Output: (embedded_input, padded_input) a tuple of numpy arrays."""
    assert isinstance(tokenized_inputs[0], list)==True # Assert multiple samples
    embedded_inputs = [get_embedding(tokenized_input) for tokenized_input in tokenized_inputs]
    
    if string_type=="context":
        padded_input = pad_sequences(embedded_inputs, max_context_len*word_vector_size, padding="post",
                                dtype="float32").reshape(len(tokenized_inputs), -1, word_vector_size)
    elif string_type=="question":
        padded_input = pad_sequences(embedded_inputs, max_question_len*word_vector_size, padding="post",
                                dtype="float32").reshape(len(tokenized_inputs), -1, word_vector_size)
    else:
        print("Incorrect string_type parameter value.")
    
    return padded_input


def get_answer_span(answer_start, answer_end):
    """ Returns one hot numpy matrices for the answer_start and answer_end indices.
       Input: answer_start, numpy array containing the 'answer start' index in the context
              answer_end, numpy array containing the 'answer end' index in the context
       Output: tuple of size 2, containing the one hot embeddings of the indices for each context"""
    y_answer_start, y_answer_end= ([] , [])
    start_arr, end_arr = (np.zeros(shape=(output_dim,), dtype=float),
                          np.zeros(shape=(output_dim,), dtype=float)) # Set the 0.0 arrays
  
    
    if isinstance(answer_start, np.int32) and isinstance(answer_end, np.int32): # Single sample case
        start_arr[answer_start]=1.0
        end_arr[answer_end]=1.0
        y_answer_start, y_answer_end = start_arr, end_arr 
   
    else:
        assert len(answer_start)==len(answer_end)
        for sample_idx in range(len(answer_start)):  # Multi sample case
            start_arr[answer_start[sample_idx]]=1.0
            end_arr[answer_end[sample_idx]]=1.0
        
            y_answer_start.append(start_arr)
            y_answer_end.append(end_arr)
        
            start_arr, end_arr = (np.zeros(shape=(output_dim,), dtype=float),
                          np.zeros(shape=(output_dim,), dtype=float)) # Reset the 0.0 arrays
        

    return (np.array(y_answer_start), np.array(y_answer_end))   

The nltk tokenizer generated around 110,00 unique tokens from our contexts, questions and answers in our dataset. ~31,000 of those tokens did not have pre-trained word vectorizations in the GloVe model. Some of these tokens were numbers, expressed as strings in an unfamiliar format, some of these tokens were misspelled words, some of these tokens were works in other languages, or symbols from other alphibets and so on. 
With the 'regex inspired' split in the tokenized function, we were able to reduce the number of words with no embeddings to around 16,000. To deal with the remaining words with no embeddings we assigned to them the embedding for the token *'unk'*, which by definition is the embedding for unknown words provided by GloVe. Thus any word/token that did not have an embedding got an *'unk'* embedding.

### Setting up 'Trans-Batch' Variables
All the computations needed for each batch training step will be computed later, the variables defined below have to take on values now as they are required in downstream calculations.

In [7]:
""" Calculating the 'trans-batch' variables. EXECUTE THESE BEFORE ANY CHANGES TO ANYTHING"""
tokenized_contexts = np.array([tokenize(context) for context in contexts])
tokenized_questions = np.array([tokenize(question) for question in questions])
tokenized_answers = np.array([tokenize(answer) for answer in answers])

# Calculating variables used within the model architecture
max_context_len = np.max([len(context) for context in tokenized_contexts])
max_question_len = np.max([len(question) for question in tokenized_questions])
max_answer_len = np.max([len(answer) for answer in tokenized_answers])

sent_end_indeces = get_sent_end_idx(tokenized_contexts)
output_dim = np.max([len(context) for context in contexts]) 

batch_iteration = 0

# Building the Dynamic Memory Network

## Defining Custom Functions

In [9]:
def get_facts(facts_output):
    """ Extracts the timesteps (facts) for each sample, then pads each tensor. Returns a 3D tensor.
        Input: facts_output, 3D tensor of all the timesteps/samples. Shape=(num samples, max # words, 50)
        Output: facts_tensor_list, list of facts for each sample. Each 2D tensor has different shape. """
    # The difficulty in this methods' definition is the fact that it is envoked within the
    # Dynamic Memory Module and represented as a layer via a 'Lambda Layer' in keras.
    # This has multiple complications with regards to how it has to be defined in a
    # supervised batch-training infrastrucure. Thus perhaps the awkward iterations and dtypes.
    
    facts_tensor_list = []
    start_idx = batch_iteration*batch_size
    
    if start_idx + batch_size <= num_of_samples:
        iter_values = batch_size
    else: 
        iter_values = num_of_samples - start_idx
    
    for sample_index in range(iter_values): #iterate over each sample in the batch
        facts = tf.nn.embedding_lookup(facts_output[sample_index],
                                       ids=sent_end_indeces[start_idx+sample_index][0])
        facts_tensor_list.append(facts)
    
    max_num_facts = np.max([facts_tensor.shape[0] for facts_tensor in facts_tensor_list])
    
    # Padding the tensors
    for idx in range(len(facts_tensor_list)):
        numpy_pad = np.zeros(shape=(max_num_facts-facts_tensor_list[idx].shape[0], hidden_units))
        pad = tf.Variable(numpy_pad, dtype=tf.float32)
        facts_tensor_list[idx] = K.concatenate([facts_tensor_list[idx], pad], axis=0)
    
    return tf.stack(facts_tensor_list) # shape (num samples, max_num_facts, hidden_units)


def get_batch_data(epoch, batch_size=batch_size, num_of_batches=num_of_batches): 
    """ A generators function that returns all necessary data for each batch iteration.
        Input: epoch, (int) integer representing the current epoch.
               batch_size and num_of_batches are self explanatory.
        Output: a list of 4 2-tuples representing the required data for batch training """
    for batch_idx in range(num_of_batches): # Do computation for each batch
        print("\nEpoch: %d/%d \tBatch iteration: %d/%d"
                                  % (epoch+1, num_of_epochs, batch_idx+1, num_of_batches) )
       
        start = batch_idx*batch_size  # Computing start and end indices of the slice.
        if (batch_idx+1)*batch_size < num_of_samples: 
            end = (batch_idx+1)*batch_size
        else: 
            end = num_of_samples-1
    
        yield [get_answer_span(answer_start[start:end], answer_end[start:end]), # This is the y.
               get_padded_inputs(tokenized_contexts[start:end], string_type="context"), 
               get_padded_inputs(tokenized_questions[start:end], string_type="question")]

## Input Module

- **What it does:** The Input Module iterates over each context (paragraph of a wikipedia article) and returns embedded representations (facts) of each sentence in the context (for each sample).
- **How it does this:** The Input Module uses a Bidirectional GRU that iterates over each word and returns the hidden state after each iteration. The GRU requires the input to be a 3D Tensor of shape (samples, timesteps, columns/features), and each sample must have the same (timesteps, columns/features) shape. However, given that our contexts are of different length (different amounts of words/timesteps), the embedded representations are padded (have 0.0s appended to each context embedding) as to meet the input requirements. As a technical side note, all embedded contexts are padded such that their length is equal to the length of the longest context in the whole sample (again, such that each individual sample has the same shape). To ensure that the GRU interprets the 0.0s as paddings, we mask the input (via the Masking layer). A Dropout Layer is added as a form of regularization. The *get_facts* method then returns the facts for each sample/context by extracting the hidden states/outputs of the GRU corresponding to 'end of sentence token' timesteps (exactly as described in the *'Ask Me Anything...'* paper.

In [10]:
context_input = Input(shape=(max_context_len, word_vector_size),
                         batch_shape=(batch_size, max_context_len, word_vector_size ),
                         name="ContextInput")

context_mask = Masking(mask_value=0.0, name="ContextMask")(context_input)

facts_output = Bidirectional(GRU(units=input_h_units, return_sequences=True, dropout=dropout,
                        kernel_regularizer=regularizers.l2(regularization_val),
                        recurrent_regularizer=regularizers.l2(regularization_val)),
                        merge_mode="concat", name="ContextBiGRU")(context_mask)

facts_tensors = Lambda(get_facts, name="FactTensorLambda")(facts_output)

facts_mask = Masking(mask_value=0.0, name="FactTensorMask")(facts_tensors)

## Question Module

- **What it does:** The Question Module (similarly to the input module) returns an embedded representation for each question. However, unlike the Input Module, the only output of the Question Module is the last hidden state of the GRU (the embedded representation). This again, is exactly the implementation defined in the beforementioned paper, but it makes sense, considering that all questions are 1 sentence long.
- **How it does it:** Similarly to the input module (spotting a trend here) the input has to be padded and masked since not all questions are of equal length. This time a 'vanilla' unidirectional GRU is used as there is only 1 sentence/sequence, and so we wouldn't benefit from a bibirectional architecture like in the Input Module. Given that the outputs of the GRU are exactly what we want, no further processing is required.

In [11]:
question_input = Input(shape=(max_question_len, word_vector_size),
                       batch_shape=(batch_size, max_question_len, word_vector_size), 
                       name="QuestionInput")

question_mask = Masking(mask_value=0.0, name="QuestionMask")(question_input)

question_output = GRU(units=hidden_units, dropout=dropout,
                      kernel_regularizer=regularizers.l2(regularization_val),
                      recurrent_regularizer=regularizers.l2(regularization_val),
                      name="QuestionGRU")(question_mask)


## Semantic Memory Module

- **What it does:** The task of the Episodic Memory Module (in simplified terms) is to take the Context and Question embeddings, returned by the Input and Question modules respectively, and a return the final memory representation to the Answer Module. There are many Attention Mechanisms and various implementation details so I will cover the details of the Episodic Memory Module used in this project in the final report.
- **How it does it:** Again, in simplified terms, this Module can be thought of to contain 2 'parts'. An *'Attention Mechanism'* and a *'Memory Update Mechanism'*. Given the 'facts' returned by the Input Module (embedded representation of each sentence in the context) and the Question Module output (which is also an embedded representation, but this time of a single sentence, the question), the Episodic Memory Module chooses which parts of the inputs (facts) to focus on through the attention mechanism. It then produces a ”memory” vector representation taking into account the question as well as the previous memory. Each iteration provides the module with newly relevant information about the input. This acts as a form of transitive reasoning, by effectively allowing the module to re-pay attention to parts of the context that it found to be relevant to answering the question. 

In [12]:
epm_output = EpisodicMemoryModule(units=hidden_units, batch_size=batch_size, emb_dim=word_vector_size,
            memory_steps=num_memory_passes)([facts_tensors, question_output])

epm_output = Dropout(dropout, name="FinalMemoryDropout")(epm_output) 

# Answer Module/Model Definition

- **What it does:** The final module of the Dynamic Memory Network is the Answer Module. The task of the Answer Module is to produce an answer given the output of the Semantic Memory Module. Given the nature of the DMN (being modular and thus generalisable to many different types of tasks) once again there are many implementations of the Answer Module, and once again I will spare the details for the report and only give a brief outline.
- **How it does it:** In this project we trained the network in a supervised setting. In the SQuAD dataset each answer is a span of the context meaning that the answer can be retrieved directly from the context as a continuous string. Thus the task of predicting the answer can be simplified to training the model to predict the start and end indices of the context, as this would allow us to directly extract the answer from the context. The objective/loss function is the sum of the categorical cross-entropy errors over the start/end index probability vectors (*start_idx_probs* and *end_idx_probs*.) 

In [13]:
start_idx_probs = Dense(units=output_dim, activation="softmax", name="StartIdxProbs",
                        kernel_regularizer=regularizers.l2(regularization_val))(epm_output) 

end_idx_probs = Dense(units=output_dim, activation="softmax", name="EndIdxProbs",
                      kernel_regularizer=regularizers.l2(regularization_val))(epm_output)

DMN_model = Model(inputs=[context_input, question_input], outputs=[start_idx_probs, end_idx_probs])

DMN_model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=learning_rate),
                      metrics=['accuracy', "categorical_accuracy"])

#print(DMN_model.summary())

# Training the Dynamic Memory Network
### Training Hyperparameters

In [None]:
num_of_epochs = 1
batch_size = 40
num_of_samples = len(list(dev_df.context.values))
num_of_batches = int(num_of_samples/batch_size)
learning_rate = 0.001
print(num_of_samples%40)

#num_of_batches = int(num_of_samples/batch_size) + 1 ONLY USE IF batch_size DOES NOT DIVIDE num_of_samples

In [14]:
history_objects = []
for epoch in range(num_of_epochs):
    all_batch_data = get_batch_data(epoch) 

    for batch_idx in range(num_of_batches):
        batch_data = all_batch_data.__next__() # Getting next value in the generator (next batch)
    
        y_answer_start, y_answer_end = batch_data[0]
        embedded_contexts, padded_contexts = batch_data[1]
        embedded_questions, padded_questions = batch_data[2]
    
        history = DMN_model.fit(x=[padded_contexts, padded_questions],
                                y=[y_answer_start, y_answer_end], batch_size=batch_size, validation_split=0.1)
        history_objects.append(history)
    
        batch_iteration+=1

    batch_iteration = 0 # Reset batch iteration counter for each new epoch


Epoch: 1/1 	Batch iteration: 1/148


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 36 samples, validate on 4 samples
Epoch 1/1


InvalidArgumentError: Incompatible shapes: [40] vs. [36]
	 [[Node: training/Adam/gradients/loss/EndIdxProbs_loss/mul_1_grad/BroadcastGradientArgs = BroadcastGradientArgs[T=DT_INT32, _class=["loc:@training/Adam/gradients/loss/EndIdxProbs_loss/mul_1_grad/Reshape"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](training/Adam/gradients/loss/EndIdxProbs_loss/mul_1_grad/Shape, training/Adam/gradients/loss/EndIdxProbs_loss/mul_1_grad/Shape_1)]]

In [None]:
print(len(history_objects))