# Building the Question Answering System

In [1]:
from collections import defaultdict
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from keras.models import Model
from keras.layers import Input, Dense, GRU, Masking, Lambda, Bidirectional, Dropout
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K
import tensorflow as tf

from EpisodicMemoryModule import EpisodicMemoryModule

Using TensorFlow backend.


# Preprocessing

#### Hyperparameter/TO DO Notes
- Word vector size (50,100,200,300)
- Still need to find a way feed in all of the sentences into input module

### Reading in Training and Dev Datasets

In [2]:
train_df_all = pd.read_json("C:/Users/Lukas Buteliauskas/Desktop/training_data.json").reset_index(drop=True)
dev_df_all = pd.read_json("C:/Users/Lukas Buteliauskas/Desktop/validation_data.json").reset_index(drop=True)

# Dataframe consisting only of 'possible' to answer questions.
train_df = train_df_all[train_df_all.is_impossible==False][["context", "question", "answer_text", "answer_start", "title"]]
train_df=train_df.reset_index(drop=True)
dev_df = dev_df_all[dev_df_all.is_impossible==False][["context", "question", "answer_text", "answer_start", "title"]]
dev_df=dev_df.reset_index(drop=True)

## Word Vectorization
To be able to use words, phrases, questions or other natural language constructs in our model we require a to provide our neural network a numerical representation of our words (as these are the elemental NLP 'particles'). The simplest implementation would be to use 'one hot encoding' and define each word as a vector the size of our dictionary (the number of unique words found in our collection of documents, our corpus). However, this approach will most likely be insufficient for the purposes of a question answering system. word2vec and GloVe are 2 popular choices sophisticated options for word embeddings that also capture word similarities. I will not go into the details of either architecture other than to say that we will not be re-training the word vectors due to the insufficient size of the dataset, and we will begin with the GloVe word embeddings due to it's superior performance in most 'downstream' modelling tasks. Having said that, given the simplicity of swapping word vector representations we will also test out performance with word2vec (providing we can do so in a time-efficient manner).

Info and download links for GloVe can be found at: https://nlp.stanford.edu/projects/glove/

### Word Vector Custom Function

In [3]:
def get_word_vector_dict(url_or_path):
    """Takes a URL or a local path and returns a dictionary of GloVe word vectors where the key is the word and the value is the 
    word vector with the dimension specified in the input file."""
    with open(url_or_path, encoding="utf8") as glove_text:
        word_embeddings = [line.split(" ") for line in glove_text.readlines()]
    word_vector_dict = {element[0]:list(map(float, element[1:])) for element in word_embeddings}
    
    return word_vector_dict

### Setting up the Word Vectors
As mentioned above with regards to what model we use for the word vectors, it's important to note that the dimention of the word vectors is a hyperparameter of the Neural Networks to come, so to keep our options open we imported a few different word vectors representations and the custom functions defined above make this a 'one line of code' affair (dictionary or dataframe).


In [4]:
word_vector_50_dict = get_word_vector_dict("C:/Users/Lukas Buteliauskas/Desktop/glove.6B.50d.txt")
vocab = np.array(word_vector_50_dict.keys()) #400k words as per the documentation.
word_vector_100_dict = get_word_vector_dict("C:/Users/Lukas Buteliauskas/Desktop/glove.6B.100d.txt")

#word_vector_200_dict=get_word_vector_dict("C:/Users/Lukas Buteliauskas/Desktop/glove.6B.200d.txt")
#word_vector_300_dict=get_word_vector_dict("C:/Users/Lukas Buteliauskas/Desktop/glove.6B.300d.txt")

### Defining Hyperparameters

In [5]:
word_vector_size = 50
hidden_units = 10
input_h_units = 5
num_memory_passes = 3
num_of_samples = 10
dropout = 0.2

num_of_epochs = 100
batch_size = 100
# add Regularization, batch size arguments (perhaps)

### Tokenization, Embedding, Padding Custom Functions

In [6]:
def tokenize(string):
    """Takes a string (either context, question or answer) and returns a list of tokens.
    Input: string, a str object.
    Output: a list of tokens, where each token is a substring."""
    tokens = [token.replace("``", '"').replace("''", '"').lower() for token in word_tokenize(string)]
    split_tokens = []
    
    for token in tokens:
        split_tokens.extend(re.split('(\W+)', token))
    
    return [token for token in split_tokens if token!=" " and token!=""]


def get_embedding(tokens, word_vector_dict=word_vector_50_dict):
    """Takes a tokenized context, question or answer and returns its vectorized/embedded form.
    Input: tokens, list of tokens of a string.
    Output: embedding, a numpy array of the vectorized/embedded representation of the string."""
    tokens = np.array(tokens)
    embedding=[]
        
    for token in tokens:
        if token in word_vector_dict.keys(): 
            embedding.extend(word_vector_dict[token])
        else:
            # We are vectorizing words with no embedding with the 'unk' token vectorization (already in GloVe)
            embedding.extend(word_vector_dict["unk"])
    
    return np.array(embedding)


def get_sent_end_idx(context_tokenizations):
    """ Get indices of tokens that are '.' (sentence end tokens). For one or many contexts.
    Input: context_tokenizations, a list or numpy array of 1 or more tokenized contexts.
    Output: a numpy array of indices of sentence end tokens."""
    return np.array([np.where(np.array(context)==".") for context in context_tokenizations])


def compute_everything(input_array):
    """ Takes a list of numpy question strings and returns useful stuff.
    Input: input_array, a numpy array of question or answer strings.
    Output: (tokenized_input, embedded_input, padded_input, max_input_len) a tuple of numpy arrays."""
    
    tokenized_input = [tokenize(string) for string in input_array]
    max_input_len = np.max([len(tokens) for tokens in tokenized_input])
    embedded_input = [get_embedding(tokenized_input) for tokenized_input in tokenized_input]
    padded_input = pad_sequences(embedded_input, max_input_len*word_vector_size,
                          padding="post", dtype="float32").reshape(len(input_array), -1, word_vector_size)
   
    return (tokenized_input, embedded_input, padded_input, max_input_len)

The nltk tokenizer generated around 110,00 unique tokens from our contexts, questions and answers in our dataset. ~31,000 of those tokens did not have pre-trained word vectorizations in the GloVe model. Some of these tokens were numbers, expressed as strings in an unfamiliar format, some of these tokens were misspelled words, some of these tokens were works in other languages, or symbols from other alphibets and so on. 
With the 'regex inspired' split in the tokenized function, we were able to reduce the number of words with no embeddings to around 16,000. To deal with the remaining words with no embeddings we assigned to them the embedding for the token *'unk'*, which by definition is the embedding for unknown words provided by GloVe. Thus any word/token that did not have an embedding got an *'unk'* embedding.

### Tokenizing, Embedding and Padding Contexts, Questions and Answers
In this section we seperate out the contexts, questions and answers, we embed all our words into 
vector representations and pad the sequences to fulfil Keras' input requirements.

#### Training Set

In [7]:
"""contexts, questions, answers = (train_df.context.values, train_df.question.values, train_df.answer_text.values)
unique_contexts = train_df.context.unique()

# Context Stuff (we embed and pad unique contexts and not all, due to memory limits)
contexts_tokenized = np.array([tokenize(context) for context in contexts])
unique_contexts_tokenized = np.array([tokenize(context) for context in unique_contexts])
max_context_len = np.max([len(context) for context in unique_contexts_tokenized])
unique_embedded_contexts = [get_embedding(tokenized_context) for tokenized_context in unique_contexts_tokenized] 
unique_padded_contexts = pad_sequences(unique_embedded_contexts, max_context_len*word_vector_size,
                            padding="post", dtype="f4").reshape(len(unique_contexts), -1, word_vector_size)

# Questions Stuff
max_question_len, padded_questions = get_questions_stuff(questions)

# Answers Stuff
tokenized_answers, embedded_answers, max_question_len = get_answer_stuff(answers)

# Other useful variables
sent_end_indeces = get_sent_end_idx(contexts_tokenized)
"""#padded_contexts_full=get_padded_contexts(train_df.index) Still can't fix the memory issue""" 

'contexts, questions, answers = (train_df.context.values, train_df.question.values, train_df.answer_text.values)\nunique_contexts = train_df.context.unique()\n\n# Context Stuff (we embed and pad unique contexts and not all, due to memory limits)\ncontexts_tokenized = np.array([tokenize(context) for context in contexts])\nunique_contexts_tokenized = np.array([tokenize(context) for context in unique_contexts])\nmax_context_len = np.max([len(context) for context in unique_contexts_tokenized])\nunique_embedded_contexts = [get_embedding(tokenized_context) for tokenized_context in unique_contexts_tokenized] \nunique_padded_contexts = pad_sequences(unique_embedded_contexts, max_context_len*word_vector_size,\n                            padding="post", dtype="f4").reshape(len(unique_contexts), -1, word_vector_size)\n\n# Questions Stuff\nmax_question_len, padded_questions = get_questions_stuff(questions)\n\n# Answers Stuff\ntokenized_answers, embedded_answers, max_question_len = get_answer_stuf

### Dev Set

In [8]:
contexts, questions, answers = (dev_df.context.values, dev_df.question.values, dev_df.answer_text.values)

# Context Stuff
tokenized_contexts, embedded_contexts, padded_contexts, max_context_len = compute_everything(contexts)

# Questions Stuff
tokenized_questions, embedded_questions, padded_questions, max_question_len = compute_everything(questions)

# Answers Stuff
tokenized_answers, embedded_answers, _, max_answer_len = compute_everything(answers)

# Other useful variables
sent_end_indeces = get_sent_end_idx(tokenized_contexts)

### Batching Data

In [9]:
indices, sample_num = (train_df.index.values, len(train_df.question.values))
num_of_batches = int(sample_num/batch_size) + 1

# Building the Dynamic Memory Network

### Defining Custom Functions

In [10]:
def get_facts(facts_output):
    """ Extracts the timesteps (facts) for each sample, then pads each tensor. Returns a 3D tensor.
    Input: facts_output, is a 3D tensor of all the timesteps/samples. Shape=(num samples, max # words, 50)
       Output: facts_tensor_list, a tensor list of the facts for each sample. Each 2D tensor has different shape. """
    facts_tensor_list = []
    
    for sample_index in range(num_of_samples): #iterate over each sample
        facts = tf.nn.embedding_lookup(facts_output[sample_index], ids=sent_end_indeces[sample_index][0])
        facts_tensor_list.append(facts)
    
    max_num_facts = np.max([facts_tensor.shape[0] for facts_tensor in facts_tensor_list])
    
    # Padding the tensors
    for idx in range(len(facts_tensor_list)):
        numpy_pad = np.zeros(shape=(max_num_facts-facts_tensor_list[idx].shape[0], hidden_units))
        pad = tf.Variable(numpy_pad, dtype=tf.float32)
        facts_tensor_list[idx] = K.concatenate([facts_tensor_list[idx], pad], axis=0)
    
    return tf.stack(facts_tensor_list) # list of 2D tensors -> 3D tensor of shape (num samples, max_num_facts, hidden_units)

### Input Module

- **What it does:** The Input Module iterates over each context (paragraph of a wikipedia article) and returns embedded representations (facts) of each sentence in the context (for each sample).
- **How it does this:** The Input Module uses a Bidirectional GRU that iterates over each word and returns the hidden state after each iteration. The GRU requires the input to be a 3D Tensor of shape (samples, timesteps, columns/features), and each sample must have the same (timesteps, columns/features) shape. However, given that our contexts are of different length (different amounts of words/timesteps), the embedded representations are padded (have 0.0s appended to each context embedding) as to meet the input requirements. As a technical side note, all embedded contexts are padded such that their length is equal to the length of the longest context in the whole sample (again, such that each individual sample has the same shape). To ensure that the GRU interprets the 0.0s as paddings, we mask the input (via the Masking layer). A Dropout Layer is added as a form of regularization. The *get_facts* method then returns the facts for each sample/context by extracting the hidden states/outputs of the GRU corresponding to 'end of sentence token' timesteps (exactly as described in the *'Ask Me Anything...'* paper.

**Input Module input shape:** *[rows, timesteps, columns/features]* or *[num of samples, max # words in context (*max_context_len*), word vector size]*.

In [11]:
context_input = Input(shape=(max_context_len, word_vector_size), name="ContextInput")
context_mask = Masking(mask_value=0.0, name="ContextMask")(context_input)
facts_output = Bidirectional(GRU(units=input_h_units, return_sequences=True),
                             merge_mode="concat", name="ContextBiGRU")(context_mask)
facts_output = Dropout(dropout, name="ContextBiGRU_Dropout")(facts_output)
facts_tensors = Lambda(get_facts, name="FactTensorLambda")(facts_output)
facts_mask = Masking(mask_value=0.0, name="FactTensorMask")(facts_tensors)

#input_model = Model(inputs=context_input, outputs=facts_mask)
#print(input_model.summary())
#input_module_outputs = input_model.predict(padded_contexts[0:num_of_samples])

### Question Module

- **What it does:** The Question Module (similarly to the input module) returns an embedded representation for each question. However, unlike the Input Module, the only output of the Question Module is the last hidden state of the GRU (the embedded representation). This again, is exactly the implementation defined in the beforementioned paper, but it makes sense, considering that all questions are 1 sentence long.
- **How it does it:** Similarly to the input module (spotting a trend here) the input has to be padded and masked since not all questions are of equal length. This time a 'vanilla' unidirectional GRU is used as there is only 1 sentence/sequence, and so we wouldn't benefit from a bibirectional architecture like in the Input Module. Given that the outputs of the GRU are exactly what we want, no further processing is required.

**Question Module input shape:** *[rows, timesteps, columns/features]* or *[num of samples, max # words in question (max_question_len), word vector size]*.

In [12]:
question_input = Input(shape=(max_question_len, word_vector_size), name="QuestionInput")
question_mask = Masking(mask_value=0.0, name="QuestionMask")(question_input)
question_output = GRU(units=hidden_units, name="QuestionGRU")(question_mask)
question_output = Dropout(dropout, name="QuestionOutputDropout")(question_output)

#question_model = Model(inputs=question_input, outputs=question_output)
#question_model_outputs = question_model.predict(padded_questions[0:num_of_samples])

# Semantic Memory Module

In [13]:
epm_output = EpisodicMemoryModule(units=hidden_units, batch_size=batch_size, emb_dim=word_vector_size,
            memory_steps=num_memory_passes)([facts_tensors, question_output])
epm_output = Dropout(dropout, name="FinalMemoryDropout")(epm_output) # 1 m^Tm for 1 sample.

#Now Implement Answer Module
          


#DMN_model = Model(inputs=[context_input, question_input], outputs=epm_output)
#print(DMN_model.summary())
    

In [14]:
"""
def get_attention(facts, question, memory):
    Input: facts, 2D Tensor of the facts for each sample. question, 1D tensor of the question. memory, 1D tensor of memory.
       Output: attentions, 1D tensor of attention scores (scalars). Implimentation as in https://arxiv.org/pdf/1603.01417.pdf
    def compute_z(fact):
        z = [tf.multiply(fact ,question), tf.multiply(fact, memory), K.abs(fact-question), K.abs(fact-memory)]
        return K.concatenate(z, axis=0) # get an array of length 4*hidden_units.
    
    Zs = K.map_fn(fn=compute_z, elems=facts) # for each fact, compute z(c_t, m, q).
    
    g_t_i = Dense(units=word_vector_size, activation='tanh')(Zs)
    g_t_i = Dense(units=1, activation="sigmoid")(g_t_i)
    
    return g_t_i

def semantic_module_print(sample_index, memory_iteration, facts, attentions, episode, memory):
     Some test prints 
    print("Sample iter %d Memory iter %d" % (sample_index, memory_iteration))
    print("Shape of facts tensor:", facts.shape)
    print("Shape of attentions tensor:", attentions.shape)
    print("Shape of Attention GRU input tensor:", attention_gru_input.shape)
    print("Shape of episode tensor:", episode.shape)
    print("Shape of memory tensor:", memory.shape, "\n")




for sample_index in range(num_of_samples): # generally this should iterate over batches
    facts, question= (facts_tensor_list[sample_index], question_output[sample_index])
    memory=question # the initial memory is set to be the question
    answer_length = len(tokenized_answers[sample_index]) # number of tokens we are trying to predict
    sample_predictions=[]
    
    for memory_iteration in range(num_memory_passes): # Episodic Memory Module
        attentions = get_attention(facts, question, memory)
        attention_gru_input = tf.reshape(K.concatenate([facts, attentions], axis=1), shape=(1, -1, hidden_units+1))
        episode = AttentionGRU(units=hidden_units)(attention_gru_input)
        episode = Dropout(dropout)(episode)
        
        memory_input=tf.expand_dims(K.concatenate([memory, tf.squeeze(episode), question], axis=0), 0) # returns 2D Tensor.
        memory = Dense(units=hidden_units, activation='relu')(memory_input) # returns 2D tensor of shape (1, hidden_units)
        memory = tf.squeeze(memory) # reshape from (1, hidden_units) -> (hidden_units, )
        _ = semantic_module_print(sample_index, memory_iteration, facts, attentions, episode, memory)
    
    for answer_index in range(answer_length): # predict answer_length word embeddings for each sample
        a_0=tf.expand_dims(memory, axis=0) # shape (hidden units, ) -> (1, hidden_units)
        y_last=0
        a_t=0
        
        if answer_index==0:
            y_last=Dense(units=word_vector_size, activation="softmax")(a_0) # prediction 
            sample_predictions.append(y_last)
        else:
            a_t=GRU(units=hidden_units, initial_states=a_0)(K.concatenate([y_last, question], axis=1))
            y_last=Dense(units=word_vector_size, activation="softmax")(a_t) # prediction 
            sample_predictions.append(y_last)

"""

'\ndef get_attention(facts, question, memory):\n    Input: facts, 2D Tensor of the facts for each sample. question, 1D tensor of the question. memory, 1D tensor of memory.\n       Output: attentions, 1D tensor of attention scores (scalars). Implimentation as in https://arxiv.org/pdf/1603.01417.pdf\n    def compute_z(fact):\n        z = [tf.multiply(fact ,question), tf.multiply(fact, memory), K.abs(fact-question), K.abs(fact-memory)]\n        return K.concatenate(z, axis=0) # get an array of length 4*hidden_units.\n    \n    Zs = K.map_fn(fn=compute_z, elems=facts) # for each fact, compute z(c_t, m, q).\n    \n    g_t_i = Dense(units=word_vector_size, activation=\'tanh\')(Zs)\n    g_t_i = Dense(units=1, activation="sigmoid")(g_t_i)\n    \n    return g_t_i\n\ndef semantic_module_print(sample_index, memory_iteration, facts, attentions, episode, memory):\n     Some test prints \n    print("Sample iter %d Memory iter %d" % (sample_index, memory_iteration))\n    print("Shape of facts tensor: