In [None]:
import numpy as np
import tensorflow as tf
import json
import warnings
import pickle
warnings.simplefilter(action='ignore', category=FutureWarning)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm

In [None]:
with open("documents.json") as f:
    doc = json.load(f)
with open("training.json") as f:
    train = json.load(f)
with open("devel.json") as f:
    dev = json.load(f)

Generate word dictionary, character dictionary and process pre-trained word embeddings. Word embeddings are downloaded from https://nlp.stanford.edu/projects/glove/.

In [None]:
tokens = []
for d in doc:
    paragraphs = d["text"]
    for paragraph in paragraphs:
        tokens += word_tokenize(paragraph.lower())
print(len(set(tokens)))
for sample in train:
    question = sample["question"]
    tokens += word_tokenize(question.lower())
print(len(set(tokens)))
tokens = set(tokens)

In [None]:
embedding = []
word_dict = dict()
with open("glove.6B.50d.txt","r",encoding="utf-8", errors="ignore") as f:
    for line in f:
        data = line.split()
        word = data[0]
        if word in tokens:
            embedding.append(np.array([float(i) for i in data[1:]]))
            word_dict[word] = len(word_dict)
embedding.append(np.random.uniform(-0.5,0.5,50))
word_dict["<UNK>"] = len(word_dict)
embedding.append(np.zeros(50))
word_dict["<PAD>"] = len(word_dict)
embedding = np.array(embedding)

In [None]:
char_set = []
for token in word_dict.keys():
    char_set += [c for c in token]
char_set = set(char_set)
print(len(char_set))
char_dict = dict()
for char in char_set:
    char_dict[char] = len(char_dict)
char_dict["<unk>"] = len(char_dict)
char_dict["<pad>"] = len(char_dict)
print(len(char_dict))

In [None]:
print("Embedding matrix shape:",embedding.shape)
print("word dictionary length:", len(word_dict))
print("character dictionary length:", len(char_dict))
print(word_dict["<PAD>"])
print(char_dict["<pad>"])

In [None]:
np.save("embedding", embedding)
with open("word_dict.json","w") as f:
    json.dump(word_dict, f)
with open("char_dict.json","w") as f:
    json.dump(char_dict, f)

Define functions for tokenization and padding.

In [None]:
def unknown_detection(token_list):
    new_list = []
    for token in token_list:
        if token in word_dict:
            new_list.append(token)
        else:
            new_list.append("<UNK>")
    return new_list

def generate_char(token_list):
    new_list = []
    for token in token_list:
        if token == "<PAD>":
            char_list = ["<pad>"]*16
        else:
            char_list = [c for c in token[:16]]
        while len(char_list) < 16:
            char_list.append("<pad>")
        for char in char_list:
            if char in char_dict:
                new_list.append(char)
            else:
                new_list.append("<unk>")
    assert len(new_list) == len(token_list) * 16
    return new_list

In [None]:
def tokenize_and_pad(data, train=True):
    padded = []
    for sample in tqdm(data):
        new_sample = dict()
        docid = sample["docid"]

        question = word_tokenize(sample["question"].lower())

        answer = word_tokenize(sample["text"].lower())
        answer_para = sample["answer_paragraph"]
        para = word_tokenize(doc[docid]["text"][answer_para].lower())[:240]

        answer_idx = 0
        if train:
            # extract answer index from paragraph
            answer_idx = None
            for i, j in enumerate(para):
                if j == answer[0]:
                    k = 1
                    while k < len(answer) and i+k<len(para):
                        if para[i+k] != answer[k]:
                            break
                        k += 1
                    else:
                        answer_idx = [i, i+k]
                        break
            # ignore samples that no answer can be found
            if answer_idx is None:
                continue

        padded_question = question[:30]
        while len(padded_question) < 30:
            padded_question.append("<PAD>")
        question_char = generate_char(padded_question)
        padded_question = unknown_detection(padded_question)

        new_sample["question"] = padded_question
        new_sample["q_char"] = question_char

        assert len(padded_question) == 30
        assert len(question_char) == 480

        para = word_tokenize(doc[docid]["text"][answer_para].lower())[:240]
        content_char = generate_char(para)
        content = unknown_detection(para)
        new_sample["content"] = content
        new_sample["c_char"] = content_char
        new_sample["answer_idx"] = answer_idx
        new_sample["answer"] = answer

        assert len(content) <= 240
        assert len(content_char) <= 3840
        assert len(content_char) == len(content) * 16

        padded.append(new_sample)
    return padded

In [None]:
def generate_training_data(padded):

    c, c_chars, q, q_chars, a_idx, answer = [], [], [], [], [], []
    
    cnt = 0
    for i in tqdm(range(len(padded))):
        cnt += 1
        sample = padded[i]
        question = sample["question"]
        content = sample["content"]
        q_char = sample["q_char"]
        c_char = sample["c_char"]
        aidx = sample["answer_idx"]
        a = sample["answer"]
        
        q_mapped = [word_dict[t] for t in question]
        c_mapped = [word_dict[t] for t in content]
        q_char_mapped = [char_dict[ch] for ch in q_char]
        c_char_mapped = [char_dict[ch] for ch in c_char]
        
        c_mapped = tf.keras.preprocessing.sequence.pad_sequences([c_mapped], maxlen=240, padding="post",value=word_dict["<PAD>"])[0]
        c_char_mapped = tf.keras.preprocessing.sequence.pad_sequences([c_char_mapped], maxlen=3840, padding="post",value=char_dict["<pad>"])[0]
        
        c.append(c_mapped)
        q.append(q_mapped)
        c_chars.append(c_char_mapped)
        q_chars.append(q_char_mapped)
        a_idx.append(aidx)
        answer.append(a)
        
    return np.array(c), np.array(c_chars), np.array(q), np.array(q_chars), np.array(a_idx), answer

Generate tfrecords file for training.

In [None]:
padded_train = tokenize_and_pad(train)

In [None]:
c, c_char, q, q_char, a_idx, _ = generate_training_data(padded_train)

In [None]:
print(c.shape, c_char.shape)
print(q.shape, q_char.shape)
print(a_idx.shape)

In [None]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

filename = "train.tfrecords"
with tf.python_io.TFRecordWriter(filename) as writer:
    for i in range(len(c)):
        example = tf.train.Example(
              features=tf.train.Features(
                  feature={
                      'c': _int64_feature(c[i]),
                      'c_char': _int64_feature(c_char[i]),
                      'q': _int64_feature(q[i]),
                      'q_char': _int64_feature(q_char[i]),
                      'a_idx': _int64_feature(a_idx[i])
                  }))
        writer.write(example.SerializeToString())

Process data in dev set and store in local storage. These files will be read later to test real time performance on dev set during training.

In [None]:
padded_dev = tokenize_and_pad(dev, train=False)

In [None]:
c, c_char, q, q_char, _, answer = generate_training_data(padded_dev)

In [None]:
print(c.shape, c_char.shape)
print(q.shape, q_char.shape)
print(len(answer))

In [None]:
np.save("c", c)
np.save("c_char", c_char)
np.save("q", q)
np.save("q_char", q_char)
with open("d_ans.pickle","wb") as f:
    pickle.dump(answer, f)