This notebook implements the shared-normalization method in [Simple and Effective Multi-Paragraph Reading Comprehension](https://arxiv.org/abs/1710.10723). This does not show a better performance over our single paragraph model.

In [None]:
import numpy as np
import tensorflow as tf
import json
import os
import pickle
import warnings
import random
warnings.simplefilter(action='ignore', category=FutureWarning)
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [None]:
embedding = np.load("embedding.npy").astype("float32")

In [None]:
with open("documents.json") as f:
    doc = json.load(f)
with open("training.json") as f:
    train = json.load(f)

In [None]:
with open("word_dict.json","r") as f:
    word_dict = json.load(f)
with open("char_dict.json","r") as f:
    char_dict = json.load(f)

In [None]:
topk = 4

stopword = set(stopwords.words('english'))
punc = set(['"','\'',"?",".",",","/","<",">",":",";"])

In [None]:
def unknown_detection(token_list):
    new_list = []
    for token in token_list:
        if token in word_dict:
            new_list.append(token)
        else:
            new_list.append("<UNK>")
    return new_list

def generate_char(token_list):
    new_list = []
    for token in token_list:
        if token == "<PAD>":
            char_list = ["<pad>"]*16
        else:
            char_list = [c for c in token[:16]]
        while len(char_list) < 16:
            char_list.append("<pad>")
        for char in char_list:
            if char in char_dict:
                new_list.append(char)
            else:
                new_list.append("<unk>")
    return new_list

if os.path.exists("tfidfs.pickle"):
    with open("tfidfs.pickle","rb") as f:
        tfidfs = pickle.load(f)
else:
    tfidfs = dict()
    for d in doc:
        tfidf = TfidfVectorizer(tokenizer=word_tokenize,
                                stop_words='english',
                                max_df=0.5,
                                smooth_idf=False,
                                sublinear_tf=True)
        paragraphs = [p.lower() for p in d["text"]]
        res = tfidf.fit_transform(paragraphs).toarray()
        mapping = tfidf.vocabulary_
        tfidfs[d["docid"]] = [res, mapping]
    with open("tfidfs.pickle","wb") as f:
        pickle.dump(tfidfs, f)

padded_train = []
for sample in tqdm(train):
    new_sample = dict()
    docid = sample["docid"]

    question = word_tokenize(sample["question"].lower())

    answer = word_tokenize(sample["text"].lower())
    answer_para = sample["answer_paragraph"]

    para = doc[docid]["text"][answer_para].lower()
    assert(doc[docid]["docid"] == docid)
    para = word_tokenize(para)[:240]
    
    # extract indices of answer from paragraph
    answer_idx = None
    for i, j in enumerate(para):
        if j == answer[0]:
            k = 1
            while k < len(answer) and i+k<len(para):
                if para[i+k] != answer[k]:
                    break
                k += 1
            else:
                answer_idx = (i, i+k)
                break
    # ignore samples that no answer can be found
    if answer_idx is None:
        continue
    
    # select topk noisy sample by tfidf
    rmed = []
    for token in question:
        if token not in stopword and token not in punc:
            rmed.append(token)
    
    res, mapping = tfidfs[docid]
    # set accumulator for each paragraph
    a_d = [0 for _ in range(res.shape[0])]
    for token in rmed:
        for i in range(len(a_d)):
            if token in mapping:
                a_d[i] += res[i, mapping[token]]
            else:
                pass
    k = topk if res.shape[0] > topk else res.shape[0]
    pred = np.argpartition(a_d, -k)[-k:]
    pred = set(pred)
    # give 3 noisy samples excluding the correct one
    if answer_para in pred:
        pred.remove(answer_para)
    else:
        pred.pop()
    pred = list(pred)
    
    while len(para) < 240:
        para.append("<PAD>")
    content_char = generate_char(para)
    content = unknown_detection(para)
    
    noisy = []
    noisy_char = []
    for i in range(3):
        noise = doc[docid]["text"][pred[i]].lower()
        noise = word_tokenize(noise)[:240]
        while len(noise) < 240:
            noise.append("<PAD>")
        noise_char = generate_char(noise)
        noise = unknown_detection(noise)
        noisy.append(noise)
        noisy_char.append(noise_char)
        assert len(noise) == 240
        assert len(noise_char) == 3840
    
    while len(answer) < 7:
        answer.append("<PAD>")
    answer = answer[:7]
    answer_char = generate_char(answer)
    answer = unknown_detection(answer)
    
    padded_question = question[:30]
    while len(padded_question) < 30:
        padded_question.append("<PAD>")
    question_char = generate_char(padded_question)
    padded_question = unknown_detection(padded_question)
    
    new_sample["question"] = padded_question
    new_sample["q_char"] = question_char
    new_sample["content"] = content
    new_sample["c_char"] = content_char
    new_sample["answer"] = answer
    new_sample["answer_char"] = answer_char
    new_sample["answer_idx"] = answer_idx
    for i in range(3):
        new_sample["noisy"+str(i+1)] = noisy[i]
        new_sample["noisy_char"+str(i+1)] = noisy_char[i]
    
    assert len(padded_question) == 30
    assert len(question_char) == 480
    assert len(content) == 240
    assert len(content_char) == 3840
    assert len(answer) == 7
    assert len(answer_char) == 112
    assert len(answer_idx) == 2
    
    padded_train.append(new_sample)

In [None]:
for i in range(1,4):
    print(padded_train[0]["noisy"+str(i)])

In [None]:
def generate_training_data(padded):
    """
    input for NN:
        c:           content token
        c_char:      content character
        q:           question token
        q_char:      question character
        n:           noisy paragraph
        n_char:      noisy paragraph character
    output:
        probability distribution of start and end position over context
    """
    c, c_chars, q, q_chars, a_idx = [], [], [], [], []
    n1, n_char1, n2, n_char2, n3, n_char3 = [], [], [], [], [], []
    
    cnt = 0
    for i in tqdm(range(len(padded))):
        cnt += 1
        sample = padded[i]
        question = sample["question"]
        content = sample["content"]
        q_char = sample["q_char"]
        c_char = sample["c_char"]
        aidx = sample["answer_idx"]
        answer = sample["answer"]
        noisy1 = sample["noisy1"]
        noisy_char1 = sample["noisy_char1"]
        noisy2 = sample["noisy2"]
        noisy_char2 = sample["noisy_char2"]
        noisy3 = sample["noisy3"]
        noisy_char3 = sample["noisy_char3"]
        
        q_mapped = [word_dict[t] for t in question]
        c_mapped = [word_dict[t] for t in content]
        n1_mapped = [word_dict[t] for t in noisy1]
        n2_mapped = [word_dict[t] for t in noisy2]
        n3_mapped = [word_dict[t] for t in noisy3]
        q_char_mapped = [char_dict[ch] for ch in q_char]
        c_char_mapped = [char_dict[ch] for ch in c_char]
        n_char1_mapped = [char_dict[ch] for ch in noisy_char1]
        n_char2_mapped = [char_dict[ch] for ch in noisy_char2]
        n_char3_mapped = [char_dict[ch] for ch in noisy_char3]
        
        c.append(c_mapped)
        q.append(q_mapped)
        c_chars.append(c_char_mapped)
        q_chars.append(q_char_mapped)
        a_idx.append(aidx)
        n1.append(n1_mapped)
        n2.append(n2_mapped)
        n3.append(n3_mapped)
        n_char1.append(n_char1_mapped)
        n_char2.append(n_char2_mapped)
        n_char3.append(n_char3_mapped)
        
        padded[i] = None # clear memory usage
        
    return np.array(c), np.array(c_chars), np.array(q), np.array(q_chars), np.array(a_idx), np.array(n1), np.array(n_char1), np.array(n2), np.array(n_char2), np.array(n3), np.array(n_char3)

In [None]:
c, c_char, q, q_char, a_idx, n1, n_char1, n2, n_char2, n3, n_char3 = generate_training_data(padded_train)

In [None]:
print(n3.shape, n_char3.shape)
print(c.shape, c_char.shape)
print(q.shape, q_char.shape)

In [None]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

filename = "train2.tfrecords"
with tf.python_io.TFRecordWriter(filename) as writer:
    for i in range(len(c)):
        example = tf.train.Example(
              features=tf.train.Features(
                  feature={
                      'c': _int64_feature(c[i]),
                      'c_char': _int64_feature(c_char[i]),
                      'q': _int64_feature(q[i]),
                      'q_char': _int64_feature(q_char[i]),
                      'a_idx': _int64_feature(a_idx[i]),
                      'n1':  _int64_feature(n1[i]),
                      'n2':  _int64_feature(n2[i]),
                      'n3':  _int64_feature(n3[i]),
                      'n_char1':  _int64_feature(n_char1[i]),
                      'n_char2':  _int64_feature(n_char2[i]),
                      'n_char3':  _int64_feature(n_char3[i])
                  }))
        writer.write(example.SerializeToString())

In [None]:
def f_score(pred_s, pred_e, pred_sn, pred_en, true_s, true_e, context):
    # computes average f_measure for a batch
    f_sum = 0
    l = len(pred_s)
    for i in range(l):
        pred_pair, score1 = calcu_score(pred_s[i], pred_e[i])
        pred_pair_n, score2 = calcu_score(pred_sn[i], pred_en[i])
        if score1 < score2 or pred_pair[1] < pred_pair[0]:
            continue
        TP, FN, FP = 0, 0, 0
        guess = context[i][pred_pair[0]:pred_pair[1]+1]
        true = context[i][true_s[i]:true_e[i]+1]
        for token in guess:
            if token in true:
                TP += 1
            else:
                FP += 1
        for token in true:
            if token not in guess:
                FN += 1
        precision = TP/(TP+FP)
        recall = TP/(TP+FN)
        f = 2*precision*recall/(precision+recall+1e-8)
        f_sum += f
    return f_sum/l

def calcu_score(slist, elist):
    sidx = np.argmax(slist)
    eidx = np.argmax(elist)
    score = slist[sidx] + elist[eidx]
    return [sidx, eidx], score

In [None]:
tf.reset_default_graph()

filename = "train2.tfrecords"
dataset = tf.data.TFRecordDataset(filename)

def parser(record):
    keys_to_features = {
        "c": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True),
        "c_char": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True),
        "q": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True),
        "q_char": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True),
        "a_idx": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True),
        "n1": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True),
        "n2": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True),
        "n3": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True),
        "n_char1": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True),
        "n_char2": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True),
        "n_char3": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True),
    }
    parsed = tf.parse_single_example(record, keys_to_features)
    c = tf.cast(parsed["c"], tf.int32)
    c_char = tf.cast(parsed["c_char"], tf.int32)
    q = tf.cast(parsed["q"], tf.int32)
    q_char = tf.cast(parsed["q_char"], tf.int32)
    a_idx = tf.cast(parsed["a_idx"], tf.int32)
    n1 = tf.cast(parsed["n1"], tf.int32)
    n2 = tf.cast(parsed["n2"], tf.int32)
    n3 = tf.cast(parsed["n3"], tf.int32)
    n_char1 = tf.cast(parsed["n_char1"], tf.int32)
    n_char2 = tf.cast(parsed["n_char2"], tf.int32)
    n_char3 = tf.cast(parsed["n_char3"], tf.int32)

    return c, c_char, q, q_char, a_idx, n1, n_char1, n2, n_char2, n3, n_char3

epoch = 1000
batch = 12

def make_dataset(dataset):
    dataset = dataset.map(parser)
    dataset = dataset.apply(tf.contrib.data.shuffle_and_repeat(10000, epoch))
    dataset = dataset.batch(batch)
    dataset = dataset.prefetch(batch)
    iterator = dataset.make_initializable_iterator()
    return iterator

train_iter = make_dataset(dataset)
next_batch = train_iter.get_next()

In [None]:
def embedding_encoder_block(scope, inputs):
    with tf.variable_scope(scope,reuse=tf.AUTO_REUSE):
        #convolution block
        pos_encoded = position_encoding(inputs)
        residual1 = tf.layers.conv1d(pos_encoded, 128, 1, padding="same",activation=tf.nn.relu)
        
        norm1 = tf.contrib.layers.layer_norm(residual1)
        norm1 = tf.nn.dropout(norm1, 1-dp)
        conv1 = tf.layers.conv1d(norm1, 128, 7, padding="same",activation=tf.nn.relu)
        conv1 = tf.nn.dropout(conv1, 1-dp)
        residual2 = tf.add(residual1, conv1)
        
        norm2 = tf.contrib.layers.layer_norm(residual2)
        norm2 = tf.nn.dropout(norm2, 1-dp)
        conv2 = tf.layers.conv1d(norm2, 128, 7, padding="same",activation=tf.nn.relu)
        conv2 = tf.nn.dropout(conv2, 1-dp)
        residual3 = tf.add(residual2, conv2)
        
        norm3 = tf.contrib.layers.layer_norm(residual3)
        norm3 = tf.nn.dropout(norm3, 1-dp)
        conv3 = tf.layers.conv1d(norm3, 128, 7, padding="same",activation=tf.nn.relu)
        conv3 = tf.nn.dropout(conv3, 1-dp)
        residual4 = tf.add(residual3, conv3)
        
        norm4 = tf.contrib.layers.layer_norm(residual4)
        norm4 = tf.nn.dropout(norm4, 1-dp)
        conv4 = tf.layers.conv1d(norm3, 128, 7, padding="same",activation=tf.nn.relu)
        conv4 = tf.nn.dropout(conv4, 1-dp)
        residual5 = tf.add(residual4, conv4)
        
        # self-attention block
        norm4 = tf.contrib.layers.layer_norm(residual5)
        attention = tf.matmul(norm4, norm4, transpose_b=True)
        dk = tf.cast(tf.shape(norm4)[-1], dtype=tf.float32)
        scaled = tf.divide(attention, tf.sqrt(dk))
        attention = tf.nn.softmax(scaled, axis=-1)
        attention_out = tf.matmul(attention, norm4)
        residual6 = tf.add(residual5, attention_out)
        
        # feedforwoad layer
        norm5 = tf.contrib.layers.layer_norm(residual5)
        norm5 = tf.nn.dropout(norm5, 1-dp)
        ffn1 = tf.layers.conv1d(norm5, 128, 1, activation=tf.nn.relu)
        ffn1 = tf.nn.dropout(ffn1, 1-dp)
        ffn2 = tf.layers.conv1d(ffn1, 128, 1)
        ffn2 = tf.nn.dropout(ffn2, 1-dp)
        residual7 = tf.add(residual6, ffn2)
    return residual7

def model_encoder_block(scope, inputs, projection=False):
    with tf.variable_scope(scope,reuse=tf.AUTO_REUSE):
        if projection:
            outputs = tf.layers.conv1d(inputs, 128, 1, padding="same", activation=tf.nn.relu)
        else:
            outputs = inputs
        for i in range(7):
            with tf.variable_scope("conv_block{}".format(i),reuse=tf.AUTO_REUSE):
                norm0 = tf.contrib.layers.layer_norm(outputs)
                norm0 = tf.nn.dropout(norm0, 1-dp)
                conv0 = tf.layers.conv1d(norm0, 128, 5, padding="same", activation=tf.nn.relu)
                conv0 = tf.nn.dropout(conv0, 1-dp)
                residual0 = tf.add(outputs, conv0)
                
                norm1 = tf.contrib.layers.layer_norm(residual0)
                norm1 = tf.nn.dropout(norm1, 1-dp)
                conv1 = tf.layers.conv1d(norm1, 128, 5, padding="same", activation=tf.nn.relu)
                conv1 = tf.nn.dropout(conv1, 1-dp)
                residual1 = tf.add(residual0, conv1)
            
            with tf.variable_scope("self_attention{}".format(i),reuse=tf.AUTO_REUSE):
                norm2 = tf.contrib.layers.layer_norm(residual1)
                attention = tf.matmul(norm2, norm2, transpose_b=True)
                dk = tf.cast(tf.shape(norm2)[-1], dtype=tf.float32)
                scaled = tf.divide(attention, tf.sqrt(dk))
                attention = tf.nn.softmax(scaled, axis=-1)
                attention_out = tf.matmul(attention, norm2)
                residual2 = tf.add(residual1, attention_out)
            
            with tf.variable_scope("feedforward{}".format(i),reuse=tf.AUTO_REUSE):
                norm3 = tf.contrib.layers.layer_norm(residual2)
                norm3 = tf.nn.dropout(norm3, 1-dp)
                ffn1 = tf.layers.conv1d(norm3, 128, 1, activation=tf.nn.relu)
                ffn1 = tf.nn.dropout(ffn1, 1-dp)
                ffn2 = tf.layers.conv1d(ffn1, 128, 1)
                ffn2 = tf.nn.dropout(ffn2, 1-dp)
                outputs = tf.add(residual2, ffn2)
    return outputs

def highway(scope, inputs):
    # two layer highway network
    size = inputs.shape.as_list()[-1]
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        T1 = tf.layers.conv1d(inputs, size, 1, activation=tf.nn.sigmoid, bias_initializer=tf.constant_initializer(-1))
        H1 = tf.layers.conv1d(inputs, size, 1)
        H1 = tf.nn.dropout(H1, 1-dp)
        highway1 = T1 * H1 + inputs * (1.0 - T1)
        
        T2 = tf.layers.conv1d(highway1, size, 1, activation=tf.nn.sigmoid, bias_initializer=tf.constant_initializer(-1))
        H2 = tf.layers.conv1d(highway1, size, 1)
        H2 = tf.nn.dropout(H2, 1-dp)
        highway2 = T2 * H2 + highway1 * (1.0 - T2)
    return highway2

def position_encoding(inputs):
    """
    sinusoids position encoding
    from Attention Is All You Need
    -- input: [None, sequence_length, depth]
    -- output: [None, sequence_length, depth]
    """
    _, seq_length, depth = inputs.get_shape().as_list()
    pos_encoding = np.array([
        [pos / np.power(10000, 2*i/depth) for i in range(depth)]
        for pos in range(seq_length)])
    pos_encoding[:, 0::2] = np.sin(pos_encoding[:, 0::2])
    pos_encoding[:, 1::2] = np.cos(pos_encoding[:, 1::2])
    pos_encoding = tf.convert_to_tensor(pos_encoding, tf.float32)
    return inputs+pos_encoding

def query_context_co_attention(w, inputs, context, query):
    """
    input: 
        w: similarity funciton weight
        inputs: [q, c, q*c]
    output:
        A: context-to-query attention
        B: query-to-context attention
    """
    # similarity matrix S (logits)
    S = tf.einsum("abcde,ef->abcdf", tf.expand_dims(inputs,3),w)
    S = tf.squeeze(S,[-2,-1])
    # S_: softmax over rows
    S_ = tf.nn.softmax(S)
    # S__T: transpose of softmax over coloum
    S__T = tf.transpose(tf.nn.softmax(S, axis=1),[0,2,1])
    # context_query attention
    A = tf.matmul(S_, query)
    # query_context attention
    B = tf.matmul(tf.matmul(S_, S__T), context)
    return A, B

In [None]:
#tf.reset_default_graph()

with tf.name_scope("inputs"):
    q_input = tf.placeholder(tf.int32, [None, 30], name="q")
    q_char_input = tf.placeholder(tf.int32, [None, 480], name="q_char")
    c_input = tf.placeholder(tf.int32, [None, 240], name="c")
    c_char_input = tf.placeholder(tf.int32, [None, 3840], name="c_char")
    n_input = tf.placeholder(tf.int32, [None, 240], name="n")
    n_char_input = tf.placeholder(tf.int32, [None, 3840], name="n_char")
    
    start_mask = tf.placeholder(tf.int32, [None], name="start_mask")
    end_mask = tf.placeholder(tf.int32, [None], name="end_mask")
    
    batch_size = tf.placeholder(tf.int32, (), name="batch_size")
    dp = tf.placeholder(tf.float32, (), name="drop_prob")

tf.add_to_collection("infer_input", q_input)
tf.add_to_collection("infer_input", q_char_input)
tf.add_to_collection("infer_input", c_input)
tf.add_to_collection("infer_input", c_char_input)
tf.add_to_collection("infer_input", dp)

with tf.variable_scope("Input_Embedding_Layer"):
    # input embedding layer
    with tf.variable_scope("W_Embedding"):
        pretrained_embedding = tf.get_variable("w_embedding",
                                               shape=[72497,300],
                                               initializer=tf.constant_initializer(embedding),
                                               trainable=False)
        unknown_embedding = tf.get_variable("unknown",
                                            shape=[1, 300],
                                            initializer=tf.random_uniform_initializer(-0.5,0.5),
                                            trainable=True)
        tf.summary.histogram("unknown_word_embedding", unknown_embedding)
        padding_embedding = tf.get_variable("padding",
                                            shape=[1, 300],
                                            initializer=tf.zeros_initializer(),
                                            trainable=False)
        word_embedding = tf.concat([pretrained_embedding, unknown_embedding, padding_embedding], 0)
        
        q_embed = tf.nn.embedding_lookup(word_embedding, q_input)
        q_embed = tf.nn.dropout(q_embed, 1-dp)
        c_embed = tf.nn.embedding_lookup(word_embedding, c_input)
        c_embed = tf.nn.dropout(c_embed, 1-dp)
        n_embed = tf.nn.embedding_lookup(word_embedding, n_input)
        n_embed = tf.nn.dropout(n_embed, 1-dp)

    with tf.variable_scope("C_Embedding"):
        char_embedding = tf.get_variable("c_embedding",
                                         shape=[209, 150],
                                         initializer=tf.random_uniform_initializer(-0.5,0.5),
                                         trainable=True)
        padding = tf.get_variable("padding",
                                  shape=[1, 150],
                                  initializer=tf.zeros_initializer(),
                                  trainable=False)
        char_combined = tf.concat([char_embedding, padding], 0)
        tf.summary.histogram("character_embedding", char_combined)
        q_char_embed = tf.nn.embedding_lookup(char_combined, q_char_input)
        c_char_embed = tf.nn.embedding_lookup(char_combined, c_char_input)
        n_char_embed = tf.nn.embedding_lookup(char_combined, n_char_input)
        
        squeeze_to_word_q = tf.layers.max_pooling1d(q_char_embed, 16, 16)
        squeeze_to_word_q = tf.nn.dropout(squeeze_to_word_q, 1-dp*0.5)
        squeeze_to_word_c = tf.layers.max_pooling1d(c_char_embed, 16, 16)
        squeeze_to_word_c = tf.nn.dropout(squeeze_to_word_c, 1-dp*0.5)
        squeeze_to_word_n = tf.layers.max_pooling1d(n_char_embed, 16, 16)
        squeeze_to_word_n = tf.nn.dropout(squeeze_to_word_n, 1-dp*0.5)
        
    with tf.variable_scope("embedding_output"):
        q_embed_out = tf.concat([q_embed, squeeze_to_word_q], 2)
        c_embed_out = tf.concat([c_embed, squeeze_to_word_c], 2)
        n_embed_out = tf.concat([n_embed, squeeze_to_word_n], 2)
        q_embed_out = highway("highway", q_embed_out)
        c_embed_out = highway("highway", c_embed_out)
        n_embed_out = highway("highway", n_embed_out)

with tf.variable_scope("Embedding_Encoder_Layer"):
    # embedding encoder layer
    q_encoded = embedding_encoder_block("encoder_block", q_embed_out)
    c_encoded = embedding_encoder_block("encoder_block", c_embed_out)
    n_encoded = embedding_encoder_block("encoder_block", n_embed_out)
    print(q_encoded.shape, c_encoded.shape, n_encoded.shape)
    
with tf.variable_scope("Context_Query_Attention_Layer"):
    # context_query attention layer
    # first compute similarity matrix between context and query
    # S_tj = w * [C_t; Q_j; C_t*Q_j]
    c_expand = tf.expand_dims(c_encoded, 2)
    c_expand = tf.tile(c_expand, [1,1,30,1])
    
    n_expand = tf.expand_dims(n_encoded, 2)
    n_expand = tf.tile(n_expand, [1,1,30,1])
    
    q_expand = tf.expand_dims(q_encoded, 1)
    q_expand = tf.tile(q_expand, [1,240,1,1])
    
    qc_mul = tf.multiply(c_expand, q_expand)
    qn_mul = tf.multiply(n_expand, q_expand)
    
    qc_concat = tf.concat([c_expand,q_expand,qc_mul], 3)
    qn_concat = tf.concat([n_expand,q_expand,qn_mul], 3)
    w = tf.get_variable("s_w", [384,1])
    tf.summary.histogram("S_matrix_weight", w)
    
    A1, B1 = query_context_co_attention(w, qc_concat, c_encoded, q_encoded)
    A2, B2 = query_context_co_attention(w, qn_concat, n_encoded, q_encoded)
    
    # layer output
    G_c = tf.concat([c_encoded, A1, tf.multiply(c_encoded,A1), tf.multiply(c_encoded,B1)],2)
    G_n = tf.concat([n_encoded, A2, tf.multiply(n_encoded,A2), tf.multiply(n_encoded,B2)],2)
    print(G_c.shape, G_n.shape)

with tf.variable_scope("Model_Encoder_Layer"):
    # model encoder layer
    model_encoder_c1 = model_encoder_block("model_encoder", G_c, projection=True)
    model_encoder_c2 = model_encoder_block("model_encoder", model_encoder_c1, projection=False)
    model_encoder_c3 = model_encoder_block("model_encoder", model_encoder_c2, projection=False)
    
    model_encoder_n1 = model_encoder_block("model_encoder", G_n, projection=True)
    model_encoder_n2 = model_encoder_block("model_encoder", model_encoder_n1, projection=False)
    model_encoder_n3 = model_encoder_block("model_encoder", model_encoder_n2, projection=False)
    
    print(model_encoder_c1.shape,model_encoder_c2.shape,model_encoder_c3.shape)
    print(model_encoder_n1.shape,model_encoder_n2.shape,model_encoder_n3.shape)

global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step')

with tf.variable_scope("Output_Layer"):
    # output layer
    # p1: start probability sequence
    # p2: end probability sequence
    p1_input_c = tf.concat([model_encoder_c1, model_encoder_c2],2)
    p2_input_c = tf.concat([model_encoder_c2, model_encoder_c3],2)
    p1_input_n = tf.concat([model_encoder_n1, model_encoder_n2],2)
    p2_input_n = tf.concat([model_encoder_n2, model_encoder_n3],2)
    
    p1_logits_c = tf.squeeze(tf.layers.conv1d(p1_input_c, 1, 1),-1)
    p2_logits_c = tf.squeeze(tf.layers.conv1d(p2_input_c, 1, 1),-1)
    p1_logits_n = tf.squeeze(tf.layers.conv1d(p1_input_n, 1, 1),-1)
    p2_logits_n = tf.squeeze(tf.layers.conv1d(p2_input_n, 1, 1),-1)
    
    p1_prob = tf.nn.softmax(tf.concat([p1_logits_c, p1_logits_n], 1))
    p2_prob = tf.nn.softmax(tf.concat([p2_logits_c, p2_logits_n], 1))
    
    s_pairs = tf.concat([tf.expand_dims(tf.range(batch_size),1), tf.expand_dims(start_mask,1)],1)
    e_pairs = tf.concat([tf.expand_dims(tf.range(batch_size),1), tf.expand_dims(end_mask,1)],1)
    yhat_p1 = tf.add(tf.gather_nd(p1_prob, s_pairs), 1e-15)
    yhat_p2 = tf.add(tf.gather_nd(p2_prob, e_pairs), 1e-15)

tf.add_to_collection("predictions", p1_logits_c)
tf.add_to_collection("predictions", p2_logits_c)
    
with tf.variable_scope("Optimizer"):
    # add l2 weight decay to all variables
    trainables = tf.trainable_variables()
    loss_l2 = tf.add_n([ tf.nn.l2_loss(v) for v in trainables if 'bias' not in v.name ]) * 3e-7
    tf.summary.histogram("l2_loss", loss_l2)
    loss = -tf.reduce_mean(tf.log(yhat_p1) + tf.log(yhat_p2)) + loss_l2
    
    # perform cold warm up and gradient clipping
    lr = tf.minimum(0.001, 0.001 / tf.log(999.) * tf.log(tf.cast(global_step, tf.float32) + 1))
    optimizer = tf.train.AdamOptimizer(lr, beta1=0.8,epsilon=1e-7)
    gradients, variables = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
    opt_op = optimizer.apply_gradients(zip(gradients, variables), global_step=global_step)
    
    # apply exponential moving average
    # used in inference
    ema = tf.train.ExponentialMovingAverage(decay=0.9999)
    with tf.control_dependencies([opt_op]):
        train_step = ema.apply(trainables)

tf.add_to_collection("train_step", train_step)
        
tf.summary.scalar("loss", loss)
f_measure = tf.get_variable("f_measure", (), trainable=False)
tf.summary.scalar("f_measure", f_measure)
print(p1_prob.shape, p2_prob.shape)
print(yhat_p1.shape, yhat_p2.shape)

In [None]:
with tf.device("/gpu:0"):
    
    config = tf.ConfigProto(allow_soft_placement = True)
    with tf.Session(config=config) as sess:
        
        sess.run(tf.global_variables_initializer())
        sess.run(train_iter.initializer)
        merged = tf.summary.merge_all()
        writer = tf.summary.FileWriter("logs/", sess.graph)
        saver = tf.train.Saver(max_to_keep=3)
        
        cnt = 0
        f = 0
        while True:
            try:
                cnt += 1
                next_c, next_c_char, next_q, next_q_char, next_mask, next_n1, next_n_char1, next_n2, next_n_char2, next_n3, next_n_char3 = sess.run(next_batch)
                
                next_smask = next_mask[:,0]
                next_emask = next_mask[:,1]-1
                # randomly sample a noisy paragraph
                seed = random.randint(1,3)
                if seed == 1:
                    next_n, next_n_char = next_n1, next_n_char1
                elif seed == 2:
                    next_n, next_n_char = next_n2, next_n_char2
                else:
                    next_n, next_n_char = next_n3, next_n_char3
                    
                feed_dict = {q_input: next_q,
                             q_char_input: next_q_char,
                             c_input: next_c,
                             c_char_input: next_c_char,
                             n_input: next_n,
                             n_char_input: next_n_char,
                             start_mask: next_smask,
                             end_mask: next_emask,
                             f_measure:f,
                             batch_size: len(next_c),
                             dp:0.1}
                
                run_ops = [train_step, p1_logits_c, p2_logits_c, p1_logits_n, p2_logits_n, global_step, merged]
                
                if cnt % 100 == 99:
                    run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()
                    _, ps, pe, psn, pen, step, s = sess.run(run_ops, feed_dict=feed_dict, options=run_options, run_metadata=run_metadata)
                    writer.add_run_metadata(run_metadata, "steps{}".format(step), global_step=step)
                    writer.add_summary(s, step)
                else:
                    _, ps, pe, psn, pen, step, s = sess.run(run_ops, feed_dict=feed_dict)
                    writer.add_summary(s, step)
                
                if cnt % 50 == 0:
                    f = f_score(ps, pe, psn, pen, next_smask, next_emask, next_c)
                
                if cnt % 1500 == 0:
                    print(cnt)
                    saver.save(sess, "model/strong", global_step=step)
            except tf.errors.OutOfRangeError:
                saver.save(sess, "model/strong", global_step=step)
        print("done!")