This notebook is used to evaluate IR model and deep learning model altogether.

In [None]:
import numpy as np
import tensorflow as tf
import json
import os
import warnings
import random
import pickle
import math
warnings.simplefilter(action='ignore', category=FutureWarning)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [None]:
with open("documents.json") as f:
    doc = json.load(f)
with open("devel.json") as f:
    dev = json.load(f)
with open("word_dict.json") as f:
    word_dict = json.load(f)
with open("char_dict.json") as f:
    char_dict = json.load(f)
embedding = np.load("embedding.npy").astype("float32")

First use double TF-IDF to choose _topk-s_ sentences from _topk-p_ paragraphs. Then merge these sentences to form an input paragraph for deep learning model.

In [None]:
topk_p = 3
topk_s = 5

stopword = set(stopwords.words('english'))
punc = set(['"','\'',"?",".",",","/","<",">",":",";"])

In [None]:
def unknown_detection(token_list):
    new_list = []
    for token in token_list:
        if token in word_dict:
            new_list.append(token)
        else:
            new_list.append("<UNK>")
    return new_list

def generate_char(token_list):
    new_list = []
    for token in token_list:
        if token == "<PAD>":
            char_list = ["<pad>"]*16
        else:
            char_list = [c for c in token[:16]]
        while len(char_list) < 16:
            char_list.append("<pad>")
        for char in char_list:
            if char in char_dict:
                new_list.append(char)
            else:
                new_list.append("<unk>")
    assert len(new_list) == len(token_list) * 16
    return new_list

if os.path.exists("tfidfs.pickle"):
    with open("tfidfs.pickle","rb") as f:
        tfidfs = pickle.load(f)
    tqdm.write("matrices loaded")
else:
    tfidfs = dict()
    for d in doc:
        tfidf = TfidfVectorizer(tokenizer=word_tokenize,
                                stop_words='english',
                                max_df=0.5,
                                smooth_idf=False,
                                sublinear_tf=True)
        paragraphs = [p.lower() for p in d["text"]]
        res = tfidf.fit_transform(paragraphs).toarray()
        mapping = tfidf.vocabulary_
        tfidfs[d["docid"]] = [res, mapping]
    with open("tfidfs.pickle","wb") as f:
        pickle.dump(tfidfs, f)
    tqdm.write("matrices building complete")

topk_p = 3
topk_s = 5

padded_dev = []
for sample in tqdm(dev):
    new_sample = dict()
    
    docid = sample["docid"]
    answer = word_tokenize(sample["text"])

    question = word_tokenize(sample["question"].lower().strip())
    rmed = []
    for token in question:
        if token not in stopword and token not in punc:
            rmed.append(token)
    question = rmed
    
    res, mapping = tfidfs[docid]
    # set accumulator for each paragraph
    a_d = [0 for _ in range(res.shape[0])]
    for token in question:
        for i in range(len(a_d)):
            if token in mapping:
                a_d[i] += res[i, mapping[token]]

    k = topk_p if res.shape[0] > topk_p else res.shape[0]
    pred = np.argpartition(a_d, -k)[-k:]
    pred = set(pred)
    combined = []
    for idx in pred:
        sents = sent_tokenize(doc[docid]["text"][idx])
        for s in sents:
            combined.append(s.lower())

    # rank sentences in combined sents
    tfidf = TfidfVectorizer(smooth_idf=False,
                            sublinear_tf=True,
                            tokenizer=word_tokenize)
    array = tfidf.fit_transform(combined).toarray()
    mapping = tfidf.vocabulary_

    a_d = np.zeros(len(combined))
    for token in question:
        for i in range(len(a_d)):
            if token in mapping:
                a_d[i] += array[i, mapping[token]]
    # return top k results
    k = topk_s if len(combined) > topk_s else len(combined)
    pred = np.argpartition(a_d, -k)[-k:]
    pred = pred[np.argsort(a_d[pred])].tolist()
    
    para = []
    while len(para) < 240 and len(pred) > 0:
        idx = pred.pop()
        sent = word_tokenize(combined[idx])[:80]
        l = len(sent)
        if len(para) + l <= 240:
            para += sent
    
    content_char = generate_char(para)
    content = unknown_detection(para)
        
    padded_question = word_tokenize(sample["question"].lower())[:30]
    while len(padded_question) < 30:
        padded_question.append("<PAD>")
    question_char = generate_char(padded_question)
    padded_question = unknown_detection(padded_question)
    
    new_sample["question"] = padded_question
    new_sample["q_char"] = question_char
    new_sample["content"] = content
    new_sample["c_char"] = content_char
    new_sample["answer"] = answer
    
    assert len(padded_question) == 30
    assert len(question_char) == 480
    assert len(content) <= 240
    assert len(content_char) <= 3840
    assert len(content_char) == len(content) * 16
    
    padded_dev.append(new_sample)

In [None]:
def generate_input_data(padded):

    c, c_chars, q, q_chars, answer = [], [], [], [], []
    
    cnt = 0
    for i in tqdm(range(len(padded))):
        cnt += 1
        sample = padded[i]
        question = sample["question"]
        content = sample["content"]
        q_char = sample["q_char"]
        c_char = sample["c_char"]
        a = sample["answer"]
        
        q_mapped = [word_dict[t] for t in question]
        c_mapped = [word_dict[t] for t in content]
        q_char_mapped = [char_dict[ch] for ch in q_char]
        c_char_mapped = [char_dict[ch] for ch in c_char]
        
        c_mapped = tf.keras.preprocessing.sequence.pad_sequences([c_mapped], maxlen=240, padding="post",value=word_dict["<PAD>"])[0]
        c_char_mapped = tf.keras.preprocessing.sequence.pad_sequences([c_char_mapped], maxlen=3840, padding="post",value=char_dict["<pad>"])[0]
        
        c.append(c_mapped)
        q.append(q_mapped)
        c_chars.append(c_char_mapped)
        q_chars.append(q_char_mapped)
        answer.append(a)
        
    return np.array(c), np.array(c_chars), np.array(q), np.array(q_chars), answer

In [None]:
c, c_char, q, q_char, answer = generate_input_data(padded_dev)

In [None]:
print(c.shape, c_char.shape)
print(q.shape, q_char.shape)
print(len(answer))

Define a generator to feed batches into DL model.

In [None]:
train_indices = np.arange(len(c))
def dev_batch(batch=16):
    np.random.shuffle(train_indices)
    for i in range(int(math.ceil(len(c)/batch))):
        start_index = (i*batch)%len(c)
        idx = train_indices[start_index:start_index+batch]
        c_b = c[idx]
        c_char_b = c_char[idx]
        q_b = q[idx]
        q_char_b = q_char[idx]
        a_b = []
        for j in idx:
            a_b.append(answer[j])
        yield c_b, c_char_b, q_b, q_char_b, a_b

Define functions for calculating f1 score.

In [None]:
def unknown_detection(token_list):
    new_list = []
    for token in token_list:
        if token in word_dict:
            new_list.append(token)
        else:
            new_list.append("<UNK>")
    return new_list

def f_dev(pred_s, pred_e, a, context):
    # computes average f_measure for a batch
    f_sum = 0
    l = len(pred_s)
    for i in range(l):
        pair, _ = prob_dp(pred_s[i], pred_e[i])
        s_i, e_i = pair
        if e_i < s_i:
            continue
        TP, FN, FP = 0, 0, 0
        guess = context[i][s_i:e_i+1]
        true = [word_dict[t] for t in unknown_detection(a[i])]
        for token in guess:
            if token in true:
                TP += 1
            else:
                FP += 1
        for token in true:
            if token not in guess:
                FN += 1
        precision = TP/(TP+FP)
        recall = TP/(TP+FN)
        f = 2*precision*recall/(precision+recall+1e-8)
        f_sum += f
    return f_sum/l

def prob_dp(set1,set2):
    assert len(set1) == len(set2)
    max1 = 0
    maxi1 = 0
    maxpair = None
    maxp = 0
    for i in range(len(set1)):
        if set1[i]>max1:
            max1 = set1[i]
            maxi1 = i
        if max1 * set2[i] > maxp:
            maxp = max1 * set2[i]
            maxpair = [maxi1,i]
    assert maxpair[0] <= maxpair[1]
    return maxpair,maxp

Then we build the same computation graph so that we can load weights in.

In [None]:
tf.reset_default_graph()

def embedding_encoder_block(scope, inputs):
    with tf.variable_scope(scope,reuse=tf.AUTO_REUSE):
        # first encode input with position info
        pos_encoded = position_encoding(inputs)
        # project input to dimension 128
        residual1 = tf.layers.separable_conv1d(pos_encoded, 128, 1, padding="same",activation=tf.nn.relu)
        
        #convolution block
        norm1 = tf.contrib.layers.layer_norm(residual1)
        norm1 = tf.nn.dropout(norm1, 1-dp)
        conv1 = tf.layers.separable_conv1d(norm1, 128, 7, padding="same",activation=tf.nn.relu)
        conv1 = tf.nn.dropout(conv1, 1-dp)
        residual2 = tf.add(residual1, conv1)
        
        norm2 = tf.contrib.layers.layer_norm(residual2)
        norm2 = tf.nn.dropout(norm2, 1-dp)
        conv2 = tf.layers.separable_conv1d(norm2, 128, 7, padding="same",activation=tf.nn.relu)
        conv2 = tf.nn.dropout(conv2, 1-dp)
        residual3 = tf.add(residual2, conv2)
        
        norm3 = tf.contrib.layers.layer_norm(residual3)
        norm3 = tf.nn.dropout(norm3, 1-dp)
        conv3 = tf.layers.separable_conv1d(norm3, 128, 7, padding="same",activation=tf.nn.relu)
        conv3 = tf.nn.dropout(conv3, 1-dp)
        residual4 = tf.add(residual3, conv3)
        
        norm4 = tf.contrib.layers.layer_norm(residual4)
        norm4 = tf.nn.dropout(norm4, 1-dp)
        conv4 = tf.layers.separable_conv1d(norm4, 128, 7, padding="same",activation=tf.nn.relu)
        conv4 = tf.nn.dropout(conv4, 1-dp)
        residual5 = tf.add(residual4, conv4)
        
        # self-attention block
        norm5 = tf.contrib.layers.layer_norm(residual5)
        norm5 = tf.nn.dropout(norm5, 1-dp)
        attention_out = multihead_self_attention(norm5, "self_attention")
        attention_out = tf.nn.dropout(attention_out, 1-dp)
        residual6 = tf.add(residual5, attention_out)
        
        # feedforwoad layer
        norm6 = tf.contrib.layers.layer_norm(residual6)
        norm6 = tf.nn.dropout(norm6, 1-dp)
        ffn1 = tf.layers.separable_conv1d(norm6, 128, 1, activation=tf.nn.relu)
        ffn1 = tf.nn.dropout(ffn1, 1-dp)
        ffn2 = tf.layers.separable_conv1d(ffn1, 128, 1)
        ffn2 = tf.nn.dropout(ffn2, 1-dp)
        residual7 = tf.add(residual6, ffn2)
    return residual7

def model_encoder_block(scope, inputs, projection=False):
    with tf.variable_scope(scope,reuse=tf.AUTO_REUSE):
        inputs = position_encoding(inputs)
        if projection:
            outputs = tf.layers.separable_conv1d(inputs, 128, 1, padding="same", activation=tf.nn.relu)
        else:
            outputs = inputs
        for i in range(7):
            with tf.variable_scope("conv_block{}".format(i),reuse=tf.AUTO_REUSE):
                norm0 = tf.contrib.layers.layer_norm(outputs)
                norm0 = tf.nn.dropout(norm0, 1-dp)
                conv0 = tf.layers.separable_conv1d(norm0, 128, 5, padding="same", activation=tf.nn.relu)
                conv0 = tf.nn.dropout(conv0, 1-dp)
                residual0 = tf.add(outputs, conv0)
                
                norm1 = tf.contrib.layers.layer_norm(residual0)
                norm1 = tf.nn.dropout(norm1, 1-dp)
                conv1 = tf.layers.separable_conv1d(norm1, 128, 5, padding="same", activation=tf.nn.relu)
                conv1 = tf.nn.dropout(conv1, 1-dp)
                residual1 = tf.add(residual0, conv1)
            
            with tf.variable_scope("self_attention{}".format(i),reuse=tf.AUTO_REUSE):
                norm2 = tf.contrib.layers.layer_norm(residual1)
                norm2 = tf.nn.dropout(norm2, 1-dp)
                attention_out = multihead_self_attention(norm2, "self_attention")
                attention_out = tf.nn.dropout(attention_out, 1-dp)
                residual2 = tf.add(residual1, attention_out)
            
            with tf.variable_scope("feedforward{}".format(i),reuse=tf.AUTO_REUSE):
                norm3 = tf.contrib.layers.layer_norm(residual2)
                norm3 = tf.nn.dropout(norm3, 1-dp)
                ffn1 = tf.layers.separable_conv1d(norm3, 128, 1, activation=tf.nn.relu)
                ffn1 = tf.nn.dropout(ffn1, 1-dp)
                ffn2 = tf.layers.separable_conv1d(ffn1, 128, 1)
                ffn2 = tf.nn.dropout(ffn2, 1-dp)
                outputs = tf.add(residual2, ffn2)
    return outputs

def highway(scope, inputs):
    # two layer highway network
    size = inputs.shape.as_list()[-1]
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        T1 = tf.layers.separable_conv1d(inputs, size, 1, activation=tf.nn.sigmoid, bias_initializer=tf.constant_initializer(-1))
        H1 = tf.layers.separable_conv1d(inputs, size, 1, activation=tf.nn.relu)
        H1 = tf.nn.dropout(H1, 1-dp)
        highway1 = T1 * H1 + inputs * (1.0 - T1)
        
        T2 = tf.layers.separable_conv1d(highway1, size, 1, activation=tf.nn.sigmoid, bias_initializer=tf.constant_initializer(-1))
        H2 = tf.layers.separable_conv1d(highway1, size, 1, activation=tf.nn.relu)
        H2 = tf.nn.dropout(H2, 1-dp)
        highway2 = T2 * H2 + highway1 * (1.0 - T2)
    return highway2

def multihead_self_attention(inputs, scope, heads=8):
    # restricted multi-head self-attention
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        depth = inputs.get_shape().as_list()[-1]
        WQ = tf.get_variable("WQ", [depth, depth])
        WK = tf.get_variable("WK", [depth, depth])
        WV = tf.get_variable("WV", [depth, depth])
        
        Q = tf.einsum("bsd,dw->bsw", inputs, WQ)
        K = tf.einsum("bsd,dw->bsw", inputs, WK)
        V = tf.einsum("bsd,dw->bsw", inputs, WV) # [batch, sequence_length, depth]
        
        # split shape for vectorization
        Q_ = tf.concat(tf.split(Q, heads, axis=2), axis=0)
        K_ = tf.concat(tf.split(K, heads, axis=2), axis=0)
        V_ = tf.concat(tf.split(V, heads, axis=2), axis=0) # [8 * batch, sequence_length, depth / 8]
        
        attention_logits = tf.matmul(Q_, K_, transpose_b=True)
        dk = depth / heads
        scaled = tf.divide(attention_logits, tf.sqrt(dk))
        attention = tf.nn.softmax(scaled, axis=-1)
        attention_out = tf.matmul(attention, V_)
        
        # retrieve shape
        attention_out = tf.concat(tf.split(attention_out, heads, axis=0), axis=2)
    return attention_out

def position_encoding(inputs):
    """
    sinusoids position encoding
    from One Model to Learn Then All
    -- input: [None, sequence_length, depth]
    -- output: [None, sequence_length, depth]
    """
    _, seq_length, depth = inputs.get_shape().as_list()
    pos_encoding = np.array([
        [pos * np.power(1e-4, -(i//2)*2/depth) for i in range(depth)]
        for pos in range(seq_length)])
    pos_encoding[:, 0::2] = np.sin(pos_encoding[:, 0::2])
    pos_encoding[:, 1::2] = np.cos(pos_encoding[:, 1::2])
    pos_encoding = tf.convert_to_tensor(pos_encoding, tf.float32)
    return inputs+pos_encoding

def query_context_co_attention(w, inputs, context, query):
    """
    input: 
        w: similarity funciton weight
        inputs: [q, c, q*c]
    output:
        A: context-to-query attention
        B: query-to-context attention
    """
    # similarity matrix S (logits)
    S = tf.einsum("abcde,ef->abcdf", tf.expand_dims(inputs,3),w)
    S = tf.squeeze(S,[-2,-1])
    # S_: softmax over rows
    S_ = tf.nn.softmax(S)
    # S__T: transpose of softmax over coloum
    S__T = tf.transpose(tf.nn.softmax(S, axis=1),[0,2,1])
    # context_query attention
    A = tf.matmul(S_, query)
    # query_context attention
    B = tf.matmul(tf.matmul(S_, S__T), context)
    return A, B

with tf.name_scope("inputs"):
    q_input = tf.placeholder(tf.int32, [None, 30], name="q")
    q_char_input = tf.placeholder(tf.int32, [None, 480], name="q_char")
    c_input = tf.placeholder(tf.int32, [None, 240], name="c")
    c_char_input = tf.placeholder(tf.int32, [None, 3840], name="c_char")
    
    start_mask = tf.placeholder(tf.int32, [None], name="start_mask")
    end_mask = tf.placeholder(tf.int32, [None], name="end_mask")
    
    batch_size = tf.placeholder(tf.int32, (), name="batch_size")
    dp = tf.placeholder(tf.float32, (), name="drop_prob")

tf.add_to_collection("infer_input", q_input)
tf.add_to_collection("infer_input", q_char_input)
tf.add_to_collection("infer_input", c_input)
tf.add_to_collection("infer_input", c_char_input)
tf.add_to_collection("infer_input", dp)

with tf.variable_scope("Input_Embedding_Layer"):
    # input embedding layer
    with tf.variable_scope("W_Embedding"):
        pretrained_embedding = tf.get_variable("w_embedding",
                                               shape=[72497,300],
                                               initializer=tf.constant_initializer(embedding),
                                               trainable=False)
        unknown_embedding = tf.get_variable("unknown",
                                            shape=[1, 300],
                                            initializer=tf.random_uniform_initializer(-0.5,0.5),
                                            trainable=True)
        tf.summary.histogram("unknown_word_embedding", unknown_embedding)
        padding_embedding = tf.get_variable("padding",
                                            shape=[1, 300],
                                            initializer=tf.zeros_initializer(),
                                            trainable=False)
        word_embedding = tf.concat([pretrained_embedding, unknown_embedding, padding_embedding], 0)
        
        q_embed = tf.nn.embedding_lookup(word_embedding, q_input)
        q_embed = tf.nn.dropout(q_embed, 1-dp)
        c_embed = tf.nn.embedding_lookup(word_embedding, c_input)
        c_embed = tf.nn.dropout(c_embed, 1-dp)

    with tf.variable_scope("C_Embedding"):
        char_embedding = tf.get_variable("c_embedding",
                                         shape=[209, 200],
                                         initializer=tf.random_uniform_initializer(-0.5,0.5),
                                         trainable=True)
        padding = tf.get_variable("padding",
                                  shape=[1, 200],
                                  initializer=tf.zeros_initializer(),
                                  trainable=False)
        char_combined = tf.concat([char_embedding, padding], 0)
        tf.summary.histogram("character_embedding", char_combined)
        q_char_embed = tf.nn.embedding_lookup(char_combined, q_char_input)
        c_char_embed = tf.nn.embedding_lookup(char_combined, c_char_input)
        
        squeeze_to_word_q = tf.layers.max_pooling1d(q_char_embed, 16, 16)
        squeeze_to_word_q = tf.nn.dropout(squeeze_to_word_q, 1-dp*0.5)
        squeeze_to_word_c = tf.layers.max_pooling1d(c_char_embed, 16, 16)
        squeeze_to_word_c = tf.nn.dropout(squeeze_to_word_c, 1-dp*0.5)
        
    with tf.variable_scope("embedding_output"):
        q_embed_out = tf.concat([q_embed, squeeze_to_word_q], 2)
        c_embed_out = tf.concat([c_embed, squeeze_to_word_c], 2)
        q_embed_out = highway("highway", q_embed_out)
        c_embed_out = highway("highway", c_embed_out)

with tf.variable_scope("Embedding_Encoder_Layer"):
    # embedding encoder layer
    q_encoded = embedding_encoder_block("encoder_block", q_embed_out)
    c_encoded = embedding_encoder_block("encoder_block", c_embed_out)
    print("Embedding Encoder Layer output shape:", q_encoded.shape, c_encoded.shape)
    
with tf.variable_scope("Context_Query_Attention_Layer"):
    # context_query attention layer
    # first compute similarity matrix between context and query
    # S_tj = w * [C_t; Q_j; C_t*Q_j]
    c_expand = tf.expand_dims(c_encoded, 2)
    c_expand = tf.tile(c_expand, [1,1,30,1])
    
    q_expand = tf.expand_dims(q_encoded, 1)
    q_expand = tf.tile(q_expand, [1,240,1,1])
    
    qc_mul = tf.multiply(c_expand, q_expand)
    
    qc_concat = tf.concat([c_expand,q_expand,qc_mul], 3)
    w = tf.get_variable("s_w", [384,1])
    tf.summary.histogram("S_matrix_weight", w)
    
    A, B = query_context_co_attention(w, qc_concat, c_encoded, q_encoded)
    
    # layer output
    G = tf.concat([c_encoded, A, tf.multiply(c_encoded,A), tf.multiply(c_encoded,B)],2)
    print("Co-Attention Layer output shape:", G.shape)

with tf.variable_scope("Model_Encoder_Layer"):
    # model encoder layer
    model_encoder1 = model_encoder_block("model_encoder", G, projection=True)
    model_encoder2 = model_encoder_block("model_encoder", model_encoder1, projection=False)
    model_encoder3 = model_encoder_block("model_encoder", model_encoder2, projection=False)
    
    print("Model Encoder Layer output shape:",model_encoder1.shape,model_encoder2.shape,model_encoder3.shape)

with tf.variable_scope("Output_Layer"):
    # output layer
    # p1: start probability sequence
    # p2: end probability sequence
    p1_input = tf.concat([model_encoder1, model_encoder2],2)
    p2_input = tf.concat([model_encoder2, model_encoder3],2)
    
    p1_logits = tf.squeeze(tf.layers.separable_conv1d(p1_input, 1, 1),-1)
    p2_logits = tf.squeeze(tf.layers.separable_conv1d(p2_input, 1, 1),-1)
    
    p1_prob = tf.nn.softmax(p1_logits)
    p2_prob = tf.nn.softmax(p2_logits)
    
    s_pairs = tf.concat([tf.expand_dims(tf.range(batch_size),1), tf.expand_dims(start_mask,1)],1)
    e_pairs = tf.concat([tf.expand_dims(tf.range(batch_size),1), tf.expand_dims(end_mask,1)],1)
    yhat_p1 = tf.add(tf.gather_nd(p1_prob, s_pairs), 1e-15)
    yhat_p2 = tf.add(tf.gather_nd(p2_prob, e_pairs), 1e-15)

tf.add_to_collection("predictions", p1_prob)
tf.add_to_collection("predictions", p2_prob)

global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step')
    
with tf.variable_scope("Optimizer"):
    
    # add l2 weight decay to all variables
    trainables = tf.trainable_variables()
    loss_l2 = tf.add_n([ tf.nn.l2_loss(v) for v in trainables if 'bias' not in v.name ]) * 3e-7
    tf.summary.histogram("l2_loss", loss_l2)
    loss = -tf.reduce_mean(tf.log(yhat_p1) + tf.log(yhat_p2)) + loss_l2
    
    # perform cold warm up and gradient clipping
    lr = tf.minimum(0.001, 0.001 / tf.log(999.) * tf.log(tf.cast(global_step, tf.float32) + 1))
    optimizer = tf.train.AdamOptimizer(lr, beta1=0.8,epsilon=1e-7)
    gradients, variables = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
    opt_op = optimizer.apply_gradients(zip(gradients, variables), global_step=global_step)
    
    # apply exponential moving average
    # used in inference
    ema = tf.train.ExponentialMovingAverage(decay=0.9999)
    with tf.control_dependencies([opt_op]):
        train_step = ema.apply(trainables)

tf.add_to_collection("train_step", train_step)
        
tf.summary.scalar("loss", loss)
f_measure_train = tf.get_variable("f_train", (), trainable=False)
f_measure_dev = tf.get_variable("f_dev", (), trainable=False)
tf.summary.scalar("f_train", f_measure_train)
tf.summary.scalar("f_dev", f_measure_dev)
print("Prob distribution shape:", p1_prob.shape, p2_prob.shape)

During inference, we load the shadow variables(exponential moving average).

In [None]:
with tf.device("/gpu:0"):
    config = tf.ConfigProto(allow_soft_placement = True)
    
    with tf.Session(config=config) as sess:
        variables_to_restore = ema.variables_to_restore()
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('./model/checkpoint'))
        saver = tf.train.Saver(variables_to_restore)
        gen = dev_batch()
        #saver = tf.train.Saver()
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        if train:
            sess.run(train_iter.initializer)

        s_idx, e_idx = tf.get_collection("predictions")
        f_list = []
        
        i = 0
        while True:
            i += 1
            
            try:
                c_d, c_char_d, q_d, q_char_d, ans_d = next(gen)
            except StopIteration:
                break
            
            feed_dict={q_input:q_d,
                         q_char_input: q_char_d,
                         c_input: c_d,
                         c_char_input: c_char_d,
                         dp: 0}
            
            pred_s, pred_e = sess.run([s_idx, e_idx], feed_dict=feed_dict)
            f = f_dev(pred_s, pred_e, ans_d, c_d)
            
            print(f)
            f_list.append(f)

print("Done!")
print(sum(f_list)/len(f_list))