In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import json
import pickle
import tensorflow as tf
import os
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import OrderedDict
%cd "/home/linrong/QA/v100-run2"

/home/linrong/QA/v100-run2


In [2]:
with open("/home/linrong/floyd/qa/documents.json") as f:
    doc = json.load(f)
with open("/home/linrong/Downloads/project_files/devel.json") as f:
    dev = json.load(f)
with open("/home/linrong/floyd/qa/mapping.json") as f:
    word_dict = json.load(f)
with open("/home/linrong/floyd/qa/char_dict.json") as f:
    char_dict = json.load(f)
embedding = np.load("/home/linrong/floyd/qa/embedding.npy")

In [16]:
with open("/home/linrong/floyd/qa/training.json") as f:
    train = json.load(f)

In [3]:
topk_p = 3
topk_s = 3
stopword = set(stopwords.words('english'))
punc = set(['"','\'',"?",".",",","/","<",">",":",";"])

In [100]:
def unknown_detection(token_list):
    new_list = []
    for token in token_list:
        if token in word_dict:
            new_list.append(token)
        else:
            new_list.append("<UNK>")
    return new_list

def unknown_detection_char(char_list):
    for i in range(len(char_list)):
        if char_list[i] not in char_dict:
            char_list[i] = "<unk>"
    return char_list

def generate_char(token_list):
    new_list = []
    for token in token_list:
        if token == "<PAD>":
            char_list = ["<pad>"]*16
        else:
            char_list = [c for c in token[:16]]
        while len(char_list) < 16:
            char_list.append("<pad>")
        for char in char_list:
            if char in char_dict:
                new_list.append(char)
            else:
                new_list.append("<unk>")
    return new_list

if os.path.exists("tfidfs.pickle"):
    with open("tfidfs.pickle","rb") as f:
        tfidfs = pickle.load(f)
    tqdm.write("matrices loaded")
else:
    tfidfs = dict()
    for d in doc:
        tfidf = TfidfVectorizer(tokenizer=word_tokenize,
                                stop_words='english',
                                max_df=0.5,
                                smooth_idf=False,
                                sublinear_tf=True)
        paragraphs = [p.lower() for p in d["text"]]
        res = tfidf.fit_transform(paragraphs).toarray()
        mapping = tfidf.vocabulary_
        tfidfs[d["docid"]] = [res, mapping]
    with open("tfidfs.pickle","wb") as f:
        pickle.dump(tfidfs, f)
    tqdm.write("matrices building complete")

reproduce = False
c = 0
padded = []
for sample in tqdm(dev):
    new_sample = dict()
    
    docid = sample["docid"]
    answer = word_tokenize(sample["text"])[:7]

    question = word_tokenize(sample["question"].lower().strip())
    rmed = []
    for token in question:
        if token not in stopword and token not in punc:
            rmed.append(token)
    question = rmed
    
    if not reproduce:
        res, mapping = tfidfs[docid]
        # set accumulator for each paragraph
        a_d = [0 for _ in range(res.shape[0])]
        for token in question:
            for i in range(len(a_d)):
                if token in mapping:
                    a_d[i] += res[i, mapping[token]]

        k = topk_p if res.shape[0] > topk_p else res.shape[0]
        pred = np.argpartition(a_d, -k)[-k:]
        pred = set(pred)
        combined = []
        for idx in pred:
            sents = sent_tokenize(doc[docid]["text"][idx].lower())
            for s in sents:
                combined.append(s)

        # rank sentences in combined sents
        tfidf = TfidfVectorizer(smooth_idf=False,
                                sublinear_tf=True,
                                tokenizer=word_tokenize)
        array = tfidf.fit_transform(combined).toarray()
        mapping = tfidf.vocabulary_

        a_d = np.zeros(len(combined))
        for token in question:
            for i in range(len(a_d)):
                if token in mapping:
                    a_d[i] += array[i, mapping[token]]
        # return top k results
        k = topk_s if len(combined) > topk_s else len(combined)
        pred = np.argpartition(a_d, -k)[-k:]
        pred = list(OrderedDict.fromkeys(pred))

        para = []
        for idx in pred:
            sent = word_tokenize(combined[idx])[:80]
            para += sent
    else:
        para = word_tokenize(doc[docid]["text"][sample["answer_paragraph"]].lower())[:240]
        # extract indices of answer from paragraph
        answer_idx = None
        for i, j in enumerate(para):
            if j == answer[0]:
                k = 1
                while k < len(answer) and i+k<len(para):
                    if para[i+k] != answer[k]:
                        break
                    k += 1
                else:
                    answer_idx = (i, i+k)
                    break
        # ignore samples that no answer can be found
        if answer_idx is None:
            continue
    
    while len(para) < 240:
        para.append("<PAD>")
    content_char = generate_char(para)
    content = unknown_detection(para)
        
    padded_question = word_tokenize(sample["question"].lower())[:30]
    while len(padded_question) < 30:
        padded_question.append("<PAD>")
    question_char = generate_char(padded_question)
    padded_question = unknown_detection(padded_question)

    
    new_sample["question"] = padded_question
    new_sample["q_char"] = question_char
    new_sample["content"] = content
    new_sample["c_char"] = content_char
    new_sample["answer"] = answer
    
    assert len(padded_question) == 30
    assert len(question_char) == 480
    assert len(content) == 240
    assert len(content_char) == 3840
    assert len(answer) <= 7
    
    padded.append(new_sample)

  0%|          | 15/3097 [00:00<00:21, 141.59it/s]

matrices loaded


100%|██████████| 3097/3097 [00:20<00:00, 153.58it/s]


In [18]:
def generate_input_data(padded):

    c, c_chars, q, q_chars, answer = [], [], [], [], []

    for sample in tqdm(padded):
        question = sample["question"]
        content = sample["content"]
        q_char = sample["q_char"]
        c_char = sample["c_char"]
        a = sample["answer"]

        q_mapped = [word_dict[t] for t in question]
        c_mapped = [word_dict[t] for t in content]
        q_char_mapped = [char_dict[ch] for ch in q_char]
        c_char_mapped = [char_dict[ch] for ch in c_char]
        
        c.append(c_mapped)
        q.append(q_mapped)
        c_chars.append(c_char_mapped)
        q_chars.append(q_char_mapped)
        answer.append(a)
        
    return np.array(c), np.array(c_chars), np.array(q), np.array(q_chars), answer


In [101]:
c, c_char, q, q_char, answer = generate_input_data(padded)

100%|██████████| 3097/3097 [00:00<00:00, 3908.44it/s]


In [8]:
curr = padded[2550]
print(curr["answer"])

['$', '40', 'billion']


In [103]:
def f_score(pred_s, pred_e, a, context):
    # computes average f_measure for a batch
    if pred_e < pred_s:
        return 0
    TP, FN, FP = 0, 0, 0
    a = unknown_detection(a)
    guess = context[0][pred_s:pred_e+1]
    true = [word_dict[x] for x in a]
    for token in guess:
        if token in true:
            TP += 1
        else:
            FP += 1
    for token in true:
        if token not in guess:
            FN += 1
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f = 2*precision*recall/(precision+recall+1e-8)
    return f

In [8]:
def prob_dp(set1,set2):
    assert len(set1) == len(set2)
    max1 = 0
    maxi1 = 0
    maxpair = None
    maxp = 0
    for i in range(len(set1)):
        if set1[i]>max1:
            max1 = set1[i]
            maxi1 = i
        if max1 * set2[i] > maxp:
            maxp = max1 * set2[i]
            maxpair = [maxi1,i]
    assert maxpair[0] <= maxpair[1]
    return maxpair,maxp

In [108]:
#tf.reset_default_graph()
shadow_var = True

with tf.device("/cpu:0"):
    config = tf.ConfigProto(allow_soft_placement = True)
    
    with tf.Session(config=config) as sess:
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('./model/checkpoint'))
        #saver = tf.train.import_meta_graph("naive-6268.meta", clear_devices=True)
        if shadow_var:
            var_to_restore = ema.variables_to_restore()
            saver = tf.train.Saver(var_to_restore)
        else:
            saver = tf.train.Saver()
        saver.restore(sess, ckpt.model_checkpoint_path)
        #sess.run(tf.global_variables_initializer())
        c_ph,c_char_ph, q_ph, q_char_ph, dp = tf.get_collection("infer_input")
        s_idx, e_idx = tf.get_collection("predictions")
        f_dp = []
        f_max = []
        
        for i in tqdm(range(len(c))):
            c_in = c[i:i+1,:]
            c_char_in = c_char[i:i+1, :]
            q_in = q[i:i+1, :]
            q_char_in = q_char[i:i+1, :]
            a_in = answer[i]
            
            feed_dict={"inputs/c:0": c_in,
                       "inputs/c_char:0": c_char_in,
                       "inputs/q:0": q_in,
                       "inputs/q_char:0": q_char_in,
                       "inputs/drop_prob:0": 0}
            
            pred_s, pred_e = sess.run([s_idx, e_idx], feed_dict=feed_dict)
            ls = pred_s.tolist()[0]
            le = pred_e.tolist()[0]
            pair, _ = prob_dp(ls, le)
            f1 = f_score(*pair, a_in, c_in)
            f2 = f_score(np.argmax(ls), np.argmax(le), a_in, c_in)
            f_dp.append(f1)
            f_max.append(f2)

INFO:tensorflow:Restoring parameters from ./model/naive-39000


100%|██████████| 3097/3097 [04:35<00:00, 11.23it/s]


In [109]:
print(sum(f_dp)/len(f_dp))
print(sum(f_max)/len(f_max))

0.5118520292505654
0.5016366903479255


In [None]:
import numpy as np
x = np.zeros((3,3))

In [69]:
prob_dp([1,1,6,1,1],[7,2,1,4,5])

([2, 4], 30)

In [54]:
tf.reset_default_graph()

sess = tf

'1.8.0'

In [9]:
# todo: depthwise separable convolutions
# todo: position encoding
# todo: multihead attention(maybe)
# todo: regularization(dropout)

tf.reset_default_graph()

def embedding_encoder_block(scope, inputs):
    with tf.variable_scope(scope,reuse=tf.AUTO_REUSE):
        #convolution block
        residual1 = tf.layers.separable_conv1d(inputs, 128, 1,activation=tf.nn.relu)
        
        norm1 = tf.contrib.layers.layer_norm(residual1)
        norm1 = tf.nn.dropout(norm1, 1-dp)
        conv1 = tf.layers.separable_conv1d(norm1, 128, 7, padding="same",activation=tf.nn.relu)
        conv1 = tf.nn.dropout(conv1, 1-dp)
        residual2 = tf.add(residual1, conv1)
        
        norm2 = tf.contrib.layers.layer_norm(residual2)
        norm2 = tf.nn.dropout(norm2, 1-dp)
        conv2 = tf.layers.separable_conv1d(norm2, 128, 7, padding="same",activation=tf.nn.relu)
        conv2 = tf.nn.dropout(conv2, 1-dp)
        residual3 = tf.add(residual2, conv2)
        
        norm3 = tf.contrib.layers.layer_norm(residual3)
        norm3 = tf.nn.dropout(norm3, 1-dp)
        conv3 = tf.layers.separable_conv1d(norm3, 128, 7, padding="same",activation=tf.nn.relu)
        conv3 = tf.nn.dropout(conv3, 1-dp)
        residual4 = tf.add(residual3, conv3)
        
        norm4 = tf.contrib.layers.layer_norm(residual4)
        norm4 = tf.nn.dropout(norm4, 1-dp)
        conv4 = tf.layers.separable_conv1d(norm3, 128, 7, padding="same",activation=tf.nn.relu)
        conv4 = tf.nn.dropout(conv4, 1-dp)
        residual5 = tf.add(residual4, conv4)
        
        # self-attention block
        norm4 = tf.contrib.layers.layer_norm(residual5)
        attention = tf.matmul(norm4, norm4, transpose_b=True)
        dk = tf.cast(tf.shape(norm4)[-1], dtype=tf.float32)
        scaled = tf.divide(attention, tf.sqrt(dk))
        attention = tf.nn.softmax(scaled, axis=-1)
        attention_out = tf.matmul(attention, norm4)
        residual6 = tf.add(residual5, attention_out)
        
        # feedforwoad layer
        norm5 = tf.contrib.layers.layer_norm(residual5)
        norm5 = tf.nn.dropout(norm5, 1-dp)
        ffn1 = tf.layers.separable_conv1d(norm5, 128, 1, activation=tf.nn.relu)
        ffn1 = tf.nn.dropout(ffn1, 1-dp)
        ffn2 = tf.layers.separable_conv1d(ffn1, 128, 1)
        ffn2 = tf.nn.dropout(ffn2, 1-dp)
        residual7 = tf.add(residual6, ffn2)
    return residual7

def model_encoder_block(scope, inputs):
    with tf.variable_scope(scope,reuse=tf.AUTO_REUSE):
        outputs = inputs
        for i in range(7):
            with tf.variable_scope("conv_block{}".format(i),reuse=tf.AUTO_REUSE):
                norm0 = tf.contrib.layers.layer_norm(outputs)
                norm0 = tf.nn.dropout(norm0, 1-dp)
                conv0 = tf.layers.separable_conv1d(norm0, 128, 5, padding="same", activation=tf.nn.relu)
                conv0 = tf.nn.dropout(conv0, 1-dp)
                residual0 = tf.add(outputs, conv0)
                
                norm1 = tf.contrib.layers.layer_norm(residual0)
                norm1 = tf.nn.dropout(norm1, 1-dp)
                conv1 = tf.layers.separable_conv1d(norm1, 128, 5, padding="same", activation=tf.nn.relu)
                conv1 = tf.nn.dropout(conv1, 1-dp)
                residual1 = tf.add(residual0, conv1)
            
            with tf.variable_scope("self_attention{}".format(i),reuse=tf.AUTO_REUSE):
                norm2 = tf.contrib.layers.layer_norm(residual1)
                attention = tf.matmul(norm2, norm2, transpose_b=True)
                dk = tf.cast(tf.shape(norm2)[-1], dtype=tf.float32)
                scaled = tf.divide(attention, tf.sqrt(dk))
                attention = tf.nn.softmax(scaled, axis=-1)
                attention_out = tf.matmul(attention, norm2)
                residual2 = tf.add(residual1, attention_out)
            
            with tf.variable_scope("feedforward{}".format(i),reuse=tf.AUTO_REUSE):
                norm3 = tf.contrib.layers.layer_norm(residual2)
                norm3 = tf.nn.dropout(norm3, 1-dp)
                ffn1 = tf.layers.separable_conv1d(norm3, 128, 1, activation=tf.nn.relu)
                ffn1 = tf.nn.dropout(ffn1, 1-dp)
                ffn2 = tf.layers.separable_conv1d(ffn1, 128, 1)
                ffn2 = tf.nn.dropout(ffn2, 1-dp)
                outputs = tf.add(residual2, ffn2)
    return outputs

def highway(scope, inputs):
    # two layer highway network
    size = inputs.shape.as_list()[-1]
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        T1 = tf.layers.separable_conv1d(inputs, size, 1, activation=tf.nn.sigmoid, bias_initializer=tf.constant_initializer(-1))
        H1 = tf.layers.separable_conv1d(inputs, size, 1)
        H1 = tf.nn.dropout(H1, 1-dp)
        highway1 = T1 * H1 + inputs * (1.0 - T1)
        
        T2 = tf.layers.separable_conv1d(highway1, size, 1, activation=tf.nn.sigmoid, bias_initializer=tf.constant_initializer(-1))
        H2 = tf.layers.separable_conv1d(highway1, size, 1)
        H2 = tf.nn.dropout(H2, 1-dp)
        highway2 = T2 * H2 + highway1 * (1.0 - T2)
    return highway2

#tf.reset_default_graph()

with tf.name_scope("inputs"):
    q_input = tf.placeholder(tf.int32, [None, 30], name="q")
    q_char_input = tf.placeholder(tf.int32, [None, 480], name="q_char")
    c_input = tf.placeholder(tf.int32, [None, 240], name="c")
    c_char_input = tf.placeholder(tf.int32, [None, 3840], name="c_char")
    start_mask = tf.placeholder(tf.int32, [None], name="start_mask")
    end_mask = tf.placeholder(tf.int32, [None], name="end_mask")
    batch_size = tf.placeholder(tf.int32, (), name="batch_size")
    dp = tf.placeholder(tf.float32, (), name="drop_prob")

tf.add_to_collection("infer_input", q_input)
tf.add_to_collection("infer_input", q_char_input)
tf.add_to_collection("infer_input", c_input)
tf.add_to_collection("infer_input", c_char_input)
tf.add_to_collection("infer_input", dp)

with tf.variable_scope("Input_Embedding_Layer"):
    # input embedding layer
    with tf.variable_scope("W_Embedding"):
        pretrained_embedding = tf.get_variable("w_embedding",
                                               shape=[72497, 50],
                                               initializer=tf.constant_initializer(embedding[:-2,:]),
                                               trainable=False)
        unknown_embedding = tf.get_variable("unknown",
                                            shape=[1, 50],
                                            initializer=tf.random_uniform_initializer(-0.5,0.5),
                                            trainable=True)
        padding_embedding = tf.get_variable("padding",
                                            shape=[1, 50],
                                            initializer=tf.zeros_initializer(),
                                            trainable=False)
        word_embedding = tf.concat([pretrained_embedding, unknown_embedding, padding_embedding], 0)
        q_embed = tf.nn.embedding_lookup(word_embedding, q_input)
        c_embed = tf.nn.embedding_lookup(word_embedding, c_input)

    with tf.variable_scope("C_Embedding"):
        char_embedding = tf.get_variable("c_embedding",
                                         shape=[215, 200],
                                         initializer=tf.random_uniform_initializer(-0.5,0.5),
                                         trainable=True)
        padding = tf.get_variable("padding",
                                  shape=[1, 200],
                                  initializer=tf.zeros_initializer(),
                                  trainable=False)
        char_combined = tf.concat([char_embedding, padding], 0, name="char_embedding")
        q_char_embed = tf.nn.embedding_lookup(char_combined, q_char_input)
        c_char_embed = tf.nn.embedding_lookup(char_combined, c_char_input)
        squeeze_to_word_q = tf.layers.max_pooling1d(q_char_embed, 16, 16)
        squeeze_to_word_c = tf.layers.max_pooling1d(c_char_embed, 16, 16)
        
    with tf.variable_scope("embedding_output"):
        q_embed_out = tf.concat([q_embed, squeeze_to_word_q], 2)
        q_embed_out = tf.nn.dropout(q_embed_out, 1-dp)
        c_embed_out = tf.concat([c_embed, squeeze_to_word_c], 2)
        c_embed_out = tf.nn.dropout(c_embed_out, 1-dp*0.5)
        q_embed_out = highway("highway", q_embed_out)
        c_embed_out = highway("highway", c_embed_out)

with tf.variable_scope("Embedding_Encoder_Layer"):
    # embedding encoder layer
    q_encoded = embedding_encoder_block("encoder_block", q_embed_out)
    c_encoded = embedding_encoder_block("encoder_block", c_embed_out)
    print(q_encoded.shape, c_encoded.shape)
    
with tf.variable_scope("Context_Query_Attention_Layer"):
    # context_query attention layer
    # first compute similarity matrix between context and query
    # S_tj = w * [C_t; Q_j; C_t*Q_j]
    c_expand = tf.expand_dims(c_encoded, 2)
    c_expand = tf.tile(c_expand, [1,1,30,1])
    q_expand = tf.expand_dims(q_encoded, 1)
    q_expand = tf.tile(q_expand, [1,240,1,1])
    qc_mul = tf.multiply(c_expand, q_expand)
    concat = tf.concat([c_expand,q_expand,qc_mul], 3)
    w = tf.get_variable("s_w", [384,1])
    
    # similarity matrix S (logits)
    S = tf.einsum("abcde,ef->abcdf", tf.expand_dims(concat,3),w)
    S = tf.squeeze(S,[-2,-1])
    # S_: softmax over rows
    S_ = tf.nn.softmax(S)
    # S__T: transpose of softmax over coloum
    S__T = tf.transpose(tf.nn.softmax(S, axis=1),[0,2,1])
    # context_query attention
    A = tf.matmul(S_, q_encoded)
    # query_context attention
    B = tf.matmul(tf.matmul(S_, S__T), c_encoded)
    
    # layer output
    G = tf.concat([c_encoded, A, tf.multiply(c_encoded,A), tf.multiply(c_encoded,B)],2)
    print(G.shape)

with tf.variable_scope("Model_Encoder_Layer"):
    # model encoder layer
    G_conv = tf.layers.separable_conv1d(G, 128, 1, padding="same", activation=tf.nn.relu)
    model_encoder1 = model_encoder_block("model_encoder", G_conv)
    model_encoder2 = model_encoder_block("model_encoder", model_encoder1)
    model_encoder3 = model_encoder_block("model_encoder", model_encoder2)
    print(model_encoder1.shape,model_encoder2.shape,model_encoder3.shape)

global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step')

with tf.variable_scope("Output_Layer"):
    # output layer
    p1_input = tf.concat([model_encoder1, model_encoder2],2)
    p2_input = tf.concat([model_encoder2, model_encoder3],2)
    p1_prob = tf.nn.softmax(tf.squeeze(tf.layers.conv1d(p1_input, 1, 1),-1))
    p2_prob = tf.nn.softmax(tf.squeeze(tf.layers.conv1d(p2_input, 1, 1),-1))
    pred_s = tf.argmax(p1_prob, axis=1)
    pred_e = tf.argmax(p2_prob, axis=1)
    s_pairs = tf.concat([tf.expand_dims(tf.range(batch_size),1), tf.expand_dims(start_mask,1)],1)
    e_pairs = tf.concat([tf.expand_dims(tf.range(batch_size),1), tf.expand_dims(end_mask,1)],1)
    yhat_p1 = tf.add(tf.gather_nd(p1_prob, s_pairs), 1e-15)
    yhat_p2 = tf.add(tf.gather_nd(p2_prob, e_pairs), 1e-15)

tf.add_to_collection("predictions", p1_prob)
tf.add_to_collection("predictions", p2_prob)
    
with tf.variable_scope("Optimizer"):
    # add l2 weight decay to all variables
    trainables = tf.trainable_variables()
    loss_l2 = tf.add_n([ tf.nn.l2_loss(v) for v in trainables if 'bias' not in v.name ]) * 3e-7
    loss = -tf.reduce_mean(tf.log(yhat_p1) + tf.log(yhat_p2)) + loss_l2
    
    # perform cold warm up and gradient clipping
    lr = tf.minimum(0.001, 0.001 / tf.log(999.) * tf.log(tf.cast(global_step, tf.float32) + 1))
    optimizer = tf.train.AdamOptimizer(lr, beta1=0.8,epsilon=1e-7)
    gradients, variables = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
    opt_op = optimizer.apply_gradients(zip(gradients, variables), global_step=global_step)
    
    # apply exponential moving average
    ema = tf.train.ExponentialMovingAverage(decay=0.9999)
    with tf.control_dependencies([opt_op]):
        train_step = ema.apply(trainables)

tf.add_to_collection("train_step", train_step)
        
tf.summary.scalar("loss", loss)
f_measure = tf.get_variable("f_measure", (), trainable=False)
tf.summary.scalar("f_measure", f_measure)
print(yhat_p1.shape, yhat_p2.shape)

(?, 30, 128) (?, 240, 128)
(?, 240, 512)
(?, 240, 128) (?, 240, 128) (?, 240, 128)
(?,) (?,)
