In [1]:
import numpy as np
import tensorflow as tf
import json
import os
import warnings
import random
import pickle
import math
import spacy
warnings.simplefilter(action='ignore', category=FutureWarning)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
%cd "E:\NLP\QA\output"

E:\NLP\QA\output


In [2]:
with open("documents.json") as f:
    doc = json.load(f)
with open("devel.json") as f:
    dev = json.load(f)
with open("word_dict2.json") as f:
    word_dict = json.load(f)
with open("char_dict2.json") as f:
    char_dict = json.load(f)

In [3]:
embedding = np.load("embedding2.npy").astype("float32")

In [4]:
nlp = spacy.load("en")

stopword = set(stopwords.words('english'))
punc = set(['"','\'',"?",".",",","/","<",">",":",";"])

In [6]:
def unknown_detection(token_list):
    new_list = []
    for token in token_list:
        if token in word_dict:
            new_list.append(token)
        else:
            new_list.append("<UNK>")
    return new_list

def generate_char(token_list):
    new_list = []
    for token in token_list:
        if token == "<PAD>":
            char_list = ["<pad>"]*16
        else:
            char_list = [c for c in token[:16]]
        while len(char_list) < 16:
            char_list.append("<pad>")
        for char in char_list:
            if char in char_dict:
                new_list.append(char)
            else:
                new_list.append("<unk>")
    assert len(new_list) == len(token_list) * 16
    return new_list

if os.path.exists("tfidfs.pickle"):
    with open("tfidfs.pickle","rb") as f:
        tfidfs = pickle.load(f)
    tqdm.write("matrices loaded")
else:
    tfidfs = dict()
    for d in doc:
        tfidf = TfidfVectorizer(tokenizer=word_tokenize,
                                stop_words='english',
                                max_df=0.5,
                                smooth_idf=False,
                                sublinear_tf=True)
        paragraphs = [p.lower() for p in d["text"]]
        res = tfidf.fit_transform(paragraphs).toarray()
        mapping = tfidf.vocabulary_
        tfidfs[d["docid"]] = [res, mapping]
    with open("tfidfs.pickle","wb") as f:
        pickle.dump(tfidfs, f)
    tqdm.write("matrices building complete")

topk_p = 4
topk_s = 6

padded_dev = []
for sample in tqdm(dev):
    new_sample = dict()
    
    docid = sample["docid"]
    answer = word_tokenize(sample["text"])

    question = word_tokenize(sample["question"].lower().strip())
    rmed = []
    for token in question:
        if token not in stopword and token not in punc:
            rmed.append(token)
    question = rmed
    """
    res, mapping = tfidfs[docid]
    # set accumulator for each paragraph
    a_d = [0 for _ in range(res.shape[0])]
    for token in question:
        for i in range(len(a_d)):
            if token in mapping:
                a_d[i] += res[i, mapping[token]]

    k = topk_p if res.shape[0] > topk_p else res.shape[0]
    pred = np.argpartition(a_d, -k)[-k:]
    pred = set(pred)
    combined = []
    for idx in pred:
        sents = [s.text for s in nlp(doc[docid]["text"][idx]).sents]
        for s in sents:
            combined.append(s.lower())

    # rank sentences in combined sents
    tfidf = TfidfVectorizer(smooth_idf=False,
                            sublinear_tf=True,
                            tokenizer=word_tokenize)
    array = tfidf.fit_transform(combined).toarray()
    mapping = tfidf.vocabulary_

    a_d = np.zeros(len(combined))
    for token in question:
        for i in range(len(a_d)):
            if token in mapping:
                a_d[i] += array[i, mapping[token]]
    # return top k results
    k = topk_s if len(combined) > topk_s else len(combined)
    pred = np.argpartition(a_d, -k)[-k:]
    pred = pred[np.argsort(a_d[pred])].tolist()
    
    para = []
    while len(para) < 240 and len(pred) > 0:
        idx = pred.pop()
        sent = word_tokenize(combined[idx])[:80]
        l = len(sent)
        if len(para) + l <= 240:
            para += sent
    """
    para = word_tokenize(doc[docid]["text"][sample["answer_paragraph"]].lower())[:240]
    
    content_char = generate_char(para)
    content = unknown_detection(para)
        
    padded_question = word_tokenize(sample["question"].lower())[:30]
    while len(padded_question) < 30:
        padded_question.append("<PAD>")
    question_char = generate_char(padded_question)
    padded_question = unknown_detection(padded_question)
    
    new_sample["question"] = padded_question
    new_sample["q_char"] = question_char
    new_sample["content"] = content
    new_sample["c_char"] = content_char
    new_sample["answer"] = answer
    new_sample["answer_idx"] = 0
    
    assert len(padded_question) == 30
    assert len(question_char) == 480
    assert len(content) <= 240
    assert len(content_char) <= 3840
    assert len(content_char) == len(content) * 16
    
    padded_dev.append(new_sample)

matrices loaded


100%|█████████████████████████████████████████████████████████████████████████████| 3097/3097 [00:07<00:00, 433.31it/s]


In [7]:
def generate_input_data(padded):

    c, c_chars, q, q_chars, a_idx, answer = [], [], [], [], [], []

    for sample in tqdm(padded):
        question = sample["question"]
        content = sample["content"]
        q_char = sample["q_char"]
        c_char = sample["c_char"]
        a = sample["answer"]
        aidx = sample["answer_idx"]

        q_mapped = [word_dict[t] for t in question]
        c_mapped = [word_dict[t] for t in content]
        q_char_mapped = [char_dict[ch] for ch in q_char]
        c_char_mapped = [char_dict[ch] for ch in c_char]
        
        c_mapped = tf.keras.preprocessing.sequence.pad_sequences([c_mapped], maxlen=240, padding="post",value=word_dict["<PAD>"])[0]
        c_char_mapped = tf.keras.preprocessing.sequence.pad_sequences([c_char_mapped], maxlen=3840, padding="post",value=char_dict["<pad>"])[0]
        
        c.append(c_mapped)
        q.append(q_mapped)
        c_chars.append(c_char_mapped)
        q_chars.append(q_char_mapped)
        answer.append(a)
        a_idx.append(aidx)
        
    return np.array(c), np.array(c_chars), np.array(q), np.array(q_chars), np.array(a_idx), answer

In [8]:
c, c_char, q, q_char, _, answer = generate_input_data(padded_dev)

100%|████████████████████████████████████████████████████████████████████████████| 3097/3097 [00:01<00:00, 1900.10it/s]


In [8]:
print(c.shape, c_char.shape)
print(q.shape, q_char.shape)
print(len(answer))

(3097, 240) (3097, 3840)
(3097, 30) (3097, 480)
3097


In [9]:
train_indices = np.arange(len(c))
def dev_batch(batch=16):
    np.random.shuffle(train_indices)
    for i in range(int(math.ceil(len(c)/batch))):
        start_index = (i*batch)%len(c)
        idx = train_indices[start_index:start_index+batch]
        c_b = c[idx]
        c_char_b = c_char[idx]
        q_b = q[idx]
        q_char_b = q_char[idx]
        a_b = []
        for j in idx:
            a_b.append(answer[j])
        yield c_b, c_char_b, q_b, q_char_b, a_b

In [10]:
def f_dev(pred_s, pred_e, a, context):
    # computes average f_measure for a batch
    f_sum = 0
    l = len(pred_s)
    for i in range(l):
        s_i = np.argmax(pred_s[i])
        e_i = np.argmax(pred_e[i])
        if e_i < s_i:
            continue
        TP, FN, FP = 0, 0, 0
        guess = context[i][s_i:e_i+1]
        true = [word_dict[t] for t in unknown_detection(a[i])]
        for token in guess:
            if token in true:
                TP += 1
            else:
                FP += 1
        for token in true:
            if token not in guess:
                FN += 1
        precision = TP/(TP+FP)
        recall = TP/(TP+FN)
        f = 2*precision*recall/(precision+recall+1e-8)
        f_sum += f
    return f_sum/l

def f_train(pred_s, pred_e, true_s, true_e, context):
    # computes average f_measure for a batch
    f_sum = 0
    l = len(pred_s)
    for i in range(l):
        s_i = np.argmax(pred_s[i])
        e_i = np.argmax(pred_e[i])
        if e_i < s_i:
            continue
        TP, FN, FP = 0, 0, 0
        guess = context[i][s_i:e_i+1]
        true = context[i][true_s[i]:true_e[i]+1]
        for token in guess:
            if token in true:
                TP += 1
            else:
                FP += 1
        for token in true:
            if token not in guess:
                FN += 1
        precision = TP/(TP+FP)
        recall = TP/(TP+FN)
        f = 2*precision*recall/(precision+recall+1e-8)
        f_sum += f
    return f_sum/l

In [None]:
def prob_dp(set1,set2):
    assert len(set1) == len(set2)
    max1 = 0
    maxi1 = 0
    maxpair = None
    maxp = 0
    for i in range(len(set1)):
        if set1[i]>max1:
            max1 = set1[i]
            maxi1 = i
        if max1 * set2[i] > maxp:
            maxp = max1 * set2[i]
            maxpair = [maxi1,i]
    assert maxpair[0] <= maxpair[1]
    return maxpair,maxp

In [15]:
#tf.reset_default_graph()
train = False

with tf.device("/gpu:0"):
    config = tf.ConfigProto(allow_soft_placement = True)
    
    with tf.Session(config=config) as sess:
        variables_to_restore = ema.variables_to_restore()
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('./model/checkpoint'))
        saver = tf.train.Saver(variables_to_restore)
        gen = dev_batch()
        #saver = tf.train.Saver()
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        if train:
            sess.run(train_iter.initializer)

        c_ph,c_char_ph, q_ph, q_char_ph, dp_in = tf.get_collection("infer_input")
        s_idx, e_idx = tf.get_collection("predictions")
        f_list = []
        
        i = 0
        while True:
            i += 1
            if train:
                next_c, next_c_char, next_q, next_q_char, next_mask = sess.run(next_batch)
                next_smask = next_mask[:,0]
                next_emask = next_mask[:,1]-1

                feed_dict = {q_input: next_q,
                             q_char_input: next_q_char,
                             c_input: next_c,
                             c_char_input: next_c_char,
                             dp:0}
            else:
                """
                c_in = c[i:i+1,:]
                c_char_in = c_char[i:i+1, :]
                q_in = q[i:i+1, :]
                q_char_in = q_char[i:i+1, :]
                a_in = answer[i]
                feed_dict={"inputs/c:0": c_in,
                           "inputs/c_char:0": c_char_in,
                           "inputs/q:0": q_in,
                           "inputs/q_char:0": q_char_in,
                           "inputs/drop_prob:0": 0.}
                """
                try:
                    c_d, c_char_d, q_d, q_char_d, ans_d = next(gen)
                except StopIteration:
                    break
                feed_dict={q_input:q_d,
                             q_char_input: q_char_d,
                             c_input: c_d,
                             c_char_input: c_char_d,
                             dp: 0}
            
            pred_s, pred_e = sess.run([s_idx, e_idx], feed_dict=feed_dict)
            if train:
                f = f_train(pred_s, pred_e, next_smask, next_emask, next_c)
            else:
                f = f_dev(pred_s, pred_e, ans_d, c_d)
            print(f)
            f_list.append(f)
            if i % 200 == 0:
                print(i)

print("Done!")
print(sum(f_list)/len(f_list))

INFO:tensorflow:Restoring parameters from ./model\strong-21000
0.28124999851562504
0.2571022714102854
0.22083333179583337
0.29062499842890627
0.44999999721249995
0.1532738083149093
0.3124999984375
0.48055555300478403
0.3820312476345704
0.32193396061443125
0.2916666649479167
0.38541666459201385
0.2708333318402778
0.3283730139228238
0.33749999792916663
0.3374999981375
0.2537202366603334
0.5510416635042534
0.22894736714072023
0.149305554607446
0.3124999984375
0.36458333122395836
0.48124999729999995
0.45833333090277784
0.3762310578206389
0.3574999981117222
0.34166666483888886
0.32291666500868055
0.3729166646045139
0.2782738079210955
0.44791666420138887
0.522115381605991
0.3589962100070256
0.4166666644791666
0.33286561033679246
0.12711864318759428
0.4145833309934027
0.24999999854166666
0.48749999723472226
0.29013364592557456
0.3104166649170139
0.48749999751249995
0.32384259049991426
0.17708333223090278
0.30160018385488374
0.2708333318402778
0.38503787639330356
0.27499999845
0.45231481231709

In [None]:
print(sum(f_list)/len(f_list))