In [1]:
import numpy as np
import tensorflow as tf
import json
import os
import warnings
import random
import pickle
import math
warnings.simplefilter(action='ignore', category=FutureWarning)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

%cd "E:\NLP\QA"

E:\NLP\QA


In [2]:
embedding = np.load("embedding2.npy").astype("float32")
with open("documents.json") as f:
    doc = json.load(f)
with open("training.json") as f:
    train = json.load(f)
with open("devel.json") as f:
    dev = json.load(f)
with open("word_dict2.json","r") as f:
    word_dict = json.load(f)
with open("char_dict2.json","r") as f:
    char_dict = json.load(f)

In [3]:
print(embedding.shape)
print(len(word_dict))
print(len(char_dict))
print(word_dict["<PAD>"])
print(char_dict["<pad>"])

(72497, 300)
72499
210
72498
209


In [4]:
topk = 2

stopword = set(stopwords.words('english'))
punc = set(['"','\'',"?",".",",","/","<",">",":",";"])

In [5]:
def unknown_detection(token_list):
    new_list = []
    for token in token_list:
        if token in word_dict:
            new_list.append(token)
        else:
            new_list.append("<UNK>")
    return new_list

def generate_char(token_list):
    new_list = []
    for token in token_list:
        if token == "<PAD>":
            char_list = ["<pad>"]*16
        else:
            char_list = [c for c in token[:16]]
        while len(char_list) < 16:
            char_list.append("<pad>")
        for char in char_list:
            if char in char_dict:
                new_list.append(char)
            else:
                new_list.append("<unk>")
    assert len(new_list) == len(token_list) * 16
    return new_list

if os.path.exists("tfidfs.pickle"):
    with open("tfidfs.pickle","rb") as f:
        tfidfs = pickle.load(f)
else:
    tfidfs = dict()
    for d in doc:
        tfidf = TfidfVectorizer(tokenizer=word_tokenize,
                                stop_words='english',
                                max_df=0.5,
                                smooth_idf=False,
                                sublinear_tf=True)
        paragraphs = [p.lower() for p in d["text"]]
        res = tfidf.fit_transform(paragraphs).toarray()
        mapping = tfidf.vocabulary_
        tfidfs[d["docid"]] = [res, mapping]
    with open("tfidfs.pickle","wb") as f:
        pickle.dump(tfidfs, f)

In [7]:
padded_train = []
for sample in tqdm(train):
    new_sample = dict()
    docid = sample["docid"]

    question = word_tokenize(sample["question"].lower())

    answer = word_tokenize(sample["text"].lower())[:7]
    answer_para = sample["answer_paragraph"]
    para = word_tokenize(doc[docid]["text"][answer_para].lower())[:240]
    
    padded_question = question[:30]
    while len(padded_question) < 30:
        padded_question.append("<PAD>")
    question_char = generate_char(padded_question)
    padded_question = unknown_detection(padded_question)
    
    # select topk noisy sample by tfidf
    rmed = []
    for token in question:
        if token not in stopword and token not in punc:
            rmed.append(token)
    
    res, mapping = tfidfs[docid]
    # set accumulator for each paragraph
    a_d = [0 for _ in range(res.shape[0])]
    for token in rmed:
        for i in range(len(a_d)):
            if token in mapping:
                a_d[i] += res[i, mapping[token]]
            else:
                pass
    k = topk if res.shape[0] > topk else res.shape[0]
    pred = np.argpartition(a_d, -k)[-k:]
    pred = set(pred)
    # give 3 noisy samples excluding the correct one
    if answer_para in pred:
        pred.remove(answer_para)
    else:
        pred.pop()
    pred = list(pred)
    
    new_sample["question"] = padded_question
    new_sample["q_char"] = question_char
    
    assert len(padded_question) == 30
    assert len(question_char) == 480
    
    for idx in pred:
        if idx == answer_para:
            para = word_tokenize(doc[docid]["text"][answer_para].lower())[:240]
            content_char = generate_char(para)
            content = unknown_detection(para)
            new_sample["content"] = content
            new_sample["c_char"] = content_char
            new_sample["label"] = 1
            padded_train.append(new_sample)
            padded_train.append(new_sample)
            padded_train.append(new_sample)
        else:
            para = word_tokenize(doc[docid]["text"][idx].lower())[:240]
            content_char = generate_char(para)
            content = unknown_detection(para)
            new_sample["content"] = content
            new_sample["c_char"] = content_char
            new_sample["label"] = 0
            padded_train.append(new_sample)
        assert len(content) <= 240
        assert len(content_char) <= 3840
        assert len(content_char) == len(content) * 16

100%|███████████████████████████████████████████████████████████████████████████| 43379/43379 [02:34<00:00, 280.12it/s]


In [8]:
def generate_training_data(padded):

    c, c_chars, q, q_chars, labels = [], [], [], [], []
    
    cnt = 0
    for i in tqdm(range(len(padded))):
        cnt += 1
        sample = padded[i]
        question = sample["question"]
        content = sample["content"]
        q_char = sample["q_char"]
        c_char = sample["c_char"]
        label = sample["label"]
        
        """
        # ignore answer only contains <UNK>
        idx = answer.index("<PAD>") if "<PAD>" in answer else 7
        if all(t == "<UNK>" for t in answer[:idx]):
            continue
        """
        
        q_mapped = [word_dict[t] for t in question]
        c_mapped = [word_dict[t] for t in content]
        q_char_mapped = [char_dict[ch] for ch in q_char]
        c_char_mapped = [char_dict[ch] for ch in c_char]
        
        c_mapped = tf.keras.preprocessing.sequence.pad_sequences([c_mapped], maxlen=240, padding="post",value=word_dict["<PAD>"])[0]
        c_char_mapped = tf.keras.preprocessing.sequence.pad_sequences([c_char_mapped], maxlen=3840, padding="post",value=char_dict["<pad>"])[0]
        
        c.append(c_mapped)
        q.append(q_mapped)
        c_chars.append(c_char_mapped)
        q_chars.append(q_char_mapped)
        labels.append(label)
        
    return np.array(c), np.array(c_chars), np.array(q), np.array(q_chars), np.array(labels)

In [9]:
c, c_char, q, q_char, label = generate_training_data(padded_train)

100%|██████████████████████████████████████████████████████████████████████████| 43379/43379 [00:23<00:00, 1818.75it/s]


In [10]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _int64_feature2(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

filename = "train.tfrecords"
with tf.python_io.TFRecordWriter(filename) as writer:
    for i in range(len(c)):
        example = tf.train.Example(
              features=tf.train.Features(
                  feature={
                      'c': _int64_feature(c[i]),
                      'c_char': _int64_feature(c_char[i]),
                      'q': _int64_feature(q[i]),
                      'q_char': _int64_feature(q_char[i]),
                      'label': _int64_feature2(label[i])
                  }))
        writer.write(example.SerializeToString())

In [5]:
train_indices = np.arange(len(c))
def dev_batch(batch=16):
    while True:
        np.random.shuffle(train_indices)
        for i in range(int(math.ceil(len(c)/batch))):
            start_index = (i*batch)%len(c)
            idx = train_indices[start_index:start_index+batch]
            c_b = c[idx]
            c_char_b = c_char[idx]
            q_b = q[idx]
            q_char_b = q_char[idx]
            a_b = []
            for j in idx:
                a_b.append(answer[j])
            yield c_b, c_char_b, q_b, q_char_b, a_b

In [6]:
def f_train(pred_s, pred_e, true_s, true_e, context):
    # computes average f_measure for a batch
    f_sum = 0
    l = len(pred_s)
    for i in range(l):
        s_i = np.argmax(pred_s[i])
        e_i = np.argmax(pred_e[i])
        if e_i < s_i:
            continue
        TP, FN, FP = 0, 0, 0
        guess = context[i][s_i:e_i+1]
        true = context[i][true_s[i]:true_e[i]+1]
        for token in guess:
            if token in true:
                TP += 1
            else:
                FP += 1
        for token in true:
            if token not in guess:
                FN += 1
        precision = TP/(TP+FP)
        recall = TP/(TP+FN)
        f = 2*precision*recall/(precision+recall+1e-8)
        f_sum += f
    return f_sum/l

def f_dev(pred_s, pred_e, a, context):
    # computes average f_measure for a batch
    f_sum = 0
    l = len(pred_s)
    for i in range(l):
        s_i = np.argmax(pred_s[i])
        e_i = np.argmax(pred_e[i])
        if e_i < s_i:
            continue
        TP, FN, FP = 0, 0, 0
        guess = context[i][s_i:e_i+1]
        true = [word_dict[t] for t in unknown_detection(a[i])]
        for token in guess:
            if token in true:
                TP += 1
            else:
                FP += 1
        for token in true:
            if token not in guess:
                FN += 1
        precision = TP/(TP+FP)
        recall = TP/(TP+FN)
        f = 2*precision*recall/(precision+recall+1e-8)
        f_sum += f
    return f_sum/l

In [None]:
def parser(record):
    keys_to_features = {
        "c": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True),
        "c_char": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True),
        "q": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True),
        "q_char": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True),
        "lable":tf.FixedLenFeature(shape=[], dtype=tf.int64)
    }
    parsed = tf.parse_single_example(record, keys_to_features)
    c = tf.cast(parsed["c"], tf.int32)
    c_char = tf.cast(parsed["c_char"], tf.int32)
    q = tf.cast(parsed["q"], tf.int32)
    q_char = tf.cast(parsed["q_char"], tf.int32)
    label = tf.cast(parsed["label"], tf.int32)

    return c, c_char, q, q_char, label