In [None]:
import pickle
import torch

In [None]:
# loading pickle dataset files
train = pickle.load(open('wiki_qa_train.pkl', 'rb'))
test = pickle.load(open('wiki_qa_test.pkl', 'rb'))
val = pickle.load(open('wiki_qa_validation.pkl', 'rb'))

In [None]:
# tokenize data is a function defined in embeds_and_cosinet.ipynb
wiki_qa_train = tokenize_data(train)
wiki_qa_test = tokenize_data(test)
wiki_qa_validation = tokenize_data(val)

In [None]:
wiki_qa_embeddings = torch.load('wiki_qa_embedding.pt')

In [None]:
# data in wiki_qa_train is of the form 
# (question, answer, label) where for question and answer we have the w2i representation

def get_embedded_forms(data):
    for i, datum in enumerate(data):
        data[i]["question"] = [wiki_qa_embeddings[index] for index in datum["question"]]
        data[i]["answer"] = [[wiki_qa_embeddings[index] for index in datum["answer"]] for answer in datum["answer"]]
    
    return data

train_embedded = get_embedded_forms(wiki_qa_train)
test_embedded = get_embedded_forms(wiki_qa_test)
validation_embedded = get_embedded_forms(wiki_qa_validation)

In [None]:
def max_num_of_pairs(data):
    max_pairs = 0
    max_length = 0
    for datum in data:
        if len(datum["answer"]) > max_pairs:
            max_pairs = len(datum["answer"])
    return max_pairs

max_pairs = max(max_num_of_pairs(train_embedded), max_num_of_pairs(test_embedded), max_num_of_pairs(validation_embedded))

In [None]:
def max_sent_length(data):
    max_length = 0
    for datum in data:
        for answer in datum["answer"]:
            if len(answer) > max_length:
                max_length = len(answer)
    return max_length

max_length = max(max_sent_length(train_embedded), max_sent_length(test_embedded), max_sent_length(validation_embedded)) + 1 # +1 for the cosine similarity that'll be added

In [None]:
def get_pair_cosine(question, answer):
    question_r = []
    answer_r = []

    for word in question:
        max_similarity = 0
        for word2 in answer:
            similarity = torch.cosine_similarity(word, word2)
            if similarity > max_similarity:
                max_similarity = similarity
        question_r.append(max_similarity)
    
    for word in answer:
        max_similarity = 0
        for word2 in question:
            similarity = torch.cosine_similarity(word, word2)
            if similarity > max_similarity:
                max_similarity = similarity
        answer_r.append(max_similarity)
    
    return torch.tensor(question_r), torch.tensor(answer_r)

In [None]:
def extended_embeddings(question, answer):
    question_r, answer_r = get_pair_cosine(question, answer)
    for i in range(len(question)):
        question[i] = torch.cat((question[i], question_r[i].unsqueeze(0)), 0)
    for i in range(len(answer)):
        answer[i] = torch.cat((answer[i], answer_r[i].unsqueeze(0)), 0)
    
    if len(question) < max_length:
        for i in range(max_length - len(question)):
            question.append(torch.zeros(301))
    
    if len(answer) < max_length:
        for i in range(max_length - len(answer)):
            answer.append(torch.zeros(301))
            
    return question, answer

In [None]:
# might want to find better ways to implement padding for sentence as well as for number of sentence pairs
filler = [torch.zeros(301) for i in range(max_length)]

In [None]:
def get_question_answer_pairs(data):
    pairs = []
    question_pairs = []
    for datum in data:
        question = datum["question"]
        for i,answer in enumerate(datum["answer"]):
            question, answer = extended_embeddings(question, answer)
            question_pairs.append((question, answer))
        pairs.append(question_pairs)

    return pairs

In [None]:
# generating modified embeddings along with cosine similarity, this is being generated here because we want to keep them static as per the paper
train_pairs = get_question_answer_pairs(train_embedded)
test_pairs = get_question_answer_pairs(test_embedded)
validation_pairs = get_question_answer_pairs(validation_embedded)