# Retrieve Using PACRR

### This script's goal is to use our pre-trained model and pre-retrieved data using our simple classifier to re-rank and retrieve the best 5 results.

In [1]:
import dataset_loader as dl
import pacrr
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import random
import gc
import time
import json

  from ._conv import register_converters as _register_converters


In [2]:
lq = 10
ld = 20
lf = 32
lg = 3
k = 3
lr = 0.01

raw_data_f = r"C:\Users\matan\Desktop\IR_Proj\data\anfL6.json"
token_data_f = "./tokenized_dataset.pickle"

netfile = r"D:\IRfiles\saves\250\e250.cpt"

qfile = "finalEval.txt"
outfile = "res.json"

# file as output from our Lucene classifier
from_answers_f = r"D:\IRfiles\irproj\data\res.json"


In [3]:
'''
Our train data is stored in a few ways:
all the raw_* arrays are just the dataset as is. not tokenization or answers/questions removed.
answers = an array of tokenized and w2v answers
qinds = an array of size len(answers). qinds[i] = index of question in questions to which answers[i] answers
questions = an array of tokenized and w2v questions.
'''

raw_ids, raw_questions, raw_answers, _ = dl.get_dataset(raw_data_f)
del _

ids, questions, answers, _ = dl.get_dataset_tokens_loaded(raw_data_f, token_data_f)
answers, qinds = dl.get_fixed_w2v_answers(answers, ld)

print("done")

done


In [7]:
gc.collect()

'''
because we store data differently (raw_answers is an array of size |raw_questions| of arrays of answers to each question) 
and our lucene IR system returns the answer via question_id and ans_ind we need to be able to convert from 
one representation to the other

<b>THIS IS FOR TRAINING AND FOR FINDING TEXT OF *ALREADY* RETRIEVED DOCUMENTS. RETRIEVAL IS DONE VIA THE NEURAL NET ONLY! </b>
'''

# from qind (index in the questions array) and aind (index of answer of that question in the original data.)
# returns index in answers array of corresponding answer.
def get_ans_ind(qind, aind):
    for i in range(len(qinds)):
        if qinds[i]==qind:
            return i + aind
    return None

# from qid (id of question) and aind (index of answer of that question in the original data.)
# returns index in answers array of corresponding answer.
def get_ans_ind_by_id(qid, aind):
    for i in range(len(qinds)):
        if ids[qinds[i]]==qid:
            return i + aind
    return None

# from ansind (index of answer in answers array)
# retuns index in questions and index of answer in that question in original data.
def ans_ind_to_q(ansind):
    questionind = qinds[ansind]
    i = ansind - 1
    firstqid = ansind
    while i >= 0 and qinds[i] == questionind:
        firstqid = i
        i -= 1
    return questionind, ansind - firstqid

# from ansind (index of answer in answers array)
# returns original text (non-tokenized) of answer from raw_answers
def ans_ind_to_text(ansind):
    qind, aind = ans_ind_to_q(ansind)
    qind = ids[qind]
    
    raw_qind = raw_ids.index(qind)
    return raw_answers[raw_qind][aind]

In [8]:
# load our trained network from file
sess = tf.Session()
loss, yp, yn, prel, opt, saver = pacrr.load(sess, netfile)

INFO:tensorflow:Restoring parameters from D:\IRfiles\saves\250\e250.cpt


In [9]:
# load questions file (finalEval.txt)
testids = []
with open(qfile, 'r') as f:
    lines = f.readlines()
for line in lines:
    testids.append(int(line[0:line.find("\t")]))

In [10]:
# convert from question ids to question indexes in questions array
testids = [ids.index(str(tid)) for tid in testids]

In [11]:
# splits data into batches for input to network
def split_batches(a, batch_size):
    batches = []
    
    for i in range(int(a.shape[0]/batch_size)):
        batches.append(a[i*batch_size : (i+1) * batch_size])
    
    return batches

In [12]:
# for each answer batch in answer_batches, give each similarity matrix a rank
def eval_q(answer_batches, batch_size=128):
    rankings = []
    for ab in answer_batches:
        feed_dict = { yp : ab }
        netout = sess.run(prel, feed_dict=feed_dict)
        rankings += netout.tolist()
    return rankings

In [13]:
# load our retrieved data for re-ranking (from our simple lucene system)
with open(from_answers_f, "r") as f:
    from_answers = json.load(f)

In [14]:
# for each question we now want to rank it with initial retrieval answers and return best ranked ones.

result = []

# how many should we return?
num_to_rank = 5

batch_size = 128
ans_counter = 0


for testqind in testids:
    st = time.time()
    
    #load question and change it to w2v
    q = questions[testqind][:]
    dl.to_w2v(q)
    q = dl.fix_length_single(q, lq)
    
    # find corresponding answers from retrieved data
    # (this is similar format to our json result. but instead of answer text we have original question id and ansind)
    # like previously explained in this code. (NOT USED FOR RETRIEVAL)    
    for jsonquestion in from_answers:
        if jsonquestion['id'] == ids[testqind]:
            my_answers = jsonquestion['answers']
            break
    
    # Pre-allocate space for similarity matrices
    sim_matrices = np.empty((len(my_answers), lq, ld))
    
    # Calculate matrices
    print("loading matrices")
    for idx, ans in enumerate(my_answers):
        sim_matrices[idx] = dl.get_cosine_mat(q, answers[get_ans_ind_by_id(ans['originId'], int(ans['index']))])
    
    batches = split_batches(sim_matrices, batch_size)
    
    # run through network
    print("evaling")
    output_ranking_vals = eval_q(batches)
    
    print("ranking")
    # find top ranked indexes (in net output)
    ranking = np.array(output_ranking_vals).argsort()[-num_to_rank:][::-1]
    curr_ranking = { "id" : str(ids[testqind])}
    
    # foreach index of top ranked results. Convert it's index from my_answers index to answers index and get original text.
    curr_answers = [{"answer" : ans_ind_to_text(get_ans_ind_by_id(my_answers[i]['originId'], int(my_answers[i]['index']))),
                     "score" : output_ranking_vals[i]} for i in ranking]
    
    curr_ranking["answers"] = curr_answers
    result.append(curr_ranking)
    print("question %d, ind %d, secs %d" % (ans_counter, testqind, time.time()-st))
    ans_counter += 1
    del q
    del sim_matrices


loading matrices
evaling
ranking
question 0, ind 85295, secs 81
loading matrices
evaling
ranking
question 1, ind 13158, secs 81
loading matrices
evaling
ranking
question 2, ind 64616, secs 78
loading matrices
evaling
ranking
question 3, ind 30066, secs 83
loading matrices
evaling
ranking
question 4, ind 6135, secs 81
loading matrices
evaling
ranking
question 5, ind 10664, secs 78
loading matrices
evaling
ranking
question 6, ind 11702, secs 81
loading matrices
evaling
ranking
question 7, ind 42131, secs 82
loading matrices
evaling
ranking
question 8, ind 51489, secs 2
loading matrices
evaling
ranking
question 9, ind 70683, secs 81
loading matrices
evaling
ranking
question 10, ind 28407, secs 78
loading matrices
evaling
ranking
question 11, ind 63554, secs 81
loading matrices
evaling
ranking
question 12, ind 24619, secs 81
loading matrices
evaling
ranking
question 13, ind 41639, secs 84
loading matrices
evaling
ranking
question 14, ind 56697, secs 79
loading matrices
evaling
ranking
ques

ranking
question 126, ind 27977, secs 78
loading matrices
evaling
ranking
question 127, ind 66548, secs 82
loading matrices
evaling
ranking
question 128, ind 12880, secs 80
loading matrices
evaling
ranking
question 129, ind 79408, secs 81
loading matrices
evaling
ranking
question 130, ind 85323, secs 82
loading matrices
evaling
ranking
question 131, ind 75850, secs 84
loading matrices
evaling
ranking
question 132, ind 71626, secs 0
loading matrices
evaling
ranking
question 133, ind 3831, secs 79
loading matrices
evaling
ranking
question 134, ind 73306, secs 78
loading matrices
evaling
ranking
question 135, ind 54020, secs 77
loading matrices
evaling
ranking
question 136, ind 78297, secs 84
loading matrices
evaling
ranking
question 137, ind 35739, secs 0
loading matrices
evaling
ranking
question 138, ind 51620, secs 81
loading matrices
evaling
ranking
question 139, ind 4654, secs 77
loading matrices
evaling
ranking
question 140, ind 86446, secs 82
loading matrices
evaling
ranking
questi

evaling
ranking
question 251, ind 54019, secs 88
loading matrices
evaling
ranking
question 252, ind 24966, secs 0
loading matrices
evaling
ranking
question 253, ind 13245, secs 82
loading matrices
evaling
ranking
question 254, ind 5846, secs 84
loading matrices
evaling
ranking
question 255, ind 50631, secs 89
loading matrices
evaling
ranking
question 256, ind 845, secs 87
loading matrices
evaling
ranking
question 257, ind 4685, secs 3
loading matrices
evaling
ranking
question 258, ind 8677, secs 85
loading matrices
evaling
ranking
question 259, ind 30798, secs 85
loading matrices
evaling
ranking
question 260, ind 31078, secs 87
loading matrices
evaling
ranking
question 261, ind 39820, secs 87
loading matrices
evaling
ranking
question 262, ind 79523, secs 86
loading matrices
evaling
ranking
question 263, ind 30219, secs 57
loading matrices
evaling
ranking
question 264, ind 34939, secs 86
loading matrices
evaling
ranking
question 265, ind 66436, secs 84
loading matrices
evaling
ranking
q

In [15]:

# write to json file (final result before submitting)
with open(outfile, 'w', encoding="ascii") as f:
    json.dump(result, f, indent=4)