In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from numba import jit
from scipy.sparse import coo_matrix
import torch
device = torch.device("cuda:0")

# Readfile

In [2]:
text_file = open('./Collection.txt', "r")
clcs = text_file.read().splitlines()
clc_list = []
for clc in clcs:
    content = clc.split()
    content = [x for x in content]
    cnt_str = ' '.join(content)
    clc_list.append(cnt_str)
text_file.close()

print('size of collection:',len(clc_list))

size of collection: 18461


In [3]:
text_file = open('./doc_list.txt', "r")
docs = text_file.read().splitlines()
doc_list = []
for doc in docs:
    f = open('./Document/' + doc)
    content = f.read().split()[5:]
    content = [x for x in content if x != '-1']
    cnt_str = ' '.join(content)
    doc_list.append(cnt_str)
text_file.close()

print('size of document:', len(doc_list))

size of document: 2265


In [4]:
text_file = open('./query_list.txt', "r")
queries = text_file.read().splitlines()
qry_list = []
for qry in queries:
    f = open('./Query/' + qry)
    content = f.read().split()
    content = [x for x in content if x != '-1']
    qry_list.append(content)
text_file.close()

print('size of query:', len(qry_list))

size of query: 800


In [5]:
text_file = open('./BGLM.txt', "r")
BGLM = text_file.read().splitlines()
idf={}
for line in BGLM:
    (word,value)=line.split()
    idf[word]=np.exp(float(value))
text_file.close()

In [6]:
text_file = open('./Rq100.txt', "r")
Rqs = text_file.read().splitlines()
Rq_list = []
for Rq in Rqs:
    content = Rq.split()
    content = [doc_list[int(j)].split() for j in content]
    Rq_list.append(content)
text_file.close()

print('size of Rq_list:',(len(Rq_list),len(Rq_list[0])))

size of Rq_list: (800, 50)


# TF

In [7]:
# build clc_tf, vocabulary
vectorizer = CountVectorizer(token_pattern='[0-9]+', min_df = 1)
clc_tf = vectorizer.fit_transform(clc_list+doc_list).tocoo()
vocabulary = vectorizer.vocabulary_  # Mapping of {word -> col of doc_term}
print('size of vocabulary:', len(vocabulary))

# build doc_tf
doc_tf = vectorizer.transform(doc_list).tocoo()

size of vocabulary: 35028


# PRF_similarity

In [8]:
@jit
def PRF_similarity(Pwd, PLSA, Rq_list, rev_num, q_ratio, Rq_ratio):
    alpha = 0.1
    beta = 0.52
    PRF_sim = np.zeros((Q, D))
    for q in range(Q):
        # original query part
        qry_sim = np.zeros(D)
        q_len = len(qry_list[q])
        for word in qry_list[q]:
                if word in vocabulary:
                    i = vocabulary[word]
                    A = alpha * Pwd[:,i]
                    B = beta * PLSA[:,i]
                    C = (1 - alpha - beta) * idf[word]
                    qry_sim[:] += np.log(A + B + C)
        # pseudo relvancr feedback part            
        Rq_sim = np.zeros(D)
        Rq_len = 0
        for r in range(rev_num):
            Rq_len += len(Rq_list[q][r])
            for word in Rq_list[q][r]:
                    i = vocabulary[word]
                    A = alpha * Pwd[:,i]
                    B = beta * PLSA[:,i]
                    C = (1 - alpha - beta) * idf[word]
                    Rq_sim[:] += np.log(A + B + C)
                    
        PRF_sim[q,:] = q_ratio/q_len*qry_sim[:] + Rq_ratio/Rq_len*Rq_sim[:]
        if q % 100 == 99:
            print("query:",q+1)
    return PRF_sim

# PRF result

In [9]:
T = 100
D = len(doc_list)
Q = len(qry_list)
Pwd = np.load("Pwd"+str(T)+".npy")
PLSA = np.load("PLSA"+str(T)+".npy")
(rev_num, q_ratio, Rq_ratio) = (20, 0.2, 0.8)
PRF_sim = PRF_similarity(Pwd, PLSA, Rq_list, rev_num, q_ratio, Rq_ratio)

query: 300
query: 400
query: 500
query: 600
query: 700
query: 800


In [10]:
fname = "./"+str(T)+'_'+str(rev_num)+'_'+str(q_ratio)+'_'+str(Rq_ratio)+".txt"
f = open(fname, 'w')
f.write("Query,RetrievedDocuments\n")  

for q in range(len(qry_list)):
    f.write(queries[q] + ",")        
    rank = np.argsort(-PRF_sim[q])
    for j in rank[:50]:
        f.write(docs[j]+" ")
    f.write("\n")
f.close()