In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from numba import jit
from scipy.sparse import coo_matrix
# import torch
# device = torch.device("cuda:0")

# Readfile

In [2]:
text_file = open('./Collection.txt', "r")
clcs = text_file.read().splitlines()
clc_list = []
for clc in clcs:
    content = clc.split()
    content = [x for x in content]
    cnt_str = ' '.join(content)
    clc_list.append(cnt_str)
text_file.close()

print('size of collection:',len(clc_list))

size of collection: 18461


In [3]:
text_file = open('./doc_list.txt', "r")
docs = text_file.read().splitlines()
doc_list = []
for doc in docs:
    f = open('./Document/' + doc)
    content = f.read().split()[5:]
    content = [x for x in content if x != '-1']
    cnt_str = ' '.join(content)
    doc_list.append(cnt_str)
text_file.close()

print('size of document:', len(doc_list))

size of document: 2265


In [4]:
text_file = open('./query_list.txt', "r")
queries = text_file.read().splitlines()
qry_list = []
for qry in queries:
    f = open('./Query/' + qry)
    content = f.read().split()
    content = [x for x in content if x != '-1']
    qry_list.append(content)
text_file.close()

print('size of query:', len(qry_list))

size of query: 800


In [5]:
text_file = open('./BGLM.txt', "r")
BGLM = text_file.read().splitlines()
idf={}
for line in BGLM:
    (word,value)=line.split()
    idf[word]=np.exp(float(value))
text_file.close()

# TF

In [6]:
# build clc_tf, vocabulary
vectorizer = CountVectorizer(token_pattern='[0-9]+', min_df = 1)
clc_tf = vectorizer.fit_transform(clc_list+doc_list).tocoo()
vocabulary = vectorizer.vocabulary_  # Mapping of {word -> col of doc_term}
print('size of vocabulary:', len(vocabulary))

# build doc_tf
doc_tf = vectorizer.transform(doc_list).tocoo()

size of vocabulary: 35028


# Initialize Pwt,Ptd

In [7]:
def init_Pwt(T, V):
    Pwt=np.random.rand(T, V)
    Pwt /=  Pwt.sum(axis=1,keepdims=True)
    return Pwt

def init_Ptd(D, T):
    Ptd=np.random.rand(D, T)
    Ptd /=  Ptd.sum(axis=1,keepdims=True)
    return Ptd

# E step

In [8]:
@jit(nopython=True)
def E_step(tf_data,tf_row,tf_col, Pwt, Ptd, T):
    nnz=len(tf_data)
    Ptwd = np.zeros((T, nnz))    
    for ij in range(nnz):
        j, i = tf_row[ij], tf_col[ij]
        Ptwd_numerator = np.zeros(T)
        Ptwd_denominator = 0
        for k in range(T):
            Ptwd_numerator[k] = Pwt[k][i] * Ptd[j][k]
            Ptwd_denominator += Pwt[k][i] * Ptd[j][k]
        for k in range(T):
            if Ptwd_denominator!=0:
                Ptwd[k][ij] = Ptwd_numerator[k] / Ptwd_denominator
    return Ptwd

# M step

In [9]:
@jit(nopython=True)
def M_step(tf_data,tf_row,tf_col, Ptwd, clc_len, V, C, T):

    Ptd = np.zeros((C, T))
    Pwt = np.zeros((T, V))
    Pwt_denominator = np.zeros(T)
    for ij in range(len(tf_data)):
        j, i = tf_row[ij], tf_col[ij]
        for k in range(T):            
            Pwt[k][i] += tf_data[ij]*Ptwd[k][ij]
            Pwt_denominator[k] += tf_data[ij]*Ptwd[k][ij]
            Ptd[j][k] += tf_data[ij]*Ptwd[k][ij]
    for k in range(T): 
        for i in range(V):
            Pwt[k][i] = Pwt[k][i] / Pwt_denominator[k] 
    for k in range(T):
        for j in range(C):
            Ptd[j][k] = Ptd[j][k] / clc_len[j]
    return Pwt, Ptd 

# M step (fold-in)

In [10]:
@jit(nopython=True)
def M_step_fold_in(tf_data,tf_row,tf_col, Ptwd, doc_len, D, T):

    Ptd_fdn = np.zeros((D, T))
    for ij in range(len(tf_data)):
        j, i = tf_row[ij], tf_col[ij]
        for k in range(T):
            Ptd_fdn[j][k] += tf_data[ij]*Ptwd[k][ij]  
            
    for k in range(T): 
        for j in range(D):
            Ptd_fdn[j][k] = Ptd_fdn[j][k] / doc_len[j]  
    return Ptd_fdn 

# Likelihood

In [11]:
@jit(nopython=True)
def likelihood(tf_data, tf_row, tf_col, Pwt, Ptd, T):
    likelihood=0
    for ij in range(len(tf_data)):
        j,i = tf_row[ij], tf_col[ij]
        sumation=0
        for k in range(T):
            sumation+=Pwt[k][i]*Ptd[j][k]
        likelihood+=tf_data[ij]*np.log(sumation)
    return likelihood

# Similarity

In [12]:
@jit
def similarity(alpha, beta, Pwd, PLSA):

    sim = np.zeros((Q, D))
    for q in range(Q):
        for word in (qry_list[q]):
                if word in vocabulary:
                    i = vocabulary[word]
                    A = alpha * Pwd[:,i]
                    B = beta * PLSA[:,i]
                    C = (1 - alpha - beta) * idf[word]
                    sim[q,:] += np.log(A + B + C)
        if q % 50 == 49:
            print("query:",q+1)
    return sim

# Train

In [13]:
C = len(clc_list+doc_list)
V = len(vocabulary)
T = 256
Pwt = init_Pwt(T, V)
Ptd = init_Ptd(C, T)
# Pwt = np.load("Pwt.npy")
# Ptd = np.load("Ptd.npy")
clc_data = clc_tf.data
clc_row = clc_tf.row
clc_col = clc_tf.col
clc_len = clc_tf.toarray().sum(axis=1).reshape((C,))

In [14]:
# train_iter = 200
print("iter:", 0, "\tlikelihood:", likelihood(clc_data,clc_row,clc_col,Pwt,Ptd,T))
for count in range(200):
    Ptwd = E_step(clc_data,clc_row,clc_col, Pwt, Ptd, T)
    Pwt, Ptd = M_step(clc_data,clc_row,clc_col, Ptwd, clc_len, V, C, T)
    if count%50==49:
        np.save('Pwt', Pwt)
        np.save('Ptd', Ptd)
        print("iter:", count+1, "\tlikelihood:", likelihood(clc_data,clc_row,clc_col,Pwt,Ptd,T))

iter: 0 	likelihood: -62569364.47501534
iter: 50 	likelihood: -38364480.48140072


KeyboardInterrupt: 

# Fold-in

In [None]:
D = len(doc_list)
# Pwt = np.load("Pwt.npy")
Ptd_fdn = init_Ptd(D, T)
# Ptd_fdn = np.load("Ptd_fdn.npy")
doc_data = doc_tf.data
doc_row = doc_tf.row
doc_col = doc_tf.col
doc_len = doc_tf.toarray().sum(axis=1).reshape((D,))

In [None]:
# fold_in_iter = 200
print("iter:", 0, "\tlikelihood:", likelihood(doc_data,doc_row,doc_col,Pwt,Ptd_fdn,T))
for count in range(200):
    Ptwd = E_step(doc_data,doc_row,doc_col, Pwt, Ptd_fdn, T)
    Ptd_fdn = M_step_fold_in(doc_data,doc_row,doc_col, Ptwd, doc_len, D, T)
    if count%50==49:
        np.save('Ptd_fdn', Ptd_fdn)
        print("iter:", count+1, "\tlikelihood:", likelihood(doc_data,doc_row,doc_col,Pwt,Ptd_fdn,T))

# Relevance degree

In [None]:
Q = len(qry_list)
D = len(doc_list)
# Ptd_fdn = np.load("Ptd_fdn.npy")
# Pwt = np.load("Pwt.npy")
Pwd = doc_tf.toarray() / doc_len.reshape((D,1))
PLSA = np.dot(Ptd_fdn, Pwt)
sim = similarity(0.1, 0.52, Pwd, PLSA)

In [None]:
fname = "./256_0.1_0.52.txt"
Rq_file = open("./Rq.txt", 'w')
f = open(fname, 'w')
f.write("Query,RetrievedDocuments\n")  

for q in range(len(qry_list)):
    f.write(queries[q] + ",")        
    rank = np.argsort(-sim[q])
    for j in rank[:50]:
        Rq_file.write(str(j)+" ")
        f.write(docs[j]+" ")
    f.write("\n")
    Rq_file.write("\n")
f.close()
Rq_file.close()

# Read file Rq

In [1]:
text_file = open('./Rq.txt', "r")
Rqs = text_file.read().splitlines()
Rq_list = []
for Rq in Rqs:
    content = Rq.split()
    content = [int(j) for j in content]
    Rq_list.append(content)
text_file.close()

print('size of Rq:',(len(Rq_list),len(Rq_list[0])))

size of Rq: (800, 50)


In [5]:
Rq_list[0][:10]

[1701, 2240, 2254, 2260, 63, 1713, 1638, 1628, 971, 644]

In [6]:
Rqs[0]

'1701 2240 2254 2260 63 1713 1638 1628 971 644 47 1508 355 1443 817 1700 271 1500 1712 1720 1702 259 816 332 331 506 233 151 330 2005 475 1398 1807 508 1335 970 264 1000 243 2166 751 1821 753 1338 477 1328 2239 692 1200 1520 '

# TF-IDF

In [69]:
from sklearn.feature_extraction.text import TfidfTransformer
qry_str_list=[]
for qry in qry_list:
    cnt_str = ' '.join(qry)
    qry_str_list.append(cnt_str)
qry_tf = vectorizer.transform(qry_str_list).tocoo() 

transformer = TfidfTransformer()
qry_tfidf = transformer.fit_transform(qry_tf.toarray()).toarray()
doc_tfidf = transformer.fit_transform(doc_tf.toarray()).toarray()
print("qry_tfidf.shape:",qry_tfidf.shape)
print("doc_tfidf.shape:",doc_tfidf.shape)

qry_tfidf.shape: (800, 35028)
doc_tfidf.shape: (2265, 35028)


# The Rocchio Algorithm

In [92]:
def feedback_qry_tfidf(qry_tfidf,doc_tfidf,Rq_list,rev_num,alpha,beta,Q,V):
    feedback_qry_tfidf=np.zeros((Q, V))
    for q in range(Q):
        rev_doc_tfidf_sum=np.zeros(V)
        for r in range(rev_num):
            j = Rq_list[q][r]
            rev_doc_tfidf_sum += doc_tfidf[j]
        feedback_qry_tfidf[q] = alpha*qry_tfidf[q] + beta*rev_doc_tfidf_sum/rev_num
    return feedback_qry_tfidf

In [100]:
rev_num=10
alpha,beta=(0.8,0.2)
new_qry_tfidf=feedback_qry_tfidf(qry_tfidf,doc_tfidf,Rq_list,rev_num,alpha,beta,Q,V)

# Cosine Similarity

In [101]:
def cos_sim(v1,v2):
    return np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

# PRF result

In [102]:
fname = "./"+str(rev_num)+"_"+str(alpha)+"_"+str(beta)+".txt"
f = open(fname, 'w')
f.write("Query,RetrievedDocuments\n")  

for q in range(len(qry_list)):
    f.write(queries[q] + ",")   
    
    for j in range(50):         
        if j==0:
            sim=cos_sim(new_qry_tfidf[q],doc_tfidf[j])
        else:
            sim=np.append(sim,cos_sim(new_qry_tfidf[q],doc_tfidf[j]))
        
    rank = np.argsort(-sim)
    for j in rank:
        f.write(docs[j]+" ")
    f.write("\n")
f.close()