In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from numba import jit
from scipy.sparse import coo_matrix
import torch
device = torch.device("cuda:0")

# Readfile

In [2]:
text_file = open('./Collection.txt', "r")
clcs = text_file.read().splitlines()
clc_list = []
for clc in clcs:
    content = clc.split()
    content = [x for x in content]
    cnt_str = ' '.join(content)
    clc_list.append(cnt_str)
text_file.close()

print('size of collection:',len(clc_list))

size of collection: 18461


In [3]:
text_file = open('./doc_list.txt', "r")
docs = text_file.read().splitlines()
doc_list = []
for doc in docs:
    f = open('./Document/' + doc)
    content = f.read().split()[5:]
    content = [x for x in content if x != '-1']
    cnt_str = ' '.join(content)
    doc_list.append(cnt_str)
text_file.close()

print('size of document:', len(doc_list))

size of document: 2265


In [4]:
text_file = open('./query_list.txt', "r")
queries = text_file.read().splitlines()
qry_list = []
for qry in queries:
    f = open('./Query/' + qry)
    content = f.read().split()
    content = [x for x in content if x != '-1']
    qry_list.append(content)
text_file.close()

print('size of query:', len(qry_list))

size of query: 32


In [5]:
text_file = open('./BGLM.txt', "r")
BGLM = text_file.read().splitlines()
idf={}
for line in BGLM:
    (word,value)=line.split()
    idf[word]=np.exp(float(value))
text_file.close()

# TF

In [6]:
# build clc_tf, vocabulary
vectorizer = CountVectorizer(token_pattern='[0-9]+', min_df = 1)
clc_tf = vectorizer.fit_transform(clc_list+doc_list).tocoo()
vocabulary = vectorizer.vocabulary_  # Mapping of {word -> col of doc_term}
print('size of vocabulary:', len(vocabulary))

# build doc_tf
doc_tf = vectorizer.transform(doc_list).tocoo()

size of vocabulary: 35028


# Initialize Pwt,Ptd

In [7]:
def init_Pwt(T, V):
    Pwt=np.random.rand(T, V)
    Pwt /=  Pwt.sum(axis=1,keepdims=True)
    return Pwt

def init_Ptd(D, T):
    Ptd=np.random.rand(D, T)
    Ptd /=  Ptd.sum(axis=1,keepdims=True)
    return Ptd

# E step

In [8]:
@jit(nopython=True)
def E_step(tf_data,tf_row,tf_col, Pwt, Ptd, T):
    nnz=len(tf_data)
    Ptwd = np.zeros((T, nnz))    
    for ij in range(nnz):
        j, i = tf_row[ij], tf_col[ij]
        Ptwd_numerator = np.zeros(T)
        Ptwd_denominator = 0
        for k in range(T):
            Ptwd_numerator[k] = Pwt[k][i] * Ptd[j][k]
            Ptwd_denominator += Pwt[k][i] * Ptd[j][k]
        for k in range(T):
            if Ptwd_denominator!=0:
                Ptwd[k][ij] = Ptwd_numerator[k] / Ptwd_denominator
    return Ptwd

# M step

In [9]:
@jit(nopython=True)
def M_step(tf_data,tf_row,tf_col, Ptwd, clc_len, V, C, T):

    Ptd = np.zeros((C, T))
    Pwt = np.zeros((T, V))
    Pwt_denominator = np.zeros(T)
    for ij in range(len(tf_data)):
        j, i = tf_row[ij], tf_col[ij]
        for k in range(T):            
            Pwt[k][i] += tf_data[ij]*Ptwd[k][ij]
            Pwt_denominator[k] += tf_data[ij]*Ptwd[k][ij]
            Ptd[j][k] += tf_data[ij]*Ptwd[k][ij]
    for k in range(T): 
        for i in range(V):
            Pwt[k][i] = Pwt[k][i] / Pwt_denominator[k] 
    for k in range(T):
        for j in range(C):
            Ptd[j][k] = Ptd[j][k] / clc_len[j]
    return Pwt, Ptd 

# M step (fold-in)

In [10]:
@jit(nopython=True)
def M_step_fold_in(tf_data,tf_row,tf_col, Ptwd, doc_len, D, T):

    Ptd_fdn = np.zeros((D, T))
    for ij in range(len(tf_data)):
        j, i = tf_row[ij], tf_col[ij]
        for k in range(T):
            Ptd_fdn[j][k] += tf_data[ij]*Ptwd[k][ij]  
            
    for k in range(T): 
        for j in range(D):
            Ptd_fdn[j][k] = Ptd_fdn[j][k] / doc_len[j]  
    return Ptd_fdn 

# Likelihood

In [11]:
@jit(nopython=True)
def likelihood(tf_data, tf_row, tf_col, Pwt, Ptd, T):
    likelihood=0
    for ij in range(len(tf_data)):
        j,i = tf_row[ij], tf_col[ij]
        sumation=0
        for k in range(T):
            sumation+=Pwt[k][i]*Ptd[j][k]
        likelihood+=tf_data[ij]*np.log(sumation)
    return likelihood

# Similarity

In [12]:
@jit
def similarity(alpha, beta, Pwd, PLSA):

    sim = np.zeros((Q, D))
    for q in range(Q):
        for word in (qry_list[q]):
                if word in vocabulary:
                    i = vocabulary[word]
                    A = alpha * Pwd[:,i]
                    B = beta * PLSA[:,i]
                    C = (1 - alpha - beta) * idf[word]
                    sim[q,:] += np.log(A + B + C)
        if q % 4 == 3:
            print("query:",q+1)
    return sim

# Train

In [14]:
C = len(clc_list+doc_list)
V = len(vocabulary)
T = 256
Pwt = init_Pwt(T, V)
Ptd = init_Ptd(C, T)
# Pwt = np.load("Pwt.npy")
# Ptd = np.load("Ptd.npy")
clc_data = clc_tf.data
clc_row = clc_tf.row
clc_col = clc_tf.col
clc_len = clc_tf.toarray().sum(axis=1).reshape((C,))

In [15]:
# train_iter = 200
print("iter:", 0, "\tlikelihood:", likelihood(clc_data,clc_row,clc_col,Pwt,Ptd,T))
for count in range(200):
    Ptwd = E_step(clc_data,clc_row,clc_col, Pwt, Ptd, T)
    Pwt, Ptd = M_step(clc_data,clc_row,clc_col, Ptwd, clc_len, V, C, T)
    if count%20==19:
        np.save('Pwt', Pwt)
        np.save('Ptd', Ptd)
        print("iter:", count+1, "\tlikelihood:", likelihood(clc_data,clc_row,clc_col,Pwt,Ptd,T))

iter: 20 	likelihood: -39473196.938522205
iter: 40 	likelihood: -38491133.57485001
iter: 60 	likelihood: -38251146.91385805
iter: 80 	likelihood: -38140396.86830778
iter: 100 	likelihood: -38075253.11421587
iter: 120 	likelihood: -38031666.210489586
iter: 140 	likelihood: -38000660.77803813
iter: 160 	likelihood: -37976712.224048235
iter: 180 	likelihood: -37957182.664669566
iter: 200 	likelihood: -37941024.915237986


# Fold-in

In [16]:
D = len(doc_list)
Pwt = np.load("Pwt.npy")
Ptd_fdn = init_Ptd(D, T)
# Ptd_fdn = np.load("Ptd_fdn.npy")
doc_data = doc_tf.data
doc_row = doc_tf.row
doc_col = doc_tf.col
doc_len = doc_tf.toarray().sum(axis=1).reshape((D,))

In [17]:
# fold_in_iter = 200
for count in range(200):
    Ptwd = E_step(doc_data,doc_row,doc_col, Pwt, Ptd_fdn, T)
    Ptd_fdn = M_step_fold_in(doc_data,doc_row,doc_col, Ptwd, doc_len, D, T)
    if count%20==19:
        np.save('Ptd_fdn', Ptd_fdn)
        print("iter:", count+1, "\tlikelihood:", likelihood(doc_data,doc_row,doc_col,Pwt,Ptd_fdn,T))

iter: 20 	likelihood: -2457532.0504565504
iter: 40 	likelihood: -2457284.071879985
iter: 60 	likelihood: -2457255.814712747
iter: 80 	likelihood: -2457248.4411781672
iter: 100 	likelihood: -2457245.674834265
iter: 120 	likelihood: -2457244.4017677656
iter: 140 	likelihood: -2457243.735669609
iter: 160 	likelihood: -2457243.3507889854
iter: 180 	likelihood: -2457243.1089782664
iter: 200 	likelihood: -2457242.947904937


# Relevance degree

In [18]:
Q = len(qry_list)
D = len(doc_list)
# Ptd_fdn = np.load("Ptd_fdn.npy")
# Pwt = np.load("Pwt.npy")
Pwd = doc_tf.toarray() / doc_len.reshape((D,1))
PLSA = np.dot(Ptd_fdn, Pwt)

In [19]:
alpha, beta = (0.1, 0.1)
sim = similarity(alpha, beta, Pwd, PLSA)

query: 4
query: 8
query: 12
query: 16
query: 20
query: 24
query: 28
query: 32


In [20]:
fname = "./256_"+str(alpha)+"_"+str(beta)+".txt"
f = open(fname, 'w')
f.write("Query,RetrievedDocuments\n")  

for q in range(len(qry_list)):
    f.write(queries[q] + ",")        
    rank = np.argsort(-sim[q])
    for j in rank:
        f.write(docs[j]+" ")
    f.write("\n")
f.close()

# check result

text_file = open("./result_"+str(alpha)+"_"+str(beta)+".txt", "r")
result = text_file.read().splitlines()
text_file.close()

text_file = open('./result_49749.txt', "r")
result_last = text_file.read().splitlines()
text_file.close()

result==result_last