In [1]:
# Import required libraries for excecution
from IPython.display import clear_output
from gensim import corpora
import numpy as np
import pandas as pd
import csv

In [2]:
# Load vocabulary, doc_corpus, query_corpus and df with tags
dictionary = corpora.Dictionary.load('vocab.dict')
doc_corpus = corpora.MmCorpus("doc_corpus.mm")
query_corpus = corpora.MmCorpus("query_corpus.mm")
df = pd.read_csv('./data/relevance-judgments.tsv', sep='\t', header=None)
df.columns = ['query', 'doc']

In [3]:
print(doc_corpus[0][1])

(1, 1.0)


In [4]:
# Build first the Tf Matrix
tf_matrix = np.zeros((len(dictionary),len(doc_corpus)), dtype = np.float32)
# Doc corpus loop
for doc_id, doc in enumerate(doc_corpus):
    # Compute tf for each doc
    for term in doc:
        tf_matrix[term[0],doc_id] = np.log10(1+term[1])        
# Build the idf array
N = len(doc_corpus)
idf_array = np.zeros((len(dictionary),), dtype = np.float32)
# Compute the idf for each term
for i, term in enumerate(tf_matrix):
    idf_array[i] = np.log10(N/len(np.nonzero(term)[0]))
def dq_to_vec(d_q):
    vec = np.zeros((len(dictionary), 1), dtype = np.float32)
    for term in d_q:
        vec[term[0]] = np.log10(1 + term[1]) * idf_array[term[0]]
    return vec

In [5]:
def cosine_similarity(doc,query):
    doc = dq_to_vec(doc)
    query = dq_to_vec(query)
    return (sum(doc*query)/(sum(doc)*sum(query)))[0]

In [6]:
def ranked_retrieval(query):
    scores = []
    for doc in doc_corpus:
        scores.append(cosine_similarity(doc, query))
    scores_sorted = scores.copy()
    scores_sorted.sort(reverse = True)
    docs = []
    for score in scores_sorted[:len(np.nonzero(scores_sorted)[0])]:
        docs.append(scores.index(score) + 1)
    return docs

In [7]:
df_results = []
for query in query_corpus:
    result_list = ''
    results = ranked_retrieval(query)
    for result in results:
        result_list = result_list + 'd' + str(f'{result:03}') + ','
    df_results.append(result_list[:-1])
df['results'] = df_results
df = df.set_index('query')
# drop column and export tsv file with results
df.drop('doc', axis=1).to_csv('./results/(RRDV-queries_results.tsv', sep='\t', header=False)