# HW01 Ranked Retrieval and Document Vectorizarion

## Import data

In [1]:
# Import required libraries for excecution
from IPython.display import clear_output
from gensim import corpora
import numpy as np
import pandas as pd
import csv
import time

In [2]:
# Load vocabulary, doc_corpus, query_corpus and df with tags
dictionary = corpora.Dictionary.load('resources/vocab.dict')
doc_corpus = corpora.MmCorpus("resources/doc_corpus.mm")
query_corpus = corpora.MmCorpus("resources/query_corpus.mm")
df = pd.read_csv('./data/relevance-judgments.tsv', sep='\t', header=None)
df.columns = ['query', 'doc']

In [3]:
# Glimpse to doc and query corpus
print(doc_corpus)
print(query_corpus)

MmCorpus(331 documents, 17365 features, 81038 non-zero entries)
MmCorpus(35 documents, 16373 features, 110 non-zero entries)


In [4]:
binary_matrix = np.load('./resources/BSmatrix.npy')
N = len(doc_corpus)
idf_array = np.zeros((len(dictionary),1), dtype = np.float32)
for i in range(binary_matrix.shape[0]):
    idf_array[i] = np.log10(N/len(np.nonzero(binary_matrix[i,:])[0]))
print(idf_array)

[[0.28178188]
 [1.5655855 ]
 [1.519828  ]
 ...
 [2.519828  ]
 [2.519828  ]
 [2.519828  ]]


# tf-idf model

In [5]:
# Build first the Tf Matrix
tf_matrix = np.zeros((len(dictionary),len(doc_corpus)), dtype = np.float32)
# Doc corpus loop
for doc_id, doc in enumerate(doc_corpus):
    # Compute tf for each doc
    for term in doc:
        tf_matrix[term[0],doc_id] = np.log10(1+term[1])        
# Build the idf array
N = len(doc_corpus)
idf_array1 = np.zeros((len(dictionary),), dtype = np.float32)
# Compute the idf for each term
for i, term in enumerate(tf_matrix):
    idf_array1[i] = np.log10(N/len(np.nonzero(term)[0]))
print(idf_array1)

[0.28178188 1.5655855  1.519828   ... 2.519828   2.519828   2.519828  ]


In [6]:
for i in range(len(idf_array)):
    if idf_array1[i] != idf_array[i]:
        print(i)

## Necessary functions

In [7]:
def dq_to_vec(d_q):
    """ Function to vectorize doc or query recieved
    
    Args:
        d_q (list): Document or query to be vectorized
    
    Returns:
        numpy.ndarray: Vector corresponding to the document
    
    """
    vec = np.zeros((len(dictionary), 1), dtype = np.float32)
    for term in d_q:
        vec[term[0]] = np.log10(1 + term[1]) * idf_array[term[0]]
    return vec

In [8]:
def cosine_similarity(doc,query):
    """ Function to return the cosine similarity between to vectors
    
    Args:
        doc (list): Document to be compared to query
        query (list): Given query to find related documents.
        
    Returns:
        numpy.float32: Cosine similarity between query and document
    
    """
    doc = dq_to_vec(doc)
    query = dq_to_vec(query)
    return (sum(doc*query)/(np.sqrt(sum(doc**2))*np.sqrt(sum(query**2))))[0]

In [9]:
def ranked_retrieval(query):
    """ Function to return relevant documents to a query
    
    Args:
        query (list): query to find related documents to.
    
    Returns:
        list: Relevant documents
    
    """
    scores = []
    for doc in doc_corpus:
        scores.append(cosine_similarity(doc, query))
    scores_sorted = scores.copy()
    scores_sorted.sort(reverse = True)
    docs = []
    for score in scores_sorted[:len(np.nonzero(scores_sorted)[0])]:
        docs.append(scores.index(score) + 1)
    return docs

## Querying

In [10]:
""" Runs ranked retrieval and document vectorizarion for each query

Args:
    query_corpus (gensim.corpora.mmcorpus.MmCorpus): corpus with the queries. 
    df (pandas.core.frame.DataFrame): Dataframe with read queries.
    
Returns:
    df (pandas.core.frame.DataFrame): Dataframe with new column with resulting documents for each query.
"""

df_results = []
for query in query_corpus:
    result_list = ''
    results = ranked_retrieval(query)
    for result in results:
        result_list = result_list + 'd' + str(f'{result:03}') + ','
    df_results.append(result_list[:-1])
df['results'] = df_results
df = df.set_index('query')
# drop column and export tsv file with results
df.drop('doc', axis=1).to_csv('./results/RRDV-queries_results.tsv', sep='\t', header=False)