# HW01 Basic Ranked Retrieval (RRI)

## Import Data

In [1]:
# Import required libraries for excecution

from IPython.display import clear_output
from gensim import corpora
import numpy as np
import pandas as pd
import csv

In [2]:
# Load vocabulary, doc_corpus, query_corpus and df with tags
dictionary = corpora.Dictionary.load('vocab.dict')
doc_corpus = corpora.MmCorpus("doc_corpus.mm")
query_corpus = corpora.MmCorpus("query_corpus.mm")
#df = pd.read_csv('./data/relevance-judgments.tsv', sep='\t', header=None)
#df.columns = ['query', 'doc']

In [3]:
# Glimpse at vocabulary
print(dictionary)
#print(dictionary.token2id)

Dictionary(17365 unique tokens: ['1', '1785', '1812', '1819', '1820']...)


In [4]:
# Glimpse to doc and query corpus
print(doc_corpus)
print(query_corpus)

MmCorpus(331 documents, 17365 features, 81038 non-zero entries)
MmCorpus(35 documents, 16373 features, 110 non-zero entries)


## Matrix to store Tfidf

In [5]:
# Build first the Tf Matrix
tf_matrix = np.zeros((len(dictionary),len(doc_corpus)), dtype = np.float32)

# Doc corpus loop
for doc_id, doc in enumerate(doc_corpus):
    # Compute tf for each doc
    for term in doc:
        tf_matrix[term[0],doc_id] = np.log10(1+term[1])
        
# Build the idf array
N = len(doc_corpus)
idf_array = np.zeros((len(dictionary),), dtype = np.float32)
# Compute the idf for each term
for i, term in enumerate(tf_matrix):
    idf_array[i] = np.log10(N/len(np.nonzero(term)[0]))
    
# Build the inverted index Tfidf Matrix
tfidf_matrix = (tf_matrix.T * idf_array).T

#print(tfidf_matrix)

## Basic Ranked Retrieval

In [6]:
def basic_ranked_retrieval(tfidf_matrix, query):
    """ Performs a basic ranked retrieval query based on the weights (scores) stored in a tfidf_matrix.
     
    Args:
        tfidf_matrix (numpy.ndarray): matrix with the tfidf scores for each term in each doc.
        query (list): words contained in the query
    
    Returns:
        list: with relevant (score > 0) documents according to query
    
    """
    #  Retrieve terms (row index) from the query
    index = []
    for query_terms in query:
        index.append(query_terms[0])
        
    # Retrieve rows from the tfidf matrix
    results_matrix = tfidf_matrix[index,:]
    
    # Add all tfidf scores for each term
    results_matrix = np.sum(results_matrix, axis=0)
    
    # Retrieve doc IDs
    results = list(enumerate(results_matrix))
    dtype = [('doc_id', int), ('score', float)]
    docs_score = np.array(results, dtype=dtype)
    # Adjust doc ID
    docs_score['doc_id'] = docs_score['doc_id'] + 1

    # Sort docs by score
    docs_score_sorted = np.flip(np.sort(docs_score, order='score'))

    # Retrieve only documents with non zero score
    k = len(np.nonzero(docs_score['score'])[0])
    relevant_docs = docs_score_sorted[0:k]
    
    # Return relevant docs
    return relevant_docs

## Querying

In [7]:
# Perform Basic Ranked Retrieval for all querys
print('Basic Ranked Retrieval Query results:')
print('')

for query in query_corpus:
    relevant_docs = basic_ranked_retrieval(tfidf_matrix, query)
    # Print only top 5 docs
    print(relevant_docs['doc_id'][0:5])

Basic Ranked Retrieval Query results:

[ 16 186 254  85 259]
[147 283 293 149 318]
[291 283 152]
[ 49  19 270 275  10]
[297 329  26 257  29]
[  4 146  34 289  98]
[251 108 110 117 180]
[199 198 177 217 223]
[231  52 100  60  36]
[277 258 250 239 176]
[ 49 272 241  42 219]
[180 121 122 145  81]
[184 132 250 277 176]
[271 121  91  24 172]
[192 194 201 210 207]
[179 323 102   4  21]
[258  49  56 277  11]
[219 276 323  26 318]
[199 282 220 216 201]
[166 328  20 265 156]
[152 143 293 291 147]
[103  51 158 116 227]
[316 136  56  80 147]
[314 133 120 113   1]
[ 25  90 139  67  92]
[216 113  37  75 312]
[257 277 321  23 265]
[169 116  62 327 294]
[263 294 118  39 138]
[307  42 262 283 252]
[174 150 268 128  36]
[298  48 314 125 216]
[ 29 185 164 105 254]
[105 164 185  85 126]
[133 125 122 145 121]
