# HW1 Binary Search (BS)

## Load data

In [1]:
# Import required libraries for excecution
from gensim import corpora
import numpy as np
import pandas as pd
import time

In [2]:
# Load dictionary, doc_corpus, query_corpus and df with tags
dictionary = corpora.Dictionary.load('resources/vocab.dict')
doc_corpus = corpora.MmCorpus('resources/doc_corpus.mm')
query_corpus = corpora.MmCorpus('resources/query_corpus.mm')
df = pd.read_csv('./data/relevance-judgments.tsv', sep='\t', header=None)
df.columns = ['query', 'doc']

## Binary matrix

In [3]:
# Create binary matrix
matrix = np.zeros((len(doc_corpus), len(dictionary)), dtype = np.bool_)
for i in range(0, len(doc_corpus)):
    for j in range(0, len(doc_corpus[i])):
        matrix[i, doc_corpus[i][j][0]] = True
matrix = matrix.transpose()
np.save('./resources/BSmatrix.npy', matrix)
matrix = np.load('./resources/BSmatrix.npy')
print(matrix.shape)

(17365, 331)


## Binary search

In [4]:
def binary_search(matrix, query, logical_op):
    """ Returns a matrix with relevant documents based on a query according to binary search method.
    
    Args:
        matrix (numpy.ndarray): binary matrix with document ando word information
        query (list): words contained in the query
    
    Returns:
        list: with relevant documents according to query
    
    """
    
    # Convert query to binary vector
    query_vector = np.zeros((1, len(dictionary)))
    for i in query:
        query_vector[0,i[0]] = True
    
    # Create empty list to gather revelvant documents
    result_list = []
    for i in range(0, len(doc_corpus)):
        
        # Logical AND - OR between matrix and query vector
        vec = np.nonzero((np.logical_and(query_vector[0], matrix[:,i])))[0]
        vec1 = np.nonzero(query_vector[0])[0]
        if logical_op == 'OR':
            if len(vec) > 0:
                result_list.append(i + 1)
        elif logical_op == 'AND':
            if len(vec) == len(vec1):
                result_list.append(i + 1)
    return result_list

## Querying

In [5]:
""" Runs binary search for each query

Args:
    query_corpus (gensim.corpora.mmcorpus.MmCorpus): corpus with the queries. 
    df (pandas.core.frame.DataFrame): Dataframe with read queries.
    
Returns:
    df (pandas.core.frame.DataFrame): Dataframe with new column with resulting documents for each query.
"""
df_results = []
for query in query_corpus:
    result_list = ''
    results = binary_search(matrix, query, 'AND')
    for result in results:
        result_list = result_list + 'd' + str(f'{result:03}') + ','
    df_results.append(result_list[:-1])
df['results'] = df_results
df = df.set_index('query')
# drop column and export tsv file with results
df.drop('doc', axis=1).to_csv('./results/BS-queries_results_and.tsv', sep='\t', header=False)

In [6]:
""" Runs binary search for each query

Args:
    query_corpus (gensim.corpora.mmcorpus.MmCorpus): corpus with the queries. 
    df (pandas.core.frame.DataFrame): Dataframe with read queries.
    
Returns:
    df (pandas.core.frame.DataFrame): Dataframe with new column with resulting documents for each query.
"""
df_results = []
for query in query_corpus:
    result_list = ''
    results = binary_search(matrix, query, 'OR')
    for result in results:
        result_list = result_list + 'd' + str(f'{result:03}') + ','
    df_results.append(result_list[:-1])
df['results'] = df_results
# drop column and export tsv file with results
df.drop('doc', axis=1).to_csv('./results/BS-queries_results_or.tsv', sep='\t', header=False)

In [7]:
# Measures time spent to exceute each binary search (and, or) over all queries and averages over 30 runs
samples = 10
initial_time = time.time()
for i in range(0, samples):
    for query in query_corpus:
        results = binary_search(matrix, query, 'AND')
total_time = time.time() - initial_time
print('Average time BS_and: %.3fs' %(total_time/samples))

initial_time = time.time()
for i in range(0, samples):
    for query in query_corpus:
        results = binary_search(matrix, query, 'OR')
total_time = time.time() - initial_time
print('Average time BS_or: %.3f s' %(total_time/samples))

Average time BS_and: 1.558 s
Average time BS_or: 1.572 s
