# HW1 Binary Search (BS)

In [1]:
# Import required libraries for excecution
from gensim import corpora
import numpy as np
import pandas as pd

In [2]:
# Load dictionary, doc_corpus, query_corpus and df with tags
dictionary = corpora.Dictionary.load('vocab.dict')
doc_corpus = corpora.MmCorpus("doc_corpus.mm")
query_corpus = corpora.MmCorpus("query_corpus.mm")
df = pd.read_csv('./data/relevance-judgments.tsv', sep='\t', header=None)
df.columns = ['query', 'doc']

In [3]:
# Create binary matrix
matrix = np.zeros((len(doc_corpus), len(dictionary)), dtype = np.bool_)
for i in range(0, len(doc_corpus)):
    for j in range(0, len(doc_corpus[i])):
        matrix[i, doc_corpus[i][j][0]] = True

In [4]:
def binary_search(matrix, query):
    """ Returns a matrix with relevant documents based on a query according to binary search method.
    
    Args:
        matrix (numpy.ndarray): binary matrix with document ando word information
        query (list): words contained in the query
    
    Returns:
        list: with relevant documents according to query
    
    """
    
    # Convert query to binary vector
    query_vector = np.zeros((1, len(dictionary)))
    for i in query:
        query_vector[0,i[0]] = True
    
    # Create empty list to gather revelvant documents
    result_list = []
    for i in range(0, len(doc_corpus)):
        
        # Logical AND - OR between matrix and query vector
        if (np.logical_and(query_vector[0], matrix[i, :])).any():
            result_list.append(i + 1)
    return result_list

In [5]:
def convert_to_evaluation(binary_result, query):
    """ Returns a vector in format to use evaluation metrics.
    
    Args:
        binary_result (list): Result to the binary search
        query (int): Number of the query to be compared with
    
    Returns:
        list: relevant documents in format for evaluation metrics
    """
    
    # Empty list to gather relevant document numbers
    tag = []
    for i in df['doc'][query].split(','):
        tag.append(int(i.split(':')[0].split('d')[1]))
        
    # Create the evaluation list
    evaluation = np.zeros(len(binary_result))
    for i in range(0, len(binary_result)):
        if binary_result[i] in tag:
            evaluation[i] = 1
    return evaluation