# HW1 Binary Search using Inverted Index

In [2]:
import time
import numpy as np
import pandas as pd
from gensim import corpora

In [None]:
dictionary = corpora.Dictionary.load('resources/vocab.dict')
doc_corpus = corpora.MmCorpus('resources/doc_corpus.mm')

In [None]:
# SPIMI-like starts here

# Creates a dictionary. Dictionary will hold token as key and list as value.
index_dictionary = {}

# Iterates over every document in the corpus
for doc_indx in range(len(doc_corpus)):    
    
    # Retrieves document from corpus using index
    document = doc_corpus[doc_indx]
    
    # Calculates document ID using the document index
    doc_id = doc_indx + 1
    
    # Iterates over every token present in document
    for token_indx in range(len(document)):
        
        # Converts token ID to token
        token = document[token_indx][0]
        
        if token not in index_dictionary.keys():
            index_dictionary[token] = [doc_id]
        else:
            index_dictionary[token].append(doc_id)
            
# Creates the numpy array
inverted_index = np.zeros((len(dictionary), len(doc_corpus) + 1), dtype=np.int_)
            
# Calculates the relative frequency and converts to numpy array
for key in index_dictionary.keys():
    
    relative_frequency = len(index_dictionary[key])
    
    inverted_index[key, 0] = relative_frequency
    
    inverted_index[key, 1:1+relative_frequency] = index_dictionary[key]

# Saves the inverted index matrix
np.save('./data/IImatrix.npy', inverted_index)

In [3]:
inverted_index = np.load('./data/IImatrix.npy')

In [4]:
def merge(posting_list_0, posting_list_1, operator, n_docs, mod_term_0=False, mod_term_1=False):
    
    # Checks operator
    if operator not in ["OR", "AND"]:
        raise Exception("Not valid boolean operator. Operator must be either 'OR' or 'AND'.")
        
    # Changes first posting lists if modifier is enabled
    if mod_term_0:
        posting_list_0 = [val for val in range(1, n_docs) if val not in posting_list_0]
    
    # Changes second posting lists if modifier is enabled
    if mod_term_1:
        posting_list_1 = [val for val in range(1, n_docs) if val not in posting_list_1]
          
    # Sets pl0 as the shortest and pl1 as the longest
    if len(posting_list_1) < len(posting_list_0):
        pl0, pl1 = posting_list_1, posting_list_0
    else:
        pl0, pl1 = posting_list_0, posting_list_1
        
    # Calculates the length of both lists
    len_pl0 = len(pl0)
    len_pl1 = len(pl1)
    
    # Creates the output list for merged posting lists  
    merged = []
        
    # Creates index list
    indexes = [0, 0]
    
    if operator == "AND" and (len(pl0) == 0 or len(pl1) == 0):
        return []
    else:
        finish = False
    
    while not finish:
        
        # Retrieves postings from lists
        posting0 = pl0[indexes[0]]
        posting1 = pl1[indexes[1]]
    
        # If merging using an AND operator
        if operator == "AND":
            
            if posting0 == posting1:
                merged.append(posting0)
            
            if posting0 < posting1:
                indexes[0] += 1
            else:
                indexes[1] += 1
                
            if indexes[0] == len_pl0 - 1 or indexes[1] == len_pl1 - 1:
                finish = True
        
        # If merging using an OR operator
        else:
            
            if indexes[0] != len_pl0 - 1:
                
                if posting0 < posting1:
                    merged.append(posting0)
                    indexes[0] += 1
                    
                elif posting0 == posting1:
                    merged.append(posting0)
                    indexes[0] += 1
                    indexes[1] += 1
                
                else:
                    merged.append(posting1)
                    indexes[1] += 1
                
            else:
                merged.append(posting1)
                indexes[1] += 1
                
            if indexes[1] == len_pl1:
                finish = True

    # Returns output list for merge operation
    return merged

In [5]:
def inverted_index_query(inverted_index, query, n_docs, conjunctive=True):
    
    # Array to store terms with relative frequency
    terms = []
    
    # Sets the boolean operator
    if conjunctive:
        operator = "AND"
    else:
        operator = "OR"
    
    # Retrieves relative frequency from 
    for term in query:
        rf = inverted_index[term][0]
        terms.append([term, rf])
    
    # Converts to numpy array
    terms = np.asarray(terms)
    
    # Orders the terms
    terms = terms[terms[:, 1].argsort()]
    
    # Gets the first posting list
    result = inverted_index[terms[0,0]][1::]
    
    # If there is more than one term, results are calculated
    if terms.shape[0] != 1:
        
        # Removes zeroes from first posting list
        result = result[result != 0]
        
        # Iterates over remaining posting lists
        for indx in range(1, terms.shape[0]):
            
            # Gets second posting list
            second = inverted_index[terms[indx, 0]][1::]
            
            # Removes zeroes from second posting list
            second = second[second != 0]

            result = merge(result, second, operator, 331)
            
        return result
    
    # If only one term is present in query, its posting list is returned
    else:
        return result[result != 0]
    
    

In [6]:
# Reads query corpus
query_corpus = [[term[0] for term in query] for query in corpora.MmCorpus("./resources/query_corpus.mm")]

# Sets the query names from golden file
query_names = list(pd.read_csv("./data/relevance-judgments.tsv", sep='\t', names=['query', 'd']).loc[:, "query"])

FileNotFoundError: [Errno 2] No such file or directory: 'query_corpus.mm'

In [121]:
# Conjuctive queries
conj_file = open("./results/BSII-AND-queries-results.tsv", "w")

for query_indx in range(len(query_corpus)):
    
    name = query_names[query_indx]
    
    query = query_corpus[query_indx]
    
    result = inverted_index_query(inverted_index, query, 331, True)
    
    conj_file.write(name + '\t')
    
    for r in result:
        
        conj_file.write('d' + str(r))
        
        if r != result[-1]:
            conj_file.write(',')

    conj_file.write('\n')
    
conj_file.close()

In [121]:
# Disjunctive queries
conj_file = open("./results/BSII-OR-queries-results.tsv", "w")

for query_indx in range(len(query_corpus)):
    
    name = query_names[query_indx]
    
    query = query_corpus[query_indx]
    
    result = inverted_index_query(inverted_index, query, 331, False)
    
    conj_file.write(name + '\t')
    
    for r in result:
        
        conj_file.write('d' + str(r))
        
        if r != result[-1]:
            conj_file.write(',')

    conj_file.write('\n')
    
conj_file.close()