In [1]:
import os
import re
from nltk.tokenize import sent_tokenize , word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import math
nltk.download('punkt')
nltk.download('stopwords')
import operator
import pickle
Stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

import sys
    


[nltk_data] Downloading package punkt to /home/anirudh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/anirudh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Total no. of documents
doc_cnt = 101



#unique tokens
with open('vocabulary.txt', 'rb') as handle:
    data                            = handle.read()
    vocabulary                      = pickle.loads(data)

#posting list, key=token,value=list of doc_id's    
with open('boolean_posting_list.txt', 'rb') as handle:
    data                            = handle.read()
    boolean_posting_list            = pickle.loads(data)
#dict,no of times a token appears in all documents, useful in idf calculation    
with open('token_document_frequency.txt', 'rb') as handle:
    data                            = handle.read()
    token_document_frequency        = pickle.loads(data)
#document vector of tf-idf values    
with open('tfidf_doc_vector_list.txt', 'rb') as handle:
    data                            = handle.read()
    tfidf_doc_vector_list           = pickle.loads(data)
#stores norm of tfidf vectors size is doc_cnt    
with open('normalized_doc_len.txt', 'rb') as handle:
    data                            = handle.read()
    normalized_doc_len              = pickle.loads(data)
    

#no of tokens in a document    
with open('doc_len.txt', 'rb') as handle:
    data                            = handle.read()
    doc_len                         = pickle.loads(data)
    
#key=index,value=filename
with open('filename_index_inv_dict.txt', 'rb') as handle:
    data                            = handle.read()
    filename_index_inv_dict         = pickle.loads(data)
    
#list, value=(token, freq in that document index)
with open('document_tf_list.txt', 'rb') as handle:
    data                            = handle.read()
    document_tf_list                = pickle.loads(data)
    

In [3]:
def boolean_retrieval_query(query):
    tokens         = preprocess_query(query)
    similar_docs   = {}
    #boolean retrieval does not consider term freq
    tokens         = set(tokens) 
    
    #initializing dictionary.Initially all documents a
    for doc_id in range(doc_cnt):
        similar_docs[doc_id] = 0
    
    
    for token in tokens:
        #if token is present in posting list
        if(token in boolean_posting_list):
            for doc_id in boolean_posting_list[token]:
                if(doc_id in similar_docs):
                    similar_docs[doc_id] +=1
                else:
                    similar_docs[doc_id]  =1
        else: #token not matched with any document.So we consider substring match.Modification with boolean retrieval
            for posting_token in boolean_posting_list:
                if(posting_token.find(token)!=-1):
                    for doc_id in boolean_posting_list[posting_token]:
                        similar_docs[doc_id]+=1
            
            
    similar_docs = dict( sorted(similar_docs.items(), key=operator.itemgetter(1),reverse=True))
    ranked_documents = rank_documents(similar_docs)
    return ranked_documents

In [4]:
#calculates idf
def calc_idf_tfidfalgo(token: str)->int:
    if(token in token_document_frequency):
        return math.log(doc_cnt/(token_document_frequency[token]+1))
    else:
        return 0

#calculates tf
def calc_tfidf(freq:int, idf: int)->int:
    if(freq==0):
        return 0
    else:
        return (1+math.log(freq))*idf


def tfidf_query(query)->[]:
    tokens      = preprocess_query(query)
    query_tfidf = {}
    tf_freq_map = {}
    query_vector_sqrt_val = 0
    
    #converting query into tfidf vector to find cosine similarity
    for token in tokens:
        if(not(token in tf_freq_map)):
            tf_freq_map[token]=1
        else:
            tf_freq_map[token]+=1
    for token in tf_freq_map:
        idf = calc_idf_tfidfalgo(token)
        query_tfidf[token] = calc_tfidf(tf_freq_map[token], idf)
        query_vector_sqrt_val += (query_tfidf[token]*query_tfidf[token])
        
    query_vector_sqrt_val = math.sqrt(query_vector_sqrt_val)
        
    #cosine similarity
    doc_index = 0
    doc_similarity_dict = {}
    for tfidf_doc_vector_dict in tfidf_doc_vector_list:
        similarity_val = 0
        for query_token in query_tfidf:
            if(query_token in tfidf_doc_vector_dict):
                similarity_val += (query_tfidf[query_token]*tfidf_doc_vector_dict[query_token])
                similarity_val /= math.sqrt(normalized_doc_len[doc_index])
                similarity_val /= query_vector_sqrt_val
        doc_similarity_dict[doc_index] = similarity_val
        doc_index+=1
    #greater cos(theta) value has greater similarity
    doc_similarity_dict = dict( sorted(doc_similarity_dict.items(), key=operator.itemgetter(1),reverse=True))
    ranked_documents = rank_documents(doc_similarity_dict)
    return ranked_documents
           

In [5]:
#idf of bm25 algorithm
def calc_idf_bm25algo(token: str)->int:
    if(token in token_document_frequency):
        return math.log(1+((doc_cnt-token_document_frequency[token]+0.5)/(token_document_frequency[token] + 0.5)))
    else:
        return 0
#tf of bm25 algorithm
def calc_bm25_tf(freq:int, idf:int, doc_id:int)->int:
    k = 1.5
    b = 0.75
    avg_doc_len = sum(doc_len)/len(doc_len)
    return ((freq*(k+1))/(freq + k*(1-b+b*(doc_len[doc_id]/avg_doc_len))))



def bm25_query(query)->[]:
    tokens                = preprocess_query(query)
    similarity_score_dict = {}
    for doc_id in range(doc_cnt):
        score = 0
        for token in tokens:
            if(token in document_tf_list[doc_id]):
                freq = document_tf_list[doc_id][token]
            else:
                freq = 0
            bm25_idf = calc_idf_bm25algo(token)
            bm25_tf  = calc_bm25_tf(freq, bm25_idf, doc_id)
            score   += (bm25_idf*bm25_tf) #calculates similarity score for each document
        similarity_score_dict[doc_id] = score
    similarity_score_dict = dict( sorted(similarity_score_dict.items(), key=operator.itemgetter(1),reverse=True))
    ranked_documents = rank_documents(similarity_score_dict)
    return ranked_documents

In [6]:
def rank_documents(doc_similarity_dict)->[]:
    ranked_list = []
    for key in enumerate(doc_similarity_dict):
        ranked_list.append(filename_index_inv_dict[key[1]])
        #selecting top 5 documents 0,1,2,3,4
        if(key[0]>=4):
            break
    return ranked_list

def clean_text(data: str)->str:
    sub_space   = '[. \- ,: /; \[\]\(\)=_]' #substitutes the special chars with space
    data = data.replace("\\", " ") #replaces / with space
    data = re.sub(sub_space,' ',data)
    data = data.replace('"',"")
    data = data.replace("'","")  #replaces quotes with nothing
    cleaned_data = data.encode('ascii',errors='ignore').decode()  #ignores non-ascii chars
    return cleaned_data

def tokenize_text(data: str):
    word_data = word_tokenize(data)
    token_list = []
    for word in word_data:
        #if(len(word)>1 and (word not in Stopwords)):
        if(len(word)>1 or (word in ['a','i'])):
            token_list.append(word.lower()) #tokenizing text.Including stop words
    return token_list


def stem_text(token_data: []):
    stemmed_tokens = []
    for token in token_data:
        stemmed_tokens.append(ps.stem(token))  #porter stemmer
    return stemmed_tokens        

def insert_unique_tokens(tokens: []):
    for token in tokens:
        vocabulary.add(token) #all the unique tokens
    return vocabulary

def preprocess(data: str):
    cleaned_data   = clean_text(data)
    token_data     = tokenize_text(cleaned_data)
    stemmed_tokens = stem_text(token_data)
    return stemmed_tokens


def preprocess_query(query):
    tokens = []
    #if query is in string format
    if(type(query) == str):
        tokens = preprocess(query)
        #if query is in list format
    elif(type(query) == list):
        for query_token in query:
            preprocessed_tokens = preprocess(query_token)
            for ptoken in preprocessed_tokens:
                tokens.append(ptoken)
    return tokens

def write_results_to_file(ranked_list, file_name, query_id):
   # print(file_name)
   # print(ranked_list)
    with open(file_name, 'a') as handle:
        for doc_name in ranked_list:
            file_line  = ""
            file_line += query_id
            file_line += ", "
            file_line += "1, "    #Iteration
            file_line += doc_name #doc_id
            file_line += ", "
            file_line += "1"    #relevance
            handle.write(file_line)
            handle.write('\n')
            
            
            
        
        
def truncate_files(boolean_qrels, tfidf_qrels, bm25_qrels):
    with open(boolean_qrels, "w") as fp:
        fp.truncate(0)
    with open(tfidf_qrels, "w") as fp:
        fp.truncate(0)
    with open(bm25_qrels, "w") as fp:
        fp.truncate(0)
        

    
def query(file_name):
    #output files
    truncate_files("QRels_Boolean.txt","QRels_Tfidf.txt", "QRels_Bm25.txt")
    with open(file_name) as fp:
        while(True):
            line = fp.readline()
            if((not line) or (len(line)==1)):
                break
            fields   = line.split("\t")
            query_id = fields[0]
            query    = fields[1].rstrip('\n') #Removing newline if present at the end of query
            
            print('query_id {} query {}'.format(query_id, query))
            
            write_results_to_file(boolean_retrieval_query(query), "QRels_Boolean.txt", query_id)
            write_results_to_file(tfidf_query(query), "QRels_Tfidf.txt", query_id)
            write_results_to_file(bm25_query(query),  "QRels_Bm25.txt",query_id)

In [8]:
file_name = "21111011-qrels/queries.txt"
print("file_name is ",file_name)
query(file_name)

file_name is  21111011-qrels/queries.txt
query_id Q01 query Deep learning in natural language processing (NLP)
query_id Q02 query Hindu religion and their temples
query_id Q03 query covid-19 pandemic vaccination
query_id Q04 query data structures and algorithms in computer programming
query_id Q05 query Infectious diseases prevention
query_id Q06 query Kurukshetra war in Mahabharata
query_id Q07 query literature, language and art
query_id Q08 query unsupervised learning methods in machine learning
query_id Q09 query quantum physics, quantum mechanics, quantum computing
query_id Q10 query Rama Ravana war
query_id Q11 query Object oriented computer programming
query_id Q12 query Graph theory computer science
query_id Q13 query human immune system
query_id Q14 query unesco world heritage sites in India
query_id Q15 query english fiction literature genre
query_id Q16 query Figures of speech
query_id Q17 query wildlife sanctuaries in India
query_id Q18 query database management system conce