In [6]:
from typing import List
import string
from spellchecker import SpellChecker
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords as nltk_stopwords
from nltk import pos_tag
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize , sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import inflect
import re
import contractions
from typing import Dict
from collections import defaultdict
from datetime import datetime
import pycountry
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def custom_preprocessor(text):
    lowercase_text = _lowercase_tokens(text)
    expanded_text = expand_contractions(lowercase_text)
    cleaned_text = remove_html_and_links(expanded_text)

    return cleaned_text
#==================================================================#
def custom_tokenizer(text,online=False):
    tokens, ddate = _get_tokenz(text)
    
    cleaned_tokens = _remove_punctuations(tokens)

    # apply Spell Checker for online query 
    if online:
        corrected_token = spell_checker(cleaned_tokens)
    else:
        corrected_token = cleaned_tokens

    stop_words_removed = _stop_words(corrected_token)
    
    dateNormalization = normalize_dates(ddate)
    
    tokenWithoutCountry, countryNormalization = _normalize_country_names(stop_words_removed)
    
    lemma_tokens = _lemma_tokens(tokenWithoutCountry)
    
    return lemma_tokens + dateNormalization + countryNormalization
#==================================================================#
def _lowercase_tokens(text:list) -> str:
    lowercase_words = text.lower()
    
    return lowercase_words 
#==================================================================#
def expand_contractions(text):

    expanded_text = contractions.fix(text)
    return expanded_text
    
#==================================================================#
def remove_html_and_links(text):
    clean_text = re.sub(r'<.*?>', '', text)
    clean_text = re.sub(r'http\S+', '', clean_text)
    
    return clean_text  
#==================================================================#
def _get_tokenz(text: str) -> list:
    date_pattern = r'(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})|'\
                r'(\d{4}[-/]\d{1,2}[-/]\d{1,2})|'\
                r'(\d{1,2}\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{2,4})|' \
                r'((jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{1,2},\s+\d{2,4})|' \
                r'(\d{1,2}\s+(january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{2,4})|' \
                r'((january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s+\d{2,4})'

    matches = re.findall(date_pattern, text)

    dates=[]
    for match in matches:
        date = next(item for item in match if item) 
        dates.append(date)
        text = text.replace(date,"")

    tokens = word_tokenize(text)

    return tokens , dates 
#==================================================================#
def _remove_punctuations(tokens: list) -> list:
    tokenizer = RegexpTokenizer(r'\w+')
    cleaned_token = tokenizer.tokenize(' '.join(tokens))
    
    return cleaned_token
#==================================================================#
def _stop_words(tokens:list) -> list:
    stopwords = nltk_stopwords.words("english")
    filtered_tokens = [token for token in tokens if token not in stopwords]
    
    return filtered_tokens
#==================================================================#
def get_wordnet_pos(tag_parameter):
    tag = tag_parameter[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    return tag_dict.get(tag, wordnet.NOUN)

def _lemma_tokens(tokens:list) -> list:
    tagged_tokens = pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in tagged_tokens]
    
    return lemmatized_words
#==================================================================#
def normalize_dates(tokens:list) -> list:
    date_pattern = r'(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})|' \
                    r'(\d{4}[-/]\d{1,2}[-/]\d{1,2})|' \
                    r'(\d{1,2}\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{2,4})|' \
                    r'((jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{1,2},\s+\d{2,4})|' \
                    r'(\d{1,2}\s+(january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{2,4})|' \
                    r'((january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},\s+\d{2,4})'

    format_strings = ['%d-%m-%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y-%m-%d', '%Y/%m/%d', '%Y.%m.%d', '%d %b %Y', '%b %d, %Y',
                        '%d %B %Y', '%B %d, %Y', '%m/%d/%Y', '%m-%d-%Y', '%m.%d.%Y', '%d-%m-%y', '%d/%m/%y', '%d.%m.%y',
                        '%y-%m-%d', '%y/%m/%d', '%y.%m.%d', '%d %b %y', '%b %d, %y',
                        '%d %B %y', '%B %d, %y', '%m/%d/%y', '%m-%d-%y', '%m.%d.%y']

    for token in tokens: 
        matches = re.findall(date_pattern, token)
        if matches:
            match = next(item for item in matches[0] if item)
            for fmt in format_strings:
                try:
                    date_obj = datetime.strptime(match, fmt)
                    break  
                except ValueError:
                    pass
            else:
             continue  
            normalized_date = date_obj.strftime('%Y-%m-%d')
            position = tokens.index(token)
            tokens[position]=token.replace(match, normalized_date)

    return tokens
#==================================================================#
def _normalize_country_names(tokens:list) -> list:
    country_codes = set(country.alpha_3 for country in pycountry.countries)
    countryName=[]
    for token in tokens.copy():
        if token.upper() in country_codes:
            try:
                country = pycountry.countries.lookup(token.upper())
                tokens.remove(token)
                countryName.append(country.name)
            except LookupError:
                pass

    return tokens,countryName
#=============================================================================================================================================#
def spell_checker(tokens:list) -> list:
    spell = SpellChecker()
    misspelled = spell.unknown(tokens)
   # print("--------",misspelled)
    for i, token in enumerate(tokens):
        if token in misspelled:
            suggestions = spell.candidates(token)
            #print("hello",suggestions)
            if suggestions:
                corrected = spell.correction(token)
                if corrected is not None:
                    tokens[i] = corrected
    return tokens

In [7]:
from typing import List
import string
import pandas as pd
import re
from typing import Dict
from collections import defaultdict

def load_corpus(dataset:str) -> Dict:
        data_dict = {}
        if dataset == "wikir":
                path = r'C:\Users\juman\Downloads\dataset\wikIR1k\wikIR1k\documents.csv'
                data = pd.read_csv(path, index_col=False)
                data_dict = {row["id_right"]: row["text_right"] for index, row in data.iterrows()} #second way

        elif dataset == "lotte":
                path = r'C:\Users\juman\Downloads\dataset\lifestyle\lifestyle\dev\collection.tsv'
                data = pd.read_csv(path, sep='\t', header=None)
                data_dict =  {row[0]: row[1] for index, row in data.iterrows()} 

        return data_dict
#---------------------------------------------------------------------------------#
def load_queries(dataset:str) -> Dict:
        queries_dict = {} 
        if dataset == "wikir":
                path = r'C:\Users\juman\Downloads\dataset\wikIR1k\wikIR1k\training\queries.csv'
                data = pd.read_csv(path, index_col=False)
                queries_dict = {row["id_left"]: row["text_left"] for index, row in data.iterrows()} #second way

        elif dataset == "lotte": 
                path1 = r'C:\Users\juman\Downloads\dataset\lifestyle\lifestyle\dev\questions.search.tsv'
                data = pd.read_csv(path1, sep='\t', header=None)
                queries_dict =  {row[0]: row[1] for index, row in data.iterrows()} 

        return queries_dict
#--------------------------------------------------------------------------------------#
def load_qrels(dataset:str):
        qrels_dict = {}
        if dataset == "lotte":
                path = r'C:\Users\juman\Downloads\dataset\lifestyle\lifestyle\dev\qas.search.jsonl'
                df = pd.read_json(path,lines=True)
                qrels_dict = pd.Series(df['answer_pids'].values, index=df['qid']).to_dict()

        else:
                path= r'C:\Users\juman\Downloads\dataset\wikIR1k\wikIR1k\training\BM25.qrels.csv'
                df = pd.read_csv(path)
                qrels_dict = df.groupby('id_left')['id_right'].apply(list).to_dict()

        return  qrels_dict

In [8]:
corpus=load_corpus("lotte")


In [None]:
import pickle

tfidf_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, preprocessor=custom_preprocessor)
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus.values())

with open('wikir_tfidf_vectorizer_object.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

with open('wikir_dtm.pkl', 'wb') as file:
    pickle.dump(tfidf_matrix, file)

In [9]:
import pickle

with open('lotte_tfidf_vectorizer_object.pkl', 'rb') as file:
        tfidf_vectorizer = pickle.load(file)

with open('lotte_dtm.pkl', 'rb') as file:
        tfidf_matrix = pickle.load(file)

In [10]:
def query_vectorizer(query:str,tfidfobj):
    x=custom_preprocessor(query)
    processed_query=custom_tokenizer(query)
    vector=tfidfobj.transform([' '.join(processed_query)])
    
    return vector ,processed_query

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix
import pickle
import shelve
import time

def dtm_matched_doc(query_terms,tfidf_matrix,tifidfObj):
    terms_to_search=query_terms
    
    term_indices = [tifidfObj.vocabulary_[term] for term in terms_to_search if term in tifidfObj.vocabulary_]
     
    matching_docs = set()
    for term_index in term_indices:
        matching_docs.update(tfidf_matrix[:, term_index].nonzero()[0])
            
    matching_docs = list(matching_docs)
    
    submatrix = tfidf_matrix[matching_docs, :]
 
    index_mapping = {i: matching_docs[i] for i in range(submatrix.shape[0])}
    
    return submatrix ,index_mapping
#-------------------------------------------------------------------------------------------------------------------#
def matching_and_ranking(query:str,tfidf_matrix,tifidfObj,data_dict):
    query_vector , processed_query = query_vectorizer(query,tifidfObj)

    docs_tfidf_matrix ,index_mapping = dtm_matched_doc(processed_query,tfidf_matrix,tifidfObj)
     
    top_K_documents = []
    if docs_tfidf_matrix.nnz > 0:
                
        cosine_similarities = cosine_similarity(query_vector, docs_tfidf_matrix)

        top_ten_indices = np.argsort(-cosine_similarities[0])[:10]
            
        top_ten_doc_ids = [index_mapping[index] for index in top_ten_indices]
                    
        top_K_documents = [list(data_dict.keys())[i] for i in top_ten_doc_ids]
    
    return top_K_documents

In [13]:
from collections import defaultdict

def calculate_precision_at_10(retrieved, relevant, k=10):
    relevant_set = set(relevant)
    true_positives = len([doc for doc in retrieved if doc in relevant_set])
    return true_positives / k

def calculate_recall_at_10(retrieved, relevant, k=10):
    retrieved_at_k = retrieved
    relevant_set = set(relevant)
    true_positives = len([doc for doc in retrieved_at_k if doc in relevant_set])
    return true_positives / len(relevant_set)

def calculate_reciprocal_rank(retrieved, relevant):
    relevant_set = set(relevant)
    for index, doc in enumerate(retrieved):
        if doc in relevant_set:
            return 1 / (index + 1)
    return 0

def calculate_average_precision_at_10(retrieved, relevant, k=10):
    relevant_set = set(relevant)  
    precision_at_i = 0.0
    num_of_relevant_found = 0.0

    for i, doc in enumerate(retrieved):
        if doc in relevant and doc not in retrieved[:i]:
            num_of_relevant_found += 1.0
            precision_at_i += num_of_relevant_found / (i + 1.0)

    return precision_at_i / min(10, len(relevant_set))
#-------------------------------------------------------------------------------#
def all_metrices(tfidf_matrix,tifidfObj,corpus,queries,qrel):
    queries = queries
    actual_relevant_docs = qrel # this is from qrels  
    sum_of_rr = 0.0
    sum_of_aps = 0.0
    query_precision={}
    query_recall={}
    
    for query_id in queries.keys():
            predicted_result=matching_and_ranking(queries[query_id],tfidf_matrix,tifidfObj,corpus)
            relevant_docs = actual_relevant_docs[query_id]   
       
            precision = calculate_precision_at_10(predicted_result, relevant_docs)
            query_precision[query_id]=precision

            #------------------------------------------------------------------------------------------#
            recall = calculate_recall_at_10(predicted_result, relevant_docs)
            query_recall[query_id]=recall

            #------------------------------------------------------------------------------------------#
            rr = calculate_reciprocal_rank(predicted_result,relevant_docs)
            sum_of_rr  += rr

            #------------------------------------------------------------------------------------------#
            ap = calculate_average_precision_at_10(predicted_result,relevant_docs)
            sum_of_aps += ap
            
    mrr = sum_of_rr / len(queries)
    map=sum_of_aps / len(queries)
    print("========================== MRR ==================================")
    print(f"Mean Reciprocal Rank at 10: {mrr:.2f}")
    print("========================== MAP ==================================")
    print(f"Mean Average Precision at 10: {map:.2f}")
    print("======================== P@10 ===================================")
    print("precision at 10:", query_precision)
    print("===========================Recacll ==============================")
    print("recall at 10",query_recall)

In [None]:
queris=load_queries("wikir")
qrels=load_qrels("wikir")

In [15]:
all_metrices(tfidf_matrix,tfidf_vectorizer,corpus,queris,qrels)

Mean Reciprocal Rank at 10: 0.74
Mean Average Precision at 10: 0.60
precision at 10: {123839: 1.0, 188629: 0.6, 13898: 0.5, 316959: 1.0, 515031: 1.0, 123783: 0.8, 4332: 0.0, 9591: 1.0, 114982: 0.0, 563603: 1.0, 104206: 0.0, 1580851: 0.8, 23678: 0.0, 37480: 1.0, 109454: 1.0, 544250: 0.8, 12519: 0.0, 115015: 0.8, 84287: 1.0, 24998: 1.0, 11925: 1.0, 643: 0.5, 113075: 1.0, 1250387: 1.0, 72012: 1.0, 167865: 1.0, 1425120: 1.0, 24883: 1.0, 27200: 0.0, 136856: 1.0, 6031: 1.0, 1313191: 1.0, 187115: 1.0, 84110: 0.5, 208234: 1.0, 6431: 1.0, 10799: 1.0, 1417144: 1.0, 32512: 0.4, 677637: 1.0, 97937: 1.0, 113765: 0.0, 12900: 0.8, 1294909: 0.8, 73745: 0.0, 11692: 0.5, 22173: 1.0, 76662: 0.4, 164301: 0.9, 23344: 1.0, 9367: 0.6, 592217: 0.6, 98546: 1.0, 98175: 0.9, 15189: 1.0, 1256: 1.0, 1815937: 1.0, 1249076: 1.0, 80274: 1.0, 38489: 0.0, 11311: 1.0, 4754: 0.0, 257711: 0.3, 141668: 0.7, 100895: 1.0, 32945: 0.2, 112321: 0.3, 133725: 1.0, 424464: 0.0, 523456: 0.5, 1327515: 0.8, 12101: 0.0, 9739: 1.0, 189

In [None]:
queris=load_queries("lotte")
qrels=load_qrels("lotte")

In [14]:
all_metrices(tfidf_matrix,tfidf_vectorizer,corpus,queris,qrels)

Mean Reciprocal Rank at 10: 0.29
Mean Average Precision at 10: 0.17
precision at 10: {2076: 0.2, 2077: 0.4, 2078: 0.1, 2079: 0.1, 2080: 0.2, 2081: 0.1, 2082: 0.1, 2083: 0.1, 2084: 0.4, 2085: 0.1, 2086: 0.2, 2087: 0.1, 2088: 0.1, 2089: 0.0, 2090: 0.4, 2091: 0.1, 2092: 0.0, 2093: 0.0, 2094: 0.1, 2095: 0.1, 2096: 0.0, 2097: 0.0, 2098: 0.1, 2099: 0.1, 2100: 0.0, 2101: 0.0, 2102: 0.0, 2103: 0.0, 2104: 0.0, 2105: 0.0, 2106: 0.0, 2107: 0.1, 2108: 0.3, 2109: 0.1, 2110: 0.0, 2111: 0.1, 2112: 0.0, 2113: 0.0, 2114: 0.1, 2115: 0.1, 2116: 0.0, 2117: 0.0, 2118: 0.1, 2119: 0.1, 2120: 0.0, 2121: 0.2, 2122: 0.1, 2123: 0.1, 2124: 0.2, 2125: 0.2, 2126: 0.1, 2127: 0.2, 2128: 0.3, 2129: 0.0, 2130: 0.3, 2131: 0.1, 2132: 0.0, 2133: 0.3, 2134: 0.0, 2135: 0.2, 2136: 0.2, 2137: 0.2, 2138: 0.1, 2139: 0.1, 2140: 0.3, 2141: 0.1, 2142: 0.0, 2143: 0.0, 2144: 0.1, 2145: 0.2, 2146: 0.2, 2147: 0.1, 2148: 0.1, 2149: 0.1, 2150: 0.1, 2151: 0.0, 2152: 0.1, 2153: 0.1, 2154: 0.1, 2155: 0.0, 2156: 0.0, 2157: 0.3, 2158: 0.0, 2