In [2]:
!pip install nltk
!pip install contractions
!pip install python-dateutil
!pip install country_converter --upgrade
!pip install flask



In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from  dateutil.parser import parse
from datetime import datetime
from flask import Flask, request, jsonify,render_template

In [2]:
import string
import re
import contractions
import pandas as pd
import nltk as nk
import numpy as np
import re
import country_converter as coco
import pickle


In [3]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
vectorizer = TfidfVectorizer()
vectorizer2 = TfidfVectorizer()
vec = CountVectorizer()
cc = coco.CountryConverter()

**1- PREPROCESSING**

In [4]:
def handle_numbers(text):
    # Replace numbers with a special token
    text = re.sub("(?<![\d/-])\d+(?![\d/-])", "[NUMBER]", text)
    return text

In [5]:
def handle_countries(tokens):
  text = cc.pandas_convert(series = pd.Series(tokens, name='country'), to='ISO3', not_found=None) 
  text = (' ').join(text)
  text = text.replace("united states", "USA")
  text = text.replace("united kingdom", "UK")
  return word_tokenize(text)

In [6]:
def handle_contractions(text):
    # Expand contractions in the text
    text = contractions.fix(text)
    return text

In [7]:
def remove_punctuation(text):
    # Define a translation table that excludes punctuation from dates
    date_punctuation = string.punctuation.replace('-', '')
    table = str.maketrans('', '', date_punctuation)

    # Split the text into words
    words = text.split()

    # Remove punctuation from each word, except for words that look like dates or contain special characters
    cleaned_words = []
    for word in words:
        if any(char.isdigit() for char in word) or any(char.isalpha() for char in word):
            # If the word contains alphanumeric characters, keep punctuation
            cleaned_word = word.translate(str.maketrans('', '', string.punctuation.replace('-', '')))
            cleaned_words.append(cleaned_word)
        else:
            # If the word contains only special characters or punctuation, remove it
            continue

    # Join the cleaned words back into a string
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

In [8]:
def lowercase(text):
    # Convert the text to lowercase
    text = text.lower()
    return text

In [9]:
def handle_dates(text):
    # Define regular expression to match dates in text
    regex = r'\b\d{1,4}[/-]\d{1,2}[/-]\d{1,4}\b'

    # Search for all dates in text
    dates = re.findall(regex, text)

    # Loop through all dates found and convert to the desired format
    for date in dates:
        # Try to parse the date - skip if it is an invalid date
        try:
            datetime_obj = datetime.strptime(date, '%d/%m/%Y')
        except ValueError:
            try:
                datetime_obj = datetime.strptime(date, '%Y/%m/%d')
            except ValueError:
                try:
                    datetime_obj = datetime.strptime(date, '%m/%d/%Y')
                except ValueError:
                    continue

        # Convert date to desired format
        formatted_date = datetime_obj.strftime('%Y-%m-%d')

        # Replace the original date in the text with the formatted date
        text = text.replace(date, formatted_date)

    # Return the modified text with handled dates
    return text

In [10]:

# nlp = spacy.load('en_core_web_sm')

# def lemmatize_with_pos(text):
#     doc = nlp(text)
#     lemmatized_text = " ".join([token.lemma_ for token in doc])
#     return lemmatized_text
def lemmatize_with_pos(tokens):
    pos_tags = nk.pos_tag(tokens)
    pos_lemmas = []
    for word, pos in pos_tags:
        if pos.startswith('J'):
            # If the word is an adjective, use 'a' (adjective) as the POS tag
            pos_lemmas.append(lemmatizer.lemmatize(word, pos='a'))
        elif pos.startswith('V'):
            # If the word is a verb, use 'v' (verb) as the POS tag
            pos_lemmas.append(lemmatizer.lemmatize(word, pos='v'))
        elif pos.startswith('N'):
            # If the word is a noun, use 'n' (noun) as the POS tag
            pos_lemmas.append(lemmatizer.lemmatize(word, pos='n'))
        else:
            # For all other cases, use the default POS tag 'n' (noun)
            pos_lemmas.append(lemmatizer.lemmatize(word))
    return pos_lemmas

In [11]:
def preprocess_text(text_str):
    # text_str = str(text)
    #Normalization
    text_str = remove_punctuation(text_str)
    text_str = handle_dates(text_str)
    text_str = handle_numbers(text_str)
    text_str = handle_contractions(text_str)
    text_str = lowercase(text_str)

    #Tokenization
    tokens = word_tokenize(text_str)
    stop_words = set(stopwords.words('english'))
    #Removing stop worlds
    tokens = [token for token in tokens if token not in stop_words]

    #tokens = handle_countries(tokens)
    
    #Stemming
    tokens = [ps.stem(token) for token in tokens]

    #Lemmatization  
    lemmatized_tokens = lemmatize_with_pos(tokens)
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

**2- REPRESENTATION**

In [12]:
# Data Representation
def represent_text(text):
    x = vectorizer.fit_transform(text)
    return x

In [13]:
# Data Representation
def represent_text2(text):
    x = vectorizer2.fit_transform(text)
    return x

**3- INDEXING**

In [14]:
#Indexing
def build_index(data):
    inverse_index = {}
    for i, doc in enumerate(data):
        terms = doc.split(' ')
        for term in terms:
            if term not in inverse_index:
                inverse_index[term] = [i]
            else:
                inverse_index[term].append(i)
    return inverse_index

In [15]:
def build_index2(data):
    inverse_index = {}
    for i, doc in enumerate(data):
        terms = doc.split(' ')
        for term in terms:
            if term not in inverse_index:
                inverse_index[term] = [i]
            else:
                inverse_index[term].append(i)
    return inverse_index

**4- MATCHING & RANKING** 

In [17]:
def match_query(query_vector, document_vectors, candidate_docs, data, top_k):
    query_vector = vectorizer.transform([query_vector])
    similarities = cosine_similarity(query_vector, document_vectors).flatten()
    print(query_vector)
    # Rank the documents
    sorted_indices = similarities.argsort()[::-1]
    # print(sorted_indices)
    # print(data)
    ranked_documents = [data.iloc[i] for i in sorted_indices if i in candidate_docs]

    # Return the top-ranked documents
    search_results = ranked_documents[:top_k]
    return search_results


In [18]:
def match_query2(query_vector2, document_vectors2, candidate_docs2, data2, top_k):
    query_vector2 = vectorizer2.transform([query_vector2])
    similarities2 = cosine_similarity(query_vector2, document_vectors2).flatten()
    print(query_vector2)
    # Rank the documents
    sorted_indices2 = similarities2.argsort()[::-1]
    # print(sorted_indices)
    # print(data)
    ranked_documents2 = [data2.iloc[i] for i in sorted_indices2 if i in candidate_docs2]

    # Return the top-ranked documents
    search_results2 = ranked_documents2[:top_k]
    return search_results2

In [19]:
def get_candidate_docs(query_text, inverse_index):
    # Split the query text into terms
    query_terms = query_text.split(' ')

    # Retrieve the candidate document IDs for the query
    candidate_doc_ids = set(inverse_index.get(query_terms[0], []))
    for term in query_terms[1:]:
        candidate_doc_ids.intersection_update(set(inverse_index.get(term, [])))

    # Return the candidate document IDs
    return candidate_doc_ids

In [20]:
def get_candidate_docs2(query_text2, inverse_index2):
    # Split the query text into terms
    query_terms2 = query_text2.split(' ')

    # Retrieve the candidate document IDs for the query
    candidate_doc_ids2 = set(inverse_index2.get(query_terms2[0], []))
    for term in query_terms2[1:]:
        candidate_doc_ids2.intersection_update(set(inverse_index2.get(term, [])))

    # Return the candidate document IDs
    return candidate_doc_ids2

In [21]:
df = pd.read_csv("wikiest/documents.csv").head(400000)

In [22]:
preprocessed_text = df['text'].apply(preprocess_text)

In [23]:
document_vectors = represent_text(preprocessed_text)

In [24]:
inverse_index = build_index(preprocessed_text)

In [25]:
queries = pd.read_csv('wikiest/queries.csv')

In [26]:
qrels = pd.read_csv('wikiest/qrels.csv')

In [27]:
df2=pd.read_csv("lotte/docs.csv").head(200000)

In [28]:
preprocessed_text2=df2['text'].apply(preprocess_text)

In [29]:
document_vector2 = represent_text2(preprocessed_text2)

In [30]:
inverse_inde2 = build_index(preprocessed_text2)

In [31]:
queries2 = pd.read_csv('lotte/queries.csv')

In [32]:
qrels2 = pd.read_csv('lotte/qrels.csv')

In [33]:
def process_queries(queries, document_vectors, inverse_index, data):
    match_result = {}
    
    for (index, query_id, text) in queries.values:
        top_k = len(qrels.loc[qrels['query_id'] == query_id, 'doc_id'])
        # Preprocess the query text
        processed_query = preprocess_text(text)
        
        # Get the candidate documents for the query
        candidate_docs = get_candidate_docs(processed_query, inverse_index)
        
        # Rank the documents and retrieve the top results
        query_results = match_query(processed_query, document_vectors, candidate_docs, data, top_k)
        
        # Store the results for the query
        match_result[query_id] = [doc['doc_id'] for doc in query_results]
    
    return match_result

In [35]:
result_matches = process_queries(queries, document_vectors, inverse_index, df)

  (0, 3673)	0.612422750719358
  (0, 3259)	0.7905304386305028

  (0, 3672)	0.39202763283668246
  (0, 3321)	0.33038981331658174
  (0, 1886)	0.5804907124419155
  (0, 609)	0.6326036983118094


  (0, 1966)	1.0
  (0, 3673)	0.4352246671507473
  (0, 1870)	0.4925945907260671
  (0, 1659)	0.7536113443220848
  (0, 825)	1.0



  (0, 1839)	0.7071067811865476
  (0, 467)	0.7071067811865476





  (0, 2864)	1.0

  (0, 3672)	0.32401473046474294
  (0, 3321)	0.273070460710779
  (0, 3301)	0.4797813367597753
  (0, 1378)	0.3897742605953864
  (0, 609)	0.5228532369423347
  (0, 103)	0.40614943705772727

  (0, 1023)	0.6947536966339446
  (0, 569)	0.49907341431479857
  (0, 484)	0.5179218359343796
  (0, 896)	0.7761520616027761
  (0, 314)	0.6305457772991272
  (0, 606)	0.8219024820329071
  (0, 341)	0.5696282208845932
  (0, 3816)	1.0
  (0, 3509)	1.0
  (0, 3672)	0.6236325739231582
  (0, 1919)	0.7817176042164948
  (0, 1966)	1.0
  (0, 1966)	1.0
  (0, 2882)	1.0
  (0, 1443)	0.6134565880470041
  (0, 250)	0.7897284435688817


In [36]:
print(result_matches)

{158491: [], 5728: [], 13554: [], 32674: [], 406391: [], 5115: [], 15469: [], 62953: [], 152444: [], 104086: [], 145194: [], 73752: [], 1368508: [], 11534: [], 83078: [], 25174: [], 265104: [], 139082: [], 37743: [], 207224: [], 1313611: [], 641464: [], 75295: [], 2591: [890294], 107590: [], 362197: [], 13540: [1728654, 1671729, 502902], 25406: [], 128282: [], 267973: [], 116217: [503290], 16130: [], 471746: [], 113000: [], 87860: [], 25983: [], 11919: [], 22343: [], 81083: [], 172877: [], 7169: [], 13690: [], 8140: [], 206019: [], 104453: [], 73673: [], 996687: [], 121384: [], 87686: [], 410859: [], 108909: [], 92044: [], 371437: [], 151303: [], 101626: [], 79032: [], 17656: [], 15514: [], 1254734: [], 1238278: [], 21645: [], 1280405: [759572], 32559: [], 34939: [], 174151: [], 4900: [], 10166: [], 25726: [2426736], 720: [], 200237: [], 14410: [], 23920: [], 1897191: [], 196387: [], 25130: [], 28232: [], 16952: [], 28485: [], 113827: [], 32219: [], 6984: [], 124119: [], 34875: [], 791

In [41]:
def process_queries2(queries, document_vectors, inverse_index, data):
    match_result = {}
    
    for (index, query_id, text) in queries.values:
        top_k = len(qrels2.loc[qrels2['query_id'] == query_id, 'doc_id'])
        # Preprocess the query text
        processed_query = preprocess_text(text)
        
        # Get the candidate documents for the query
        candidate_docs = get_candidate_docs(processed_query, inverse_index)
        
        # Rank the documents and retrieve the top results
        query_results = match_query2(processed_query, document_vectors, candidate_docs, data, top_k)
        
        # Store the results for the query
        match_result[query_id] = [doc['doc_id'] for doc in query_results]
    
    return match_result

In [42]:
result_matches2 = process_queries2(queries2, document_vector2, inverse_inde2, df2)

  (0, 1252)	0.27630349580102564
  (0, 933)	0.5744086171278504
  (0, 835)	0.770526520489336
  (0, 1323)	0.41721643621639976
  (0, 686)	0.6475223714761291
  (0, 674)	0.38648952528968405
  (0, 128)	0.5072189572858761
  (0, 1270)	0.61458424635994
  (0, 459)	0.3955798437317365
  (0, 408)	0.61458424635994
  (0, 386)	0.2967979034385922
  (0, 1055)	0.4346244533079612
  (0, 907)	0.37077936513243503
  (0, 763)	0.5830164411555552
  (0, 476)	0.47448018444257156
  (0, 278)	0.32952182157240945
  (0, 1287)	0.3486484094644865
  (0, 867)	0.42585229146457426
  (0, 593)	0.5482175493448145
  (0, 278)	0.30985343246926755
  (0, 16)	0.5482175493448145
  (0, 1252)	0.14541211518384942
  (0, 1178)	0.4055102192058119
  (0, 553)	0.4055102192058119
  (0, 382)	0.4055102192058119
  (0, 299)	0.6968067903778795
  (0, 994)	0.44483053338735823
  (0, 815)	0.7221954721714559
  (0, 23)	0.5296786729153499
  (0, 1318)	0.44130668729282335
  (0, 1073)	0.44130668729282335
  (0, 1030)	0.5432136848312069
  (0, 856)	0.425246475252

In [43]:
print(result_matches2)

{0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 11: [], 12: [], 13: [49, 11], 14: [], 15: [], 16: [], 17: [], 18: [], 19: [], 20: [], 21: [], 22: [], 23: [], 24: [], 25: [83, 49], 26: [], 27: [92, 77], 28: [], 29: [], 30: [], 31: [], 32: [], 33: [], 34: [], 35: [], 36: [], 37: [], 38: [], 39: [49], 40: [90, 91], 41: [], 42: [], 43: [], 44: [], 45: [], 46: [], 47: [], 48: [], 49: [], 50: [83, 49], 51: [], 52: [], 53: [83], 54: [], 55: [], 56: [], 57: [], 58: [], 59: [], 60: [], 61: [], 62: [], 63: [], 64: [], 65: [], 66: [], 67: [], 68: [], 69: [], 70: [], 71: [], 72: [], 73: [], 74: [], 75: [], 76: [], 77: [], 78: [], 79: [], 80: [], 81: [], 82: [], 83: [], 84: [], 85: [], 86: [], 87: [], 88: [], 89: [], 90: [], 91: [93, 12, 21, 25, 43, 23, 82, 35, 32, 8, 30, 24, 91, 50], 92: [], 93: [], 94: [], 95: [], 96: [], 97: [], 98: [], 99: [71], 100: [], 101: [], 102: [], 103: [], 104: [], 105: [], 106: [], 107: [], 108: [], 109: [], 110: [45, 62, 54], 111: [], 11

In [55]:
def process_queries_api(query, document_vectors, inverse_index, data):
    id = queries.loc[queries['text'] == query, 'query_id']
    top_k = len(qrels.loc[qrels['query_id'] == id.get(2), 'doc_id'])
    processed_query = preprocess_text(query)
    candidate_docs = get_candidate_docs2(processed_query, inverse_index)
    query_results2 = match_query(processed_query, document_vectors, candidate_docs, data,6)
    arr2 = {}
    for i in query_results2:
      arr2[i['doc_id']] = i['text']  
    return arr2

In [57]:
result = process_queries_api(" world", document_vectors, inverse_index, df)
print(result)

  (0, 3873)	1.0
{1602376: 'in 2015 the firm refocused its business to provide capital and specialist advice exclusively for forbes 500 families the company works only with ultra high net worth investors in private off market initiatives providing specialist advice in the form of business development and strategic advisory services with a focus on emerging markets of asia latin america cis and the middle east in an investigative report euromoney described bmb as one of the world s most exclusive financial institutions the group calls to mind an era when courts of asian empires were the global centers of money technology culture and power its growing number of investors are drawn from eastern ruling families with vast riches gleaned from the long term growth demand for commodities the firm was founded in 2006 by prominent financier rayo withanage originally an attorney and strategist who developed a successful career as a direct investor mr withanage who was named in the financial times 

In [47]:
def process_queries_api2(query, document_vectors, inverse_index, data):
    id = queries.loc[queries['text'] == query, 'query_id']
    top_k = len(qrels2.loc[qrels2['query_id'] == id.get(2), 'doc_id'])
    processed_query = preprocess_text(query)
    candidate_docs = get_candidate_docs(processed_query, inverse_index)
    query_results = match_query2(processed_query, document_vectors, candidate_docs, data, 6)
    arr = {}
    for i in query_results:
      arr[i['doc_id']] = i['text']  
    return arr

In [54]:
result2 = process_queries_api2("the fifth element earth", document_vector2, inverse_inde2, df2)
print(result2)

  (0, 441)	0.5773502691896257
  (0, 373)	0.5773502691896257
  (0, 358)	0.5773502691896257
{1: '"quint" means fifth. The "quintessence" is the fifth essence. The fifth element was the one supposed to come after air, fire, earth, and water in the Medieval Age.', 4: '"Essence" in this context is a synonym for "element", and "essential" for "elemental". In pre-atomic theory, there were four "known" elements or essences — Earth, Air, Fire and Water — and a putative fifth element (quinta essentia). The fifth element was believed to be superior to the others, and so, "quintessential" has come to mean something that is superior.', 0: "It's the fifth element after earth, air, fire, and water, so it is presumably superior to those or completing those."}


In [79]:
print(result)

{219642: 'a barcode is a machine readable optical label that contains information about the item to which it is attached in practice qr codes often contain data for a locator identifier or tracker that points to a website or application a qr code uses four standardized encoding modes numeric alphanumeric byte binary and kanji to store data efficiently extensions may also be used the quick response system became popular outside the automotive industry due to its fast readability and greater storage capacity compared to standard upc barcodes applications include product tracking item identification time tracking document management and general marketing a qr code consists of black squares arranged in a square grid on a white background which can be read by an imaging device such as a camera and processed using reed solomon error correction until the image can be appropriately interpreted the required data is then extracted from patterns that are present in both horizontal and vertical co

In [50]:
print(df)

    Unnamed: 0   doc_id                                               text
0          1.0  1781133  it was used in landing craft during world war ...
1          2.0  2426736  after rejecting an offer from cambridge univer...
2          3.0  2224122  mat zan coached kuala lumpur fa in 1999 and wo...
3          4.0   219642  a barcode is a machine readable optical label ...
4          5.0  1728654  since the subordination of the monarchy under ...
..         ...      ...                                                ...
95        96.0  1008941  escaping from charisma is a 2006 south korean ...
96        97.0  1313953  after short scientific studies he was introduc...
97        98.0   414617  they formed in 1987 with michael sheridan also...
98        99.0   578027  baykam s father dr suphi baykam is a deputy in...
99       100.0  1834363  the name refers to their nature i e extraordin...

[100 rows x 3 columns]


In [62]:
print(preprocessed_text)

0     use land craft world war ii use today privat b...
1     reject offer cambridg univers move london [ nu...
2     mat zan coach kuala lumpur fa [ number ] malay...
3     barcod machin readabl optic label contain info...
4     sinc subordin monarchi parliament increasingli...
                            ...                        
95    escap charisma [ number ] south korean film st...
96    short scientif studi introduc artist moebiu se...
97    form [ number ] michael sheridan also max q gu...
98    baykam father dr suphi baykam deputi turkish p...
99    name refer natur e extraordinari choose men al...
Name: text, Length: 100, dtype: object


In [61]:
print(document_vectors)

  (0, 2768)	0.05684005900087589
  (0, 1261)	0.05684005900087589
  (0, 1837)	0.06615671690747109
  (0, 3788)	0.09504680218856137
  (0, 2036)	0.05684005900087589
  (0, 803)	0.13231343381494218
  (0, 1510)	0.06615671690747109
  (0, 796)	0.04594026798606497
  (0, 1766)	0.06615671690747109
  (0, 156)	0.06070682140006563
  (0, 2455)	0.030213227765721783
  (0, 1767)	0.06615671690747109
  (0, 3672)	0.04099764385616817
  (0, 1426)	0.12141364280013126
  (0, 3702)	0.06615671690747109
  (0, 3416)	0.06070682140006563
  (0, 3630)	0.06615671690747109
  (0, 2244)	0.06070682140006563
  (0, 1682)	0.05384076507028841
  (0, 397)	0.06070682140006563
  (0, 3003)	0.05384076507028841
  (0, 3744)	0.10768153014057683
  (0, 2753)	0.08199528771233634
  (0, 103)	0.05139016349347043
  (0, 242)	0.06070682140006563
  :	:
  (99, 3124)	0.05224064906357508
  (99, 965)	0.06903233027478041
  (99, 3422)	0.05224064906357508
  (99, 3180)	0.05404089762007699
  (99, 1931)	0.04662017918078253
  (99, 1332)	0.03789971657636809
  

In [69]:
print(inverse_index)

{'use': [0, 0, 3, 3, 3, 3, 5, 8, 10, 11, 17, 18, 31, 33, 43, 46, 50, 56, 56, 56, 56, 56, 56, 61, 64, 65, 65, 69, 69, 69, 69, 70, 73, 73, 74, 74, 75, 77, 77, 78, 78, 78, 78, 80, 81, 88, 90, 91], 'land': [0, 11, 31, 34, 34, 37, 37, 37, 51, 51, 56, 63, 71, 84], 'craft': [0, 51, 98], 'world': [0, 14, 19, 19, 19, 48, 49, 50, 52, 55, 55, 68, 69, 77, 84], 'war': [0, 49, 50, 61, 61, 61, 61, 84], 'ii': [0, 11, 93], 'today': [0, 27, 37, 43, 60, 61], 'privat': [0, 55, 55, 56, 57, 57], 'boat': [0, 0, 25, 25, 51], 'train': [0, 9, 13, 21, 32, 32, 32, 32, 70], 'facil': [0, 5, 14, 50, 62], '[': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 9, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 

**5- EVALUATION**

In [30]:
def precision(tp, fp):
    """
    Calculate precision given the number of true positives (tp) and false positives (fp).
    """
    return tp / (tp + fp + 1e-10)


In [31]:
def recall(tp, fn):
    """
    Calculate recall given the number of true positives (tp) and false negatives (fn).
    """
    return tp / (tp + fn + 1e-10)

In [32]:
def f1_score(precision, recall):
    """
    Calculate F1-score given precision and recall.
    """
    return 2 * precision * recall / (precision + recall + 1e-10)

In [33]:
def calculate_precision_at_k(relevant_docs, retrieved_docs, k):
    retrieved_docs_at_k = retrieved_docs[:k]
    relevant_and_retrieved = set(relevant_docs).intersection(set(retrieved_docs_at_k))
    precision = len(relevant_and_retrieved) / k
    return precision

In [34]:
def calculate_mrr(relevant_docs, retrieved_docs):
    for i, doc in enumerate(retrieved_docs):
        if doc in relevant_docs:
            return 1 / (i + 1)
    return 0

In [35]:
def evaluation(queries, result_matches, qrels, evaluation_results):
    f1_scores = []
    precesion_at_k = []
    average_precision = []
    mrr_scores = []
    for (index, query_id, text) in queries.values:
        if query_id in evaluation_results:
            # If this query has already been evaluated, skip it
            continue

        # Retrieve the ground truth relevance dataset for this query
        relevant_docs = list(qrels.loc[qrels['query_id'] == query_id, 'doc_id'])

        # Retrieve the relevant documents retrieved by the system for this query
        result_match = result_matches[query_id]
        # Compute tp (true positives)
        tp = len(np.intersect1d(relevant_docs, result_match))
        # Compute fp (false positives)
        fp = len(result_match) - tp

        # Compute fn (false negatives)
        fn = len(list(set(relevant_docs) - set(result_match)))

        # Compute precision, recall, and f1_score
        prec = precision(tp, fp)
        rec = recall(tp, fn)
        f1 = f1_score(prec, rec)
        pre_k = calculate_precision_at_k(relevant_docs, result_match, 10)
        mrr = calculate_mrr(relevant_docs, result_match)
        # Store the f1 score in the list
        f1_scores.append(f1)
        average_precision.append(prec)
        precesion_at_k.append(pre_k)
        mrr_scores.append(mrr)
        # Store the evaluation results for this query in the dictionary
        evaluation_results[query_id] = (prec, rec, f1)

    # Compute the average and return it
    avg_f1 = sum(f1_scores) / len(f1_scores)
    avg_pre = sum(average_precision) / len(average_precision)
    avg_pre_at_k = sum(precesion_at_k) / len(precesion_at_k)
    avg_mrr = sum(mrr_scores) / len(mrr_scores)
    return avg_f1, avg_pre, avg_pre_at_k, avg_mrr

In [44]:
evaluation_results = {};
avg_f1, avg_pre, avg_pre_at_k, avg_mrr = evaluation(queries, result_matches, qrels, evaluation_results)
print(avg_f1)
print(avg_pre)
print(avg_pre_at_k)
print(avg_mrr)


0.014792972466284113
0.2810254785057099
0.08099999999999997
0.43325


In [56]:
evaluation_results = {};
avg_f2, avg_pre2, avg_pre_at_k2, avg_mrr2 = evaluation(queries2, result_matches2, qrels2, evaluation_results)
print(avg_f2)
print(avg_pre2)
print(avg_pre_at_k2)
print(avg_mrr2)

0.030493393408092713
0.047276793342549535
0.010799999999999995
0.06405555555555555


**6- FAST API**

In [None]:

app = Flask(__name__, template_folder='templates', static_folder='static')
@app.route("/")
def index():
    return render_template("inde.html")
@app.route("/search",methods=["POST"])
   
def search():
  dataset_name=request.form.get('dataset')
  query = request.form.get('query')

  if dataset_name == 'df':  
     id = queries.loc[queries['text'] == query, 'query_id']
     top_k = len(qrels.loc[qrels['query_id'] == id.get(0), 'doc_id'])
     processed_query = preprocess_text(query)
     candidate_docs = get_candidate_docs(processed_query, inverse_index)
     results = match_query(processed_query, document_vectors, candidate_docs ,df, 10)
  elif dataset_name == 'df1':
     id = queries2.loc[queries2['text'] == query, 'query_id']
     top_k = len(qrels2.loc[qrels2['query_id'] == id.get(2), 'doc_id'])
     processed_query = preprocess_text(query)
     candidate_docs = get_candidate_docs(processed_query, inverse_inde2)
     results = match_query2(processed_query, document_vector2, candidate_docs ,df2, top_k)   
  else:
       return "Invalid dataset name", 400    
  return render_template("results.html",results=results)
@app.route("/results")
def results():
      return render_template("results.html")

if __name__=='__main__':
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [31/May/2024 13:21:32] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [31/May/2024 13:21:32] "GET /static/style.css HTTP/1.1" 304 -
[2024-05-31 13:21:37,458] ERROR in app: Exception on /search [POST]
Traceback (most recent call last):
  File "C:\Users\hasan\anaconda3\Lib\site-packages\flask\app.py", line 2529, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hasan\anaconda3\Lib\site-packages\flask\app.py", line 1825, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hasan\anaconda3\Lib\site-packages\flask\app.py", line 1823, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hasan\anaconda3\Lib\site-packages\flask\app.py", line 1799, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
     

**6- OFFLINE PROCESSING**

In [None]:
# with open('vectorized.pickle', 'wb') as file:
#         pickle.dump(document_vectors, file)
#     with open('inverse_index.pickle', 'wb') as file:
#         pickle.dump(inverse_index, file)

In [None]:
#     with open('document_vectors.pickle', 'rb') as f:
#         document_vectors1 = pickle.load(f)
#     with open('inverse_index.pickle', 'rb') as f:
#         inverse_index1 = pickle.load(f)