## loading indexes from .json files

In [1]:
import json

# loading Inverted Index
with open("inverted_index_updt.json", "r") as f:
    inverted_index = {}
    for line in f:
        data = json.loads(line.strip())
        inverted_index.update(data)

# loading Positional Index
with open("positional_index_updt.json", "r") as f:
    positional_index = {}
    for line in f:
        data = json.loads(line.strip())
        positional_index.update(data)


In [2]:
inverted_index

{'aal': [327],
 'aapl': [114],
 'abbrevi': [128],
 'abc': [226],
 'abdomin': [222],
 'abduct': [284],
 'abil': [5,
  8,
  22,
  23,
  41,
  49,
  52,
  54,
  55,
  69,
  84,
  104,
  106,
  114,
  141,
  143,
  149,
  155,
  157,
  171,
  191,
  206,
  239,
  251,
  257,
  365,
  372,
  374,
  384,
  431],
 'abl': [18,
  50,
  58,
  66,
  68,
  93,
  95,
  135,
  136,
  154,
  155,
  158,
  162,
  168,
  197,
  205,
  225,
  239,
  256,
  259,
  260,
  267,
  288,
  291,
  292,
  297,
  310,
  313,
  326,
  328,
  335,
  358,
  386,
  408,
  439,
  447],
 'abnorm': [10, 131, 197, 203, 276, 321, 329, 405],
 'abound': [112],
 'about': [4,
  5,
  19,
  48,
  54,
  55,
  81,
  83,
  84,
  91,
  93,
  127,
  129,
  132,
  133,
  150,
  151,
  158,
  178,
  182,
  186,
  188,
  206,
  213,
  217,
  253,
  281,
  283,
  291,
  294,
  308,
  340,
  344,
  356,
  365,
  373,
  392,
  445],
 'abov': [30, 49, 271, 276],
 'abp': [138],
 'absenc': [51, 98, 140, 435],
 'absent': [258],
 'absolut': [

In [3]:
positional_index

{'aal': {'327': [20, 51, 100]},
 'aapl': {'114': [32]},
 'abbrevi': {'128': [88]},
 'abc': {'226': [93]},
 'abdomin': {'222': [4, 21, 61]},
 'abduct': {'284': [42]},
 'abil': {'5': [31, 36],
  '8': [59],
  '22': [41],
  '23': [68],
  '41': [30],
  '49': [34],
  '52': [114],
  '54': [104],
  '55': [96],
  '69': [110],
  '84': [82],
  '104': [140],
  '106': [150],
  '114': [106],
  '141': [65],
  '143': [140],
  '149': [39],
  '155': [21],
  '157': [40],
  '171': [104],
  '191': [81],
  '206': [66, 112],
  '239': [20],
  '251': [86],
  '257': [116],
  '365': [96],
  '372': [109],
  '374': [18],
  '384': [113],
  '431': [62]},
 'abl': {'18': [27],
  '50': [104],
  '58': [70],
  '66': [22],
  '68': [82],
  '93': [182, 191],
  '95': [88],
  '135': [34],
  '136': [28],
  '154': [137],
  '155': [33],
  '158': [150],
  '162': [30],
  '168': [121],
  '197': [169],
  '205': [73],
  '225': [100],
  '239': [144],
  '256': [133],
  '259': [169],
  '260': [91],
  '267': [139],
  '288': [27, 34, 98],

In [4]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

ps = PorterStemmer()



def validateQuery(query):
    lower_query = query.lower()
    
    if not lower_query.strip():
        return {"Error" : "Query can not be Empty."}
    
    words = lower_query.strip().split()
    
    if len(words)>5:
        return {"Error": "Query Can Contain 3 Terms and 2 Operators At Most."}
    
    
    valid_operators = {"and","or","not"}

    terms = []
    operators = []
    expecting_term = True # tracking what should come next
    
    for word in words:
        if word in valid_operators:
            
            if word == "not" and expecting_term:
                # NOT is valid at the start or after another operator
                operators.append(word)
                expecting_term = True  # expecting a term after NOT
                
            elif expecting_term:
                return {"Error": "Invalid Operator Placement!"}
            
            else:
                operators.append(word)
                expecting_term = True # now next must be an term
                
        else:
            
            if not expecting_term:
                return {"Error": "Operators Must Separate Terms Correctly."}
            stemmed_word = ps.stem(word)
            terms.append(stemmed_word)
            expecting_term = False  # next must be an operator or end of query
    
    
    if expecting_term and operators:  # query should not end with AND/OR
        return {"Error": "Query can not end with an operator."}
    
    
    if len(terms) == 1 and not operators:
        return {"terms":terms, "operators":None, "valid":True}
    
    return {"terms":terms, "operators":operators, "valid":True}
 

 

In [15]:
def merge_AND(l1,l2):
    # intersection of 2 lists 
    intersected = []
    
    for doc in l1:
        if doc in l2:
            intersected.append(doc)
    return intersected


def merge_OR(l1,l2):
    # unions of 2 lists
    union_set = set()
    
    for doc in l1:
        union_set.add(doc)
    
    for doc in l2:
        union_set.add(doc)
    
    return sorted(union_set) 



def merge_NOT(all_docs,l1):
    # return NOT operation
    complement_docs = []
    
    for doc in all_docs:
        if doc not in l1:
            complement_docs.append(doc)
    return complement_docs




In [16]:

def boolean_search(terms,operator,inverted_index):
    
    # fetch all docs first of all
    all_doc = set()
    
    for docs in inverted_index.values():
        for doc in docs:
            all_doc.add(doc) # adding each  docID to the set
            
    all_doc = sorted(all_doc)
    
    
    # 1) Handle single term quries:
    if len(terms) == 1:
        return inverted_index.get(terms[0],[])  # if term present in invrtd index return IDs list else []
    
    
    
    # geting the posting list for all terms
    l1 = inverted_index.get(terms[0],[])
    l2 = inverted_index.get(terms[1],[])
    
    
    if len(terms) == 3:
        l3 = inverted_index.get(terms[2],[])
    else:
        l3 = []  # when theres no 3rd term
        
        
    

    # processing the first operation
    if operator[0] == "and":
        result = merge_AND(l1,l2)
    elif operator[0] == "or":
        result = merge_OR(l1,l2)
    elif operator[0] == "not":
        result = merge_NOT(all_doc,l2)
    else:
        return [] # invalid operator
    
    
    # if there is second operation
    if len(operator) == 2:
        if ops[1] == 'and':
            result = merge_AND(result, l3)
        elif ops[1] == 'or':
            result = merge_OR(result, l3)
        elif ops[1] == 'not':
            result = merge_NOT(all_docs, result)
        else:
            return []  # Invalid operator
    return result

    
    
    

In [None]:
 # old validity checks
    
"""
def check_query_validity(tokens):
    
    # rule#01 : query can not start or ends with a operator
    if tokens[0] in {"and","or"} or tokens[-1] in {"and","or"}:
        print("Invalid Format! Query started or ended by an Operator.")
        return False
    
    # rule#02 : consecutive operators not allowed
    for i in range(len(tokens)-1):
        if tokens[i] in operators and tokens[i+1] in operators:
            print(f"Consecutive operators {tokens[i]} {tokens[i+1]} not allowed!")
            return False
    # rule#03 : NOT shpould not be before any operator
    for i in range(len(tokens)-1):
        if tokens[i] == "not" and tokens[i+1] in operators:
            print(f"NOT operator can not be before a {tokens[i+1]} Operator!")
            return False
        
    # if all cases failed means the query is valid
    return True
    


def preprocess_query(query):
    
    if not query:
        print("Query is empty")
        return None
    
    stemmer = PorterStemmer()
    tokens = word_tokenize(query.lower())  # breaking the query sentence into words
    
    
    # Ensemble AND Learning OR Neural  => implementing for MAXimum 5 words 
    query_len = len(tokens)
    if query_len > 5:
        print("Query Exceeded its Limit! Try 5 or <5 Words.\n")
        return None
    
    
   
    processed_query = []  # to store query wrds
    
    
    
    for token in tokens:
        if token in operators:
            processed_query.append(token)  
        else:
            stemmed_word = stemmer.stem(token)
            processed_query.append(stemmed_word)
    
    # now lets check the query is in valid format or not
    valid = check_query_validity(processed_query)
    
    if valid == False:
        return None
    else:
        return processed_query
    
    
"""


### testinng my valideQuery function for query preprocessing and validation

In [17]:
# validateQuery function and edge cases testing 

query1 = "Not ensemble"
print(validateQuery(query1))  

query2 = "apple AND banana OR cherry"
print(validateQuery(query2))  

query3 = "NOT banana AND apple"
print(validateQuery(query3))  

query4 = "image AND restoration"
print(validateQuery(query4))  

query5 = "AND banana apple"
print(validateQuery(query5))  


{'terms': ['ensembl'], 'operators': ['not'], 'valid': True}
{'terms': ['appl', 'banana', 'cherri'], 'operators': ['and', 'or'], 'valid': True}
{'terms': ['banana', 'appl'], 'operators': ['not', 'and'], 'valid': True}
{'terms': ['imag', 'restor'], 'operators': ['and'], 'valid': True}
{'Error': 'Invalid Operator Placement!'}


### Verifying using single query

In [18]:
query = "autoencoders"

query_data = validateQuery(query)

terms, ops, res = query_data['terms'], query_data['operators'], query_data['valid']
print(f"Query Terms: {terms}")

if res:
    retrieved = boolean_search(terms,ops,inverted_index)
    print(retrieved)
else:
    print("Invalid Query!")

Query Terms: ['autoencod']
[187, 273, 279, 325, 333, 405]


## For calculating the metrics (accuracy,precision and F1-score)

In [25]:
def compute_metrics(retrieved, relevant):   # for calculating Precision, Recall, F1-Score
   
    retrieved = set(retrieved)
    relevant = set(relevant)
    
    # True Positives: intersection of retrieved and relavant docIDs
    true_positives = len(retrieved.intersection(relevant))
    
    # Precision: what frction of retrieved docs are relevant
    if len(retrieved) > 0:
        precision = true_positives / len(retrieved)
    else:
        precision = 0
    
    
    
    # Recall: what fraction of all relevant docs were retrieved
    if len(relevant) > 0:
        recall = true_positives / len(relevant)
    else:
        recall = 0
    
    
    
    # F1-Score: Harmonic mean of precision and recall
    if precision + recall > 0:
        f1 = 2 * precision * recall / (precision + recall)
    else:
        f1 = 0
        
    return precision, recall, f1


In [26]:
gold_standard = {
    "image AND restoration": {359, 375},
    "deep AND learning": {24, 175, 177, 250, 254, 258, 267, 273, 279, 281, 333, 345, 346, 348, 352, 357, 358, 360, 362, 371, 374, 375, 380, 396, 397, 401, 404, 415, 421, 432},
    "autoencoders": {187, 273, 279, 325, 333, 405},
    "temporal AND deep AND learning": {279, 358, 373, 405},
    "time AND series": {40, 54, 110, 111, 112, 113, 158, 163, 173, 180, 181, 202, 220, 237, 238, 239, 240, 258, 277, 283, 295, 305, 350, 405, 421, 437, 438, 445},
    "time AND series AND classification": {40, 237, 283, 445},
    "time AND series OR classification": {4, 6, 9, 10, 16, 22, 24, 33, 34, 38, 40, 43, 45, 46, 49, 51, 54, 55, 56, 58, 59, 60, 63, 64, 66, 67, 71, 73, 75, 76, 77, 80, 84, 85, 94, 95, 98, 99, 106, 107, 110, 111, 112, 113, 120, 121, 122, 123, 125, 126, 128, 140, 143, 147, 158, 163, 164, 165, 167, 168, 169, 171, 173, 174, 175, 176, 177, 180, 181, 182, 187, 193, 197, 198, 202, 208, 210, 213, 215, 220, 228, 229, 234, 235, 236, 237, 238, 239, 240, 245, 247, 248, 249, 252, 256, 258, 259, 261, 265, 268, 272, 273, 277, 280, 283, 286, 287, 289, 295, 299, 302, 303, 305, 310, 313, 316, 317, 321, 327, 328, 334, 338, 341, 345, 348, 350, 352, 353, 354, 357, 363, 369, 371, 375, 377, 378, 382, 384, 385, 386, 387, 395, 397, 404, 405, 408, 420, 421, 424, 425, 427, 432, 437, 438, 439, 442, 445},
    "pattern": {9, 10, 18, 21, 23, 26, 30, 34, 40, 50, 73, 118, 126, 127, 139, 145, 148, 155, 180, 186, 189, 194, 201, 209, 214, 216, 230, 231, 234, 238, 279, 280, 288, 326, 343, 350, 351, 368, 369, 383, 394, 406, 412, 413, 424, 425, 429, 446, 447},
    "pattern AND clustering": {40, 73, 180, 216, 326, 350, 351, 413, 446},
    "pattern AND clustering AND heart": {73}
}


## Now Verifying Against Gold Standard Queries for (Boolean Search)

In [29]:
#query = "autoencoders"
gold_std_queries = [
    "image AND restoration",
    "deep AND learning",
    "autoencoders",
    "temporal AND deep AND learning",
    "time AND series",
    "time AND series AND classification",
    "time AND series OR classification",
    "pattern",
    "pattern AND clustering",
    "pattern AND clustering AND heart"
]

for query in gold_std_queries:
    query_data = validateQuery(query)

    terms, ops, res = query_data['terms'], query_data['operators'], query_data['valid']
    print(f"Original Query: {query}")
    print(f"Query Processed Terms: {terms}")
    print(f"Operators in Query: {ops}")
    

    if res:
        retrieved = boolean_search(terms,ops,inverted_index)
        print(f"Docs Retrieved: {retrieved}")
    else:
        print("Invalid Query!")
        
      
    # feteching the gold standard set for the query
    gold_docs = gold_standard.get(query, set())
    
    # calculating the metrics for reliablilty of results
    precision, recall, f1 = compute_metrics(retrieved, gold_docs)
    
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print("****************************************************************************\n")

    

Original Query: image AND restoration
Query Processed Terms: ['imag', 'restor']
Operators in Query: ['and']
Docs Retrieved: [359, 375]
Precision: 1.0
Recall: 1.0
F1-score: 1.0
****************************************************************************

Original Query: deep AND learning
Query Processed Terms: ['deep', 'learn']
Operators in Query: ['and']
Docs Retrieved: [23, 24, 174, 175, 176, 177, 213, 245, 247, 250, 254, 258, 267, 272, 273, 278, 279, 281, 325, 333, 345, 346, 347, 348, 352, 357, 358, 360, 362, 371, 373, 374, 375, 380, 381, 382, 396, 397, 401, 404, 405, 415, 421, 432, 444]
Precision: 0.6666666666666666
Recall: 1.0
F1-score: 0.8
****************************************************************************

Original Query: autoencoders
Query Processed Terms: ['autoencod']
Operators in Query: None
Docs Retrieved: [187, 273, 279, 325, 333, 405]
Precision: 1.0
Recall: 1.0
F1-score: 1.0
****************************************************************************

Original Que

## Proximity Search

In [63]:
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [64]:

ps = PorterStemmer()



def ProximityQuery_search(query,positional_index):
    
    
    lower_query = query.lower()
    
    if not lower_query.strip():
        return {"Error" : "Query can not be Empty."}
    
    
    # query : term   term   /   int   (handling all edge cases for spaces and whatever user inputs)
    pattren = r'^(\w+)\s+(\w+)\s*/\s*(\d+)$'
    
    match = re.match(pattren, lower_query)
    
    if not match:
        return {"Error": "Invalid proximity query format. Expected format: 'term1 term2 /k', e.g., 'neural information /2'"}
    

    term1_raw, term2_raw, dist_str = match.groups()
    
    
    # storing the stemmmed words
    term1 = ps.stem(term1_raw)
    term2 = ps.stem(term2_raw)
    
    if dist_str.isdigit():
        dist = int(dist_str)
    else:
        return {"Error": "Invalid proximity value. Please provide a numeric value after '/'."}

    
    result_docs = set()
    
    
    # chk if both terms persent in positional index
    if term1 not in positional_index or term2 not in positional_index:
        return {"terms": [term1, term2], "distance": dist, "result": []}

    
    # if they both appear in index
    docs_term1 = set(positional_index[term1].keys())  # get docIDs of term1
    docs_term2 = set(positional_index[term2].keys())  # get docIDs of term2
    
    # get the common docID by intersecting
    common_docs = docs_term1.intersection(docs_term2)
    
    
    # now for each common doc chk for positions using 2-pointer technique
    for doc in common_docs:
        pos1 = positional_index[term1][doc]
        pos2 = positional_index[term2][doc]
        
        i = 0
        j = 0
        while i<len(pos1) and j<len(pos2):
            
            gap = abs(pos1[i] - pos2[j])
            if gap <= dist:
                result_docs.add(doc)
                break   # stop chk futher positions of this doc
            
            if pos1[i] < pos2[j]:
                i += 1
            else:
                j += 1
            
    result_docs = sorted(result_docs)
    return {"terms": [term1, term2], "distance": dist, "result": result_docs}

        
        

In [65]:
# testing

ProximityQuery_search("neural information /3",positional_index)

{'terms': ['neural', 'inform'], 'distance': 3, 'result': ['26']}

## Now Verifying Against Gold Standard Queries for (Proximity Search)

In [79]:
gold_standard_proximity = {
    "neural information /2": {26},
    "feature track /5": {13,212}
}


quries = ["neural information /2", "feature track /5"]


In [80]:
for query in quries:
    result_data = ProximityQuery_search(query,positional_index)
    
    if "Error" in result_data:
        print("Query:", query)
        print(result_data["Error"])
        print("****************************************************************************\n")
        continue
    
    retrieved_docs = result_data.get("result",[])
    retrieved_docs = [int(doc) for doc in retrieved_docs if doc != ""]

    
    gold_docs = gold_standard_proximity.get(query,set())
    
    precision, recall, f1 = compute_metrics(retrieved_docs,gold_docs)
    
    print("Proximity Query:", query)
    print("Processed Terms:", result_data.get("terms"))
    print("Distance:", result_data.get("distance"))
    print("Docs Retrieved:", retrieved_docs)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print("****************************************************************************\n")

    

    

Proximity Query: neural information /2
Processed Terms: ['neural', 'inform']
Distance: 2
Docs Retrieved: []
Precision: 0
Recall: 0.0
F1-score: 0
****************************************************************************

Proximity Query: feature track /5
Processed Terms: ['featur', 'track']
Distance: 5
Docs Retrieved: [13, 212]
Precision: 1.0
Recall: 1.0
F1-score: 1.0
****************************************************************************

