# Posting List operations

This notebook is to evaluate inverted index and tf_idf on any given datasets.
For usage adapt the last 4 variables in the first cell (inverted_index_path, tf_idf_path, out_path and query_files) accordingly, depending on whether everything should be executed fo train or eval.

todos:
-   review everything
-   run for train and test set
-   check output file parameter if rank -> tf_idf is correct or this is meant elsewise --> Ciwan confirmed is fine, must just be sorted :)

In [33]:
import shelve
import pickle
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import os
import math 
from collections import Counter
from collections import defaultdict 
from datetime import datetime
import re

nltk.download('stopwords')

try:
    import orjson as _json
    loads = _json.loads
except Exception:
    import json as _json
    loads = _json.loads

INVERTED_INDEX_PATH = "./data/inverted_index_eval.db"
TF_IDF_PATH = "./data/tf_idf_eval.db"
OUT_PATH = "../data/runs"
query_files = ["../data/tot25/subsets/eval20/eval20-queries-dev1.jsonl", "../data/tot25/subsets/eval20/eval20-queries-dev2.jsonl", "../data/tot25/subsets/eval20/eval20-queries-dev3.jsonl", "../data/tot25/subsets/eval20/eval20-queries-train.jsonl"]
stop = set(stopwords.words('english') + list(string.punctuation))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Inverted Index

In [None]:

# Function to tokenize and preprocess a document
def preprocess_unique(text):
    tokens = set(word_tokenize(text.lower())) # get all tokens
    return [i for i in tokens if i not in stop] # get all tokens without stopwords

def AND(l1, l2):
    # note: this method assumes that both lists are sorted and do not contain any duplicates (which if the input comes from a posting list is given)
    # this asssumption is given considering the inverted index and tf_idf db's were created
    p1 = 0
    p2 = 0
    sqrtJump1 = max(math.isqrt(len(l1)), 1)
    sqrtJump2 = max(math.isqrt(len(l2)), 1)
    result = []
    while p1 < len(l1)  and p2 < len(l2):
        if l1[p1] == l2[p2]:
            result.append(l1[p1])
            p1 += 1
            p2 += 1
        # skip pointer in first list
        elif p1 % sqrtJump1 == 0 and p1 + sqrtJump1 < len(l1) and l1[p1 + sqrtJump1] <= l2[p2]:
            p1 += sqrtJump1
        # skip pointer in second list
        elif p2 % sqrtJump2 == 0 and p2 + sqrtJump2 < len(l2) and l2[p2 + sqrtJump2] <= l1[p1]:
            p2 += sqrtJump2
        elif l1[p1] < l2[p2]:
            p1 += 1
        else:
            p2 += 1
    return result

def OR(l1, l2):
    # note: this method assumes that both lists are sorted and do not contain any duplicates (which if the input comes from a posting list is given)
    # this asssumption is given considering the inverted index and tf_idf db's were created
    p1 = 0
    p2 = 0
    result = []
    while p1 < len(l1) and p2 < len(l2):
        if l1[p1] < l2[p2]:
            result.append(l1[p1])
            p1 += 1
        elif l1[p1] > l2[p2]:
            result.append(l2[p2])
            p2 += 1
        else: 
            result.append(l1[p1])
            p1 += 1
            p2 += 1
    while p1 < len(l1):
        result.append(l1[p1])
        p1 += 1
    while p2 < len(l2):
        result.append(l2[p2])
        p2 += 1
    return result

In [None]:
def query_inverted_index_and(query):
    with shelve.open(INVERTED_INDEX_PATH) as db:
        terms = [t for t in preprocess_unique(query)]
        terms.sort(key = lambda t:len(db[t]))
        result = db[terms[0]]
        for term in terms[1:]:
            result = AND(result, db[term])
        return result

def query_inverted_index_sorted_by_word_counts_top_n(query, n):
    with shelve.open(INVERTED_INDEX_PATH) as db:
        terms = [t for t in preprocess_unique(query)]
        result = list()
        for term in terms:
            if not term in db:
                continue
            result.extend(db[term])
        return Counter(result).most_common(n)
    
def evaluate_inverted_index(query_files, runID, output_path):
    os.makedirs(output_path, exist_ok=True) 
    for filename in query_files:
        print(filename)
        print(datetime.now())
        with open(output_path + "/" + filename.split("/")[-1].split(".")[0] + ".run", "w") as output_file:
            testdata = pd.read_json(filename, lines = True)
            for index, row in testdata.iterrows():
                result = query_inverted_index_sorted_by_word_counts_top_n(row["query"], 1000)
                rank_counter = 1
                for id, frequency in result:
                    output_file.write(f"{row.get('query_id')} Q0 {id} {rank_counter} {frequency} {runID}\n")
                    # print(f"{row.get("query_id")} Q0 {id} {rank_counter} {frequency} {runID}")
                    rank_counter += 1
                
                

evaluate_inverted_index(query_files, 1, OUT_PATH + "/inverted_index")
# 2 mins :)

../data/tot25/subsets/eval20/eval20-queries-dev1.jsonl
2025-11-17 22:44:36.277845
../data/tot25/subsets/eval20/eval20-queries-dev2.jsonl
2025-11-17 22:44:50.405052
../data/tot25/subsets/eval20/eval20-queries-dev3.jsonl
2025-11-17 22:45:00.711880
../data/tot25/subsets/eval20/eval20-queries-train.jsonl
2025-11-17 22:46:12.455231


## TF_IDF

In [None]:
def preprocess_unique_list(text):
    tokens = list(word_tokenize(text.lower())) # get all tokens
    return [i for i in tokens if i not in stop] # get all tokens without stopwords


def query_tf_idf_top_n(query, n):
    with shelve.open(TF_IDF_PATH) as db:
        terms = [t for t in preprocess_unique_list(query)]
        
        tf_idf_sums = defaultdict(float)
        
        for term in terms:
            if term not in db:
                continue
            data = db[term]
            idf = data["idf"]
            doc_ids = data["doc_ids"]
            tfs = data["tfs"]

            # in the database the tfs for each document is stored for the documents index in doc_ids, for this reason we can easily zip them together :)
            for doc_id, tf in zip(doc_ids, tfs):
                tf_idf_sums[doc_id] +=  tf * idf
        
        return sorted(tf_idf_sums.items(), key = lambda x: x[1], reverse = True)[:n]

def evaluate_tf_idf(query_files, runID, output_path):
    os.makedirs(output_path, exist_ok=True)
    for filename in query_files:
        print(filename)
        print(datetime.now())
        with open(output_path + "/" + filename.split("/")[-1].split(".")[0] + ".run", "w") as output_file:
            testdata = pd.read_json(filename, lines = True)
            for index, row in testdata.iterrows():
                result = query_tf_idf_top_n(row["query"], 1000)
                rank_counter = 1
                for id, tf_idf in result:
                    # output_file.write(f"{row.get('query_id')} Q0 {id} {rank_counter} {tf_idf} {runID}\n")
                    print(f"{row.get('query_id')} Q0 {id} {rank_counter} {tf_idf} {runID}")
                    rank_counter += 1
            
evaluate_tf_idf(query_files, 1, OUT_PATH + "/tf_idf")
# 3.5 mins :)

# Reduced queries without weights

In [3]:
OUT_PATH = "../data/runs/Reduced_Query"

In [None]:
import os

def preprocess_unique_list(text):
    tokens = list(word_tokenize(text.lower())) # get all tokens
    return [i for i in tokens if i not in stop] # get all tokens without stopwords


for filename in query_files:
    filename_improved_query = "./data/improved_queries/" + re.search(r'([^/]+)(?=\.[^.]+$)', filename).group(1) + "_improved_queries.jsonl"
    if os.path.isfile(filename_improved_query):
        print("File exists and is a file")
    else:
        print("No file found")
    df_improved_queries = pd.read_json(filename_improved_query, lines=True)
    
    break

File exists and is a file
id                                                        153
keywords    [{'term': 'martial arts', 'weight': 1.0}, {'te...
guesses                                                    []
Name: 1, dtype: object


In [35]:
def query_tf_idf_top_n(query, n):
    with shelve.open(TF_IDF_PATH) as db:
        terms = [t for t in preprocess_unique_list(query)]
        
        tf_idf_sums = defaultdict(float)
        
        for term in terms:
            if term not in db:
                continue
            data = db[term]
            idf = data["idf"]
            doc_ids = data["doc_ids"]
            tfs = data["tfs"]

            # in the database the tfs for each document is stored for the documents index in doc_ids, for this reason we can easily zip them together :)
            for doc_id, tf in zip(doc_ids, tfs):
                tf_idf_sums[doc_id] +=  tf * idf
        
        return sorted(tf_idf_sums.items(), key = lambda x: x[1], reverse = True)[:n]

def evaluate_tf_idf_reduced_query(query_files, runID, output_path):
    os.makedirs(output_path, exist_ok=True)
    for filename in query_files:
        print(filename)
        print(datetime.now())
        filename_improved_query = "./data/improved_queries/" + re.search(r'([^/]+)(?=\.[^.]+$)', filename).group(1) + "_improved_queries.jsonl"
        df_improved_queries = pd.read_json(filename_improved_query, lines = True)
        df_improved_queries["terms_string"] = df_improved_queries["keywords"].apply(lambda items: " ".join([d["term"] for d in items]))
        
        
        with open(output_path + "/" + filename.split("/")[-1].split(".")[0] + ".run", "w") as output_file:
            testdata = pd.read_json(filename, lines = True)
            for index, row in testdata.iterrows():
                result = query_tf_idf_top_n(df_improved_queries[df_improved_queries["id"] == row.get("query_id")]["terms_string"].iloc[0], 1000)
                rank_counter = 1
                for id, tf_idf in result:
                    # output_file.write(f"{row.get('query_id')} Q0 {id} {rank_counter} {tf_idf} {runID}\n")
                    print(f"{row.get('query_id')} Q0 {id} {rank_counter} {tf_idf} {runID}")
                    rank_counter += 1
                    break
            
evaluate_tf_idf_reduced_query(query_files, 1, OUT_PATH + "/tf_idf")

../data/tot25/subsets/eval20/eval20-queries-dev1.jsonl
2025-11-24 11:16:35.700753
152 Q0 71362650 1 3203.3987519361317 1
153 Q0 213472 1 1587.3387238090706 1
185 Q0 74998017 1 2877.841379854192 1
240 Q0 101888 1 2724.5155889333832 1
365 Q0 3883945 1 1877.3104658814264 1
385 Q0 13406737 1 2327.1467034988177 1
391 Q0 4035 1 1087.6573675854895 1
403 Q0 294286 1 3247.5551396219225 1
442 Q0 74998017 1 5071.727830631169 1
458 Q0 13763276 1 2255.6470977564754 1
490 Q0 3333003 1 2607.1318156824786 1
498 Q0 21013155 1 1920.6357716685457 1
512 Q0 57197 1 20031.792468411513 1
554 Q0 74998017 1 3530.652500502496 1
563 Q0 26349927 1 1654.1196354539575 1
625 Q0 28356988 1 3714.103977030983 1
639 Q0 74998017 1 2403.1170172844 1
722 Q0 314828 1 1677.024229501376 1
735 Q0 74998017 1 3107.0308366366835 1
761 Q0 54090522 1 3739.2825552375775 1
766 Q0 1303939 1 2179.0423352831876 1
786 Q0 57197 1 1574.6641012070495 1
813 Q0 21013155 1 1796.8114057738665 1
850 Q0 74998017 1 2307.337436065065 1
861 Q0 85578