# Posting List operations

This notebook is to evaluate inverted index and tf_idf on any given datasets.
For usage adapt the last 4 variables in the first cell (inverted_index_path, tf_idf_path, out_path and query_files) accordingly, depending on whether everything should be executed fo train or eval.


todos:
-   review everything
-   run for train and test set
-   check output file parameter if rank -> tf_idf is correct or this is meant elsewise --> Ciwan confirmed is fine, must just be sorted :)

In [28]:
import shelve
import pickle
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import os
import math 
from collections import Counter
from collections import defaultdict 


nltk.download('stopwords')

try:
    import orjson as _json
    loads = _json.loads
except Exception:
    import json as _json
    loads = _json.loads

INVERTED_INDEX_PATH = "./data/inverted_index_train.db"
TF_IDF_PATH = "./data/tf_idf_train.db"
OUT_PATH = "../data/runs"
query_files = ["../data/tot25/subsets/train80/train80-queries-dev1.jsonl"]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Inverted Index

In [5]:
stop = set(stopwords.words('english') + list(string.punctuation))

# Function to tokenize and preprocess a document
def preprocess_unique(text):
    tokens = set(word_tokenize(text.lower())) # get all tokens
    return [i for i in tokens if i not in stop] # get all tokens without stopwords

def AND(l1, l2):
    # note: this method assumes that both lists are sorted and do not contain any duplicates (which if the input comes from a posting list is given)
    # this asssumption is given considering the inverted index and tf_idf db's were created
    p1 = 0
    p2 = 0
    sqrtJump1 = max(math.isqrt(len(l1)), 1)
    sqrtJump2 = max(math.isqrt(len(l2)), 1)
    result = []
    while p1 < len(l1)  and p2 < len(l2):
        if l1[p1] == l2[p2]:
            result.append(l1[p1])
            p1 += 1
            p2 += 1
        # skip pointer in first list
        elif p1 % sqrtJump1 == 0 and p1 + sqrtJump1 < len(l1) and l1[p1 + sqrtJump1] <= l2[p2]:
            p1 += sqrtJump1
        # skip pointer in second list
        elif p2 % sqrtJump2 == 0 and p2 + sqrtJump2 < len(l2) and l2[p2 + sqrtJump2] <= l1[p1]:
            p2 += sqrtJump2
        elif l1[p1] < l2[p2]:
            p1 += 1
        else:
            p2 += 1
    return result

def OR(l1, l2):
    # note: this method assumes that both lists are sorted and do not contain any duplicates (which if the input comes from a posting list is given)
    # this asssumption is given considering the inverted index and tf_idf db's were created
    p1 = 0
    p2 = 0
    result = []
    while p1 < len(l1) and p2 < len(l2):
        if l1[p1] < l2[p2]:
            result.append(l1[p1])
            p1 += 1
        elif l1[p1] > l2[p2]:
            result.append(l2[p2])
            p2 += 1
        else: 
            result.append(l1[p1])
            p1 += 1
            p2 += 1
    while p1 < len(l1):
        result.append(l1[p1])
        p1 += 1
    while p2 < len(l2):
        result.append(l2[p2])
        p2 += 1
    return result

In [None]:
def query_inverted_index_and(query):
    with shelve.open(INVERTED_INDEX_PATH) as db:
        terms = [t for t in preprocess_unique(query)]
        terms.sort(key = lambda t:len(db[t]))
        result = db[terms[0]]
        for term in terms[1:]:
            result = AND(result, db[term])
        return result

def query_inverted_index_sorted_by_word_counts_top_n(query, n):
    with shelve.open(INVERTED_INDEX_PATH) as db:
        terms = [t for t in preprocess_unique(query)]
        result = list()
        for term in terms:
            if not term in db:
                continue
            result.extend(db[term])
        return Counter(result).most_common(n)
    
def evaluate_inverted_index(query_files, runID, output_path):
    os.makedirs(output_path, exist_ok=True) 
    for filename in query_files:
        with open(output_path + "/" + filename.split("/")[-1].split(".")[0] + ".txt", "w") as output_file:
            testdata = pd.read_json(filename, lines = True)
            for index, row in testdata.iterrows():
                result = query_inverted_index_sorted_by_word_counts_top_n(row["query"], 1000)
                rank_counter = 1
                for id, frequency in result:
                    output_file.write(f"{row.get('query_id')} Q0 {id} {rank_counter} {frequency} {runID}\n")
                    # print(f"{row.get("query_id")} Q0 {id} {rank_counter} {frequency} {runID}")
                    rank_counter += 1
                
                

# evaluate_inverted_index(query_files, 1, OUT_PATH + "/inverted_index")

## TF_IDF

In [None]:
TF_IDF_PATH = "./data/tf_idf.db"
            
def query_tf_idf_top_n(query, n):
    with shelve.open(TF_IDF_PATH) as db:
        terms = [t for t in preprocess_unique(query)]
        
        tf_idf_sums = defaultdict(float)
        
        for term in terms:
            if term not in db:
                continue
            data = db[term]
            idf = data["idf"]
            doc_ids = data["doc_ids"]
            tfs = data["tfs"]

            # in the database the tfs for each document is stored for the documents index in doc_ids, for this reason we can easily zip them together :)
            for doc_id, tf in zip(doc_ids, tfs):
                tf_idf_sums[doc_id] +=  tf * idf
        
        return sorted(tf_idf_sums.items(), key = lambda x: x[1], reverse = True)[:n]

def evaluate_tf_idf(query_files, runID, output_path):
    os.makedirs(output_path, exist_ok=True)
    for filename in query_files:
        with open(output_path + "/" + filename.split("/")[-1].split(".")[0] + ".txt", "w") as output_file:
            testdata = pd.read_json(filename, lines = True)
            for index, row in testdata.iterrows():
                result = query_tf_idf_top_n(row["query"], 1000)
                rank_counter = 1
                for id, tf_idf in result:
                        output_file.write(f"{row.get('query_id')} Q0 {id} {rank_counter} {tf_idf} {runID}\n")
                        # print(f"{row.get('query_id')} Q0 {id} {rank_counter} {tf_idf} {runID}")
                        rank_counter += 1
            
            

# evaluate_tf_idf(query_files, 1, OUT_PATH + "/tf_idf")