# Posting List operations

This notebook is to evaluate inverted index and tf_idf on any given datasets.
For usage adapt the last 4 variables in the first cell (inverted_index_path, tf_idf_path, out_path and query_files) accordingly, depending on whether everything should be executed fo train or eval.

todos:
-   review everything
-   run for train and test set
-   check output file parameter if rank -> tf_idf is correct or this is meant elsewise --> Ciwan confirmed is fine, must just be sorted :)

In [3]:
import shelve
import pickle
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import pandas as pd
import numpy as np
import os
import math 
from collections import Counter
from collections import defaultdict 
from datetime import datetime
import re
import math 

nltk.download('stopwords')

try:
    import orjson as _json
    loads = _json.loads
except Exception:
    import json as _json
    loads = _json.loads

stop = set(stopwords.words('english') + list(string.punctuation))

INVERTED_INDEX_PATH = "./data/inverted_index_eval.db"
TF_IDF_PATH = "./data/tf_idf_eval.db"
OUT_PATH = "../data/runs"
query_files = ["../data/tot25/subsets/eval20/eval20-queries-dev1.jsonl", "../data/tot25/subsets/eval20/eval20-queries-dev2.jsonl", "../data/tot25/subsets/eval20/eval20-queries-dev3.jsonl", "../data/tot25/subsets/eval20/eval20-queries-train.jsonl"]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Function for Lematization

In [4]:
# download for lematization
nltk.download('punkt_tab')   
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

def get_wordnet_pos(tag): # mapping POS tag from pos_tag to a format WordNetLemmatizer accepts.
    match tag[0]:
        case 'J':
            return wordnet.ADJ
        case 'V':
            return wordnet.VERB
        #case 'N':
        #    return wordnet.NOUN       online source: here, but twice no benefit so removed
        case 'R':
            return wordnet.ADV
        case _:
            return wordnet.NOUN

lemmatizer = WordNetLemmatizer()
def lematization(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    return [lemmatizer.lemmatize(token, get_wordnet_pos(tag)).lower() for token, tag in tagged] # lower so can compare eg write friday instead of Friday

#sanity check
lem = lematization("This notebook is used to create an inverted_index and tf_idf database for the provided corpus file (train or eval). Adapt the variables in the first cell accordingly for files and output dir.Note that the executino time for eval is 1/3 that of train, despite the fact that eval contains 2.5 times the amount of docs. We assume this is ")
print(lem) # later need to remove stopwords

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Kai\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['this', 'notebook', 'be', 'use', 'to', 'create', 'an', 'inverted_index', 'and', 'tf_idf', 'database', 'for', 'the', 'provided', 'corpus', 'file', '(', 'train', 'or', 'eval', ')', '.', 'adapt', 'the', 'variable', 'in', 'the', 'first', 'cell', 'accordingly', 'for', 'file', 'and', 'output', 'dir.note', 'that', 'the', 'executino', 'time', 'for', 'eval', 'be', '1/3', 'that', 'of', 'train', ',', 'despite', 'the', 'fact', 'that', 'eval', 'contain', '2.5', 'time', 'the', 'amount', 'of', 'doc', '.', 'we', 'assume', 'this', 'be']


## Inverted Index

In [5]:
# Function to tokenize and preprocess a document
def preprocess_unique(text):
    #old function changed to lematization    tokens = set(word_tokenize(text.lower())) # get all tokens
    tokens = set(lematization(text))
    return [i for i in tokens if i not in stop] # get all tokens without stopwords

def AND(l1, l2):
    # note: this method assumes that both lists are sorted and do not contain any duplicates (which if the input comes from a posting list is given)
    # this asssumption is given considering the inverted index and tf_idf db's were created
    p1 = 0
    p2 = 0
    sqrtJump1 = max(math.isqrt(len(l1)), 1)
    sqrtJump2 = max(math.isqrt(len(l2)), 1)
    result = []
    while p1 < len(l1)  and p2 < len(l2):
        if l1[p1] == l2[p2]:
            result.append(l1[p1])
            p1 += 1
            p2 += 1
        # skip pointer in first list
        elif p1 % sqrtJump1 == 0 and p1 + sqrtJump1 < len(l1) and l1[p1 + sqrtJump1] <= l2[p2]:
            p1 += sqrtJump1
        # skip pointer in second list
        elif p2 % sqrtJump2 == 0 and p2 + sqrtJump2 < len(l2) and l2[p2 + sqrtJump2] <= l1[p1]:
            p2 += sqrtJump2
        elif l1[p1] < l2[p2]:
            p1 += 1
        else:
            p2 += 1
    return result

def OR(l1, l2):
    # note: this method assumes that both lists are sorted and do not contain any duplicates (which if the input comes from a posting list is given)
    # this asssumption is given considering the inverted index and tf_idf db's were created
    p1 = 0
    p2 = 0
    result = []
    while p1 < len(l1) and p2 < len(l2):
        if l1[p1] < l2[p2]:
            result.append(l1[p1])
            p1 += 1
        elif l1[p1] > l2[p2]:
            result.append(l2[p2])
            p2 += 1
        else: 
            result.append(l1[p1])
            p1 += 1
            p2 += 1
    while p1 < len(l1):
        result.append(l1[p1])
        p1 += 1
    while p2 < len(l2):
        result.append(l2[p2])
        p2 += 1
    return result

In [None]:
def query_inverted_index_and(query):
    with shelve.open(INVERTED_INDEX_PATH) as db:
        terms = [t for t in preprocess_unique(query)]
        terms.sort(key = lambda t:len(db[t]))
        result = db[terms[0]]
        for term in terms[1:]:
            result = AND(result, db[term])
        return result

def query_inverted_index_sorted_by_word_counts_top_n(query, n):
    with shelve.open(INVERTED_INDEX_PATH) as db:
        terms = [t for t in preprocess_unique(query)]
        result = list()
        for term in terms:
            if not term in db:
                continue
            result.extend(db[term])
        return Counter(result).most_common(n)
    
def evaluate_inverted_index(query_files, runID, output_path):
    os.makedirs(output_path, exist_ok=True) 
    for filename in query_files:
        print(filename)
        print(datetime.now())
        with open(output_path + "/" + filename.split("/")[-1].split(".")[0] + ".run", "w") as output_file:
            testdata = pd.read_json(filename, lines = True)
            for index, row in testdata.iterrows():
                result = query_inverted_index_sorted_by_word_counts_top_n(row["query"], 1000)
                rank_counter = 1
                for id, frequency in result:
                    output_file.write(f"{row.get('query_id')} Q0 {id} {rank_counter} {frequency} {runID}\n")
                    # print(f"{row.get("query_id")} Q0 {id} {rank_counter} {frequency} {runID}")
                    rank_counter += 1
                
                

evaluate_inverted_index(query_files, 1, OUT_PATH + "/inverted_index")
# 1 min :)

../data/tot25/subsets/eval20/eval20-queries-dev1.jsonl
2025-11-25 10:32:17.494569
../data/tot25/subsets/eval20/eval20-queries-dev2.jsonl
2025-11-25 10:32:23.910188
../data/tot25/subsets/eval20/eval20-queries-dev3.jsonl
2025-11-25 10:32:30.052930
../data/tot25/subsets/eval20/eval20-queries-train.jsonl
2025-11-25 10:33:08.426017


## TF_IDF

In [None]:
def preprocess_unique_list(text):
    #old function changed to lematization    tokens = list(word_tokenize(text.lower())) # get all tokens
    tokens = list(lematization(text)) # get all tokens
    return [i for i in tokens if i not in stop] # get all tokens without stopwords


def query_tf_idf_top_n(query, n):
    with shelve.open(TF_IDF_PATH) as db:
        terms = [t for t in preprocess_unique_list(query)]
        
        tf_idf_sums = defaultdict(float)
        
        for term in terms:
            if term not in db:
                continue
            data = db[term]
            idf = data["idf"]
            doc_ids = data["doc_ids"]
            tfs = data["tfs"]

            # in the database the tfs for each document is stored for the documents index in doc_ids, for this reason we can easily zip them together :)
            for doc_id, tf in zip(doc_ids, tfs):
                tf_idf_sums[doc_id] +=  math.log(tf + 1) * idf
        
        return sorted(tf_idf_sums.items(), key = lambda x: x[1], reverse = True)[:n]

def evaluate_tf_idf(query_files, runID, output_path):
    os.makedirs(output_path, exist_ok=True)
    for filename in query_files:
        print(filename)
        print(datetime.now())
        with open(output_path + "/" + filename.split("/")[-1].split(".")[0] + ".run", "w") as output_file:
            testdata = pd.read_json(filename, lines = True)
            for index, row in testdata.iterrows():
                result = query_tf_idf_top_n(row["query"], 1000)
                rank_counter = 1
                for id, tf_idf in result:
                    output_file.write(f"{row.get('query_id')} Q0 {id} {rank_counter} {tf_idf} {runID}\n")
                    # print(f"{row.get('query_id')} Q0 {id} {rank_counter} {tf_idf} {runID}")
                    rank_counter += 1
            
evaluate_tf_idf(query_files, 1, OUT_PATH + "/tf_idf")
# 2:49 mins :)

../data/tot25/subsets/eval20/eval20-queries-dev1.jsonl
2025-11-25 10:36:04.016110
../data/tot25/subsets/eval20/eval20-queries-dev2.jsonl
2025-11-25 10:36:23.149638
../data/tot25/subsets/eval20/eval20-queries-dev3.jsonl
2025-11-25 10:36:40.632113
../data/tot25/subsets/eval20/eval20-queries-train.jsonl
2025-11-25 10:38:35.481766


# Reduced queries without weights

In [26]:
def preprocess_unique_list(text):
    #old function changed to lematization    tokens = list(word_tokenize(text.lower())) # get all tokens
    tokens = list(lematization(text)) # get all tokens
    return [i for i in tokens if i not in stop] # get all tokens without stopwords

tokens = list(lematization("I am singing"))
print(tokens)

['i', 'be', 'sing']


In [None]:
def query_tf_idf_top_n(query, n):
    with shelve.open(TF_IDF_PATH) as db:
        terms = [t for t in preprocess_unique_list(query)]
        
        tf_idf_sums = defaultdict(float)
        
        for term in terms:
            if term not in db:
                continue
            data = db[term]
            idf = data["idf"]
            doc_ids = data["doc_ids"]
            tfs = data["tfs"]

            # in the database the tfs for each document is stored for the documents index in doc_ids, for this reason we can easily zip them together :)
            for doc_id, tf in zip(doc_ids, tfs):
                tf_idf_sums[doc_id] +=  math.log(tf + 1) * idf
        
        return sorted(tf_idf_sums.items(), key = lambda x: x[1], reverse = True)[:n]

def evaluate_tf_idf_reduced_query(query_files, runID, output_path):
    os.makedirs(output_path, exist_ok=True)
    for filename in query_files:
        print(filename)
        print(datetime.now())
        filename_improved_query = "./data/improved_queries/" + re.search(r'([^/]+)(?=\.[^.]+$)', filename).group(1) + "_improved_queries.jsonl"
        df_improved_queries = pd.read_json(filename_improved_query, lines = True)
        df_improved_queries["terms_string"] = df_improved_queries["keywords"].apply(lambda items: " ".join([d["term"] for d in items]))
        
        
        with open(output_path + "/" + filename.split("/")[-1].split(".")[0] + ".run", "w") as output_file:
            testdata = pd.read_json(filename, lines = True)
            for index, row in testdata.iterrows():
                result = query_tf_idf_top_n(df_improved_queries[df_improved_queries["id"] == row.get("query_id")]["terms_string"].iloc[0], 1000)
                rank_counter = 1
                for id, tf_idf in result:
                    output_file.write(f"{row.get('query_id')} Q0 {id} {rank_counter} {tf_idf} {runID}\n")
                    # print(f"{row.get('query_id')} Q0 {id} {rank_counter} {tf_idf} {runID}")
                    rank_counter += 1
            
evaluate_tf_idf_reduced_query(query_files, 1, OUT_PATH + "/tf_idf_reduced_queries")
# 30 seconds

../data/tot25/subsets/eval20/eval20-queries-dev1.jsonl
2025-11-25 10:39:51.421430
../data/tot25/subsets/eval20/eval20-queries-dev2.jsonl
2025-11-25 10:39:56.836661
../data/tot25/subsets/eval20/eval20-queries-dev3.jsonl
2025-11-25 10:40:01.444993
../data/tot25/subsets/eval20/eval20-queries-train.jsonl
2025-11-25 10:40:16.896775


# Distance Weighted retrieval

In this section we use TF-IDF but in addition weight the query terms by their index, i.e. first word gets weight 1, last gets minimum weight. 
The weighting function is 
1 - (1 - MIN_WEIGHT) * (index/ query_length)^2 

We chose this function for the following reasons:
1. Weights should be in the range of 1 max and MIN_WEIGHT = 0.4 min (minimum value worked the best with experience)
2. The function should be concave and decreasing slowly, hence the ^2
3. The weight should not only depend on the index, but also on the length of the query, i.e. first Token gets weight 1, last gets weight MIN_WEIGHT, no matter how long the query is. That is because our query lengths range from tens to hundreds.

In [None]:
MIN_WEIGHT = 0.4 
def query_tf_idf_top_n(query, n):
    with shelve.open(TF_IDF_PATH) as db:
        terms = [t for t in preprocess_unique_list(query)]
        term_count = len(terms)
        
        tf_idf_sums = defaultdict(float)
        
        counter = 0
        
        for term in terms:
            if term not in db:
                continue
            data = db[term]
            idf = data["idf"]
            doc_ids = data["doc_ids"]
            tfs = data["tfs"]

            # in the database the tfs for each document is stored for the documents index in doc_ids, for this reason we can easily zip them together :)
            for doc_id, tf in zip(doc_ids, tfs):
                tf_idf_sums[doc_id] +=  math.log(1 + tf) * idf * (1 - (1 - MIN_WEIGHT) * ((counter / term_count) ** 2))
            counter += 1
        
        return sorted(tf_idf_sums.items(), key = lambda x: x[1], reverse = True)[:n]

def evaluate_tf_idf_reduced_query(query_files, runID, output_path):
    os.makedirs(output_path, exist_ok=True)
    for filename in query_files:
        print(filename)
        print(datetime.now())
        filename_improved_query = "./data/improved_queries/" + re.search(r'([^/]+)(?=\.[^.]+$)', filename).group(1) + "_improved_queries.jsonl"
        df_improved_queries = pd.read_json(filename_improved_query, lines = True)
        df_improved_queries["terms_string"] = df_improved_queries["keywords"].apply(lambda items: " ".join([d["term"] for d in items]))
        
        
        with open(output_path + "/" + filename.split("/")[-1].split(".")[0] + ".run", "w") as output_file:
            testdata = pd.read_json(filename, lines = True)
            for index, row in testdata.iterrows():
                result = query_tf_idf_top_n(df_improved_queries[df_improved_queries["id"] == row.get("query_id")]["terms_string"].iloc[0], 1000)
                rank_counter = 1
                for id, tf_idf in result:
                    output_file.write(f"{row.get('query_id')} Q0 {id} {rank_counter} {tf_idf} {runID}\n")
                    # print(f"{row.get('query_id')} Q0 {id} {rank_counter} {tf_idf} {runID}")
                    rank_counter += 1
            
evaluate_tf_idf_reduced_query(query_files, 1, OUT_PATH + "/tf_idf_distance_weighted")
# 39 sec

../data/tot25/subsets/eval20/eval20-queries-dev1.jsonl
2025-11-25 10:41:47.004529
../data/tot25/subsets/eval20/eval20-queries-dev2.jsonl
2025-11-25 10:41:54.355193
../data/tot25/subsets/eval20/eval20-queries-dev3.jsonl
2025-11-25 10:42:00.804552
../data/tot25/subsets/eval20/eval20-queries-train.jsonl
2025-11-25 10:42:20.790891


# Reduced queries weights

No we also account for the weights given by ChatGPT. For each proposed phrase, like "action thriller", we preprocess it and split it into unique terms, then compute the min tf over all terms and the maximum corresponding idf over all unique terms in the phrase, based on that we compute a weighted sum tf-idf over all proposed phrases.
1. minimum tf: since we do not store phrases in our tf-idf, but only unique tokens, we make the assumption that the given terms in a phrase co-occur in the amount of the minimum tf  of the terms. We are aware that this might not be the case, however due to computational constraints and for storage reasons we cannot compute the tf-idf based on phrases, which is why we make this assumption.
2. maximum idf: Since in (1) we assume that the occurence of the phrase is defined by the least occuring term in the phrase, we take the maximum idf, as this corresponds to the least occuring term.

In [27]:
def query_tf_idf_top_n(terms_weights, n):
    with shelve.open(TF_IDF_PATH) as db:
        tf_idf_sums = defaultdict(float) # default value is 0 :)
        for entry in terms_weights:
            datas = []
            # db does not contain subterm --> ignore it for quering
            preprocessed_split = preprocess_unique_list(entry["term"])
            if any(term not in db for term in preprocessed_split):
                continue
            for term in preprocessed_split:
                datas.append(db[term])
            if len(datas) == 0:
                continue
            idf = max(data["idf"] for data in datas)
            doc_ids = set.intersection(*(set(data["doc_ids"]) for data in datas))
            min_tfs = defaultdict(lambda: float("inf"))
            for data in datas:
                for doc_id, tf in zip(data["doc_ids"], data["tfs"]):
                    min_tfs[doc_id] = min(min_tfs[doc_id], tf)
            for doc_id in doc_ids:
                tf_idf_sums[doc_id] += math.log(min_tfs[doc_id] + 1) *  idf * entry["weight"]
                
        return sorted(tf_idf_sums.items(), key = lambda x: x[1], reverse = True)[:n]
   

def evaluate_tf_idf_reduced_query(query_files, runID, output_path):
    os.makedirs(output_path, exist_ok=True)
    for filename in query_files:
        print(filename)
        print(datetime.now())
        filename_improved_query = "./data/improved_queries/" + re.search(r'([^/]+)(?=\.[^.]+$)', filename).group(1) + "_improved_queries.jsonl"
        df_improved_queries = pd.read_json(filename_improved_query, lines = True)
        
        with open(output_path + "/" + filename.split("/")[-1].split(".")[0] + ".run", "w") as output_file:
            testdata = pd.read_json(filename, lines = True)
            for index, row in testdata.iterrows():
                result = query_tf_idf_top_n(df_improved_queries[df_improved_queries["id"] == row.get("query_id")].iloc[0]["keywords"], 1000)
                rank_counter = 1
                for id, tf_idf in result:
                    output_file.write(f"{row.get('query_id')} Q0 {id} {rank_counter} {tf_idf} {runID}\n")
                    # print(f"{row.get('query_id')} Q0 {id} {rank_counter} {tf_idf} {runID}")
                    rank_counter += 1

            
evaluate_tf_idf_reduced_query(query_files, 1, OUT_PATH + "/tf_idf_reduced_queries_weights")

../data/tot25/subsets/eval20/eval20-queries-dev1.jsonl
2025-11-25 11:04:18.026441
../data/tot25/subsets/eval20/eval20-queries-dev2.jsonl
2025-11-25 11:04:23.552340
../data/tot25/subsets/eval20/eval20-queries-dev3.jsonl
2025-11-25 11:04:29.467181
../data/tot25/subsets/eval20/eval20-queries-train.jsonl
2025-11-25 11:04:45.572321


todos:
1. run evaluation of files
2. 

# Normalization by document length

In [28]:
from pathlib import Path
import subprocess
import os

root_path = Path(OUT_PATH).resolve()

for root, dirs, files in os.walk(root_path):
    root = Path(root).resolve()  # absolute path
    print("Directory:", root)
    for f in files:
        split = f.split(".")[0].split("-")[-1]
        file_path = Path(root, f).resolve()  # absolute path to file

        # Absolute path to the script
        script_path = Path("EvalPipelineSubSet/run_eval.py").resolve()

        env = os.environ.copy()
        env["PYTHONPATH"] = str(Path(".").resolve())

        cmd = [
            "python", str(script_path),
            "--split", split,
            "--run", str(file_path),
            "--metrics", "ndcg@10", "ndcg@1000", "R@1000", "rr", "map", "map@1000", "P@10", "R@10", "f1@10", "success@10"
        ]

        result = subprocess.run(cmd, capture_output=True, text=True, env=env)
        print("  File:", f)
        print(result.stdout)
        print("    stderr:", result.stderr)
        #print("    returncode:", result.returncode)

Directory: C:\Users\Kai\OneDrive\Vorlesungen\Information Retrieval\Github\IR_Project\data\runs
Directory: C:\Users\Kai\OneDrive\Vorlesungen\Information Retrieval\Github\IR_Project\data\runs\inverted_index
  File: eval20-queries-dev1.run

    stderr: python: can't open file 'C:\\Users\\Kai\\OneDrive\\Vorlesungen\\Information Retrieval\\Github\\IR_Project\\Inverted_index\\EvalPipelineSubSet\\run_eval.py': [Errno 2] No such file or directory

  File: eval20-queries-dev2.run

    stderr: python: can't open file 'C:\\Users\\Kai\\OneDrive\\Vorlesungen\\Information Retrieval\\Github\\IR_Project\\Inverted_index\\EvalPipelineSubSet\\run_eval.py': [Errno 2] No such file or directory

  File: eval20-queries-dev3.run

    stderr: python: can't open file 'C:\\Users\\Kai\\OneDrive\\Vorlesungen\\Information Retrieval\\Github\\IR_Project\\Inverted_index\\EvalPipelineSubSet\\run_eval.py': [Errno 2] No such file or directory

  File: eval20-queries-train.run

    stderr: python: can't open file 'C:\\Use

In [35]:
from pathlib import Path
import subprocess


root_path = Path(OUT_PATH)

for root, dirs, files in os.walk(root_path):
    print("Directory:", root)
    for d in dirs:
        print("  Subdirectory:", os.path.join(root, d))
    for f in files:
        result = subprocess.run(f"python ../EvalPipelineSubSet/run_eval.py --split {f.split(".")[0].split("-")[-1]} --run {os.path.join(root, f)} --metrics ndcg@10 ndcg@1000 R@1000 rr map map@1000 P@10 R@10 f1@10 success@10", shell=True, capture_output=True, text=True)
        #print("  File:", f, "\t|", result.stdout )
        #print("  File:", f)
        #print("    returncode:", result.returncode)
        print("    stdout:", result.stdout)
        #print("    stderr:", result.stderr)

Directory: ..\data\runs
  Subdirectory: ..\data\runs\inverted_index
  Subdirectory: ..\data\runs\tf_idf
  Subdirectory: ..\data\runs\tf_idf_distance_weighted
  Subdirectory: ..\data\runs\tf_idf_reduced_queries
  Subdirectory: ..\data\runs\tf_idf_reduced_queries_weights
Directory: ..\data\runs\inverted_index
Explanation: missing queries are scored as 0 and still averaged; this is standard TREC practice
Since the metric values that also exist on the website match ours 1:1, they handled it the same way, and our pipeline corresponds to their evaluation method.

Split: dev1
Queries (in qrels): 142
Run: C:\Users\Kai\OneDrive\Vorlesungen\Information Retrieval\Github\IR_Project\data\runs\inverted_index\eval20-queries-dev1.run
     NDCG@10 : 0.000
   NDCG@1000 : 0.002
      R@1000 : 0.014
          RR : 0.000
         MAP : 0.000
    MAP@1000 : 0.000
        P@10 : 0.000
        R@10 : 0.000
       F1@10 : 0.000
  success@10 : 0.000

Explanation: missing queries are scored as 0 and still averag