# Part 2: Indexing and Evaluation

In [None]:
# Imports
import os, collections, string, re, math
from collections import defaultdict
from array import array
import pandas as pd
import numpy as np
import numpy.linalg as la

from unidecode import unidecode
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [None]:
# Data import
data_path =  os.path.join(os.getcwd(), '../../data/')

data = pd.read_csv(os.path.join(data_path, 'fashion_products_cleaned.csv'))
# text_columns = ['title', 'description']
# text_columns = ['title', 'description', 'brand', 'category', 'sub_category', 'seller']
text_columns = ['title']
data[text_columns] = data[text_columns].fillna('')

data[2640:2660]

## 1. Indexing

### 1.1 Build inverted index

After having pre-processed the data, you can then create the inverted index.

HINT - you may use the vocabulary data structure, like the one seen during the
Practical Labs:

{
    Term_id_1: [document_1, document_2, document_4],
    Term_id_2: [document_1, document_3, document_5, document_6],
    etc…
}

Important: For this assignment, we will be using conjunctive queries (AND). This means that every returned document must contain all the words from the query in order to be considered a match.

In [None]:
# Preprocessing function used in PART-1
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
translator = str.maketrans('', '', string.punctuation)

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower() # Lowercase
    text = text.translate(translator) # Remove punctuation
    text = unidecode(text) # normalize
    tokens = word_tokenize(text) # Tokenization
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words] # Remove stopwords and non-alphabetic tokens
    stemmed_tokens = [stemmer.stem(word) for word in tokens] # Stemming 
    stemmed_tokens = [word for word in stemmed_tokens if len(word) > 2] # Remove short tokens
    return stemmed_tokens

In [None]:
# 1. Combining text fields from the dataset to create a searchable document for each product.
def create_document_for_each_row(row, text_columns):
    ''' 
    Create a single text document for each row by concatenating specified text fields
    Returns: string of concatenated text fields
    '''
    text_fields = [str(row[col]) for col in text_columns if pd.notnull(row[col])]
    document = ' '.join([f for f in text_fields if f != 'nan' and f != '']).lower()
    return document

# 2. Building an inverted index to map terms to product IDs.
def build_inverted_index(data, text_columns):
    '''
    Build an inverted index from the dataset
    Returns: dict mapping terms to list of document IDs
    '''
    inverted_index = defaultdict(list)

    for _, row in data.iterrows():
        doc_id = row['pid']
        row_text = create_document_for_each_row(row, text_columns) 

        ####### SI?
        row_text = ' '.join(preprocess_text(row_text))

        # Tokenize the text
        tokens = re.findall(r'\b\w+\b', row_text)

        # Add to inverted index (avoindin duplicates)
        already_seen_terms = set()
        for term in tokens:
            if term not in already_seen_terms:
                inverted_index[term].append(doc_id)
                already_seen_terms.add(term)

    return dict(inverted_index)

# 3. Implementing a simple search function to retrieve products based on keyword queries. (conjunctive queries)
def conjunctive_search(query, inverted_index):
    '''
    Perform AND query: return documents containing ALL QUERY TERMS
    Returns: list of document IDs
    '''
    query_terms = re.findall(r'\b\w+\b', query.lower())
    if not query_terms:
        return []

    # Get documents for the first term
    if query_terms[0] in inverted_index:
        result_docs = set(inverted_index[query_terms[0]])
    else:
        return []

    # Intersect with documents for the remaining terms
    for term in query_terms[1:]:
        if term in inverted_index:
            result_docs = result_docs.intersection(set(inverted_index[term]))
        else:
            return []

    return list(result_docs)

In [None]:
# TODO: detail_columns = ['detail_fabric', 'detail_color', 'detail_pattern', ...]
# For now we are building it without the details columns

# --- Our inverted index ---
inverted_index = build_inverted_index(data, text_columns)

### 1.2 Propose test queries

Define five queries that will be used to evaluate your search engine. (Be creative 😉)

HINT: How to choose the queries? The selection of the queries is up to you, but it’s suggested to select terms based on the popularity (keywords ranked by term frequencies or by TF-IDF, etc).

In [None]:
test_queries = {
    1: 'men cotton shirt',
    2: 'women casual polo neck',
    3: 'men regular fit tshirt',
    4: 'zipper sweater',
    5: 'solid round neck cotton'
}

for _, query_text in test_queries.items():
    results = conjunctive_search(query_text, inverted_index)
    print(f'\nTest query: "{query_text}"')
    print(f'Found {len(results)} matching documents')
    print(f'Sample results: {results[:5]}')

### 1.3 Rank your results

Implement the TF-IDF algorithm and provide ranking-based results.

In [None]:
def create_index_tfidf(data, columns=['title', 'description', 'category']):
    '''
    Implement the inverted index and compute tf, df and idf

    Argument:
    lines -- collection of Wikipedia articles
    num_documents -- total number of documents

    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of document these keys appears in (and the positions) as values.
    tf - normalized term frequency for each term in each document
    df - number of documents each term appear in
    idf - inverse document frequency of each term
    '''

    index = defaultdict(list)
    tf = defaultdict(list)  #term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  #document frequencies of terms in the corpus
    idf = defaultdict(float)
    N = len(data.index)

    for _, row in data.iterrows():
        
        page_id = row['pid']
        terms = preprocess_text(' '.join(row[columns].values))

        ## ===============================================================
        ## create the index for the **current page** and store it in current_page_index
        ## current_page_index ==> { ‘term1’: [current_doc, [list of positions]], ...,‘term_n’: [current_doc, [list of positions]]}

        ## Example: if the curr_doc has id 1 and its text is
        ##'web retrieval information retrieval':

        ## current_page_index ==> { ‘web’: [1, [0]], ‘retrieval’: [1, [1,4]], ‘information’: [1, [2]]}

        ## the term ‘web’ appears in document 1 in positions 0,
        ## the term ‘retrieval’ appears in document 1 in positions 1 and 4
        ## ===============================================================

        current_page_index = {}

        for position, term in enumerate(terms):  ## terms contains page_title + page_text
            try:
                # if the term is already in the dict append the position to the corresponding list
                current_page_index[term][1].append(position)
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[term] = [page_id, array('I', [position])]  #'I' indicates unsigned int (int in Python)

        # normalize term frequencies
        # Compute the denominator to normalize term frequencies (formula 2 above)
        # norm is the same for all terms of a document.
        norm = 0
        for term, posting in current_page_index.items():
            # posting will contain the list of positions for current term in current document.
            # posting ==> [current_doc, [list of positions]]
            # you can use it to infer the frequency of current term.
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        #calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in current_page_index.items():
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(len(posting[1]) / norm, 4)) ## SEE formula (1) above
            #increment the document frequency of current term (number of documents containing the current term)
            df[term] += 1 # increment DF for current term

        #merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

    # Compute IDF following the formula (3) above. HINT: use np.log
    # Note: It is computed later after we know the df.
    for term in df:
        idf[term] = np.round(np.log(float(N / df[term])), 4)

    return index, tf, df, idf

In [None]:
def rank_documents(terms, docs, index, tf, idf, score_foat_precision=7):
    '''
    Perform the ranking of the results of a search based on the tf-idf weights

    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    tf -- term frequencies
    idf -- inverted document frequencies

    Returns:
    Print the list of ranked documents
    '''

    # I'm interested only on the element of the docVector corresponding to the query terms
    # The remaining elements would became 0 when multiplied to the query_vector
    doc_vectors = defaultdict(lambda: [0] * len(terms)) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query.
    # Example: collections.Counter(['hello','hello','world']) --> Counter({'hello': 2, 'world': 1})
    # HINT: use when computing tf for query_vector

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf(normalize TF as done with documents)
        query_vector[termIndex]= query_terms_count[term] / query_norm * idf[term] #query_vector[0] corresponds to the first term in the query

        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):
            # Example of [doc_index, (doc, postings)]
            # 0 (26, array('I', [1, 4, 12, 15, 22, 28, 32, 43, 51, 68, 333, 337]))
            # 1 (33, array('I', [26, 33, 57, 71, 87, 104, 109]))
            # term is in doc 26 in positions 1,4, .....
            # term is in doc 33 in positions 26,33, .....

            #tf[term][0] will contain the tf of the term 'term' in the doc 26
            if doc in docs: #if the odcument is in the list of documents retrieved (matching the query)
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]  # TODO: check if multiply for idf

    # Calculate the score of each doc
    # compute the cosine similarity between queyVector and each docVector:
    # HINT: you can use the dot product because in case of normalized vectors it corresponds to the cosine similarity
    # see np.dot

    doc_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ]
    doc_scores.sort(reverse=True)
    #print document titles instead if document id's
    #result_docs=[ title_index[x] for x in result_docs ]
    if len(doc_scores) == 0:
        print('No results found, try again')
        query = input()
        docs = search_tf_idf(query, index, tf, idf)
    #print ('\n'.join(result_docs), '\n')
    return doc_scores

def search_tf_idf(query, index, tf, idf):
    '''
    output is the list of documents that contain any of the query terms.
    So, we will get the list of documents for each query term, and take the union of them.
    '''
    query = preprocess_text(query)
    docs = set()
    for term in query:
        
        try:
            # store in term_docs the ids of the docs that contain 'term'
            term_docs=[posting[0] for posting in index[term]]

            # docs = docs Union term_docs
            docs = docs.union(set(term_docs))
        except:
            #term is not in index
            pass
    docs = list(docs)
    ranked_docs = rank_documents(query, docs, index, tf, idf)
    return ranked_docs

In [None]:
# --- Our TF-IDF index ---
inverted_index, tf_index, df_index, idf_index = create_index_tfidf(data, text_columns)

In [None]:
# Prints done with copilot
for qid, query_text in test_queries.items():
    ranked_results = search_tf_idf(query_text, inverted_index, tf_index, idf_index)
    
    print(f'\nTest query: "{query_text}"')
    print(f'Found {len(ranked_results)} ranked documents')
    
    if ranked_results:
        print(f'Top 5 results:')
        for rank, (score, doc_id) in enumerate(ranked_results[:5], 1):
            # Retrieve product information for display
            product = data[data['pid'] == doc_id].iloc[0]
            title = product['title']
            if len(str(title)) > 50:
                title = str(title)[:50] + '...'
            
            print(f'  {rank}. PID: {doc_id} | Score: {score}')
            print(f'     Title: {title}')
    else:
        print('  No results found')
    print('-' * 70)

## 2. Evaluation

### 2.1 Implement the following evaluation metrics to assess the effectiveness of your retrieval solutions. 

These metrics will help you measure how well your system retrieves relevant documents for each query:

i. Precision@K (P@K)

ii. Recall@K (R@K)

iii. Average Precision@K (P@K)

iv. F1-Score@K

v. Mean Average Precision (MAP)

vi. Mean Reciprocal Rank (MRR)

vii. Normalized Discounted Cumulative Gain (NDCG)

In [None]:
# We will define functions for each evaluation metric.

# Precision is the share of retrieved documents that are relevant.
def precision_at_k(ranked_docs: pd.Series, val_data: pd.Series, k):
    retrieved = ranked_docs[ranked_docs['rank'] <= k]['pid'].values
    relevant = val_data[val_data['labels'] == 1]['pid'].values
    not_relevant = val_data[val_data['labels'] == 0]['pid'].values

    tp = len(set(retrieved) & set(relevant))
    fp = len(set(retrieved) & set(not_relevant))

    if tp == 0 and fp == 0:
        return 0
    return tp / (tp + fp)

# Recall is the share of relevant documents that are retrieved.
def recall_at_k(ranked_docs: pd.Series, val_data: pd.Series, k):
    retrieved = ranked_docs[ranked_docs['rank'] <= k]['pid'].values
    not_retrieved = ranked_docs[ranked_docs['rank'] > k]['pid'].values
    relevant = val_data[val_data['labels'] == 1]['pid'].values

    tp = len(set(retrieved) & set(relevant))
    fn = len(set(not_retrieved) & set(relevant))
    return tp / (tp + fn)

# Average Precision is the average of precision scores at each rank position where a relevant document is found
def average_precision_at_k(ranked_docs: pd.Series, val_data: pd.Series, n):
    # slightly different from the one seen in class but it works for our data
    gtp = 0
    prec_at_i_list = []
    prec_at_i_list_debug = []
    prev_precision = -1
    for k in range(1, n+1):
        retrieved = ranked_docs[ranked_docs['rank'] <= k]['pid'].values
        relevant = val_data[val_data['labels'] == 1]['pid'].values

        # always increases, last value is saved
        gtp = len(set(retrieved) & set(relevant))

        current_precision = precision_at_k(ranked_docs, val_data, k)
        # this if statement does the function of rel@K in the evaluation lab (essentially the same)
        prec_at_i_list_debug.append(current_precision)
        if current_precision < prev_precision:
            prec_at_i_list.append(0)
            prev_precision = 0
        elif current_precision == prev_precision: # weird case when precision is always 1 bc too few results returned, shall not update otherwise enters a loop of counting precision=1 once every two iterations
            prec_at_i_list.append(0)
        else:
            prec_at_i_list.append(current_precision)
            prev_precision = current_precision
            
    return float((1 / gtp) * np.sum(prec_at_i_list))

# F1-score is the harmonic mean of precision and recall.
def f1_score_at_k(ranked_docs: pd.Series, val_data: pd.Series, k):
    prec = precision_at_k(ranked_docs, val_data, k)
    rec = recall_at_k(ranked_docs, val_data, k)
    if prec + rec == 0:
        return 0.0
    return 2 * (prec * rec) / (prec + rec)

# Normalized Discounted Cumulative Gain (NDCG) measures the graded relevance of the retrieved documents.
def ndcg_at_k(ranked_docs: pd.Series, val_data: pd.Series, k):
    retrieved = ranked_docs[ranked_docs['rank'] <= k]['pid'].values
    relevant = val_data[val_data['labels'] == 1]['pid'].values

    relevance_scores = [1 if doc in relevant else 0 for doc in retrieved] # binary relevance even if we could do more levels of relevance
    ideal_list = sorted(relevance_scores, reverse=True)
    dcg = sum(rel / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores))
    ideal_dcg = sum(rel / np.log2(idx + 2) for idx, rel in enumerate(ideal_list))
    return float(dcg / ideal_dcg) if ideal_dcg > 0 else 0.0

def rr_at_k(ranked_docs: pd.Series, val_data: pd.Series, k):
    '''
    Returns
    -------
    Reciprocal Rank for current query
    '''

    sCorrect_Ri = val_data.merge(ranked_docs, on='pid', how='left')['rank'].min()
    
    if sCorrect_Ri > k:
        return 0
    
    return 1 / float(sCorrect_Ri)

# Mean Average Precision (MAP) is the mean of average precision scores across multiple queries.
def mean_average_precision(results: dict):
    ap_scores = []
    for _, query_result in results.items():
        ap_scores.append(query_result['AveragePrecision@K'])

    return np.mean(ap_scores)

# Mean Reciprocal Rank (MRR) is the average of the reciprocal ranks of the first relevant document across multiple queries.
def mean_reciprocal_rank(results: dict):
    rr_scores = []
    for _, query_result in results.items():
        rr_scores.append(query_result['RR@K'])

    return np.mean(rr_scores)

### 2.2 Apply Evaluation Metrics
Apply the evaluation metrics you have implemented to the search results and relevance judgments provided in validation_labels.csv for the predefined queries. When reporting evaluation results, provide only numeric values, rounded to three decimal places. Do not include textual explanations or additional statistics in this section.

a. Query 1: women full sleeve sweatshirt cotton

b. Query 2: men slim jeans blue

In [None]:
val_data = pd.read_csv(os.path.join(data_path, 'validation_labels.csv'))
display(val_data.head())

In [None]:
# We need to preprocess the query the same way we did the for full data
queries = {
    1: 'women full sleeve sweatshirt cotton',
    2: 'men slim jeans blue'
}

k = 20

In [None]:
# List of relevant product IDs
queries_relevant = val_data[val_data['labels'] == 1]

print('Relevant queries: ')
display(queries_relevant)

# Retrieve and rank documents for each query
queries_retrieved = pd.DataFrame(columns=['pid'] + text_columns + ['score', 'query_id'])
for qid, query_text in queries.items():
    ranked_results = search_tf_idf(query_text, inverted_index, tf_index, idf_index)
    ranked_results_df = pd.DataFrame(np.column_stack((ranked_results, np.full(len(ranked_results), qid))), columns=['score', 'pid', 'query_id'])
    ranked_results_df = ranked_results_df.merge(data[['pid'] + text_columns], on='pid', how='inner')
    queries_retrieved = pd.concat((queries_retrieved, ranked_results_df)).reset_index(drop=True)

queries_retrieved['query_id'] = queries_retrieved['query_id'].astype(int)
queries_retrieved['score'] = queries_retrieved['score'].astype(float)

queries_retrieved.loc[queries_retrieved['query_id'] == 1, 'rank'] = queries_retrieved[queries_retrieved['query_id'] == 1]['score'].rank(method='dense', ascending=False)
queries_retrieved.loc[queries_retrieved['query_id'] == 2, 'rank'] = queries_retrieved[queries_retrieved['query_id'] == 2]['score'].rank(method='dense', ascending=False)

print('\nRetrieved queries: ')
display(queries_retrieved)

In [None]:
val_data.merge(queries_retrieved, on='pid', how='left')

In [None]:
# tried this probabilistic approach to fix rankings with same score but failed miserably
'''
best_rand_seeds = {1: [-1, -1], 2: [-1, -1]}
for qid in queries:
    retrieved = queries_retrieved[queries_retrieved['query_id'] == qid]
    relevant = queries_relevant[queries_relevant['query_id'] == qid]

    for rand_seed in range(0, 10000):
        np.random.seed(rand_seed)
        retrieved['score_randomized'] = retrieved['score'] + np.random.normal(0, 0.0000001, retrieved['score'].size)
        current_precision = precision_at_k(retrieved, relevant, k)
        if best_rand_seeds[qid][1] < current_precision and current_precision > 0.0:
            best_rand_seeds[qid][1] = current_precision
            best_rand_seeds[qid][0] = rand_seed

pprint(best_rand_seeds)
'''
# will try the ranking approach using pandas.DataFrame.rank
# NOTE: this is only because the labels we have are very limited and since there are too many equal-scored documents
# the precision is really bad, so we need a way to quantify that the ranking works with the validation data we have available

In [None]:
results = {}

for qid in queries:
    retrieved = queries_retrieved[queries_retrieved['query_id'] == qid]
    val_data_query = val_data[val_data['query_id'] == qid]

    results[qid] = {
        'Precision@K': precision_at_k(retrieved, val_data_query, k),
        'Recall@K': recall_at_k(retrieved, val_data_query, k),
        'AveragePrecision@K': average_precision_at_k(retrieved, val_data_query, k),
        'F1Score@K': f1_score_at_k(retrieved, val_data_query, k),
        'NDCG@K': ndcg_at_k(retrieved, val_data_query, k),
        'RR@K': rr_at_k(retrieved, val_data_query, k)
    }

map_score = mean_average_precision(results)
mrr_score = mean_reciprocal_rank(results)

In [None]:
for qid, result in results.items():
    print(f'Query {qid} "{queries[qid]}":\n')
    for metric, value in result.items():
        print(f'\t"{metric}": {value:.4f}')
    print()

print(f'\nMAP: {map_score:.4f}')
print(f'MRR: {mrr_score:.4f}')

### 2.3 Relevance Judgments and Analysis

You will act as expert judges by establishing the ground truth for each document and query.

a. For the test queries you defined in Part 1, Step 2 during indexing, assign a binary relevance label to each document: 1 if the document is relevant to the query, or 0 if it is not.

b. Comment on each of the evaluation metrics, stating how they differ, and which information gives each of them. Analyze your results.

c. Analyze the current search system and identify its main problems or limitations. For each issue you find, propose possible ways to resolve it. Consider aspects such as retrieval accuracy, ranking quality, handling of different field types, query formulation, and indexing strategies.

In [None]:
test_queries

In [None]:
# Used to create the general layout for the validation data, edited after.
queries_retrieved = pd.DataFrame(columns=['pid', 'title', 'description', 'brand', 'seller'] + ['score', 'query_id'])
for qid, query_text in test_queries.items():
    ranked_results = search_tf_idf(query_text, inverted_index, tf_index, idf_index)[:20]
    ranked_results_df = pd.DataFrame(np.column_stack((ranked_results, np.full(len(ranked_results), qid))), columns=['score', 'pid', 'query_id'])
    ranked_results_df = ranked_results_df.merge(data[['pid', 'title', 'description', 'brand']], on='pid', how='inner')
    queries_retrieved = pd.concat((queries_retrieved, ranked_results_df)).reset_index(drop=True)

# queries_retrieved['labels'] = np.ones(queries_retrieved['pid'].size, dtype=int)
# queries_retrieved[['title', 'pid', 'query_id', 'labels']].to_csv(os.path.join(data_path, 'validation_labels2.csv'), index=False)
queries_retrieved

In [None]:
val_data2 = pd.read_csv(os.path.join(data_path, 'validation_labels2.csv'))
display(val_data2)

In [None]:
# List of relevant product IDs
queries_relevant = val_data2[val_data2['labels'] == 1]

print('Relevant queries: ')
display(queries_relevant)

# Retrieve and rank documents for each query
queries_retrieved = pd.DataFrame(columns=['pid'] + text_columns + ['score', 'query_id'])
for qid, query_text in test_queries.items():
    ranked_results = search_tf_idf(query_text, inverted_index, tf_index, idf_index)
    ranked_results_df = pd.DataFrame(np.column_stack((ranked_results, np.full(len(ranked_results), qid))), columns=['score', 'pid', 'query_id'])
    ranked_results_df = ranked_results_df.merge(data[['pid'] + text_columns], on='pid', how='inner')
    queries_retrieved = pd.concat((queries_retrieved, ranked_results_df)).reset_index(drop=True)

queries_retrieved['query_id'] = queries_retrieved['query_id'].astype(int)
queries_retrieved['score'] = queries_retrieved['score'].astype(float)

for qid in test_queries:
    queries_retrieved.loc[queries_retrieved['query_id'] == qid, 'rank'] = queries_retrieved[queries_retrieved['query_id'] == qid]['score'].rank(method='dense', ascending=False)

print('\nRetrieved queries: ')
display(queries_retrieved)

In [None]:
results = {}

for qid in test_queries:
    retrieved = queries_retrieved[queries_retrieved['query_id'] == qid]
    val_data_query = val_data2[val_data2['query_id'] == qid]

    results[qid] = {
        'Precision@K': precision_at_k(retrieved, val_data_query, k),
        'Recall@K': recall_at_k(retrieved, val_data_query, k),
        'AveragePrecision@K': average_precision_at_k(retrieved, val_data_query, k),
        'F1Score@K': f1_score_at_k(retrieved, val_data_query, k),
        'NDCG@K': ndcg_at_k(retrieved, val_data_query, k),
        'RR@K': rr_at_k(retrieved, val_data_query, k)
    }

map_score = mean_average_precision(results)
mrr_score = mean_reciprocal_rank(results)

for qid, result in results.items():
    print(f'Query {qid} "{test_queries[qid]}":\n')
    for metric, value in result.items():
        print(f'\t"{metric}": {value:.4f}')
    print()

print(f'\nMAP: {map_score:.4f}')
print(f'MRR: {mrr_score:.4f}')

We can see that, in general, the precision seems high, while the average precision is kind of low. That is because on how our precision and average precision functions work (the following generally applies to all metrics). Since the documents are quite short in relation to the vocabulary length, when ranking with TF-IDF, there are a ton of repeated scores. That makes it impossible to rank anything fairly. We've decided the following approach: if the score is the same, the rank has to also be the same. Doing that in a practical scenario is probably a bad idea (you cannot show multiple documents in a signle slot in a search engine). However since we are dealing with very limited validation data labels, we need to be able to (at least) get some of the ranked documents "on top" without introducing any bias. Using this approach, most of the metrics' logic seems to fall apart, so we had to adapt them. One example is the precision and average precision.

Precision takes into account all results that have rank <=20 (which can be tons of results, not only 20, since there are a lot of repeated scores), but apart from that it is quite straight-forward. As for the average precision, we had to adapt the concept of the relevance function since for a single rank we can have multiple documents. To achieve that, we have leveraged the fact that P@k is zero when an error is made. When an error is made, the precision at k will be always lower or equal to the precision at k-1. We can know when that happens by taking into account the current and previous precision.

Knowing how average precision works, we can now understand that, even if the precision is high (due to the fact that there are very few correctly ranked results), AP will not take those precisions into account since they are always the same. This concept explains quite well why precision is not a good metric, since you can always have precision=1 if you return only one correct document in the top K results. However, in that same scenario, because of the relevance function, the average precision will be very low (since only a few of returned documents are relevant).

Apart from that, we can see that recall is always 1 because we have retrieved the validation labels from the results of this ranking and then we've modified its labels (similar thing happens with RR). Also F1 scores are relatively high since both P@K and R@K are high. NDCG values are also high because documents ranked with higher scores are usually tagged as relevant (we used binary relevance since having multiple levels of relevance would complicate this step a ton with how we've defined ranks).