In [None]:
import sys
import os
# Add the project root to the path so we can import our modules
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
import pandas as pd
import pickle
import numpy as np

In [None]:
'''
Here we evaluate our own set of 5 queries against a custom validation set stored in 'my_queries_validation_labels.csv'.

'''

In [None]:
# Import your TF-IDF search
from indexing import search_tfidf

In [None]:
# Import evaluation metrics
from evaluation import (
    compute_precision_at_K,
    compute_recall_at_K,
    compute_average_precision_at_K,
    compute_F1_score_at_K,
    compute_mean_average_precision,
    compute_mean_reciprocal_rank,
    compute_normalized_discounted_cumulative_gain
)

In [None]:
# -------------------------------------------------------------
# Helper Functions
# -------------------------------------------------------------
def load_index_data(path: str) -> dict:
    # Loads the precomputed inverted index and TF-IDF data
    print(f"Loading index from {path}...")
    with open(path, "rb") as f:
        data = pickle.load(f)
    print("Index loaded successfully.")
    return data

In [None]:
def load_ground_truth(path: str) -> dict:
    #Loads my own validation CSV (my_queries_validation_labels.csv)
    print(f"Loading own queries ground truth from {path}...")
    try:
        # This CSV is simple: 'query', 'pid', 'labels'
        # I created it this way so I don't need the query_id mapping
        df = pd.read_csv(path)
    except FileNotFoundError:
        print(f"ERROR: Ground truth file not found at {path}")
        print("Please create 'my_validation_labels.csv' in the 'data/' folder first.")
        sys.exit(1)
    
    # Group by the query text and get the set of
    # relevant PIDs (where 'labels' == 1)
    ground_truth = df.groupby('query').apply(
        lambda x: set(x[x['labels'] == 1]['pid'])
    ).to_dict()

    print(f"Loaded my queries ground truth for {len(ground_truth)} queries.")
    return ground_truth

In [None]:
def map_at_k(all_rankings, all_relevants, K):
    """Mean Average Precision across queries."""
    total = 0
    for ranking, rel in zip(all_rankings, all_relevants):
        total += compute_average_precision_at_K(ranking, rel, K)
    return total / len(all_rankings) if all_rankings else 0.0

In [None]:
def mrr(all_rankings, all_relevants):
    """Mean Reciprocal Rank across queries."""
    total = 0
    for ranking, rel in zip(all_rankings, all_relevants):
        for idx, doc in enumerate(ranking, start=1):
            if doc in rel:
                total += 1 / idx
                break
    return total / len(all_rankings) if all_rankings else 0.0

In [None]:
# -------------------------------------------------------------
# Main Evaluation Script
# -------------------------------------------------------------
def main():
    # File paths and configuration
    INDEX_PATH = "project_progress/part_2/irwa_index.pkl"
    LABELS_PATH = "data/my_queries_validation_labels.csv"
    K = 10  # Metrics will be computed @10

    # Loading data
    try:
        index_data = load_index_data(INDEX_PATH)
    except FileNotFoundError:
        print(f"ERROR: Index file not found at {INDEX_PATH}")
        sys.exit(1)
    
    # We load the index, no need to re-build it
    index_data = load_index_data(INDEX_PATH)
    ground_truth = load_ground_truth(LABELS_PATH)

    index = index_data['index']
    tf = index_data['tf']
    idf = index_data['idf']

    # Queries to evaluate
    my_queries = [
        "ARBO cotton track pants for men",         
        "Multicolor track pants combo ECKO",       
        "Black solid women track pants",      
        "Elastic waist cotton blend track pants",     
        "Self design multicolor track pants"  
    ]

    all_results = []
    all_ground_truths = []


    print(f"\n--- Evaluation Results (K={K}) ---")
    print("=" * 44)

    for query in my_queries:
        ranked_pids = search_tfidf(query, index, tf, idf)
        relevant_set = ground_truth.get(query, set())

        all_results.append(ranked_pids)
        all_ground_truths.append(relevant_set)

        # Convert sets to lists because evaluation functions expect ordered lists
        rel_list = list(relevant_set)

        # Calculate metrics
        p_k = compute_precision_at_K(ranked_pids, rel_list, K)
        r_k = compute_recall_at_K(ranked_pids, rel_list, K)
        f1_k = compute_F1_score_at_K(ranked_pids, rel_list, K)
        ap_k = compute_average_precision_at_K(ranked_pids, rel_list, K)
        rr_val = compute_mean_reciprocal_rank([ranked_pids], [rel_list])
        ndcg = compute_normalized_discounted_cumulative_gain(ranked_pids, rel_list)

        # Print a table for this query
        print(f"\nQuery: '{query}'")
        print("-" * 60)
        print(f"  P@10:    {p_k:.3f}")
        print(f"  R@10:    {r_k:.3f}")
        print(f"  F1@10:   {f1_k:.3f}")
        print(f"  AP@10:   {ap_k:.3f}")
        print(f"  RR:      {rr_val:.3f}")
        print(f"  NDCG: {ndcg:.3f}")

    # Print metrics for 5 queries
    print("\n" + "=" * 60)
    print("--- Overall Results for MY 5 Queries ---")
    
    map_val = compute_mean_average_precision(all_results, all_ground_truths)
    mrr_val = compute_mean_reciprocal_rank(all_results, all_ground_truths)

    print(f"Overall MAP: {map_val:.3f}")
    print(f"Overall MRR:    {mrr_val:.3f}")
    print("=" * 60)

In [None]:
if __name__ == "__main__":
    main()