In [None]:
import sys
import os
# Add the project root to the path so we can import our modules
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
import pandas as pd
import pickle
import numpy as np

In [None]:
'''
Here we evaluate the two given queries against a custom validation set stored in 'validation_labels.csv'.
'''

In [None]:
# Import your TF-IDF search
from indexing import search_tfidf

In [None]:
# Import evaluation metrics
from evaluation import (
    compute_precision_at_K,
    compute_recall_at_K,
    compute_average_precision_at_K,
    compute_F1_score_at_K,
    compute_mean_average_precision,
    compute_mean_reciprocal_rank,
    compute_normalized_discounted_cumulative_gain
)

In [None]:
# -------------------------------------------------------------
# Helper Functions
# -------------------------------------------------------------
def load_index_data(path: str) -> dict:
    # Loads the precomputed inverted index and TF-IDF data
    print(f"Loading index from {path}...")
    with open(path, "rb") as f:
        data = pickle.load(f)
    print("Index loaded successfully.")
    return data

In [None]:
def load_ground_truth(path: str) -> dict:
    # Reads the CSV file with validation labels
    print(f"Loading ground truth from {path}...")
    try:
        df = pd.read_csv(path)
    except FileNotFoundError:
        print(f"ERROR: Ground truth file not found at {path}")
        sys.exit(1)
    
    # Map query IDs to text
    query_map = {
        1: "women full sleeve sweatshirt cotton",
        2: "men slim jeans blue"
    }
    df['query'] = df['query_id'].map(query_map)

    # Group by query and collect relevant PIDs
    ground_truth = df.groupby('query').apply(
        lambda x: set(x[x['labels'] == 1]['pid'])
    ).to_dict()

    print(f"Loaded ground truth for {len(ground_truth)} queries.")
    return ground_truth

In [None]:
def map_at_k(all_rankings, all_relevants, K):
    """Mean Average Precision across queries."""
    total = 0
    for ranking, rel in zip(all_rankings, all_relevants):
        total += compute_average_precision_at_K(ranking, rel, K)
    return total / len(all_rankings) if all_rankings else 0.0

In [None]:
def mrr(all_rankings, all_relevants):
    """Mean Reciprocal Rank across queries."""
    total = 0
    for ranking, rel in zip(all_rankings, all_relevants):
        for idx, doc in enumerate(ranking, start=1):
            if doc in rel:
                total += 1 / idx
                break
    return total / len(all_rankings) if all_rankings else 0.0

In [None]:
# -------------------------------------------------------------
# Main Evaluation
# -------------------------------------------------------------
def main():
    # File paths and configuration
    INDEX_PATH = "project_progress/part_2/irwa_index.pkl"
    LABELS_PATH = "data/validation_labels.csv"
    K = 10  # Metrics will be computed @10

    # Loading data
    try:
        index_data = load_index_data(INDEX_PATH)
    except FileNotFoundError:
        print(f"ERROR: Index file not found at {INDEX_PATH}")
        sys.exit(1)
    
    
    ground_truth = load_ground_truth(LABELS_PATH)

    index = index_data['index']
    tf = index_data['tf']
    idf = index_data['idf']

    # Queries to evaluate
    queries = [
        "women full sleeve sweatshirt cotton",
        "men slim jeans blue"
    ]

    all_results = []
    all_ground_truths = []
    query_metrics = {q: {} for q in queries}

    print(f"\n--- Evaluation Results (K={K}) ---")
    print("=" * 44)
    print(f"{'Metric':<8} | {'Query 1 (women...)':<18} | {'Query 2 (men...)':<15}")
    print("-" * 44)

    for query in queries:
        ranked_pids = search_tfidf(query, index, tf, idf)
        relevant_set = ground_truth.get(query, set())

        all_results.append(ranked_pids)
        all_ground_truths.append(relevant_set)

        # Convert sets to lists because evaluation functions expect ordered lists
        rel_list = list(relevant_set)

        # Calculate metrics
        p_k = compute_precision_at_K(ranked_pids, rel_list, K)
        r_k = compute_recall_at_K(ranked_pids, rel_list, K)
        f1_k = compute_F1_score_at_K(ranked_pids, rel_list, K)
        ap_k = compute_average_precision_at_K(ranked_pids, rel_list, K)
        rr_val = compute_mean_reciprocal_rank([ranked_pids], [rel_list])
        ndcg = compute_normalized_discounted_cumulative_gain(ranked_pids, rel_list)

        # Print a table for this query
        query_metrics[query]['P@K'] = p_k
        query_metrics[query]['R@K'] = r_k
        query_metrics[query]['F1@K'] = f1_k
        query_metrics[query]['AP@K'] = ap_k
        query_metrics[query]['RR'] = rr_val
        query_metrics[query]['NDCG'] = ndcg

    # Print metrics for both queries
    for metric_name in ['P@K', 'R@K', 'F1@K', 'AP@K', 'RR', 'NDCG']:
        q1_val = query_metrics[queries[0]][metric_name]
        q2_val = query_metrics[queries[1]][metric_name]
        print(f"{metric_name:<8} | {q1_val:<18.3f} | {q2_val:<15.3f}")


    map_val = map_at_k(all_results, all_ground_truths, K)
    mrr_val = mrr(all_results, all_ground_truths)

    print(f"Overall MAP: {map_val:.3f}")
    print(f"Overall MRR:    {mrr_val:.3f}")

In [None]:
if __name__ == "__main__":
    main()