In [None]:
!pip install faiss-cpu
!pip install -U sentence-transformers
!pip install pytrec_eval

In [2]:
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import json
import pytrec_eval
from sklearn.preprocessing import normalize

In [3]:
def retrieve(query_embeddings, top_k=10):
    """
    Retrieve top-k most similar documents for multiple queries
    
    :param query_embeddings: Numpy array of query embeddings
    :param top_k: Number of documents to retrieve
    :return: Dictionary of results for pytrec_eval
    """
    # Ensure query embeddings are 2D
    if query_embeddings.ndim == 1:
        query_embeddings = query_embeddings.reshape(1, -1)
    
    # Search index
    distances, indices = index.search(query_embeddings, top_k)
    
    # Convert results to dictionary format for pytrec_eval
    results = {}
    for i, (doc_indices, doc_distances) in enumerate(zip(indices, distances)):
        # Use query index as string key
        query_key = str(i)
        results[query_key] = {
            docnos[idx]: dist  # Use docnos instead of ids
            for idx, dist in zip(doc_indices, doc_distances) 
            if idx != -1
        }
    
    return results

def load_qrels(qrels_path):
    """
    Load relevance judgments from TREC qrels file
    
    :param qrels_path: Path to qrels file
    :return: Dictionary of relevance judgments
    """
    qrels = {}
    with open(qrels_path, 'r') as f:
        for line in f:
            # Typical TREC qrels format: query_id 0 doc_id relevance
            query_id, _, doc_id, relevance = line.strip().split()
            if query_id not in qrels:
                qrels[query_id] = {}
            qrels[query_id][doc_id] = int(relevance)
    return qrels

def load_queries(queries_file):
    """
    Load queries from a file
    
    :param queries_file: Path to queries JSON file
    :return: Tuple of (query_texts, query_ids)
    """
    with open(queries_file, 'r') as f:
        queries_data = json.load(f)
    
    # Assuming JSON structure with 'text' and 'id' fields
    query_texts = [query.get('text', '') for query in queries_data]
    query_ids = [query.get('id', str(idx)) for idx, query in enumerate(queries_data)]

    return query_texts, query_ids

def compute_aggregated_measures(metrics):
    """
    Manually compute aggregated measures across queries
    
    :param metrics: Dictionary of per-query metrics from pytrec_eval
    :return: Dictionary of aggregated metrics
    """
    aggregated_metrics = {}
    
    # Metrics to aggregate
    metric_keys = [
        'ndcg', 'map', 'recip_rank', 
        'P_5', 'P_10', 'P_20', 
        'recall_5', 'recall_10', 'recall_20'
    ]
    
    for metric in metric_keys:
        # Collect all values for this metric
        metric_values = []
        for query_metrics in metrics.values():
            if metric in query_metrics:
                metric_values.append(query_metrics[metric])
        
        # Compute aggregation methods
        if metric_values:
            aggregated_metrics[f'{metric}_mean'] = np.mean(metric_values)
            aggregated_metrics[f'{metric}_median'] = np.median(metric_values)
            aggregated_metrics[f'{metric}_std'] = np.std(metric_values)
    
    return aggregated_metrics

def encode_document(doc):
    # Generate embeddings for the sentences
    embeddings = model.encode(doc)

    return embeddings

def normalize_vectors(vectors):
    """
    Normalize vectors to unit length
    """
    return normalize(vectors, norm='l2')

# Pipeline implementation

In [None]:

# List of models to use
models_to_evaluate = [
    'multi-qa-distilbert-dot-v1',
    'multi-qa-MiniLM-L6-dot-v1',
    'multi-qa-mpnet-base-cos-v1',
    'all-mpnet-base-v2',
    'all-distilroberta-v1',
    'all-MiniLM-L12-v2',
    'all-MiniLM-L6-v2',
    'multi-qa-distilbert-cos-v1',
    'multi-qa-MiniLM-L6-cos-v1',
    'multi-qa-mpnet-base-dot-v1',
    'distiluse-base-multilingual-cased-v1',
    'distiluse-base-multilingual-cased-v2',
]
query_texts, query_ids = load_queries('/kaggle/input/query-and-qrels/queries.json')

qrels = load_qrels('/kaggle/input/query-and-qrels/filtered_data.txt')


# Dictionary to store results for all models
all_results = {}

def create_faiss_index(embeddings, model_name):
    """
    Create appropriate FAISS index based on model type
    """
    dim = embeddings.shape[1]
    
    # Check if model is using cosine similarity ('cos' in name)
    #if 'cos' in model_name.lower():
    # For cosine similarity, normalize vectors and use IP distance
    normalized_embeddings = normalize_vectors(embeddings)
    index = faiss.IndexFlatIP(dim)
    index.add(normalized_embeddings)
    #else:
        # For dot product or L2 models, use raw vectors with L2 distance
    #index = faiss.IndexFlatL2(dim)
    #index.add(embeddings)
    
    return index



for model_name in models_to_evaluate:
    print(f"\nEvaluating model: {model_name}")
    # Load embeddings for current model
    embeddings_file = f'/kaggle/input/{model_name.lower()}-ft-embeddings/{model_name}_FT_embeddings.csv'
    df = pd.read_csv(embeddings_file)
    
    # Extract document embeddings
    docnos = df['DOCNO'].values
    embeddings = df.drop(columns=['DOCNO']).values
    
    # Create appropriate index based on model type
    index = create_faiss_index(embeddings, model_name)
    
    # Load model and encode queries
    model = SentenceTransformer(f'sentence-transformers/{model_name}')
    query_embeddings = model.encode(query_texts)
    
    #Normalize queries only for cosine similarity models
    #if 'cos' in model_name.lower():
    query_embeddings = normalize_vectors(query_embeddings)
    
    # Retrieve results
    run = retrieve(query_embeddings)
    run_with_query_ids = {
        query_ids[int(k)]: v for k, v in run.items()
    }
    
    # Prepare for evaluation
    corrected_version = {
        str(query_id): {str(doc_id): float(score) for doc_id, score in doc_scores.items()}
        for query_id, doc_scores in run_with_query_ids.items()
    }
    
    # Evaluate
    evaluator = pytrec_eval.RelevanceEvaluator(
        qrels,
        {
            'ndcg', 'map', 'recip_rank',
            'P_5', 'P_10',
            'recall_5', 'recall_10', 
        }
    )
    
    metrics = evaluator.evaluate(corrected_version)
    aggregated_measures = compute_aggregated_measures(metrics)
    
    # Store results
    all_results[model_name] = aggregated_measures


In [None]:

# Sort models by NDCG score
ndcg_scores = {model: results['ndcg_mean'] for model, results in all_results.items()}
sorted_models = dict(sorted(ndcg_scores.items(), key=lambda x: x[1], reverse=True))

# Create sorted results dictionary
sorted_results = {model: all_results[model] for model in sorted_models.keys()}

# Create and save sorted DataFrame
results_df = pd.DataFrame.from_dict(sorted_results, orient='index')
results_df.to_csv('model_evaluation_results.csv')

# Display ranking
print("\nModel Ranking by NDCG (Normalized Discounted Cumulative Gain):")
print("-" * 50)
for rank, (model, ndcg_score) in enumerate(sorted_models.items(), 1):
    print(f"{rank}. {model:<35} NDCG: {ndcg_score:.4f}")

print("\nDetailed Results (sorted by NDCG):")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
print(results_df)
