In [10]:
import os
import json
import math
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from langchain_postgres import PGVector
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')
print('Imports ready')

Imports ready


In [11]:
# Paths (adjust if needed)
BASE = Path('c:/Users/rayaa/OneDrive/Documents/VSCode/CSCI5832/Semeval')
FULL_QUERIES_PATH = BASE / 'human' / 'retrieval_tasks_convid' / 'cloud' / 'cloud_questions.jsonl'
REWRITE_QUERIES_PATH = BASE / 'human' / 'retrieval_tasks_convid' / 'cloud' / 'cloud_rewrite.jsonl'
QRELS_PATH = BASE / 'human' / 'retrieval_tasks_convid' / 'cloud' / 'qrels' / 'dev.tsv'
CORPUS_PATH = BASE / 'corpora' / 'passage_level' / 'cloud.jsonl'

EMBED_MODEL = 'Snowflake/snowflake-arctic-embed-l-v2.0'
# IMPORTANT: up to runner to have a PG db setup
PG_ENV_PATH = BASE / '.pg_env'

In [12]:
# Load queries (jsonl) and qrels (tsv)
def load_queries(jsonl_path):
    queries = {}
    with open(jsonl_path, 'r', encoding='utf-8') as fh:
        for line in fh:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            qid = obj.get('_id')
            text = obj.get('text','')
            # simple cleanup of conversation prefix tokens if present
            text = text.replace('|user|:', '').strip()
            queries[qid] = text
    return queries

def load_qrels(tsv_path):
    df = pd.read_csv(tsv_path, sep='	')
    qrels = {}
    for _, row in df.iterrows():
        qid = row['query-id']
        docid = row['corpus-id']
        score = int(row['score'])
        qrels.setdefault(qid, {})[docid] = score
    return qrels

full_queries = load_queries(FULL_QUERIES_PATH)
rewrite_queries = load_queries(REWRITE_QUERIES_PATH)
qrels = load_qrels(QRELS_PATH)
print(f'Loaded {len(full_queries)} full queries, {len(rewrite_queries)} rewrite queries, and {len(qrels)} qrels entries')

Loaded 188 full queries, 188 rewrite queries, and 188 qrels entries


In [13]:
# Stream the corpus and set up retrieval backend (PGVector primary, local fallback)
def stream_corpus(jsonl_path, max_passages=None):
    ids = []
    texts = []
    with open(jsonl_path, 'r', encoding='utf-8') as fh:
        for i, line in enumerate(fh):
            if max_passages is not None and i >= max_passages:
                break
            obj = json.loads(line)
            pid = obj.get('id') or obj.get('_id') or obj.get('passage_id') or obj.get('docid')
            if pid is None:
                pid = f'line_{i}'
            text = obj.get('text') or obj.get('contents') or obj.get('passage') or obj.get('title') or ''
            ids.append(str(pid))
            texts.append(text)
    return ids, texts

# Try to connect to PGVector (preferred) using your .pg_env (as in embed_cloud_passages.ipynb)
vectorstore = None
connection_string = None
if PG_ENV_PATH.exists():
    print(f'Loading Postgres connection from {PG_ENV_PATH}')
    load_dotenv(PG_ENV_PATH)
    connection_string = os.getenv('PG_CONNECTION_STRING')

if connection_string:
    print('Initializing PGVector connection...')
    # Use the same embedding class you used when creating the DB entries (embed notebook used Snowflake)
    hf_emb = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
    vectorstore = PGVector(connection=connection_string, embeddings=hf_emb)
    print('PGVector ready')
else:
    print('No PG connection found; falling back to local .npy or embedding from text if needed')

Loading Postgres connection from c:\Users\rayaa\OneDrive\Documents\VSCode\CSCI5832\Semeval\.pg_env
Initializing PGVector connection...
PGVector ready


In [14]:
# Retrieval + evaluation helpers
# retrieve will use PGVector if available

def retrieve(query_texts, model, top_k=100):
    results = {}
    if vectorstore is not None:
        # vectorstore.similarity_search_with_score returns list[(Document, score)]
        for qid, qtext in query_texts.items():
            docs_and_scores = vectorstore.similarity_search_with_score(qtext, k=top_k)
            retrieved = []
            for doc, score in docs_and_scores:
                # prefer metadata doc_id if present
                docid = None
                if hasattr(doc, 'metadata') and isinstance(doc.metadata, dict):
                    docid = doc.metadata.get('doc_id') or doc.metadata.get('docid') or doc.metadata.get('id')
                # fallback to Document.page_content hash or empty
                if not docid:
                    # try to get any identifier
                    docid = getattr(doc, 'id', None) or getattr(doc, 'page_content', '')[:64]
                retrieved.append(str(docid))
            results[qid] = retrieved
        return results


def compute_metrics(results, qrels, ks=(1,3,5,10,100)):
    # results: qid -> ranked list of docids
    metrics = {f'Recall@{k}': [] for k in ks}
    metrics['MRR'] = []
    metrics['MAP'] = []
    metrics['NDCG@10'] = []
    for qid, retrieved in results.items():
        rel_docs = set(qrels.get(qid, {}).keys())
        if not rel_docs:
            continue
        # Recall@k
        for k in ks:
            topk = set(retrieved[:k])
            recall = 1.0 if len(rel_docs & topk) > 0 else 0.0
            metrics[f'Recall@{k}'].append(recall)
        # MRR
        rr = 0.0
        for rank, docid in enumerate(retrieved, start=1):
            if docid in rel_docs:
                rr = 1.0 / rank
                break
        metrics['MRR'].append(rr)
        # AP (for MAP) - binary relevance
        num_rel = 0
        sum_prec = 0.0
        for rank, docid in enumerate(retrieved, start=1):
            if docid in rel_docs:
                num_rel += 1
                sum_prec += num_rel / rank
        ap = (sum_prec / len(rel_docs)) if len(rel_docs) > 0 else 0.0
        metrics['MAP'].append(ap)
        # NDCG@10
        def dcg_at_k(ranked_list, rel_set, k=10):
            dcg = 0.0
            for i in range(min(k, len(ranked_list))):
                if ranked_list[i] in rel_set:
                    dcg += 1.0 / math.log2(i+2)
            return dcg
        ideal_dcg = sum((1.0 / math.log2(i+2)) for i in range(min(len(rel_docs), 10)))
        ndcg = (dcg_at_k(retrieved, rel_docs, k=10) / ideal_dcg) if ideal_dcg > 0 else 0.0
        metrics['NDCG@10'].append(ndcg)
    # aggregate means
    agg = {k: (np.mean(v) if len(v) else float('nan')) for k, v in metrics.items()}
    return agg

In [15]:
# Run retrieval on ALL queries
print("Running retrieval on all full queries...")
all_qids_with_qrels = [q for q in full_queries.keys() if q in qrels]
all_full_queries = {qid: full_queries[qid] for qid in all_qids_with_qrels}

print("Running retrieval on all rewrite queries...")
all_rqids_with_qrels = [q for q in rewrite_queries.keys() if q in qrels]
all_rewrite_queries = {qid: rewrite_queries[qid] for qid in all_rqids_with_qrels}

Running retrieval on all full queries...
Running retrieval on all rewrite queries...


In [16]:
def retrieve_with_scores(query_texts, top_k=100):
    """
    Retrieve documents with PROPER similarity scores (higher = better)
    """
    results = {}
    if vectorstore is not None:
        for qid, qtext in tqdm(query_texts.items(), desc="Retrieving documents"):
            docs_and_scores = vectorstore.similarity_search_with_score(qtext, k=top_k)
            retrieved = []
            for doc, distance_score in docs_and_scores:
                # Extract document ID
                docid = None
                if hasattr(doc, 'metadata') and isinstance(doc.metadata, dict):
                    docid = doc.metadata.get('doc_id') or doc.metadata.get('docid') or doc.metadata.get('id')
                if not docid:
                    docid = getattr(doc, 'id', None) or getattr(doc, 'page_content', '')[:64]
                
                # CRITICAL FIX: Convert distance to similarity score
                # PGVector returns cosine distance (lower = more similar)
                # We need similarity (higher = more similar)
                # Since cosine distance ranges [0, 2] for normalized vectors:
                # similarity = 1 - (distance / 2) OR simply use negative distance
                # Using negative distance is simpler and maintains ordering
                similarity_score = -distance_score  # Higher = more similar
                
                retrieved.append((str(docid), float(similarity_score)))
            
            results[qid] = retrieved
    return results

def create_evaluation_format(results_with_scores, queries, collection_name="mt-rag-ibmcloud-elser-512-100-20240502"):
    """
    Create evaluation format with PROPER similarity scores
    """
    output_data = []
    
    for qid, retrieved_docs in results_with_scores.items():
        if qid not in queries:
            continue
            
        # Create contexts list with actual similarity scores
        contexts = []
        for doc_id, similarity_score in retrieved_docs:
            context = {
                "document_id": doc_id,
                "score": similarity_score,  # This is now proper similarity (higher = better)
            }
            contexts.append(context)
        
        output_item = {
            "task_id": qid,
            "Collection": collection_name,
            "contexts": contexts
        }
        output_data.append(output_item)
    
    return output_data

def run_and_save_evaluation_script(queries, out_predictions_name, out_evaluation_name):
    out_predictions_file = out_predictions_name + '.jsonl'
    out_evaluation_file = out_evaluation_name + '.jsonl'

    # Run retrieval with PROPER scores
    print("Running retrieval with proper similarity scores...")
    results_with_scores = retrieve_with_scores(queries, top_k=100)

    # Create evaluation format
    print("Creating evaluation format...")
    evaluation_data = create_evaluation_format(results_with_scores, queries)

    # Save to file
    output_dir = BASE / 'rayaan' / 'outputs'
    output_dir.mkdir(parents=True, exist_ok=True)
    output_file = output_dir / out_predictions_file

    with open(output_file, 'w', encoding='utf-8') as f:
        for item in evaluation_data:
            f.write(json.dumps(item) + '\n')

    print(f"Saved evaluation data to: {output_file}")
    print(f"Processed {len(evaluation_data)} queries")

    # Run evaluation
    print("\nRunning evaluation script...")
    eval_script_path = BASE / 'scripts' / 'evaluation' / 'run_retrieval_eval.py'

    if eval_script_path.exists():
        import subprocess
        result = subprocess.run([
            'python', str(eval_script_path),
            '--input_file', str(output_file),
            '--output_file', str(output_dir / out_evaluation_file)
        ], capture_output=True, text=True)
        
        print("Evaluation script output:")
        print(result.stdout)
        if result.stderr:
            print("Errors:")
            print(result.stderr)
    else:
        print(f"Evaluation script not found at: {eval_script_path}")

In [17]:
run_and_save_evaluation_script(all_full_queries, 'full_retrieval_predictions_cloud', 'full_retrieval_evaluated')

Running retrieval with proper similarity scores...


Retrieving documents:   0%|          | 0/188 [00:00<?, ?it/s]

Creating evaluation format...
Saved evaluation data to: c:\Users\rayaa\OneDrive\Documents\VSCode\CSCI5832\Semeval\rayaan\outputs\full_retrieval_predictions_cloud.jsonl
Processed 188 queries

Running evaluation script...
Evaluation script output:

collection_name: mt-rag-ibmcloud-elser-512-100-20240502
Retriever Evaluation Aggregate Scores: {'nDCG': [0.09574, 0.11955, 0.14179], 'Recall': [0.03945, 0.12438, 0.17261], 'collection': 'mt-rag-ibmcloud-elser-512-100-20240502', 'count': 188}
Weighted average Recall: [0.03945, 0.12438, 0.17261000000000004]
Weighted average nDCG: [0.09574, 0.11955, 0.14179]



In [18]:
run_and_save_evaluation_script(all_rewrite_queries, 'rewrite_retrieval_predictions_cloud', 'rewrite_retrieval_evaluated')

Running retrieval with proper similarity scores...


Retrieving documents:   0%|          | 0/188 [00:00<?, ?it/s]

Creating evaluation format...
Saved evaluation data to: c:\Users\rayaa\OneDrive\Documents\VSCode\CSCI5832\Semeval\rayaan\outputs\rewrite_retrieval_predictions_cloud.jsonl
Processed 188 queries

Running evaluation script...
Evaluation script output:

collection_name: mt-rag-ibmcloud-elser-512-100-20240502
Retriever Evaluation Aggregate Scores: {'nDCG': [0.28723, 0.28564, 0.31517], 'Recall': [0.1484, 0.28138, 0.34778], 'collection': 'mt-rag-ibmcloud-elser-512-100-20240502', 'count': 188}
Weighted average Recall: [0.1484, 0.28138, 0.34778]
Weighted average nDCG: [0.28723, 0.28564, 0.31517]

