In [None]:
# Install dependencies
!pip install -q sentence-transformers
!pip install -q faiss-cpu
!pip install -q beir
!pip install -q rank-bm25
!pip install -q pandas matplotlib seaborn

print("‚úÖ Dependencies installed")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import re
from tqdm.auto import tqdm

from sentence_transformers import SentenceTransformer
import faiss

from beir import util
from beir.datasets.data_loader import GenericDataLoader

# BM25
from rank_bm25 import BM25Okapi

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("‚úÖ Libraries imported")

## Dataset Selection
Choose a dataset. Defaults to FiQA for medium scale.

In [None]:
# Select dataset: 'scifact', 'fiqa', 'trec-covid', 'webis-touche2020', 'quora', 'robust04', 'trec-news', or 'nq'
dataset_name = 'fiqa'  # pick from the list above; 'nq' is very large

dataset_urls = {
    'scifact': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip',          # ~5k docs
    'fiqa': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip',            # ~57k docs
    'trec-covid': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/trec-covid.zip',  # ~171k docs
    'webis-touche2020': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/webis-touche2020.zip', # ~382k docs
    'quora': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/quora.zip',          # ~523k docs
    'robust04': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/robust04.zip',    # ~528k docs
    'trec-news': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/trec-news.zip',  # ~595k docs
    # Note: NQ is very large; ensure sufficient resources
    'nq': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/nq.zip',                # ~2.6M docs
}

url = dataset_urls[dataset_name]
print(f"Downloading {dataset_name} dataset...")
data_path = util.download_and_unzip(url, "datasets")

print("Loading dataset...")
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

print(f"\n‚úÖ Dataset loaded!\n   Documents: {len(corpus):,}\n   Queries: {len(queries):,}\n   Relevance judgments: {len(qrels):,}")

## Prepare Model and Texts
Memory-safe batching for larger corpora.

In [None]:
# Load embedding model
model_name = 'BAAI/bge-base-en-v1.5'
print(f"Loading model: {model_name}")
model = SentenceTransformer(model_name)
dimension = model.get_sentence_embedding_dimension()

# Prepare texts
doc_ids = list(corpus.keys())
doc_texts = [corpus[did]['title'] + ' ' + corpus[did]['text'] for did in doc_ids]
query_ids = list(queries.keys())
query_texts = [queries[qid] for qid in query_ids]

print(f"‚úÖ Model loaded (dim={dimension})")

In [None]:
# Encode documents with memory-safe batching
batch_size_docs = 32 if len(doc_texts) <= 100_000 else 16
print(f"Encoding {len(doc_texts):,} documents (batch_size={batch_size_docs})...")

doc_embeddings = model.encode(
    doc_texts,
    batch_size=batch_size_docs,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

print(f"‚úÖ Documents encoded! Shape: {doc_embeddings.shape}, Memory: {doc_embeddings.nbytes / (1024**2):.2f} MB")

# Encode queries
batch_size_queries = 32
print(f"Encoding {len(query_texts):,} queries (batch_size={batch_size_queries})...")
query_embeddings = model.encode(
    query_texts,
    batch_size=batch_size_queries,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)
print(f"‚úÖ Queries encoded! Shape: {query_embeddings.shape}")

## Build Indexes
HNSW for medium/large datasets, INT8 quantization for memory reduction.

In [None]:
# Flat Index (baseline)
print("Building Flat Index (exact search)...")
start_time = time.time()
flat_index = faiss.IndexFlatIP(dimension)
flat_index.add(doc_embeddings.astype('float32'))
build_time_flat = time.time() - start_time
print(f"‚úÖ Flat Index built in {build_time_flat:.3f}s (vectors={flat_index.ntotal:,})")

# HNSW Index
M = 16 if len(doc_ids) < 200_000 else 32
ef_construction = 100
ef_search = 50 if len(doc_ids) < 200_000 else 100

print("Building HNSW Index...")
start_time = time.time()
hnsw_index = faiss.IndexHNSWFlat(dimension, M)
hnsw_index.hnsw.efConstruction = ef_construction
hnsw_index.add(doc_embeddings.astype('float32'))
hnsw_index.hnsw.efSearch = ef_search
build_time_hnsw = time.time() - start_time
print(f"‚úÖ HNSW built in {build_time_hnsw:.3f}s (M={M}, efSearch={ef_search})")

# HNSW + INT8 Quantization
print("Building HNSW INT8 Index...")
start_time = time.time()
hnsw_int8_index = faiss.IndexHNSWSQ(dimension, faiss.ScalarQuantizer.QT_8bit, M)
hnsw_int8_index.hnsw.efConstruction = ef_construction
hnsw_int8_index.train(doc_embeddings.astype('float32'))
hnsw_int8_index.add(doc_embeddings.astype('float32'))
hnsw_int8_index.hnsw.efSearch = ef_search
build_time_hnsw_int8 = time.time() - start_time
print(f"‚úÖ HNSW INT8 built in {build_time_hnsw_int8:.3f}s")

## BM25 Baseline
Tokenize and build BM25 index over documents.

In [None]:
def simple_tokenize(text):
    text = text.lower()
    return re.findall(r'\b\w+\b', text)

print("Tokenizing documents for BM25...")
doc_texts_tokenized = [simple_tokenize(corpus[did]['title'] + ' ' + corpus[did]['text']) for did in doc_ids]

print("Building BM25 index...")
start_time = time.time()
bm25_index = BM25Okapi(doc_texts_tokenized)
build_time_bm25 = time.time() - start_time
print(f"‚úÖ BM25 built in {build_time_bm25:.3f}s")

## Search Functions
Shared utilities to run and measure searches.

In [None]:
def search_and_measure(index, query_embeddings, k=10, name="Index"):
    print(f"\nSearching with {name}...")
    latencies = []; all_indices = []; all_scores = []
    for query_emb in tqdm(query_embeddings, desc=f"{name} search"):
        start = time.time()
        scores, indices = index.search(query_emb.reshape(1, -1).astype('float32'), k)
        latencies.append((time.time() - start) * 1000)
        all_indices.append(indices[0])
        all_scores.append(scores[0])
    latencies = np.array(latencies)
    return {
        'name': name,
        'indices': np.array(all_indices),
        'scores': np.array(all_scores),
        'latencies': latencies,
        'median_latency': np.median(latencies),
        'p95_latency': np.percentile(latencies, 95),
        'p99_latency': np.percentile(latencies, 99),
    }

def search_bm25(bm25_index, query_texts, k=10):
    print("\nSearching with BM25...")
    latencies = []; all_indices = []; all_scores = []
    for q in tqdm(query_texts, desc="BM25 search"):
        tokens = simple_tokenize(q)
        start = time.time()
        scores = bm25_index.get_scores(tokens)
        latencies.append((time.time() - start) * 1000)
        top_idx = np.argsort(-scores)[:k]
        all_indices.append(top_idx)
        all_scores.append(scores[top_idx])
    latencies = np.array(latencies)
    return {
        'name': 'BM25',
        'indices': np.array(all_indices),
        'scores': np.array(all_scores),
        'latencies': latencies,
        'median_latency': np.median(latencies),
        'p95_latency': np.percentile(latencies, 95),
        'p99_latency': np.percentile(latencies, 99),
    }

def merge_rankings(dense_indices, dense_scores, sparse_indices, sparse_scores, k=10, alpha=0.5):
    merged = {}
    for rank, (idx, s) in enumerate(zip(dense_indices, dense_scores), 1):
        doc_id = doc_ids[idx]
        merged[doc_id] = merged.get(doc_id, 0) + alpha / (60 + rank)
    for rank, (idx, s) in enumerate(zip(sparse_indices, sparse_scores), 1):
        doc_id = doc_ids[idx]
        merged[doc_id] = merged.get(doc_id, 0) + (1 - alpha) / (60 + rank)
    ranked = sorted(merged.items(), key=lambda x: x[1], reverse=True)[:k]
    return np.array([doc_ids.index(doc_id) for doc_id, _ in ranked])

def hybrid_search(dense_index, query_embeddings, bm25_index, query_texts, alpha=0.5, k=10):
    print(f"\nSearching with Hybrid (Œ±={alpha})...")
    latencies = []; all_indices = []
    for emb, q in tqdm(zip(query_embeddings, query_texts), total=len(query_embeddings), desc="Hybrid search"):
        start = time.time()
        dscores, dindices = dense_index.search(emb.reshape(1, -1).astype('float32'), k)
        dindices = dindices[0]; dscores = dscores[0]
        tokens = simple_tokenize(q)
        sscores = bm25_index.get_scores(tokens)
        sindices = np.argsort(-sscores)[:k]
        sscores = sscores[sindices]
        merged = merge_rankings(dindices, dscores, sindices, sscores, k=k, alpha=alpha)
        latencies.append((time.time() - start) * 1000)
        all_indices.append(merged)
    latencies = np.array(latencies)
    return {
        'name': f"Hybrid (Œ±={alpha})",
        'indices': np.array(all_indices),
        'latencies': latencies,
        'median_latency': np.median(latencies),
        'p95_latency': np.percentile(latencies, 95),
        'p99_latency': np.percentile(latencies, 99),
    }

## Run Searches
Collect top-10 results and latency stats.

In [None]:
k = 10
results_flat = search_and_measure(flat_index, query_embeddings, k=k, name="Flat")
results_hnsw = search_and_measure(hnsw_index, query_embeddings, k=k, name="HNSW")
results_hnsw_int8 = search_and_measure(hnsw_int8_index, query_embeddings, k=k, name="HNSW-INT8")
results_bm25 = search_bm25(bm25_index, query_texts, k=k)

# Hybrid runs with different Œ±
alpha_values = [0.3, 0.5, 0.7]
hybrid_results = []
for alpha in alpha_values:
    res = hybrid_search(hnsw_index, query_embeddings, bm25_index, query_texts, alpha=alpha, k=k)
    hybrid_results.append(res)
print("‚úÖ Searches complete")

## Evaluation
Compute Recall@10 and nDCG@10.

In [None]:
def calculate_recall(retrieved_indices, qrels, query_ids, doc_ids, k=10):
    recalls = []
    for i, qid in enumerate(query_ids):
        if qid not in qrels:
            continue
        relevant_docs = set(qrels[qid].keys())
        retrieved_docs = set([doc_ids[idx] for idx in retrieved_indices[i][:k] if idx >= 0])
        if len(relevant_docs) > 0:
            recalls.append(len(relevant_docs & retrieved_docs) / len(relevant_docs))
    return np.mean(recalls) if recalls else 0.0

def calculate_ndcg(retrieved_indices, qrels, query_ids, doc_ids, k=10):
    ndcgs = []
    for i, qid in enumerate(query_ids):
        if qid not in qrels:
            continue
        relevant_docs = qrels[qid]
        retrieved_docs = [doc_ids[idx] for idx in retrieved_indices[i][:k] if idx >= 0]
        dcg = 0
        for rank, doc_id in enumerate(retrieved_docs, 1):
            rel = relevant_docs.get(doc_id, 0)
            dcg += (2 ** rel - 1) / np.log2(rank + 1)
        ideal = sorted(relevant_docs.values(), reverse=True)[:k]
        idcg = sum((2 ** r - 1) / np.log2(rank + 2) for rank, r in enumerate(ideal))
        ndcgs.append(dcg / idcg if idcg > 0 else 0)
    return np.mean(ndcgs) if ndcgs else 0.0

# Evaluate all
for results in [results_flat, results_hnsw, results_hnsw_int8, results_bm25] + hybrid_results:
    results['recall@10'] = calculate_recall(results['indices'], qrels, query_ids, doc_ids, k=10)
    results['ndcg@10'] = calculate_ndcg(results['indices'], qrels, query_ids, doc_ids, k=10)

print("‚úÖ Evaluation complete")

## Comparison Tables

In [None]:
comparison_df = pd.DataFrame([
    {
        'Method': 'Flat',
        'Type': 'Dense',
        'Recall@10': results_flat['recall@10'],
        'nDCG@10': results_flat['ndcg@10'],
        'Median Latency (ms)': results_flat['median_latency'],
        'P95 Latency (ms)': results_flat['p95_latency'],
    },
    {
        'Method': 'HNSW',
        'Type': 'Dense',
        'Recall@10': results_hnsw['recall@10'],
        'nDCG@10': results_hnsw['ndcg@10'],
        'Median Latency (ms)': results_hnsw['median_latency'],
        'P95 Latency (ms)': results_hnsw['p95_latency'],
    },
    {
        'Method': 'HNSW-INT8',
        'Type': 'Dense',
        'Recall@10': results_hnsw_int8['recall@10'],
        'nDCG@10': results_hnsw_int8['ndcg@10'],
        'Median Latency (ms)': results_hnsw_int8['median_latency'],
        'P95 Latency (ms)': results_hnsw_int8['p95_latency'],
    },
    {
        'Method': 'BM25',
        'Type': 'Sparse',
        'Recall@10': results_bm25['recall@10'],
        'nDCG@10': results_bm25['ndcg@10'],
        'Median Latency (ms)': results_bm25['median_latency'],
        'P95 Latency (ms)': results_bm25['p95_latency'],
    },
])

# Hybrid rows
hybrid_rows = []
for res in hybrid_results:
    hybrid_rows.append({
        'Method': res['name'],
        'Type': 'Hybrid',
        'Recall@10': res['recall@10'],
        'nDCG@10': res['ndcg@10'],
        'Median Latency (ms)': res['median_latency'],
        'P95 Latency (ms)': res['p95_latency'],
    })
comparison_df = pd.concat([comparison_df, pd.DataFrame(hybrid_rows)], ignore_index=True)

print("\nüìä COMPARISON TABLE")
print("="*100)
print(comparison_df.to_string(index=False))
print("="*100)

## Visualizations

In [None]:
# Create temp directory for plots
import os
temp_plots_dir = 'temp_plots'
os.makedirs(temp_plots_dir, exist_ok=True)

# Speed vs Quality
fig, ax = plt.subplots(figsize=(10, 6))
for _, row in comparison_df.iterrows():
    color = 'orange' if row['Type'] == 'Sparse' else ('green' if row['Type'] == 'Hybrid' else 'steelblue')
    ax.scatter(row['Median Latency (ms)'], row['nDCG@10'], s=200, alpha=0.75, color=color, edgecolors='black')
    ax.annotate(row['Method'], (row['Median Latency (ms)'], row['nDCG@10']), xytext=(8, 8), textcoords='offset points', fontsize=9, fontweight='bold')
ax.set_xlabel('Median Latency (ms)')
ax.set_ylabel('nDCG@10')
ax.set_title(f'Speed vs Quality ‚Äî {dataset_name}')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plot1_path = os.path.join(temp_plots_dir, f'speed_vs_quality_{dataset_name}.pdf')
plt.savefig(plot1_path, dpi=300, bbox_inches='tight')
plt.show()
print(f"‚úÖ Plot saved as {plot1_path}")

# Bar chart quality
fig, ax = plt.subplots(figsize=(12, 5))
ax.bar(comparison_df['Method'], comparison_df['nDCG@10'], color='skyblue', edgecolor='black')
ax.set_ylabel('nDCG@10')
ax.set_title(f'Quality Comparison ‚Äî {dataset_name}')
ax.grid(True, alpha=0.3, axis='y')
plt.xticks(rotation=30, ha='right')
plt.tight_layout()
plot2_path = os.path.join(temp_plots_dir, f'quality_comparison_{dataset_name}.pdf')
plt.savefig(plot2_path, dpi=300, bbox_inches='tight')
plt.show()
print(f"‚úÖ Plot saved as {plot2_path}")

## Save Results

In [None]:
# Detect environment and set output directory
import os
import shutil

def detect_environment():
    """Detect if running on Colab, Kaggle, Modal, or local"""
    if 'COLAB_GPU' in os.environ or 'COLAB_TPU_ADDR' in os.environ:
        return 'colab'
    elif 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        return 'kaggle'
    elif 'MODAL_PROJECT_NAME' in os.environ:
        return 'modal'
    else:
        return 'local'

environment = detect_environment()
print(f"üîç Detected environment: {environment.UPPER()}")

# Set output directory based on environment
if environment == 'colab':
    # For Colab, save to /content/results/
    output_dir = '/content/results'
    os.makedirs(output_dir, exist_ok=True)
    print(f"üíæ Saving to: {output_dir}")
    
elif environment == 'kaggle':
    # For Kaggle, save to /kaggle/working/
    output_dir = '/kaggle/working'
    print(f"üíæ Saving to: {output_dir}")

elif environment == 'modal':
    # For Modal, save to a results directory (often mapped to a Volume)
    output_dir = f'/root/results/{dataset_name}'
    os.makedirs(output_dir, exist_ok=True)
    print(f"üíæ Saving to Modal Volume path: {output_dir}")
    
else:
    # For local, save to current directory or create results folder
    output_dir = f'{dataset_name}_results'
    os.makedirs(output_dir, exist_ok=True)
    print(f"üíæ Saving to: {output_dir}/")

# Save comparison table
comparison_path = os.path.join(output_dir, f'experiment_results_{dataset_name}.csv')
comparison_df.to_csv(comparison_path, index=False)
print(f"‚úÖ Saved: {comparison_path}")

# Save latency data
latency_df = pd.DataFrame({
    'Flat': results_flat['latencies'],
    'HNSW': results_hnsw['latencies'],
    'HNSW-INT8': results_hnsw_int8['latencies'],
    'BM25': results_bm25['latencies'],
})
latency_path = os.path.join(output_dir, f'latency_data_{dataset_name}.csv')
latency_df.to_csv(latency_path, index=False)
print(f"‚úÖ Saved: {latency_path}")

# Save hybrid results table
hybrid_comparison_df = pd.DataFrame([{
    'Method': res['name'],
    'Recall@10': res['recall@10'],
    'nDCG@10': res['ndcg@10'],
    'Median Latency (ms)': res['median_latency'],
    'P95 Latency (ms)': res['p95_latency'],
} for res in hybrid_results])
hybrid_path = os.path.join(output_dir, f'hybrid_results_{dataset_name}.csv')
hybrid_comparison_df.to_csv(hybrid_path, index=False)
print(f"‚úÖ Saved: {hybrid_path}")

# Copy plot files to output directory
temp_plots_dir = 'temp_plots'
if os.path.exists(temp_plots_dir):
    plot_files = [
        f'speed_vs_quality_{dataset_name}.pdf',
        f'quality_comparison_{dataset_name}.pdf'
    ]
    for plot_file in plot_files:
        src = os.path.join(temp_plots_dir, plot_file)
        if os.path.exists(src):
            dst = os.path.join(output_dir, plot_file)
            shutil.copy2(src, dst)
            print(f"‚úÖ Copied plot: {dst}")

print("\n" + "="*80)
print(f"üìÅ All results saved to: {output_dir}/")
print("="*80)

# Environment-specific download instructions and automatic downloads
if environment == 'colab':
    print("\nüì• AUTO-DOWNLOADING FILES TO YOUR PC...")
    try:
        from google.colab import files
        # Download all result files
        for filename in os.listdir(output_dir):
            filepath = os.path.join(output_dir, filename)
            if os.path.isfile(filepath):
                print(f"   üì¶ Downloading: {filename}")
                files.download(filepath)
        print("‚úÖ All files downloaded to your PC!")
    except Exception as e:
        print(f"‚ö†Ô∏è Auto-download failed: {e}")
        print("\nüì• MANUAL DOWNLOAD INSTRUCTIONS:")
        print("   1. Click the folder icon on the left sidebar")
        print(f"   2. Navigate to {output_dir}/")
        print("   3. Right-click each file ‚Üí Download")
    
elif environment == 'kaggle':
    print("\nüì• FILES READY FOR DOWNLOAD:")
    print("   1. Click 'Save Version' ‚Üí 'Save & Run All'")
    print("   2. Once complete, go to the 'Output' tab")
    print("   3. Download the CSV and PDF files directly")
    
elif environment == 'modal':
    print("\nüì• TO ACCESS FILES IN MODAL:")
    print(f"   1. Files are stored in the volume at: {output_dir}")
    print("   2. Use 'modal volume get <volume_name> <remote_path> <local_path>' to download")
    
else:
    print(f"\nüìÇ Files saved locally in: {os.path.abspath(output_dir)}/")
    print("‚úÖ All files (CSVs and plots) are already on your PC!")

# Create summary report
print("\n" + "="*80)
print("üìä EXPERIMENT SUMMARY")
print("="*80)
print(f"Dataset: {dataset_name}")
print(f"Documents: {len(corpus):,}")
print(f"Queries: {len(queries):,}")
print(f"\nBest Quality Method: {comparison_df.loc[comparison_df['Recall@10'].idxmax()]['Method']}")
print(f"Fastest Method: {comparison_df.loc[comparison_df['Median Latency (ms)'].idxmin()]['Method']}")
print("="*80)