# üì• Notebook 1: Dataset Loading & Embedding Generation
**Author:** Gabriele Righi

**Project:** Dense vs Sparse Retrieval Reproducibility

## üéØ Objective
This is the foundational step of the project. It prepares the data for all subsequent experiments.
1.  **Data Acquisition:** Downloads the **Natural Questions (NQ)** dataset using the `BEIR` framework.
2.  **Environment Setup:** Installs Java (required for Pyserini/BM25 later) and GPU-optimized libraries.
3.  **Dense Encoding:** Uses the **BGE-Base (v1.5)** model to convert 2.6 million documents into dense vectors (embeddings).

## ‚öôÔ∏è Key Outputs
This notebook saves the following artifacts to disk (which are then used by Notebook 2 and 3):
* `doc_embeddings.npy`: The massive array of document vectors.
* `query_embeddings.npy`: The encoded test queries.

---

In [None]:
# Install specific versions to ensure compatibility between beir, pyserini and transformers
!pip uninstall -y faiss-gpu faiss-cpu sentence-transformers transformers huggingface_hub
!pip install faiss-cpu
!pip install huggingface-hub==0.23.0 transformers==4.36.2 sentence-transformers==2.2.2 pyserini beir pandas matplotlib seaborn scipy

# Install Java 21 for Lucene (required by Pyserini)
!apt-get -y install -qq openjdk-21-jdk-headless || true
print("‚úÖ Dependencies installed successfully")

In [None]:
import os
import json
import time
import shutil
import pathlib
import subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
from scipy.sparse import csr_matrix

# Configure Java 21 for Lucene
java_home = "/usr/lib/jvm/java-21-openjdk-amd64"
if os.path.exists(java_home):
    os.environ["JAVA_HOME"] = java_home
    os.environ["PATH"] = f"{java_home}/bin:" + os.environ.get("PATH", "")

from sentence_transformers import SentenceTransformer
from beir import util
from beir.datasets.data_loader import GenericDataLoader
from pyserini.search.lucene import LuceneSearcher
import faiss

sns.set_style('whitegrid')
print("‚úÖ Libraries imported and Java configured")

## ‚è±Ô∏è Execution Report
* **Total Wall Time:** 3h 40m
* **Hardware:** NVIDIA T4 (16GB VRAM)
* **Throughput:** ~200 documents/second
* **Implication:** Dense Retrieval encoding is computationally expensive compared to traditional inverted indexing.

In [None]:
# =================================================================
# SELECT DATASET
# =================================================================
dataset_name = 'nq'  # Change to: scifact, trec-covid, fiqa, etc.

# =================================================================
# DATASET CONFIGURATION
# =================================================================
public_datasets = {
    'nfcorpus': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/nfcorpus.zip',
    'scifact': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip',
    'arguana': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/arguana.zip',
    'scidocs': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scidocs.zip',
    'fiqa': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip',
    'trec-covid': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/trec-covid.zip',
    'webis-touche2020': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/webis-touche2020.zip',
    'quora': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/quora.zip',
    'dbpedia-entity': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/dbpedia-entity.zip',
    'nq': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/nq.zip',
    'cqadupstack': 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/cqadupstack.zip',
}

cqa_sub_datasets = {
    'android': '23K docs', 'english': '41K docs', 'gaming': '46K docs', 
    'gis': '38K docs', 'mathematica': '17K docs', 'physics': '39K docs', 
    'programmers': '33K docs', 'stats': '42K docs', 'tex': '71K docs', 
    'unix': '48K docs', 'webmasters': '17K docs', 'wordpress': '49K docs'
}

# Download Logic
out_dir = os.path.join(pathlib.Path('.').parent.absolute(), "datasets")

if dataset_name.startswith('cqadupstack/'):
    sub_name = dataset_name.split('/')[1]
    if sub_name not in cqa_sub_datasets:
        raise ValueError(f"Invalid CQA sub-dataset '{sub_name}'")
    print(f"--- Processing CQADupStack: {sub_name} ---")
    url = public_datasets['cqadupstack']
    base_path = util.download_and_unzip(url, out_dir)
    data_path = os.path.join(base_path, sub_name)
elif dataset_name in public_datasets:
    print(f"--- Processing {dataset_name} ---")
    url = public_datasets[dataset_name]
    data_path = util.download_and_unzip(url, out_dir)
else:
    raise ValueError(f"Dataset '{dataset_name}' not found.")

# Load Data
print(f"Loading data from: {data_path}")
try:
    corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")
    
    # Prepare lists for encoding (CRITICAL STEP)
    print("Preparing data lists...")
    doc_ids = list(corpus.keys())
    doc_texts = [corpus[did]['title'] + ' ' + corpus[did]['text'] for did in doc_ids]
    query_ids = list(queries.keys())
    query_texts = [queries[qid] for qid in query_ids]

    print(f"\n‚úÖ Dataset Loaded: {dataset_name}")
    print(f"   Documents: {len(corpus):,}")
    print(f"   Queries: {len(queries):,}")
    print(f"   Relevance judgments: {len(qrels):,}")

except Exception as e:
    print(f"\n‚ùå Error loading dataset: {e}")

In [None]:
# Modified Cell 4: Encoding with Checkpointing
import numpy as np
import os
from sentence_transformers import SentenceTransformer

# 1. Configure output directory (Kaggle Working Directory)
# This folder is temporary; you must create a Kaggle Dataset from the outputs to save them permanently.
base_dir = '/kaggle/working/nq_experiments'
os.makedirs(base_dir, exist_ok=True)

# File paths
doc_emb_path = os.path.join(base_dir, 'doc_embeddings.npy')
query_emb_path = os.path.join(base_dir, 'query_embeddings.npy')

# 2. Load Model
model_name = 'BAAI/bge-base-en-v1.5'
print(f"Loading BGE model: {model_name}")
model = SentenceTransformer(model_name)

# 3. Document Encoding (The heavy workload)
if os.path.exists(doc_emb_path):
    print(f"‚úÖ Found existing document embeddings! Skipping encoding.")
else:
    print(f"‚ö†Ô∏è No file found. Starting encoding of {len(doc_texts):,} documents...")
    # Use batch_size 16 or 32 for safety on T4 GPU
    doc_embeddings = model.encode(
        doc_texts,
        batch_size=32,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True
    )
    print(f"üíæ Saving documents ({doc_embeddings.nbytes / 1e9:.2f} GB)...")
    np.save(doc_emb_path, doc_embeddings)
    print("‚úÖ Documents saved.")

# 4. Query Encoding
if os.path.exists(query_emb_path):
    print(f"‚úÖ Found existing query embeddings! Skipping encoding.")
else:
    print(f"Starting encoding of {len(query_texts):,} queries...")
    query_embeddings = model.encode(
        query_texts,
        batch_size=32,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True
    )
    print(f"üíæ Saving queries...")
    np.save(query_emb_path, query_embeddings)
    print("‚úÖ Queries saved.")

print(f"\nüéâ DONE! Files are in: {base_dir}")
print("Now you can stop and create the Kaggle Dataset from these outputs.")