<a href="https://colab.research.google.com/github/MichalSlowakiewicz/RAG_hybrid_search/blob/master/RAG_project_fixed_with_colab.ipynb" target="_blank">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>


In [None]:
!pip install -q sentence-transformers datasets rank_bm25 scikit-learn nltk tqdm

In [None]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from tqdm.auto import tqdm
import math
import multiprocessing
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # lekki, darmowy
CHUNK_SIZE = 200          # tokens per passage (możesz zmieniać)
CHUNK_OVERLAP = 50        # overlap tokens when chunking
TOP_K = 3                # top-k retrieval to evaluate (Recall@10, MRR@10)
EVAL_SAMPLE = 200        # ile pytań użyć do szybszej ewaluacji (ustaw None aby użyć całego dev setu)
ALPHA = 0.65              # domyślne dla hybrydy (paper używał ok. 0.65) - eksperymentuj
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
ds = load_dataset("squad", split="validation")  # dev set
print("Liczba przykładów (dev):", len(ds))

# Opcjonalne downsample dla szybkiej iteracji:
if EVAL_SAMPLE is not None and EVAL_SAMPLE < len(ds):
    ds = ds.shuffle(seed=RANDOM_SEED).select(range(EVAL_SAMPLE))
    print("Używam podzbioru:", len(ds))

Liczba przykładów (dev): 10570
Używam podzbioru: 200


In [None]:
from math import ceil

def tokenize_for_chunking(text):
    return word_tokenize(text)

def chunk_context(context, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    tokens = tokenize_for_chunking(context)
    if len(tokens) <= chunk_size:
        return [" ".join(tokens)]
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        chunks.append(" ".join(chunk_tokens))
        if end >= len(tokens):
            break
        start = end - overlap
    return chunks

# Example: build corpus of passages
passages = []  # list of dicts: {"passage": text, "id": int, "source_qids": set([...])}
for i, ex in enumerate(tqdm(ds)):
    context = ex['context']
    chunks = chunk_context(context)
    for c in chunks:
        passages.append({"passage": c, "source_qid": i, "example": ex})
print("Liczba passage'ów:", len(passages))

  0%|          | 0/200 [00:00<?, ?it/s]

Liczba passage'ów: 233


In [None]:
tokenized_corpus = [word_tokenize(p['passage'].lower()) for p in passages]
bm25 = BM25Okapi(tokenized_corpus)

In [None]:
model = SentenceTransformer(EMBEDDING_MODEL)
# Encode passages in batches (convert_to_numpy True)
texts = [p['passage'] for p in passages]
passage_embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True, batch_size=64)
print("Embeddings shape:", passage_embeddings.shape)

# Build a nearest neighbors index (cosine similarity via metric='cosine')
# We'll later use cosine similarity directly, for smaller corpora this is fine.
nn = NearestNeighbors(n_neighbors=TOP_K, metric='cosine', n_jobs=-1)
nn.fit(passage_embeddings)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Embeddings shape: (233, 384)


In [None]:
from collections import defaultdict

def retrieve_bm25(query, top_k=TOP_K):
    tokens = word_tokenize(query.lower())
    scores = bm25.get_scores(tokens)  # length = num passages
    top_idx = np.argsort(scores)[-top_k:][::-1]
    top_scores = scores[top_idx]
    return list(zip(top_idx.tolist(), top_scores.tolist()))

def retrieve_embedding(query, top_k=TOP_K):
    q_emb = model.encode([query], convert_to_numpy=True)
    # sklearn NearestNeighbors with metric='cosine' returns distances; similarity = 1 - distance
    dists, idxs = nn.kneighbors(q_emb, n_neighbors=top_k)
    dists = dists[0]  # shape (top_k,)
    idxs = idxs[0]
    sims = 1.0 - dists  # convert to similarity
    return list(zip(idxs.tolist(), sims.tolist()))

def retrieve_hybrid(query, alpha=ALPHA, top_k=TOP_K):
    # get a larger candidate set (union of BM25 top_k and embedding top_k), then combine
    bm = retrieve_bm25(query, top_k=top_k)
    em = retrieve_embedding(query, top_k=top_k)
    candidate_ids = list({i for i,_ in bm} | {i for i,_ in em})
    bm_scores_all = []
    em_scores_all = []
    # collect raw scores
    for pid in candidate_ids:
        # BM25 score
        bm_score = bm25.get_scores(word_tokenize(query.lower()))[pid]
        bm_scores_all.append(bm_score)
        # emb score
        q_emb = model.encode([query], convert_to_numpy=True)
        emb_sim = cosine_similarity(q_emb, passage_embeddings[pid:pid+1])[0][0]
        em_scores_all.append(emb_sim)
    bm_scores = np.array(bm_scores_all)
    em_scores = np.array(em_scores_all)
    # normalize both to [0,1] per-query to make them comparable (min-max)
    def minmax(x):
        if len(x)==0:
            return x
        xmin, xmax = x.min(), x.max()
        if xmax - xmin < 1e-8:
            return np.ones_like(x)*0.5
        return (x - xmin) / (xmax - xmin)
    bm_norm = minmax(bm_scores)
    em_norm = minmax(em_scores)
    final_scores = alpha * em_norm + (1-alpha) * bm_norm
    # sort
    order = np.argsort(final_scores)[::-1]
    top_order = order[:top_k]
    results = [(candidate_ids[i], float(final_scores[i])) for i in top_order]
    return results

In [None]:
def normalize_text(s):
    return " ".join(word_tokenize(s.lower()))

def passage_contains_answer(passage_text, answers_list):
    p = normalize_text(passage_text)
    for a in answers_list:
        if normalize_text(a) in p:
            return True
    return False

In [None]:
def evaluate_retriever(retrieve_fn, queries_dataset, top_k=TOP_K):
    hits_at_k = 0
    rr_total = 0.0
    n = len(queries_dataset)
    for qi, ex in enumerate(tqdm(queries_dataset)):
        q_text = ex['question']
        answers = ex['answers']['text']  # list of acceptable answers
        retrieved = retrieve_fn(q_text, top_k=top_k)
        # retrieved: list of (passage_id, score)
        found = False
        rr = 0.0
        for rank, (pid, score) in enumerate(retrieved, start=1):
            if passage_contains_answer(passages[pid]['passage'], answers):
                found = True
                rr = 1.0 / rank
                break
        hits_at_k += int(found)
        rr_total += rr
    recall = hits_at_k / n
    mrr = rr_total / n
    return {"recall@{}".format(top_k): recall, "mrr@{}".format(top_k): mrr, "n": n}

# Run evaluation for BM25, Embedding, Hybrid
sample_ds = ds  # previously optionally downsampled
print("Eval queries:", len(sample_ds))

res_bm25 = evaluate_retriever(retrieve_bm25, sample_ds, top_k=TOP_K)
res_emb  = evaluate_retriever(retrieve_embedding, sample_ds, top_k=TOP_K)
res_hyb  = evaluate_retriever(lambda q, top_k: retrieve_hybrid(q, alpha=ALPHA, top_k=top_k), sample_ds, top_k=TOP_K)

print("BM25:", res_bm25)
print("Embedding:", res_emb)
print("Hybrid (alpha={}):".format(ALPHA), res_hyb)

Eval queries: 200


  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

BM25: {'recall@3': 0.905, 'mrr@3': 0.8250000000000002, 'n': 200}
Embedding: {'recall@3': 0.93, 'mrr@3': 0.8566666666666667, 'n': 200}
Hybrid (alpha=0.65): {'recall@3': 0.95, 'mrr@3': 0.905, 'n': 200}


In [None]:
alphas = [0.0, 0.25, 0.5, 0.65, 0.8, 1.0]  # 0.0 = pure BM25, 1.0 = pure semantic
results_by_alpha = {}
for a in alphas:
    res = evaluate_retriever(lambda q, top_k: retrieve_hybrid(q, alpha=a, top_k=top_k), sample_ds, top_k=TOP_K)
    results_by_alpha[a] = res
    print("alpha", a, res)

  0%|          | 0/200 [00:00<?, ?it/s]

alpha 0.0 {'recall@3': 0.905, 'mrr@3': 0.8250000000000002, 'n': 200}


  0%|          | 0/200 [00:00<?, ?it/s]

alpha 0.25 {'recall@3': 0.925, 'mrr@3': 0.8658333333333336, 'n': 200}


  0%|          | 0/200 [00:00<?, ?it/s]

alpha 0.5 {'recall@3': 0.95, 'mrr@3': 0.8925000000000002, 'n': 200}


  0%|          | 0/200 [00:00<?, ?it/s]

alpha 0.65 {'recall@3': 0.95, 'mrr@3': 0.905, 'n': 200}


  0%|          | 0/200 [00:00<?, ?it/s]

alpha 0.8 {'recall@3': 0.95, 'mrr@3': 0.9008333333333334, 'n': 200}


  0%|          | 0/200 [00:00<?, ?it/s]

alpha 1.0 {'recall@3': 0.93, 'mrr@3': 0.8566666666666667, 'n': 200}


In [None]:
# @title
import csv
with open("hybrid_results.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["alpha", "recall@{}".format(TOP_K), "mrr@{}".format(TOP_K)])
    for a, r in results_by_alpha.items():
        writer.writerow([a, r["recall@{}".format(TOP_K)], r["mrr@{}".format(TOP_K)]])
print("Zapisano hybrid_results.csv")

Zapisano hybrid_results.csv
