In [1]:
%%capture
!pip install faiss-cpu
!pip install rank_bm25

In [36]:
import csv
import re
import json
import random
import numpy as np
import torch

from collections import defaultdict
from datasets import load_dataset
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import faiss

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

## Datu sagatavošana

In [3]:
dataset = load_dataset("squad", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [4]:
titles = dataset.unique("title")
len(titles)

442

In [5]:
SEED = 42
N_SAMPLES = 50

In [6]:
by_title = defaultdict(list)
for ex in dataset:
    by_title[ex["title"]].append(ex)

rng = random.Random(SEED)
titles = sorted(by_title.keys())
selected_titles = rng.sample(titles, N_SAMPLES)

eval_samples = []
for t in selected_titles:
    eval_samples.append(by_title[t][0])

In [7]:
with open("KD-RAG-eval.json", "w") as f:
    json.dump(eval_samples, f, indent=2)

In [8]:
contexts = [ex["context"] for ex in eval_samples]

In [9]:
def chunk_text(text, chunk_size=80, overlap=20):
    words = text.split()
    chunks = []
    start = 0
    cid = 0

    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunks.append({
            "chunk_id": cid,
            "text": " ".join(words[start:end]),
            "word_start": start,
            "word_end": end
        })
        cid += 1
        start += chunk_size - overlap

    return chunks

def char_to_word_span(text, char_start, char_end):
    words = text.split()
    pos = 0

    for i, w in enumerate(words):
        w_start = text.find(w, pos)
        w_end = w_start + len(w)
        pos = w_end

        if w_end > char_start:
            ws = i
            break

    for j in range(i, len(words)):
        w_start = text.find(words[j], pos)
        if w_start >= char_end:
            we = j
            break
        we = j + 1

    return ws, we

def overlaps(chunk, ans_ws, ans_we):
    return not (
        ans_we <= chunk["word_start"]
        or ans_ws >= chunk["word_end"]
    )


In [10]:
lengths = np.array([len(c.split()) for c in contexts])
print("N contexts:", len(lengths))
print("mean:", lengths.mean())
print("median:", np.median(lengths))
print("p75:", np.percentile(lengths, 75))
print("p90:", np.percentile(lengths, 90))
print("max:", lengths.max())


N contexts: 50
mean: 110.88
median: 102.0
p75: 121.75
p90: 150.4
max: 254


In [11]:
all_chunks = []
chunks_mapped = []
for ctx_id, ctx in enumerate(contexts):
    ctx_chunks = chunk_text(ctx)
    for ch in ctx_chunks:
        all_chunks.append(ch)
        chunks_mapped.append({
            "context_id": ctx_id
        })

In [12]:
len(all_chunks) == len(chunks_mapped)

True

In [13]:
question_data = []

for i, ex in enumerate(eval_samples):
    question_data.append({
        "question": ex["question"],
        "answers": ex["answers"],
        "context_id": i,
        "context": ex["context"]
    })

In [14]:
chunks_by_context = defaultdict(list)
for i, meta in enumerate(chunks_mapped):
    chunks_by_context[meta["context_id"]].append(i)

In [15]:
def char_to_word_span(text, char_start, char_end):
    words = text.split()
    pos = 0
    spans = []

    for w in words:
        s = text.find(w, pos)
        e = s + len(w)
        spans.append((s, e))
        pos = e

    ws = we = None
    for i, (s, e) in enumerate(spans):
        if ws is None and e > char_start:
            ws = i
        if s < char_end:
            we = i + 1

    return ws, we

def overlaps(chunk, ans_ws, ans_we):
    return not (
        ans_we <= chunk["word_start"]
        or ans_ws >= chunk["word_end"]
    )

In [16]:
question_to_chunks = {}

for q in question_data:
    relevant = set()

    for ans_text, ans_start in zip(
        q["answers"]["text"], q["answers"]["answer_start"]
    ):
        ans_end = ans_start + len(ans_text)
        ws, we = char_to_word_span(q["context"], ans_start, ans_end)

        for chunk_idx in chunks_by_context[q["context_id"]]:
            chunk = all_chunks[chunk_idx]
            if overlaps(chunk, ws, we):
                relevant.add(chunk_idx)

    question_to_chunks[q["question"]] = sorted(relevant)


## Izgūšanas komponentes bāzlīnijas izvērtēšana

#### Leksiskās izgūšanas metodes izvērtēšana

In [17]:
bm25_corpus = [ch["text"].lower().split() for ch in all_chunks]
bm25 = BM25Okapi(bm25_corpus)

In [18]:
def bm25_retrieve(query, k):
    scores = bm25.get_scores(query.lower().split())
    topk = np.argsort(scores)[::-1][:k]
    return list(topk)

In [19]:
def recall_at_k(retrieved, relevant):
    if not relevant:
        return 0.0
    return len(set(retrieved) & set(relevant)) / len(relevant)

In [20]:
K_VALUES = [1, 3, 5, 10]

bm25_results = {k: [] for k in K_VALUES}

for q in question_data:
    query = q["question"]
    relevant = question_to_chunks[query]

    for k in K_VALUES:
        retrieved = bm25_retrieve(query, k)
        bm25_results[k].append(
            recall_at_k(retrieved, relevant)
        )


In [21]:
for k in K_VALUES:
    print(f"Recall@{k}: {np.mean(bm25_results[k]):.3f}")

Recall@1: 0.720
Recall@3: 0.840
Recall@5: 0.860
Recall@10: 0.870


#### Semantiskās, blīvās izgūšanas bāzlīnijas izvērtēšana

In [22]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [23]:
chunk_texts = [ch["text"] for ch in all_chunks]

In [24]:
chunk_embeddings = embedder.encode(
    chunk_texts,
    convert_to_numpy=True,
    show_progress_bar=True
)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [25]:
print(chunk_embeddings.shape)

(116, 384)


In [26]:
faiss.normalize_L2(chunk_embeddings)
dim = chunk_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(chunk_embeddings)

In [27]:
def dense_retrieve(query, k):
    q_emb = embedder.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    scores, indices = index.search(q_emb, k)
    return list(indices[0])

In [28]:
K_VALUES = [1, 3, 5, 10]

dense_results = {k: [] for k in K_VALUES}

for q in question_data:
    query = q["question"]
    relevant = question_to_chunks[query]

    for k in K_VALUES:
        retrieved = dense_retrieve(query, k)
        dense_results[k].append(
            recall_at_k(retrieved, relevant)
        )

In [29]:
print("\n=== Retrieval Comparison ===")
for k in K_VALUES:
    print(
        f"K={k:>2} | "
        f"BM25={np.mean(bm25_results[k]):.3f} | "
        f"Dense={np.mean(dense_results[k]):.3f}"
    )


=== Retrieval Comparison ===
K= 1 | BM25=0.720 | Dense=0.950
K= 3 | BM25=0.840 | Dense=1.000
K= 5 | BM25=0.860 | Dense=1.000
K=10 | BM25=0.870 | Dense=1.000


In [30]:
q = question_data[0]
idx = dense_retrieve(q["question"], 1)[0]

print("QUESTION:", q["question"])
print("\nRETRIEVED CHUNK:\n", all_chunks[idx]["text"])
print("\nGOLD CONTEXT:\n", q["context"])

QUESTION: What type of faith is Protestantism?

RETRIEVED CHUNK:
 Protestantism is a form of Christian faith and practice which originated with the Protestant Reformation,[a] a movement against what its followers considered to be errors in the Roman Catholic Church. It is one of the three major divisions of Christendom, together with Roman Catholicism and Eastern Orthodoxy. Anglicanism is sometimes considered to be independent from Protestantism.[b] The term derives from the letter of protestation from Lutheran princes in 1529 against an edict condemning the teachings of Martin Luther as heretical.

GOLD CONTEXT:
 Protestantism is a form of Christian faith and practice which originated with the Protestant Reformation,[a] a movement against what its followers considered to be errors in the Roman Catholic Church. It is one of the three major divisions of Christendom, together with Roman Catholicism and Eastern Orthodoxy. Anglicanism is sometimes considered to be independent from Protesta

#### Izgūšanas komponentes bāzlīnijas izvērtēšana

In [31]:
MODEL = "google/flan-t5-base"
device = "cuda" if torch.cuda.is_available() else "cpu"

tok = AutoTokenizer.from_pretrained(MODEL)
llm = AutoModelForSeq2SeqLM.from_pretrained(MODEL).to(device)
llm.eval()

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [32]:
@torch.inference_mode()
def generate_answer(prompt, max_new_tokens=32):
    inputs = tok(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(device)

    out = llm.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        num_beams=1
    )
    return tok.decode(out[0], skip_special_tokens=True).strip()

In [33]:
def build_prompt_no_rag(question):
    return (
        "Answer the question with a short factual phrase.\n"
        f"Question: {question}\n"
        "Answer:"
    )

def build_prompt_rag(question, retrieved_chunk_texts):
    context_block = "\n\n".join(retrieved_chunk_texts)
    return (
        "Use ONLY the context provided below to answer with a short factual phrase.\n"
        "If the answer is NOT in the context, say: unknown.\n\n"
        f"Context:\n{context_block}\n\n"
        f"Question: {question}\n"
        "Answer:"
    )

In [34]:
K_RAG = 3
results = []

for q in question_data:
    question = q["question"]
    gold_answers = q["answers"]["text"]

    prompt_no_rag = build_prompt_no_rag(question)
    pred_no_rag = generate_answer(prompt_no_rag)

    retrieved_ids = dense_retrieve(question, K_RAG)
    retrieved_chunks = [all_chunks[i]["text"] for i in retrieved_ids]

    prompt_rag = build_prompt_rag(question, retrieved_chunks)
    pred_rag = generate_answer(prompt_rag)

    results.append({
        "question": question,
        "gold_answers": gold_answers,
        "no_rag_answer": pred_no_rag,
        "rag_answer": pred_rag,
        "retrieved_chunks": retrieved_chunks
    })


In [35]:
import csv

with open("rag_eval.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow([
        "question",
        "gold_answers",
        "no_rag_answer",
        "rag_answer"
    ])

    for r in results:
        writer.writerow([
            r["question"],
            " | ".join(r["gold_answers"]),
            r["no_rag_answer"],
            r["rag_answer"]
        ])


with open("rag_eval_full.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

Eksportētie dati un atbilstošās metrikas tiek analizētas manuāli