In [None]:
import os
os.environ["GROQ_API_KEY"] = ""


In [None]:
# Run in Colab to upload PDFs interactively into /mnt/data/docs
from google.colab import files, drive
import os, pathlib

# Option A: mount drive (uncomment if you want to use Drive)
# drive.mount('/content/drive')

# Create docs dir
DOCS_DIR = "/mnt/data/docs"
os.makedirs(DOCS_DIR, exist_ok=True)
print("Upload PDFs now (you can upload multiple):")
uploaded = files.upload()  # choose files from local machine
for name in uploaded:
    dest = os.path.join(DOCS_DIR, name)
    with open(dest, "wb") as f:
        f.write(uploaded[name])
print("Saved uploaded files to", DOCS_DIR)
print("Files:", list(pathlib.Path(DOCS_DIR).glob("*")))


Upload PDFs now (you can upload multiple):


Saving Retrieval-Augmented_Generation_for_Large_Language_Models_A_Survey[1].pdf to Retrieval-Augmented_Generation_for_Large_Language_Models_A_Survey[1].pdf
Saving Retrieval-AugmentedGenerationRAG-AdvancingAIwithDynamicKnowledgeIntegration.pdf to Retrieval-AugmentedGenerationRAG-AdvancingAIwithDynamicKnowledgeIntegration.pdf
Saved uploaded files to /mnt/data/docs
Files: [PosixPath('/mnt/data/docs/Retrieval-Augmented_Generation_for_Large_Language_Models_A_Survey[1].pdf'), PosixPath('/mnt/data/docs/Retrieval-AugmentedGenerationRAG-AdvancingAIwithDynamicKnowledgeIntegration.pdf')]


In [None]:
# Run in Colab (first cell) to install required packages
!pip install -q sentence-transformers faiss-cpu transformers ragas groq pdfminer.six PyPDF2 tqdm scikit-learn pandas

# cross-encoder from sentence-transformers
!pip install -q cross-encoder

print("Install complete.")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m104.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m366.7/366.7 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.3/137.3 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m128.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.9/160.9 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m98.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os, json, math, time, re
from pathlib import Path
from tqdm.auto import tqdm

# Groq client
from groq import Groq
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
if not GROQ_API_KEY:
    print("Warning: GROQ_API_KEY not set. Set os.environ['GROQ_API_KEY'] before using Groq generator/eval.")
groq_client = Groq(api_key=GROQ_API_KEY) if GROQ_API_KEY else None

# Embeddings & FAISS
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Re-ranker cross encoder
from sentence_transformers import CrossEncoder

# Generation fallback (HF pipeline) in case Groq unavailable
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# RAGAS (for evaluation)
from ragas.llms.base import llm_factory
from ragas.embeddings.base import embedding_factory
from ragas import evaluate

print("Imports done.")

Imports done.


In [None]:
from pathlib import Path
import PyPDF2
import nbformat

DOCS_DIR = Path("/mnt/data/docs")
docs = []

def read_pdf_text(path):
    try:
        text_parts = []
        with open(path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for p in reader.pages:
                text_parts.append(p.extract_text() or "")
        return "\n".join(text_parts)
    except Exception as e:
        print("PDF read failed:", path, e)
        return ""

# Load PDFs / TXT / MD files
if DOCS_DIR.exists():
    for p in sorted(DOCS_DIR.glob("*")):
        if p.suffix.lower() in [".pdf"]:
            txt = read_pdf_text(p)
            if txt and len(txt) > 20:
                docs.append(f"Source: {p.name}\n\n{txt}")
        elif p.suffix.lower() in [".txt", ".md"]:
            txt = p.read_text(encoding="utf-8")
            if txt and len(txt)>20:
                docs.append(f"Source: {p.name}\n\n{txt}")
print(f"Loaded {len(docs)} documents from {DOCS_DIR}")

# Also try to extract markdown from uploaded notebook (if exists)
NB_PATH = Path("/mnt/data/Art_of_RAG_Evaluation_LangChain_&_Ragas.ipynb")
if NB_PATH.exists():
    nb = nbformat.read(str(NB_PATH), as_version=4)
    for cell in nb.cells:
        if cell.cell_type == "markdown":
            s = cell.get("source", "").strip()
            if len(s) > 50:
                docs.append("FromNotebook:\n" + s)
    print("Also added markdown content from uploaded notebook.")


Loaded 2 documents from /mnt/data/docs


In [None]:
# Simple token-based chunking using word tokens (deterministic)
def chunk_text(text, chunk_size=400, overlap=80):
    tokens = text.split()
    chunks = []
    i = 0
    while i < len(tokens):
        chunk = tokens[i:i+chunk_size]
        chunks.append(" ".join(chunk))
        i += chunk_size - overlap
    return chunks

# Build text_chunks list
text_chunks = []
metadata = []  # parallel metadata (source, chunk_id)
for doc_i, d in enumerate(docs):
    chs = chunk_text(d, chunk_size=400, overlap=80)
    for j, c in enumerate(chs):
        text_chunks.append(c)
        metadata.append({"doc_id": doc_i, "chunk_id": j, "source": f"doc_{doc_i}"})
print(f"Created {len(text_chunks)} text chunks.")


Created 83 text chunks.


In [None]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
batch_size = 64
embs = []

for i in tqdm(range(0, len(text_chunks), batch_size), desc="Embedding batches"):
    batch = text_chunks[i:i+batch_size]
    e = embed_model.encode(batch, show_progress_bar=False)
    embs.append(e)
if embs:
    embs = np.vstack(embs)
else:
    embs = np.zeros((0, embed_model.get_sentence_embedding_dimension()))

# normalize for cosine-sim via inner product
faiss.normalize_L2(embs)
d = embs.shape[1]
index = faiss.IndexFlatIP(d)
index.add(embs)
print("Built FAISS index with", index.ntotal, "vectors")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding batches:   0%|          | 0/2 [00:00<?, ?it/s]

Built FAISS index with 83 vectors


In [None]:
def retrieve_knn(query, k=5):
    q_emb = embed_model.encode([query])
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, k)
    results = []
    for idx in I[0]:
        idx = int(idx)
        results.append({"chunk": text_chunks[idx], "meta": metadata[idx], "score": float(D[0][list(I[0]).index(idx)])})
    return results

# quick test
print(retrieve_knn("What is retrieval-augmented generation?")[:2])


[{'chunk': "Source: Retrieval-AugmentedGenerationRAG-AdvancingAIwithDynamicKnowledgeIntegration.pdf See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/388414789 Retrieval-Augmented Generation (RAG): Advancing AI with Dynamic Knowledge Integration Preprint · Januar y 2025 DOI: 10.13140/RG.2.2.30888.89606 CITATIONS 0READS 1,394 1 author: Douglas C Y ouv an 4,341 PUBLICA TIONS 6,209 CITATIONS SEE PROFILE All c ontent f ollo wing this p age was uplo aded b y Douglas C Y ouv an on 27 Januar y 2025. The user has r equest ed enhanc ement of the do wnlo aded file. 1 Retrieval -Augmented Generation (RAG): Advancing AI with Dynamic Knowledge Integration Douglas C. Youvan doug@youvan.com January 27, 2025 Retrieval -Augmented Generation (RAG) represents a significant advancement in artificial intelligence by combining the capabilities of generative models with real - time information retrieval from external knowledge sources. Unlik

In [None]:
# Answer quality metrics cell: ROUGE, BLEU, METEOR, BERTScore
# Paste into Colab/Jupyter and run in the same kernel as your RAG/prediction objects.

# Install required packages (run once)
!pip install -q rouge-score sacrebleu bert-score nltk transformers

# Imports
import os, json, csv
from pathlib import Path
from statistics import mean
import numpy as np

# Metric imports
from rouge_score import rouge_scorer
import sacrebleu
from bert_score import score as bert_score
import nltk
from nltk.translate.meteor_score import single_meteor_score

# Ensure NLTK resources for METEOR
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Paths
EVAL_CSV = Path("/mnt/data/eval_examples.csv")
OUT_JSON = Path("/mnt/data/answer_quality_metrics.json")
OUT_CSV = Path("/mnt/data/answer_quality_metrics.csv")

# Load evaluation pairs: tries CSV -> prepared (list) -> fallback inline
pairs = []
if EVAL_CSV.exists():
    import pandas as pd
    df = pd.read_csv(EVAL_CSV)
    if "answer" in df.columns and "ground_truth" in df.columns:
        # if you saved generated answers into CSV already
        for _, r in df.iterrows():
            pred = str(r.get("answer","")).strip()
            gold = str(r.get("ground_truth","")).strip()
            pairs.append((pred, gold))
    elif "question" in df.columns and "ground_truth" in df.columns:
        # If only question+gold present, attempt to compute preds using notebook's generator if available
        # Try to reuse 'prepared' or call generate_rag_answer if present
        for _, r in df.iterrows():
            q = str(r.get("question","")).strip()
            gold = str(r.get("ground_truth","")).strip()
            pred = ""
            # try to use prepared list or generator from notebook
            if 'prepared' in globals():
                # find by question
                found = next((p for p in prepared if p.get("question","")==q), None)
                if found:
                    pred = found.get("answer","")
            if not pred and 'generate_rag_answer' in globals():
                try:
                    gen = generate_rag_answer(q, strategy="multiquery", k=6, compress=True, rerank_top=5)
                    pred = gen["answer"] if isinstance(gen, dict) else str(gen)
                except Exception:
                    pred = ""
            pairs.append((pred, gold))
    else:
        # fallback: try columns named 'question' and 'answer' or similar
        for _, r in df.iterrows():
            pred = str(r.get("answer","")).strip() if "answer" in r else str(r.get("system_answer","")).strip() if "system_answer" in r else ""
            gold = str(r.get("ground_truth","")).strip() if "ground_truth" in r else str(r.get("gold_answer","")).strip() if "gold_answer" in r else ""
            if pred or gold:
                pairs.append((pred, gold))
elif 'prepared' in globals() and isinstance(prepared, (list,tuple)) and len(prepared)>0:
    for ex in prepared:
        pred = str(ex.get("answer","")).strip()
        gold = str(ex.get("ground_truth","")).strip()
        pairs.append((pred, gold))
else:
    # fallback inline (so cell can run)
    pairs = [
        ("RAG combines retrieval with generation to ground model outputs.", "RAG combines retrieval and generation to produce more factual answers."),
        ("FAISS enables efficient similarity search for dense vectors.", "FAISS enables fast vector similarity search for retrieved documents.")
    ]

# normalize lists and remove empty golds (metrics need gold)
filtered = [(p,g) for (p,g) in pairs if g and str(g).strip()]
if not filtered:
    raise RuntimeError("No gold references found. Ensure /mnt/data/eval_examples.csv or 'prepared' list contains ground_truth/gold answers.")

preds, golds = zip(*filtered)
preds = [str(p).strip() for p in preds]
golds = [str(g).strip() for g in golds]

print(f"Computing metrics on {len(preds)} examples...")

# ROUGE (use rouge_scorer)
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
rouge1_f = []
rouge2_f = []
rougeL_f = []
for ref, hypo in zip(golds, preds):
    # rouge_scorer expects (target, prediction) order: score(target, prediction)
    sc = scorer.score(ref, hypo)
    rouge1_f.append(sc['rouge1'].fmeasure)
    rouge2_f.append(sc['rouge2'].fmeasure)
    rougeL_f.append(sc['rougeL'].fmeasure)

rouge_results = {
    "rouge1_f": float(np.mean(rouge1_f)) if rouge1_f else 0.0,
    "rouge2_f": float(np.mean(rouge2_f)) if rouge2_f else 0.0,
    "rougeL_f": float(np.mean(rougeL_f)) if rougeL_f else 0.0
}

# BLEU (corpus-level via sacrebleu). sacrebleu expects list of references (list-of-list)
# Prepare references as list of lists: [[ref1, ref2, ...]] per reference set
references = [[r] for r in golds]  # sacrebleu.corpus_bleu expects references as list of reference lists: list(list(str))
# But sacrebleu.corpus_bleu expects the second arg to be list(list-of-refs) -> transpose
# Use sacrebleu.corpus_bleu(hypotheses, list_of_references)
list_of_references = [[r for r in golds]]  # single reference per example
try:
    bleu = sacrebleu.corpus_bleu(preds, list_of_references)
    bleu_score = float(bleu.score)  # percent BLEU
except Exception as e:
    # fallback: compute simple sentence BLEU average via sacrebleu.sentence_bleu
    bleu_vals = []
    for h, r in zip(preds, golds):
        try:
            sb = sacrebleu.sentence_bleu(h, [r])
            bleu_vals.append(sb.score)
        except Exception:
            pass
    bleu_score = float(np.mean(bleu_vals)) if bleu_vals else 0.0

# METEOR (sentence-level average using nltk implementation)
meteor_scores = []
for r, h in zip(golds, preds):
    try:
        meteor_scores.append(single_meteor_score(r, h))
    except Exception:
        meteor_scores.append(0.0)
meteor_avg = float(np.mean(meteor_scores)) if meteor_scores else 0.0

# BERTScore (uses pretrained model; returns P,R,F1 lists)
print("Computing BERTScore (this may download models)...")
try:
    P, R, F1 = bert_score(list(preds), list(golds), lang="en", rescale_with_baseline=True)
    # bert_score returns torch tensors or numpy arrays
    import numpy as _np
    bert_f1 = float(_np.mean(F1.tolist()))
    bert_p = float(_np.mean(P.tolist()))
    bert_r = float(_np.mean(R.tolist()))
except Exception as e:
    print("BERTScore failed:", e)
    bert_f1 = bert_p = bert_r = 0.0

# Aggregate results
results = {
    "n_examples": len(preds),
    "rouge": rouge_results,
    "bleu_corpus_percent": bleu_score,
    "meteor_avg": meteor_avg,
    "bertscore": {"precision": bert_p, "recall": bert_r, "f1": bert_f1}
}

# Save JSON + CSV summary
with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

# Write CSV summary (one-line)
with open(OUT_CSV, "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["metric","value"])
    writer.writerow(["n_examples", results["n_examples"]])
    writer.writerow(["rouge1_f", results["rouge"]["rouge1_f"]])
    writer.writerow(["rouge2_f", results["rouge"]["rouge2_f"]])
    writer.writerow(["rougeL_f", results["rouge"]["rougeL_f"]])
    writer.writerow(["bleu_corpus_percent", results["bleu_corpus_percent"]])
    writer.writerow(["meteor_avg", results["meteor_avg"]])
    writer.writerow(["bertscore_precision", results["bertscore"]["precision"]])
    writer.writerow(["bertscore_recall", results["bertscore"]["recall"]])
    writer.writerow(["bertscore_f1", results["bertscore"]["f1"]])

print("Saved metrics JSON ->", OUT_JSON)
print("Saved metrics CSV ->", OUT_CSV)
print(json.dumps(results, indent=2))


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
Computing metrics on 2 examples...
Computing BERTScore (this may download models)...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Saved metrics JSON -> /mnt/data/answer_quality_metrics.json
Saved metrics CSV -> /mnt/data/answer_quality_metrics.csv
{
  "n_examples": 2,
  "rouge": {
    "rouge1_f": 0.6160990712074303,
    "rouge2_f": 0.3764705882352941,
    "rougeL_f": 0.5572755417956656
  },
  "bleu_corpus_percent": 16.550405241272124,
  "meteor_avg": 0.0,
  "bertscore": {
    "precision": 0.6745836138725281,
    "recall": 0.6428963243961334,
    "f1": 0.6592361330986023
  }
}


In [None]:
# Replacement retrieval evaluation cell — pure local FAISS (no Groq)
import numpy as np, random, re, json
from pathlib import Path

print("Running FAISS-only Retrieval Evaluation (no Groq)...")

# sanity checks
assert 'text_chunks' in globals(), "text_chunks not found in kernel. Run chunking cell first."
assert 'embed_model' in globals(), "embed_model not found. Load sentence-transformers model first."
assert 'index' in globals(), "FAISS index 'index' not found. Build embeddings + index first."

NUM_EVAL = min(40, len(text_chunks))
indices = random.sample(range(len(text_chunks)), NUM_EVAL)

eval_data = []
for idx in indices:
    chunk = text_chunks[idx]
    # query = first meaningful sentence (heuristic)
    sents = re.split(r'(?<=[.!?])\s+', chunk)
    query = sents[0] if sents and len(sents[0]) > 20 else chunk[:180]
    eval_data.append({"query": query, "gold_chunk": chunk, "gold_id": idx})

    # Local KNN retrieval using embed_model + index
    import faiss

    def retrieve_knn_local(query, k=10):
        q_emb = embed_model.encode([query])
        faiss.normalize_L2(q_emb)
        D, I = index.search(q_emb, k)
        hits = []
        for score, idx in zip(D[0], I[0]):
            idx = int(idx)
            hits.append({"chunk": text_chunks[idx], "id": idx, "score": float(score)})
        return hits

    # Compute Recall@k and MRR
    def recall_at_k_and_mrr(eval_data, k_list=[1, 3, 5, 10]):
        recall_counts = {k: 0 for k in k_list}
        mrr_total = 0.0
        for ex in eval_data:
            q = ex["query"]
            gold_id = ex["gold_id"]
            hits = retrieve_knn_local(q, k=max(k_list))
            ids = [h["id"] for h in hits]
            # find rank (1-based)
            rank = None
            for i, cid in enumerate(ids):
                if cid == gold_id:
                    rank = i + 1
                    break
            for k in k_list:
                if rank is not None and rank <= k:
                    recall_counts[k] += 1
            if rank is not None:
                mrr_total += 1.0 / rank
        n = len(eval_data)
        recall = {k: recall_counts[k] / n for k in recall_counts}
        mrr = mrr_total / n
        return recall, mrr

K_VALUES = [1, 3, 5, 10]
recall, mrr = recall_at_k_and_mrr(eval_data, k_list=K_VALUES)

print("\n===== RETRIEVAL METRICS (FAISS-only) =====")
for k in K_VALUES:
    print(f"  Recall@{k}: {recall[k]:.4f}")
print(f"\nMRR: {mrr:.4f}")
print("========================================")

# Save results
OUT = Path("/mnt/data/retrieval_metrics_faiss.json")
with open(OUT, "w") as f:
    json.dump({"recall": recall, "mrr": mrr}, f, indent=2)
    print("Saved metrics to:", OUT)


Running FAISS-only Retrieval Evaluation (no Groq)...

===== RETRIEVAL METRICS (FAISS-only) =====
  Recall@1: 0.5500
  Recall@3: 0.7750
  Recall@5: 0.8000
  Recall@10: 0.8500

MRR: 0.6587
Saved metrics to: /mnt/data/retrieval_metrics_faiss.json
