<a href="https://colab.research.google.com/github/MahsaSetoode/Final_Project/blob/master/E5_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

--------- NEW CODE -----------

In [1]:
# --- Setup (Colab) ---
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install dependencies
# !pip install -q sentence-transformers faiss-cpu openai tiktoken
!pip -q install sentence-transformers faiss-cpu openai tiktoken transformers

import os, glob, re, json, unicodedata, math
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from openai import OpenAI
!pip -q install -U sentence-transformers

from sentence_transformers import CrossEncoder

# OPENAI_API_KEY = input("Enter your OpenAI API key: ").strip()
# !pip install --quiet openai
# client = OpenAI(api_key=OPENAI_API_KEY)

Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# ---- Config ----
TXT_FOLDER = '/content/drive/MyDrive/extracted_texts/English_writtings'

# Paragraph windowing
PARA_WINDOW = 3
PARA_OVERLAP = 1

# Token guard
LLM_MODEL_NAME = "google/flan-t5-base"    # local tokenizer only
RESERVED_TOKENS = 128
EMBED_MODEL = "intfloat/e5-base-v2"
EMBED_BATCH = 32
TOPK_RETRIEVE = 8
TOPK_USE = 4
MAX_NEW_TOKENS = 180

# Paths to persist
ARTIFACT_DIR = "/content/drive/MyDrive/rag_cache_bahai"
os.makedirs(ARTIFACT_DIR, exist_ok=True)
INDEX_PATH = os.path.join(ARTIFACT_DIR, "faiss.index")
META_PATH  = os.path.join(ARTIFACT_DIR, "metadata.jsonl")

# OpenAI (paid API key, even if your ChatGPT plan is free)
OPENAI_API_KEY = input("Enter your OpenAI API key: ").strip()
client = OpenAI(api_key=OPENAI_API_KEY)

# Re-ranking
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')


Enter your OpenAI API key: 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


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [3]:
# ---- Helpers ----
def normalize_text(s: str) -> str:
    s = unicodedata.normalize("NFKC", s)
    # unify newlines, collapse >2 newlines
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def read_txts(folder):
    paths = sorted(glob.glob(os.path.join(folder, "**", "*.txt"), recursive=True))
    docs = []
    for p in paths:
        try:
            with open(p, "r", encoding="utf-8", errors="ignore") as f:
                t = normalize_text(f.read())
                if t:
                    docs.append({"path": p, "text": t})
        except Exception as e:
            print(f"Skip {p}: {e}")
    return docs

In [4]:
# ---- Paragraph-based chunking with token guarantees ----
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, use_fast=True)
model_max_len = tokenizer.model_max_length if tokenizer.model_max_length < 100000 else 512
CTX_LIMIT = max(64, model_max_len - RESERVED_TOKENS)
print("Model max:", model_max_len, "| context limit:", CTX_LIMIT)

def para_split(text: str):
    # split on blank lines (two or more newlines)
    paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
    return paras

def token_len(s: str) -> int:
    return len(tokenizer.encode(s, add_special_tokens=False))

def split_by_tokens(s: str, max_tokens: int, overlap_tokens: int = 40):
    ids = tokenizer.encode(s, add_special_tokens=False)
    if len(ids) <= max_tokens:
        return [s]
    chunks = []
    step = max(1, max_tokens - overlap_tokens)
    for start in range(0, len(ids), step):
        end = min(len(ids), start + max_tokens)
        window_ids = ids[start:end]
        chunks.append(tokenizer.decode(window_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True))
        if end >= len(ids):
            break
    return chunks


# (Optional) Quiet down HF warnings
from transformers.utils import logging as hf_logging
hf_logging.set_verbosity_error()

def split_by_tokens_strict(s: str, max_tokens: int, overlap_tokens: int = 40):
    """Split by token windows and guarantee every returned piece <= max_tokens."""
    ids = tokenizer.encode(s, add_special_tokens=False)
    if len(ids) <= max_tokens:
        return [s]

    out_texts = []
    step = max(1, max_tokens - overlap_tokens)
    for start in range(0, len(ids), step):
        end = min(len(ids), start + max_tokens)
        window_ids = ids[start:end]

        # Decode WITHOUT cleanup to avoid whitespace normalization surprises
        text = tokenizer.decode(
            window_ids,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )

        # Final hard guard: if re-encoding somehow exceeds, trim by ids directly
        tid = tokenizer.encode(text, add_special_tokens=False)
        if len(tid) > max_tokens:
            tid = tid[:max_tokens]
            text = tokenizer.decode(
                tid,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False
            )

        out_texts.append(text)
        if end >= len(ids):
            break
    return out_texts

def build_chunks(docs, para_window=3, para_overlap=1, ctx_limit=CTX_LIMIT):
    uid = 0
    final = []
    for d in docs:
        paras = para_split(d["text"])
        if not paras:
            continue

        step = max(1, para_window - para_overlap)
        for start in range(0, len(paras), step):
            end = min(len(paras), start + para_window)
            block = "\n\n".join(paras[start:end])

            if token_len(block) > ctx_limit:
                pieces = split_by_tokens_strict(block, ctx_limit, overlap_tokens=50)
                for i, piece in enumerate(pieces):
                    # One more microscopic guard (rarely needed but free):
                    # drop empty or whitespace-only fragments
                    if not piece.strip():
                        continue
                    final.append({
                        "id": uid,
                        "source_path": d["path"],
                        "para_start": start,
                        "para_end": end-1,
                        "subpiece": i,
                        "text": piece,
                    })
                    uid += 1
            else:
                final.append({
                    "id": uid,
                    "source_path": d["path"],
                    "para_start": start,
                    "para_end": end-1,
                    "subpiece": 0,
                    "text": block,
                })
                uid += 1

    # verify: should be 0 now
    overflow = [c for c in final if token_len(c["text"]) > ctx_limit]
    print("Built chunks:", len(final), "| overflow:", len(overflow))
    return final

docs = read_txts(TXT_FOLDER)
print(f"Loaded {len(docs)} .txt files")
chunks = build_chunks(docs, PARA_WINDOW, PARA_OVERLAP, CTX_LIMIT)

lengths = [token_len(c["text"]) for c in chunks]
print("max tokens:", max(lengths), " | any over?:", any(t > CTX_LIMIT for t in lengths))

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Model max: 512 | context limit: 384
Loaded 37 .txt files
Built chunks: 8427 | overflow: 0
max tokens: 384  | any over?: False


In [5]:
# ---- Embeddings & FAISS (save / load) ----
embedder = SentenceTransformer(EMBED_MODEL)

def encode_passages(batch_texts):
    # e5 expects "passage: ..." for docs
    return embedder.encode([f"passage: {t}" for t in batch_texts],
                           batch_size=EMBED_BATCH,
                           convert_to_numpy=True,
                           show_progress_bar=False)

def build_or_load_index(chunks):
    if os.path.exists(INDEX_PATH) and os.path.exists(META_PATH):
        print("Loading index & metadata from cache...")
        index = faiss.read_index(INDEX_PATH)
        meta = [json.loads(line) for line in open(META_PATH, "r", encoding="utf-8")]
        return index, meta

    print("Building embeddings...")
    texts = [c["text"] for c in chunks]
    embs = encode_passages(texts)
    # L2-normalize (for IP cosine)
    embs = embs / np.linalg.norm(embs, axis=1, keepdims=True)
    index = faiss.IndexFlatIP(embs.shape[1])
    index.add(embs)
    print("FAISS index size:", index.ntotal)

    print("Persisting...")
    faiss.write_index(index, INDEX_PATH)
    with open(META_PATH, "w", encoding="utf-8") as f:
        for c in chunks:
            f.write(json.dumps(c, ensure_ascii=False) + "\n")
    return index, chunks

index, meta = build_or_load_index(chunks)

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Loading index & metadata from cache...


In [6]:
# ---- Retrieval ----
def retrieve(query: str, k=TOPK_RETRIEVE):
    q_emb = embedder.encode([f"query: {query}"], convert_to_numpy=True)
    q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)
    scores, inds = index.search(q_emb, k)
    results = []
    for s, i in zip(scores[0], inds[0]):
        if i < 0:
            continue
        m = meta[i]
        results.append({
            "score": float(s),
            "i": int(i),
            "source": os.path.basename(m["source_path"]),
            "para_start": m["para_start"],
            "para_end": m["para_end"],
            "text": m["text"]
        })
    return results

# OPTIONAL RERANK (commented: requires extra model, adds cost/time)
def rerank(query, candidates, k=TOPK_USE):
    pairs = [(query, c["text"]) for c in candidates]
    scores = reranker.predict(pairs)
    for c, s in zip(candidates, scores):
        c["rerank"] = float(s)
    return sorted(candidates, key=lambda x: x["rerank"], reverse=True)[:k]

In [7]:
# ---- Generation (single, cited answer) ----
SYS = "You are a precise research assistant. Answer ONLY from the given context. If the answer is not present, say: I don't know."

def make_context_blocks(hits, k=TOPK_USE):
    picked = hits[:k]
    blocks = []
    cites = []
    for j, h in enumerate(picked, start=1):
        blocks.append(f"[{j}] Source: {h['source']} (paras {h['para_start']}-{h['para_end']})\n{h['text']}")
        # small snippet for display
        snippet = re.sub(r"\s+", " ", h["text"]).strip()
        if len(snippet) > 220: snippet = snippet[:220] + "..."
        cites.append({
            "cite_id": j,
            "source": h["source"],
            "paragraphs": f"{h['para_start']}-{h['para_end']}",
            "snippet": snippet
        })
    return "\n\n".join(blocks), cites

def answer(question: str, hits):
    ctx, cites = make_context_blocks(hits, k=TOPK_USE)
    user_prompt = (
        "Use ONLY the context blocks (with [#] markers) to answer.\n"
        "Cite evidence by listing the [#] you used.\n\n"
        f"{ctx}\n\nQuestion: {question}\nAnswer (include [#] citations):"
    )
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role":"system", "content": SYS},
                  {"role":"user", "content": user_prompt}],
        temperature=0.0,
        max_tokens=MAX_NEW_TOKENS,
    )
    return resp.choices[0].message.content.strip(), cites

In [8]:
# ---- Demo ----
question = "Where is Mount Carmel mentioned in the texts?"
hits = retrieve(question, k=TOPK_RETRIEVE)
hits = rerank(question, hits, k=TOPK_USE)   # if you enable reranker
final, citations = answer(question, hits)

print("Q:", question, "\n")
print("A:", final, "\n")
print("Citations:")
for c in citations:
    print(f"  [{c['cite_id']}] {c['source']} (paras {c['paragraphs']}) — {c['snippet']}")

Q: Where is Mount Carmel mentioned in the texts? 

A: Mount Carmel is mentioned in the following texts: 

1. In the first source, it is described as the location where a sacred edifice is being raised for the remains of the Báb, highlighting its significance and the efforts made to acquire land there ([1]).
2. The second source refers to Mount Carmel as a place where the banner of the Lord was raised and where ‘Abdu’-áli served as a gardener, emphasizing its spiritual importance ([2]).
3. The third source mentions Mount Carmel as a sacred garden where prayers were offered by the Prophets, indicating its revered status ([3]).
4. The fourth source describes Mount Carmel as the "Hill of God and His Vineyard," where Bahá’u’lláh's tent was raised and where significant events took place, including the revelation of the Tablet of Carmel ([4]). 

Citations:
  [1] light-of-the-world.txt (paras 0-0) — and sacred Lote-Tree! When the Ancient Beauty, the Most Great Name—may my life be offered up fo

Chat with Data

In [10]:
!pip -q install langchain langchain-community langchain-openai
# !pip -q install requests==2.32.4

import os, json
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS as LCFAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.embeddings.base import Embeddings

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/2.5 MB[0m [31m34.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.5/2.5 MB[0m [31m47.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fol

Re-ranking

In [16]:
from typing import List, Any
from langchain_core.retrievers import BaseRetriever
from langchain.schema import Document
from sentence_transformers import CrossEncoder
from pydantic import BaseModel, Field
from langchain.embeddings import HuggingFaceEmbeddings

class HybridFaissCrossEncoderRetriever(BaseRetriever, BaseModel):
    vectorstore: Any = Field()
    cross_encoder: Any = Field()
    top_k: int = 10
    top_n: int = 4

    def _get_relevant_documents(self, query: str) -> List[Document]:
        docs = self.vectorstore.similarity_search(query, k=self.top_k)
        if not docs:
            return []

        pairs = [(query, doc.page_content) for doc in docs]
        scores = self.cross_encoder.predict(pairs)

        reranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)[:self.top_n]
        for doc, score in reranked:
            doc.metadata["rerank_score"] = float(score)
            doc.metadata["citation"] = f'{doc.metadata.get("source")} (paras {doc.metadata.get("paragraph_start")}-{doc.metadata.get("paragraph_end")})'
        return [doc for doc, _ in reranked]

def format_citation(metadata: dict) -> str:
    """
    Format a citation string from a document's metadata.
    Expected keys in metadata: 'source' (filename or title),
    'paragraph_start' and 'paragraph_end' (ints, start and end paragraph indices).
    """
    source = metadata.get("source", "Unknown Source")
    para_start = metadata.get("paragraph_start")
    para_end = metadata.get("paragraph_end")
    if para_start is not None and para_end is not None:
        if para_start == para_end:
            return f"{source} (paragraph {para_start})"
        else:
            return f"{source} (paragraphs {para_start}-{para_end})"
    # Fallback: if only a single 'paragraph' or none
    single_para = metadata.get("paragraph")
    if single_para is not None:
        return f"{source} (paragraph {single_para})"
    return source



from sentence_transformers import CrossEncoder
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI

# Initialize the FAISS vector store and CrossEncoder model (already containing Baha'i text embeddings)
# (In practice, these would be created/loaded elsewhere and passed in.)
# Load index and metadata
index = faiss.read_index(INDEX_PATH)

with open(META_PATH, "r", encoding="utf-8") as f:
    chunks = [json.loads(line) for line in f]

documents = [
    Document(
        page_content=c["text"],
        metadata={
            "source": os.path.basename(c["source_path"]),
        }
    )
    for c in chunks
]

embedding_model = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
docstore = InMemoryDocstore(dict(enumerate(documents)))
index_to_docstore_id = {i: i for i in range(len(documents))}

vectorstore = FAISS(index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id, embedding_function=embedding_model)
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")  # load the reranker model:contentReference[oaicite:5]{index=5}

# Create the hybrid retriever
hybrid_retriever = HybridFaissCrossEncoderRetriever(
    vectorstore=vectorstore,
    cross_encoder=cross_encoder,
    top_k=10,
    top_n=4
)


# Use the retriever in a ConversationalRetrievalChain (or any QA chain)
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0, openai_api_key=OPENAI_API_KEY)
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key="answer"  # 👈 This tells memory what to store
)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=hybrid_retriever,
    memory=memory,
    return_source_documents=True,
    output_key="answer"
)

# Ask a question (the retriever will fetch relevant Baha'i text chunks, rerank them, and pass to the LLM)
query = "What is the concept of unity in the Baha'i Faith?"
result = qa_chain.invoke({
    "question": query,
    "chat_history": []
})
answer = result["answer"]
sources = result.get("source_documents", [])
print("Question:\n", query)

print("Answer:\n", answer)
print("\nSources:")
for doc in sources:
    print("-", format_citation(doc.metadata))

print("CHAT:\n \n")
while True:
    query = input("You: ").strip()
    if query.lower() in {"exit", "quit"}:
        break

    result = qa_chain.invoke({"question": query})
    answer = result["answer"]
    sources = result.get("source_documents", [])

    print("\nAssistant:", answer)
    print("\nSources:")
    for doc in sources:
        print("-", format_citation(doc.metadata))

Question:
 What is the concept of unity in the Baha'i Faith?
Answer:
 The concept of unity in the Baha'i Faith is central to its teachings and is expressed in several key principles. It emphasizes the oneness of humanity, asserting that all people are part of a single human family. This unity is seen as indissoluble, changeless, eternal, and everlasting, with each divine Manifestation representing and expressing this unity.

Baha'is believe that denying one of the Manifestations of God equates to denying all, and that persecution of one is persecution of all. The faith promotes solidarity among mankind and encourages believers to cultivate characteristics such as agreement, fellowship, and love. 

In practice, Baha'is strive to eliminate prejudices and foster a sense of brotherhood among individuals of different backgrounds, religions, and cultures. They aim for the unification of religious beliefs and the reconciliation of religion with science and reason. The ultimate goal is to achi

Evaluation

In [9]:
# ================== GENERATE 20 QA PAIRS ==================
import random, json, re, os
from datetime import datetime

SAVE_EVAL = "/content/drive/MyDrive/rag_cache_bahai/bahai_eval_qa.jsonl"

def pick_candidate_chunks(meta, n=20, min_chars=300):
    # Prefer decently sized chunks and diversify sources
    by_source = {}
    for i, m in enumerate(meta):
        src = os.path.basename(m["source_path"])
        by_source.setdefault(src, []).append((i, m))
    # flatten with a bias: take up to 3 per file randomly
    candidates = []
    for src, items in by_source.items():
        random.shuffle(items)
        take = min(3, len(items))
        for i, m in items[:take]:
            if len(m["text"]) >= min_chars:
                candidates.append((i, m))
    random.shuffle(candidates)
    return candidates[:n]

def clean(s):
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    return s

GEN_SYS = (
    "You are a careful dataset creator. "
    "Given ONLY the provided CONTEXT, write ONE question whose answer is explicitly stated as a short span. "
    "Return strict JSON with keys: question, answer, rationale. "
    "Rules: (1) The answer MUST be quoted verbatim from the context. "
    "(2) Avoid yes/no. (3) Avoid requiring world knowledge. (4) Keep the question 5-25 words. "
    "(5) Keep the answer under 25 words."
)

def make_qa_from_chunk(chunk, max_attempts=2):
    context = chunk["text"]
    user = (
        "CONTEXT:\n"
        + context
        + "\n\nCreate exactly ONE QA in strict JSON: "
        '{"question":"...", "answer":"...", "rationale":"..."}'
    )
    for _ in range(max_attempts):
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role":"system","content":GEN_SYS},
                      {"role":"user","content":user}],
            temperature=0.7,
            max_tokens=220
        )
        txt = resp.choices[0].message.content.strip()
        # try to find json (robust to stray text)
        m = re.search(r"\{.*\}", txt, flags=re.S)
        if not m:
            continue
        try:
            obj = json.loads(m.group(0))
            q = clean(obj.get("question",""))
            a = clean(obj.get("answer",""))
            r = clean(obj.get("rationale",""))
            if not q or not a:
                continue
            # Ensure the answer is actually a substring of the context (verbatim span)
            if a in context:
                return {
                    "question": q,
                    "answer": a,
                    "rationale": r,
                    "source_file": os.path.basename(chunk["source_path"]),
                    "para_range": f"{chunk.get('para_start','?')}-{chunk.get('para_end','?')}",
                    "chunk_id": chunk["id"],
                    "created_at": datetime.utcnow().isoformat(timespec="seconds") + "Z",
                    "context_preview": clean(context[:300]) + ("..." if len(context) > 300 else "")
                }
        except Exception:
            pass
    return None

# --- Sample chunks and generate ---
random.seed(42)
cands = pick_candidate_chunks(meta, n=20, min_chars=300)
qa_items = []
for i, ch in cands:
    qa = make_qa_from_chunk(ch)
    if qa: qa_items.append(qa)

print(f"Generated {len(qa_items)} QA pairs.")

# Save to JSONL
with open(SAVE_EVAL, "w", encoding="utf-8") as f:
    for item in qa_items:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")
print("✅ Saved eval set to", SAVE_EVAL)

  "created_at": datetime.utcnow().isoformat(timespec="seconds") + "Z",


Generated 19 QA pairs.
✅ Saved eval set to /content/drive/MyDrive/rag_cache_bahai/bahai_eval_qa.jsonl


In [31]:
# ================== EVALUATE RAG: EM / F1 ==================
import json, re
from collections import Counter

def normalize_text(s):
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def f1_score(pred, gold):
    p_tokens = normalize_text(pred).split()
    g_tokens = normalize_text(gold).split()
    if len(p_tokens) == 0 or len(g_tokens) == 0:
        return float(p_tokens == g_tokens)
    common = Counter(p_tokens) & Counter(g_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(p_tokens)
    recall = num_same / len(g_tokens)
    return 2 * precision * recall / (precision + recall)

def is_exact_match(pred, gold):
    return normalize_text(pred) == normalize_text(gold)

# Load eval set
eval_items = [json.loads(l) for l in open(SAVE_EVAL, "r", encoding="utf-8")]

rows = []
exact_match_total = 0
f1_total = 0

for ex in eval_items:
    q = ex["question"]
    gold = ex["answer"]
    # Retrieve & answer with your existing functions
    hits = retrieve(q, k=8)   # you can bump to 8–10 if desired
    hits = rerank(question, hits, k=4)
    if not hits:
        pred = ""
    else:
        # Use the top retrieved chunk only (simple baseline)
        pred, src = answer(q, hits)
        print("predict print", pred)

    exact_match = 1.0 if is_exact_match(pred, gold) else 0.0
    f1 = f1_score(pred, gold)
    exact_match_total += exact_match
    f1_total += f1

    rows.append({
        "question": q,
        "pred": pred,
        "gold": gold,
        "Exact Match": round(exact_match, 3),
        "F1": round(f1, 3),
        "source_ref": f"{ex['source_file']} (paras {ex['para_range']})",
        "top_source": src if hits else "N/A"
    })

print(f"\nResults on {len(eval_items)} items:")
print(f"  Exact Match: {exact_match_total/len(eval_items):.3f}")
print(f"  F1: {f1_total/len(eval_items):.3f}\n")

# Show a few rows
for r in rows[:5]:
    print("Q:", r["question"])
    print("PRED:", r["pred"])
    print("GOLD:", r["gold"])
    print("Exact Match / F1:", r["Exact Match"], "/", r["F1"])
    print("REF:", r["source_ref"])
    print("TOP SOURCE:", r["top_source"])
    print("-"*80)

predict print To achieve the lofty Cause mentioned in the text, it is required to teach the Cause of God with ardor, to proclaim His message, and to strive persistently despite setbacks and challenges. The text emphasizes the importance of unity, determination, and the guidance of the nations and peoples of the world. Additionally, it highlights the necessity of appointing those who can promote the Cause if one is unable to do so themselves. 

Evidence: [2], [3], [4]
predict print The root of the tree from which all mankind has sprung is God, who is described as the Creator and the one who has created all humanity as branches and leaves of the same tree. This is evidenced by the statements that "all are the children of God, fruit upon the one tree of His love" and "God alone is Creator, and all are creatures of His might" [3]. Additionally, Bahá’u’lláh emphasizes that "ye are the fruits of one tree and the leaves of one branch" [2]. 

Citations: [2], [3]
predict print In response to th

In [17]:
JUDGE_SYS = "You are a strict evaluator of groundedness. Reply with only Supported or Not Supported."

def judge_supported(context_snip, question, pred):
    prompt = (
        f"Context:\n{context_snip}\n\n"
        f"Question: {question}\n"
        f"Answer: {pred}\n\n"
        "Is the answer fully supported by the context?"
    )
    out = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role":"system","content":JUDGE_SYS},
                  {"role":"user","content":prompt}],
        temperature=0,
        max_tokens=2
    )
    return out.choices[0].message.content.strip()

# Example on first item
if eval_items:
    h = retrieve(eval_items[0]["question"], k=1)
    ctx = h if h else ""
    print("Judge:", judge_supported(ctx, eval_items[0]["question"], rows[0]["pred"]))


Judge: Not Supported


--------- OLD CODE -----------

In [None]:

# ---- Configuration ----
txt_folder_path = '/content/drive/MyDrive/extracted_texts/English_writtings'
CHUNK_SIZE_PARAGRAPHS = 3
CHUNK_OVERLAP_PARAGRAPHS = 1
EMBEDDING_MODEL_NAME = "intfloat/e5-base-v2"
LLM_MODEL_NAME = "google/flan-t5-base"   # small, free; max length typically 512
RESERVED_TOKENS = 128    # tokens reserved for prompt + answer (tune if needed)
OVERLAP_TOKENS = 50      # token-level overlap when splitting very long chunks
EMBED_BATCH_SIZE = 32
TOP_K_RETRIEVE = 5
TOP_K_RETURN = 3
ANSWER_MAX_NEW_TOKENS = 128

# ---- Imports ----
import os, glob, re, json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import torch
from transformers import AutoTokenizer, pipeline


In [None]:
# ---- 1. Load documents ----
def load_txt_files(folder_path):
    txt_files = sorted(glob.glob(os.path.join(folder_path, "**", "*.txt"), recursive=True))
    documents = []
    for p in txt_files:
        with open(p, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read().strip()
            if text:
                documents.append({"path": p, "text": text})
    return documents

docs = load_txt_files(txt_folder_path)
print(f"Loaded {len(docs)} txt files.")

# ---- 2. Paragraph split and chunking ----
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, use_fast=True)
model_max_len = tokenizer.model_max_length if tokenizer.model_max_length < 100000 else 512
context_token_limit = max(64, model_max_len - RESERVED_TOKENS)
print("Model max length:", model_max_len, "| context token limit:", context_token_limit)

def split_text_by_token_windows(text, max_tokens, overlap_tokens=50):
    ids = tokenizer.encode(text, add_special_tokens=False)
    if len(ids) <= max_tokens:
        return [text]
    windows = []
    step = max(1, max_tokens - overlap_tokens)
    for start in range(0, len(ids), step):
        end = min(len(ids), start + max_tokens)
        window_ids = ids[start:end]
        window_text = tokenizer.decode(window_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        windows.append(window_text)
        if end >= len(ids):
            break
    return windows

# ------------ 3) split ALL docs safely by tokens ------------
final_chunks = []
uid = 0
for doc in docs:
    # first split by markers and blank lines for readability
    sections = re.split(r'(\* \* \*|\n\s*\n)', doc["text"])
    sections = [s.strip() for s in sections if s.strip()]
    for sec in sections:
        windows = split_text_by_token_windows(sec, context_token_limit, OVERLAP_TOKENS)
        for wi, w in enumerate(windows):
            ids = tokenizer.encode(w, add_special_tokens=False)
            final_chunks.append({
                "id": uid,
                "source_path": doc["path"],
                "subchunk_index": wi,
                "text": w,
                "tok_len": len(ids)
            })
            uid += 1

print("Final (token-safe) chunks:", len(final_chunks))
long_chunks = [c for c in final_chunks if c["tok_len"] > context_token_limit]
print("Chunks still > limit (should be 0):", len(long_chunks))



Loaded 37 txt files.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1107 > 512). Running this sequence through the model will result in indexing errors


Model max length: 512 | context token limit: 384
Final (token-safe) chunks: 9072
Chunks still > limit (should be 0): 1060


In [None]:
# ------------ 4) embed + FAISS ------------
embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
texts_for_embedding = [f"passage: {c['text']}" for c in final_chunks]

embeddings = embedder.encode(texts_for_embedding, batch_size=EMBED_BATCH_SIZE,
                             convert_to_numpy=True, show_progress_bar=True)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
print("Built FAISS index, vectors:", index.ntotal)

with open("/content/final_chunks_metadata.json", "w", encoding="utf-8") as f:
    json.dump(final_chunks, f, ensure_ascii=False, indent=2)


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Batches:   0%|          | 0/284 [00:00<?, ?it/s]

Built FAISS index, vectors: 9072


In [None]:
# ------------ 5) QA pipeline ------------
device = 0 if torch.cuda.is_available() else -1
qa_pipe = pipeline("text2text-generation", model=LLM_MODEL_NAME, tokenizer=tokenizer, device=device)

def retrieve_top_k(query, k=5):
    q_emb = embedder.encode([f"query: {query}"], convert_to_numpy=True)
    q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)
    scores, inds = index.search(q_emb, k)
    results = []
    for score, i in zip(scores[0], inds[0]):
        if i < 0: continue
        results.append({"score": float(score), "chunk": final_chunks[i]})
    return results

def generate_answer_with_chunk_old(question, chunk_meta):
    context = chunk_meta["text"]
    prompt = (
        "Use ONLY the context below to answer the question. "
        "If the answer is not present, say 'I don't know'.\n\n"
        f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
    )
    out = qa_pipe(prompt, max_new_tokens=ANSWER_MAX_NEW_TOKENS, do_sample=False)[0]["generated_text"]
    snippet = context.strip().replace("\n", " ")
    if len(snippet) > 300:   # limit to 300 chars
        snippet = snippet[:300] + "..."

    source_info = f"{os.path.basename(chunk_meta['source_path'])}: \"{snippet}\""
    return out.strip(), source_info
    # source = os.path.basename(chunk_meta["source_path"])
    # return out.strip(), f"{source} (subchunk {chunk_meta['subchunk_index']})"

def generate_answer_with_chunk(question, chunk_meta):
    context = chunk_meta["text"]

    prompt = (
        "You are a precise research assistant.\n"
        "Use ONLY the context below to answer the question.\n"
        "If the context does not contain the answer, reply 'I don't know'.\n\n"
        f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
    )

    completion = client.chat.completions.create(
        model="gpt-4o-mini",          # ✅ best value on free/cheap tier
        messages=[
            {"role": "system", "content": "You are a factual assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150,
        temperature=0.0
    )

    answer = completion.choices[0].message.content.strip()

    # make a short text snippet to show as the source instead of chunk number
    snippet = context.strip().replace("\n", " ")
    if len(snippet) > 200:
        snippet = snippet[:200] + "..."
    source_info = f"{os.path.basename(chunk_meta['source_path'])}: \"{snippet}\""

    return answer, source_info

def answer_question(question, top_k_return=3, retrieval_k=6):
    retrieved = retrieve_top_k(question, k=retrieval_k)
    answers = []
    for r in retrieved[:top_k_return]:
        ans, source = generate_answer_with_chunk(question, r["chunk"])
        answers.append({"question": question, "answer": ans, "source": source, "score": r["score"]})
    return answers

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
# ------------ 6) Example run ------------
q = "who is the baha'i?"
question = "Where is Mount Carmel mentioned in the text?"
# results = answer_question(q, top_k_return=TOP_K_RETURN, retrieval_k=TOP_K_RETRIEVE)
results = answer_question(question, top_k_return=3, retrieval_k=10)

for a in results:
    print("question:\n", a["question"], "\n")
    print("answer:\n", a["answer"], "\n")
    print("source:\n", a["source"], "\n")
    print("-"*80)

NameError: name 'answer_question' is not defined