# 1. Install required packages

We install all the dependencies needed for building a
Retrieval-Augmented Generation (RAG) pipeline.
These include LangChain components, Hugging Face models,
ChromaDB for vector storage, and PyTorch for GPU acceleration.

In [None]:
%pip install somepackage -qq langchain langchain-community langchain-core langchain-text-splitters langchain-huggingface sentence-transformers chromadb transformers torch accelerate unstructured

# 2. Import libraries and set configuration

Here we import the necessary modules and define paths, constants,
and model settings.
We also suppress warnings to keep the notebook output clean.

In [None]:
from pathlib import Path
import json
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_core.prompts import PromptTemplate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
import warnings
warnings.filterwarnings("ignore")

PROMPTS_FILE = "data/test_data.json"
PERSIST_DIR = "data/chroma_db"
EMBED_MODEL = "all-MiniLM-L6-v2"
CHUNK_SIZE = 400
CHUNK_OVERLAP = 50
TOP_K_RESULTS = 5
RELEVANCE_THRESHOLD = 0.3
LLM_MODEL = "MBZUAI/LaMini-Flan-T5-248M"
MAX_NEW_TOKENS = 100
LLM_TEMPERATURE = 0.2
USE_GPU = torch.cuda.is_available()

PROMPT_TEMPLATE = """Answer the question about Apollo 11 based on the context below. If you cannot answer based on the context, say "I don't have enough information to answer that."

Context:
{context}

Question: {question}

Answer:"""

# 3. Initialize embedding model and text splitter

The embedding model converts text into numeric vectors, while the text
splitter breaks long documents into manageable chunks for retrieval.

In [None]:
embedder = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

# 4. Load the local language model

We initialize a small, local LLM (LaMini-Flan-T5) that can run on CPU or GPU.
This model will later generate answers based on retrieved context.

In [None]:
def initialize_local_llm():
    device = 0 if USE_GPU else -1
    tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        LLM_MODEL,
        torch_dtype=torch.float16 if USE_GPU else torch.float32,
        device_map="auto" if USE_GPU else None,
        low_cpu_mem_usage=True,
    )
    pipe = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=MAX_NEW_TOKENS,
        temperature=LLM_TEMPERATURE,
        repetition_penalty=1.2,
        do_sample=False,
        top_p=0.95,
        device=device,
    )
    return HuggingFacePipeline(pipeline=pipe)
llm = initialize_local_llm()

# Load documents from JSON

We read the context and metadata directly from a JSON file.
We also clean metadata and split text into chunks.

In [None]:
def load_documents_from_json(json_path=PROMPTS_FILE):
    data_path = Path(json_path)
    if not data_path.exists():
        print(f"JSON file not found at: {json_path}")
        return []

    with open(data_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    source_text = data.get("source_text", "")
    metadata = data.get("metadata", {})

    if not source_text.strip():
        print("No source text found in JSON.")
        return []

    for k, v in metadata.items():
        if isinstance(v, (list, dict)):
            metadata[k] = str(v)

    split_docs = splitter.create_documents([source_text])

    for doc in split_docs:
        doc.metadata = metadata.copy()
        doc.metadata["topic"] = "Apollo 11"
        doc.metadata["section"] = ", ".join(metadata.get("sections", ["General"]))

    print(f"Loaded and split {len(split_docs)} chunks from JSON.")
    return split_docs

# 6. Build Chroma vector store

Here we embed the document chunks and save them into a local vector database (Chroma).
This enables fast similarity-based retrieval of relevant context later.

In [None]:
def build_chroma_store(docs, persist_dir=PERSIST_DIR):
    db = Chroma.from_documents(
        documents=docs,
        embedding=embedder,
        persist_directory=persist_dir
    )
    db.persist()
    return db


# 7. Calling the Load Document Function 

This cell loads the source document (text and metadata) from the JSON file, and
splits it into smaller chunks for embedding.

In [None]:
documents = load_documents_from_json()

# 8. Calling the Build Chroma Function

This cell builds a Chroma vector database
that stores those embeddings for efficient similarity search.

Once the database is built, it’s saved to disk,
so you only need to run this cell once, unless you change or add new data.

Running it again will overwrite the existing database.

In [None]:
db = build_chroma_store(documents)

# 9. Define query and response generation

These functions retrieve the most relevant text chunks and use the
LLM to answer a question.

In [None]:
def query_database(query_text, k=TOP_K_RESULTS, threshold=RELEVANCE_THRESHOLD):
    results = db.similarity_search_with_relevance_scores(query_text, k=k)
    
    if len(results) == 0 or results[0][1] < threshold:
        return []
    
    return results

def generate_rag_response(
    query_text, k=TOP_K_RESULTS, threshold=RELEVANCE_THRESHOLD, verbose=False
):
    results = db.similarity_search_with_relevance_scores(query_text, k=k)

    if len(results) == 0 or results[0][1] < threshold:
        return {
            "answer": "No relevant information found.",
            "sources": [],
            "context": "",
            "prompt": "",
        }

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = PromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)

    if llm is None:
        return {
            "answer": "LLM not initialized.",
            "sources": [],
            "context": context_text,
            "prompt": prompt,
        }

    response_text = llm.invoke(prompt)
    sources = [doc.metadata.get("source", "Unknown") for doc, _score in results]

    if verbose:
        print(f"\nQuery: {query_text}")
        print(f"\nAnswer: {response_text}")
        print(f"\nSources: {', '.join([Path(s).name for s in sources])}")

    return {
        "answer": response_text,
        "sources": sources,
        "context": context_text,
        "prompt": prompt,
        "scores": [score for _, score in results],
    }
    
def ask(query_text):
    result = generate_rag_response(query_text, verbose=True)
    return result["answer"]

# 10. Load evaluation prompts

We load a list of test questions from a JSON file.
Each question is labeled with a category (e.g., summarization, reasoning, or RAG).

In [None]:
with open(PROMPTS_FILE, "r") as f:
    prompts_data = json.load(f)

prompts = prompts_data["prompts"]
print(f"Loaded {len(prompts)} evaluation prompts")
print("\nCategories:")
for category in ["summarization", "reasoning", "rag"]:
    count = len([p for p in prompts if p["category"] == category])
    print(f"  - {category.title()}: {count} prompts")

# 11. Run automated evaluation

For each question, we generate an answer using the RAG system and print
both the model’s response and the expected answer (if provided).

In [None]:
results = []

for p in prompts:
    question = p["prompt"]
    expected = p.get("expected_answer", None)
    print(f"\nTesting Prompt {p['id']}: {question}")

    result = generate_rag_response(question, verbose=False)
    answer = result["answer"]

    results.append(
        {
            "id": p["id"],
            "category": p["category"],
            "difficulty": p["difficulty"],
            "prompt": question,
            "answer": answer,
            "expected": expected,
            "context_used": len(result["context"]),
            "top_sources": result["sources"],
        }
    )

    print(f" Model Answer: {answer}")
    if expected:
        print(f" Expected: {expected}")
