In [None]:
# Upgrade pip to avoid dependency resolution issues
!pip install --upgrade pip

# Core utilities
!pip install python-dotenv tqdm

# LangChain modular packages (post-0.1.x architecture)
!pip install \
    langchain-core \
    langchain-community \
    langchain-groq

# Vector store backend
!pip install chromadb

# Embeddings dependencies
!pip install sentence-transformers huggingface-hub

# Optional but commonly required by Chroma + LangChain
!pip install pydantic typing-extensions


Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-groq
  Downloading langchain_groq-1.1.1-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting requests>=2.0.0 (from langsmith<1.0.0,>=0.3.45->langchain-core)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-comm

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

import json
from tqdm import tqdm
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda
from operator import itemgetter

ModuleNotFoundError: No module named 'langchain_groq'

In [None]:
from google.colab import drive
drive.mount('/content/drive')




Mounted at /content/drive


In [None]:
!ls /content/drive/MyDrive


 Agentic-RAG-Chatbot  'Colab Notebooks'   Colab_Notebooks   extra


In [None]:
%cd "/content/drive/MyDrive/Agentic-RAG-Chatbot"


/content/drive/MyDrive/Agentic-RAG-Chatbot


In [None]:
!ls

app.py		   evaluation_results_1_50.json  notebook.ipynb
Chatbot.ipynb	   main.py			 __pycache__
dataset_loader.py  metrics.py			 requirements.txt


In [None]:
import dataset_loader



In [None]:
# Import existing modules
from dataset_loader import load_squad_v2, prepare_contexts_for_rag
from metrics import evaluate_batch, evaluate_unanswerable

In [None]:
# CONFIGURATION
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL = "openai/gpt-oss-20b"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
K_RETRIEVED = 3
MAX_SAMPLES = 50  # Set to None to use all samples

In [None]:
# RAG SETUP
def format_docs(docs):
    """Format retrieved documents for context"""
    formatted = []
    for i, doc in enumerate(docs, 1):
        formatted.append(f"[Source {i}]\n{doc.page_content}")
    return "\n\n".join(formatted)


def setup_rag_system(contexts, persist_directory="./chroma_db"):
    """Setup complete RAG system with retriever and LLM"""

    # Embeddings
    embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

    # Check if index exists
    if os.path.exists(persist_directory) and os.listdir(persist_directory):
      print("Loading existing vector store from {persist_directory}...")

      # Load existing index
      vectorestore= Chroma(persist_directory= persist_directory, embedding_function=embedding_model)
      print("Loaded existing index!")
    else:
      print("Creating new vector store and saving to {persist_directory}...")

    # Create vector store
    vectorstore = Chroma.from_documents(
        documents=contexts,
        embedding=embedding_model,
        persist_directory= persist_directory
    )
    print ("Created and saved new index!")

    # Retriever
    retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={"k": K_RETRIEVED}
    )

    # LLM
    llm = ChatGroq(
        model=LLM_MODEL,
        groq_api_key=GROQ_API_KEY,
        temperature=0
    )

    # Prompt
    prompt = ChatPromptTemplate.from_messages([
        (
            "system",
            "You are a question-answering assistant. "
            "Answer the question using ONLY the provided context. "
            "If the answer cannot be found in the context, respond with 'I don't know' or 'The answer is not available in the provided context.'"
        ),
        (
            "human",
            "Context:\n{context}\n\nQuestion:\n{question}\n\nAnswer:"
        )
    ])

    # RAG Chain
    rag_chain = (
        {
            "docs": itemgetter("question") | retriever,
            "question": itemgetter("question"),
        }
        | RunnableLambda(lambda x: {
            "question": x["question"],
            "context": format_docs(x["docs"]),
        })
        | RunnableLambda(lambda x: {
            "answer": (
                prompt
                | llm
                | StrOutputParser()
            ).invoke({
                "question": x["question"],
                "context": x["context"],
            })
        })
    )

    print("RAG system ready!")
    return rag_chain

In [None]:
# EVALUATION
def evaluate_on_squad(rag_chain, examples, max_samples=MAX_SAMPLES):
    """Evaluate RAG system """
    if max_samples:
        examples = examples[:max_samples]
        print(f"Using {max_samples} samples for evaluation")

    predictions = []
    ground_truths_list = []
    is_impossible_list = []

    print(f"\nEvaluating on {len(examples)} examples...")

    for example in tqdm(examples, desc="Processing"):
        try:
            # Get prediction from RAG system
            result = rag_chain.invoke({"question": example['question']})
            prediction = result["answer"]
            predictions.append(prediction)

            # Get ground truth answers
            ground_truths = example['answers']['text']
            ground_truths_list.append(ground_truths)

            # Track if unanswerable
            is_impossible_list.append(example['is_impossible'])
        except Exception as e:
            print(f"Error processing example {example['id']}: {e}")
            predictions.append("")
            ground_truths_list.append(example['answers']['text'])
            is_impossible_list.append(example['is_impossible'])

    # Evaluate using metrics.py
    metrics = evaluate_batch(predictions, ground_truths_list)
    unanswerable_metrics = evaluate_unanswerable(predictions, is_impossible_list)

    # Combine results
    results = {
        **metrics,
        **unanswerable_metrics,
        'total_samples': len(examples),
        'answerable_samples': sum(1 - x for x in is_impossible_list),
        'unanswerable_samples': sum(is_impossible_list)
    }

    return results, predictions

In [None]:
def main():
    """Main evaluation pipeline"""
    print("SQuAD v2 RAG Evaluation Pipeline")

    # Step 1: Load SQuAD v2 dataset (using your dataset_loader.py)
    print("\nStep 1: Loading dataset...")
    examples = load_squad_v2(split='validation')
    print(f"Loaded {len(examples)} examples")

    # Step 2: Prepare contexts for RAG (using your dataset_loader.py)
    print("\nStep 2: Preparing contexts...")
    contexts = prepare_contexts_for_rag(
        examples,
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )
    print(f"Created {len(contexts)} document chunks")

    # Step 3: Setup RAG system
    print("\nStep 3: Setting up RAG system...")
    rag_chain = setup_rag_system(contexts)

    # Step 4: Run evaluation
    print("\nStep 4: Running evaluation...")
    results, predictions = evaluate_on_squad(rag_chain, examples, MAX_SAMPLES)

    # Step 5: Display results
    print("\n" + "="*60)
    print("EVALUATION RESULTS")
    print("="*60)
    print(f"F1 Score:              {results['f1']:.4f}")
    print(f"Exact Match (EM):      {results['em']:.4f}")
    print(f"Unanswerable Detection: {results['unanswerable_detection_accuracy']:.4f}")
    print(f"\nDataset Statistics:")
    print(f"  Total Samples:       {results['total_samples']}")
    print(f"  Answerable:          {results['answerable_samples']}")
    print(f"  Unanswerable:        {results['unanswerable_samples']}")
    print("="*60)

    # Step 6: Save results
    output_file = 'evaluation_results.json'
    with open(output_file, 'w') as f:
        json.dump({
            'metrics': results,
            'sample_predictions': predictions[:20],  # Save first 20 for inspection
            'config': {
                'embedding_model': EMBEDDING_MODEL,
                'llm_model': LLM_MODEL,
                'chunk_size': CHUNK_SIZE,
                'chunk_overlap': CHUNK_OVERLAP,
                'k_retrieved': K_RETRIEVED,
                'max_samples': MAX_SAMPLES
            }
        }, f, indent=2)

    print(f"\nResults saved to {output_file}")
    print("Evaluation complete!")


In [None]:
main()

SQuAD v2 RAG Evaluation Pipeline

Step 1: Loading dataset...
Loaded 11873 examples

Step 2: Preparing contexts...
Created 14591 document chunks

Step 3: Setting up RAG system...
Creating new vector store and saving to {persist_directory}...
Created and saved new index!
RAG system ready!

Step 4: Running evaluation...
Using 50 samples for evaluation

Evaluating on 50 examples...


Processing:  66%|██████▌   | 33/50 [01:48<00:53,  3.15s/it]

Error processing example 56dde27d9a695914005b9651: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 200000, Requested 448. Please try again in 3m13.535999999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error processing example 56dde27d9a695914005b9652: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 200000, Requested 632. Please try again in 4m33.024s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


Processing:  70%|███████   | 35/50 [01:48<00:24,  1.61s/it]

Error processing example 5ad3af11604f3c001a3fec63: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 200000, Requested 481. Please try again in 3m27.792s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error processing example 5ad3af11604f3c001a3fec64: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199999, Requested 482. Please try again in 3m27.792s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


Processing:  74%|███████▍  | 37/50 [01:49<00:10,  1.19it/s]

Error processing example 5ad3af11604f3c001a3fec65: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199999, Requested 525. Please try again in 3m46.368s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error processing example 56dde2fa66d3e219004dad9b: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199999, Requested 356. Please try again in 2m33.36s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


Processing:  78%|███████▊  | 39/50 [01:49<00:05,  2.12it/s]

Error processing example 5ad3c626604f3c001a3ff011: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199998, Requested 739. Please try again in 5m18.384s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error processing example 5ad3c626604f3c001a3ff012: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199998, Requested 356. Please try again in 2m32.928s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


Processing:  80%|████████  | 40/50 [01:49<00:03,  2.68it/s]

Error processing example 5ad3c626604f3c001a3ff013: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199998, Requested 453. Please try again in 3m14.832s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


Processing:  84%|████████▍ | 42/50 [01:49<00:02,  3.80it/s]

Error processing example 56de0f6a4396321400ee257f: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199997, Requested 618. Please try again in 4m25.68s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error processing example 5ad3dbc6604f3c001a3ff3e9: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199997, Requested 445. Please try again in 3m10.944s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


Processing:  88%|████████▊ | 44/50 [01:49<00:01,  5.44it/s]

Error processing example 5ad3dbc6604f3c001a3ff3ea: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199997, Requested 610. Please try again in 4m22.224s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error processing example 5ad3dbc6604f3c001a3ff3eb: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199997, Requested 355. Please try again in 2m32.064s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


Processing:  92%|█████████▏| 46/50 [01:50<00:00,  6.98it/s]

Error processing example 5ad3dbc6604f3c001a3ff3ec: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199996, Requested 614. Please try again in 4m23.52s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error processing example 56de0ffd4396321400ee258d: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199996, Requested 542. Please try again in 3m52.416s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


Processing:  96%|█████████▌| 48/50 [01:50<00:00,  7.94it/s]

Error processing example 56de0ffd4396321400ee258e: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199996, Requested 543. Please try again in 3m52.848s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error processing example 56de0ffd4396321400ee258f: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199996, Requested 311. Please try again in 2m12.624s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


Processing: 100%|██████████| 50/50 [01:50<00:00,  2.21s/it]

Error processing example 5ad3de8b604f3c001a3ff467: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199995, Requested 546. Please try again in 3m53.712s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error processing example 5ad3de8b604f3c001a3ff468: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01k5stx7hwez8968pj3m4yf370` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199995, Requested 542. Please try again in 3m51.983999999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

EVALUATION RESULTS
F1 Score:              0.1123
Exact Match (EM):      0.3000
Unan


