In [1]:
import os
import glob

from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_openai import ChatOpenAI

import gradio as gr

load_dotenv(override=True)

An error occurred: module 'importlib.metadata' has no attribute 'packages_distributions'


  from .autonotebook import tqdm as notebook_tqdm


True

# Using langchain loaders to load dataset from local directory -> corpus.jsonl

üëâ Each JSONL line is being passed as a full Python dict
üëâ LangChain expects Document.page_content to be a string

In [2]:
from langchain_community.document_loaders import JSONLoader

loader = JSONLoader(
    file_path="corpus.jsonl",
    # jq_schema="._id",
    jq_schema=".text",
    json_lines=True
)

documents = loader.load()
print(f"Loaded {len(documents)} documents")


Loaded 973 documents


In [3]:
print(type(documents[0]))
documents[0].page_content[:500]  # Print the first 500 characters of the first document

<class 'langchain_core.documents.base.Document'>


'"Privileged" Nominations Every year the Senate routinely considers whether to give its advice and consent to hundreds of nominations submitted by the President. From start to finish, the confirmation process can be a lengthy one, even for relatively noncontroversial nominees. Each nomination is typically referred to one or more committees having subject matter jurisdiction over the position. Committees may bear a significant workload in examining nominees√¢\x80\x94often including questionnaires, option'

In [4]:
documents[4].metadata  # Print the metadata of the first document


{'source': 'C:\\Users\\hitan\\OneDrive\\Desktop\\me space\\Projects_RAG\\gov_docs_rag\\gov_docs_rag\\corpus.jsonl',
 'seq_num': 5}

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

textsplitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

chunks = textsplitter.split_documents(documents)
print(f"Split into {len(chunks)} chunks")

Split into 61375 chunks


In [6]:
chunks[0].page_content # Print the first chunk

'"Privileged" Nominations Every year the Senate routinely considers whether to give its advice and consent to hundreds of nominations submitted by the President. From start to finish, the confirmation process can be a lengthy one, even for relatively noncontroversial nominees. Each nomination is typically referred to one or more committees having subject matter jurisdiction over the position. Committees may bear a significant workload in examining nominees√¢\x80\x94often including questionnaires, optional public hearings, and individual meetings with Senators√¢\x80\x94to determine whether to report a nomination to the full Senate. Once a committee has reported a nomination or been discharged from its further consideration, the Senate may take up a nomination for deliberation, though a cloture process may be required to ensure a final vote to confirm. As part of an effort to streamline the nominations process during the 112 th Congress (2011-2012), a standing order of the Senate, S.Res.

In [7]:
# embeddings = HuggingFaceEmbeddings(model_name = "BAAI/bge-large-en-v1.5")

# db_name = "vector_db"

# if os.path.exists(db_name):
#     Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()
    
# vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
# print(vectorstore._collection.count())

In [8]:
## relaoding the vectordb without re-embedding

embedding = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5"
)

vectordb = Chroma(
    persist_directory="./vector_db",
    embedding_function=embedding
)

setting up langchain objects

In [9]:
load_dotenv(override=True)

key = os.getenv("api-key")
print("KEY:", key)
print("Starts with sk-or-:", key.startswith("sk-or-") if key else None)

KEY: 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
Starts with sk-or-: True


In [11]:
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k":5})
llm = ChatOpenAI(model_name="mistralai/devstral-2512:free", 
                 openai_api_base="https://openrouter.ai/api/v1",
                 temperature=0.3,
                 max_tokens=512,
                 openai_api_key=os.getenv("api-key"))

In [13]:
retriever.invoke("What is Defense Acquisition System?")

[Document(id='9ee824a8-0899-4f42-b359-989b22099a94', metadata={'seq_num': 274, 'source': 'C:\\Users\\hitan\\OneDrive\\Desktop\\me space\\Projects_RAG\\gov_docs_rag\\gov_docs_rag\\corpus.jsonl'}, page_content='Enterprise Acquisition Services. Services Provided: Computing Services operates the DISA Data Centers, which provide mainframe and server processing operations, data storage, and other information technology services and support across the Department of Defense (DOD). Telecommunications Services provides secure telecommunications services, including the Defense Information Systems Network. Enterprise Acquisitions Services provides contracting services for information technology and telecommunications acquisitions from the commercial sector and contracting support to the Defense Information Systems Network programs and other customers through DISA‚Äôs Defense Information Technology Contracting Organization. Approach to Allocating Costs: The Defense Information Systems Agency (DISA)

In [12]:
llm.invoke("What is Defense Acquisition System?")

AIMessage(content='The **Defense Acquisition System (DAS)** is the structured process used by the **U.S. Department of Defense (DoD)** to acquire weapons, equipment, services, and other capabilities needed to support national defense. It is governed by **DoD Directive 5000.01** and **DoD Instruction 5000.02**, which outline policies and procedures for acquiring defense systems efficiently, cost-effectively, and in a timely manner.\n\n### **Key Components of the Defense Acquisition System:**\n1. **Acquisition Framework**\n   - The system follows a **phased approach** (from concept to disposal) to manage risk, cost, and performance.\n   - It emphasizes **affordability, innovation, and rapid fielding** of capabilities.\n\n2. **Acquisition Phases (DoD 5000.02)**\n   The process is divided into **six key phases**:\n   - **Materiel Solution Analysis (MSA)** ‚Äì Identifies capabilities needed and explores potential solutions.\n   - **Technology Maturation & Risk Reduction (TMRR)** ‚Äì Develop

In [None]:
SYSTEM_PROMPT_TEMPLATE = """
You are a knowledgeable, strict assistant representing the details from government documents.
You are chatting with a user about government policies.
If relevant, use the given context to answer any question.
If you don't know the answer, say so.
Context:
{context}
"""

In [None]:
def answer_question(question: str, history):
    docs = retriever.invoke(question)
    context = "\n\n".join(doc.page_content for doc in docs)####################
    system_prompt = SYSTEM_PROMPT_TEMPLATE.format(context=context)
    response = llm.invoke([SystemMessage(content=system_prompt), HumanMessage(content=question)])
    return response.content

In [None]:
# answer_question("which program had an estimated combined value of $20 billion?", [])

"Based on the provided context, none of the mentioned programs had an estimated combined value of $20 billion. The largest funding figure mentioned is the **$1.2 trillion** estimate over 30 years for the **U.S. nuclear arsenal modernization program**, as referenced in the Congressional Budget Office's projection.\n\nIf you're referring to a different program or need further clarification, please specify."

In [None]:
# gr.ChatInterface(answer_question).launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [None]:
llm1 = ChatOpenAI(
    model="nvidia/nemotron-3-nano-30b-a3b:free",  # example
    openai_api_key=os.getenv("api-key"),
    openai_api_base="https://openrouter.ai/api/v1",
    temperature=0.3,
    max_tokens=512,
)

In [None]:
#create another llm instance with different model and test the performance


In [None]:
from rag_evaluator import RAGEvaluator
from langchain_core.messages import SystemMessage, HumanMessage
import json
from dataclasses import dataclass
from typing import List, Tuple

# ============================================================================
# DATA CLASSES FOR DETAILED EVALUATION
# ============================================================================

@dataclass
class RetrievalResult:
    """Results from retrieval evaluation"""
    mrr: float  # Mean Reciprocal Rank
    ndcg: float  # Normalized Discounted Cumulative Gain
    keywords_found: int
    total_keywords: int
    keyword_coverage: float  # percentage
    retrieved_docs: List[str]

@dataclass
class AnswerResult:
    """Results from answer evaluation"""
    accuracy: float  # 0-5
    completeness: float  # 0-5
    relevance: float  # 0-5
    feedback: str

# ============================================================================
# EVALUATION FUNCTIONS
# ============================================================================

def evaluate_retrieval(question, keywords, k=4):
    """
    Evaluate retrieval quality
    Returns: RetrievalResult with metrics
    """
    docs = retreiver.invoke(question)
    
    # Get doc content
    doc_contents = [doc.page_content.lower() for doc in docs]
    combined_content = " ".join(doc_contents)
    
    # Count keywords found
    keywords_found = 0
    for keyword in keywords:
        if keyword.lower() in combined_content:
            keywords_found += 1
    
    # Calculate keyword coverage
    keyword_coverage = (keywords_found / len(keywords) * 100) if keywords else 0
    
    # Calculate MRR (Mean Reciprocal Rank)
    mrr = 0.0
    for idx, doc in enumerate(docs):
        doc_lower = doc.page_content.lower()
        found_keywords = sum(1 for kw in keywords if kw.lower() in doc_lower)
        if found_keywords > 0:
            mrr = 1.0 / (idx + 1)
            break
    
    # Calculate nDCG (simplified version)
    dcg = 0.0
    idcg = 0.0
    for idx in range(min(len(docs), len(keywords))):
        doc_lower = docs[idx].page_content.lower()
        found_keywords = sum(1 for kw in keywords if kw.lower() in doc_lower)
        dcg += found_keywords / (idx + 1)
        idcg += 1 / (idx + 1)
    
    ndcg = dcg / idcg if idcg > 0 else 0.0
    
    return RetrievalResult(
        mrr=mrr,
        ndcg=ndcg,
        keywords_found=keywords_found,
        total_keywords=len(keywords),
        keyword_coverage=keyword_coverage,
        retrieved_docs=[doc.page_content[:200] for doc in docs]
    )

def evaluate_answer(question, reference_answer, keywords):
    """
    Evaluate generated answer quality
    Returns: (AnswerResult, generated_answer, retrieved_docs)
    """
    # Get retrieval
    docs = retreiver.invoke(question)
    context = "\n\n".join(doc.page_content for doc in docs)
    
    # Generate answer
    system_prompt = SYSTEM_PROMPT_TEMPLATE.format(context=context)
    response = llm1.invoke([
        SystemMessage(content=system_prompt),
        HumanMessage(content=question)
    ])
    generated_answer = response.content
    
    # Evaluate accuracy (keyword matching)
    answer_lower = generated_answer.lower()
    keywords_matched = sum(1 for kw in keywords if kw.lower() in answer_lower)
    accuracy = (keywords_matched / len(keywords) * 5) if keywords else 5.0
    accuracy = min(accuracy, 5.0)
    
    # Evaluate completeness (length and detail)
    ref_words = len(reference_answer.split())
    gen_words = len(generated_answer.split())
    length_ratio = gen_words / ref_words if ref_words > 0 else 1.0
    completeness = 5.0 if 0.7 <= length_ratio <= 1.3 else 3.0
    
    # Evaluate relevance (semantic similarity)
    ref_words_set = set(reference_answer.lower().split())
    gen_words_set = set(generated_answer.lower().split())
    similarity = len(ref_words_set & gen_words_set) / len(ref_words_set | gen_words_set)
    relevance = similarity * 5.0
    
    # Generate feedback
    feedback_parts = []
    if keywords_matched < len(keywords):
        missing = [kw for kw in keywords if kw.lower() not in answer_lower]
        feedback_parts.append(f"Missing keywords: {', '.join(missing[:3])}")
    if length_ratio < 0.7:
        feedback_parts.append("Answer too short - lacks detail")
    elif length_ratio > 1.3:
        feedback_parts.append("Answer too long - could be more concise")
    if similarity < 0.5:
        feedback_parts.append("Answer diverges from reference - may have hallucinations")
    
    feedback = " | ".join(feedback_parts) if feedback_parts else "Good answer!"
    
    return AnswerResult(
        accuracy=accuracy,
        completeness=completeness,
        relevance=relevance,
        feedback=feedback
    ), generated_answer, docs

# ============================================================================
# MAIN EVALUATION LOOP
# ============================================================================

def run_detailed_evaluation(test_cases, num_tests=None):
    """
    Run detailed evaluation on test cases
    """
    if num_tests:
        test_cases = test_cases[:num_tests]
    
    results = {
        'individual_tests': [],
        'summary': {}
    }
    
    print("\n" + "=" * 80)
    print("DETAILED RAG SYSTEM EVALUATION")
    print("=" * 80)
    print(f"Total tests to run: {len(test_cases)}\n")
    
    # Track metrics
    all_accuracy = []
    all_completeness = []
    all_relevance = []
    all_mrr = []
    all_ndcg = []
    all_keyword_coverage = []
    
    for test_number, test in enumerate(test_cases, 1):
        # Print test info
        print(f"\n{'=' * 80}")
        print(f"Test #{test_number}")
        print(f"{'=' * 80}")
        print(f"Question: {test['question']}")
        print(f"Keywords: {test['keywords']}")
        print(f"Category: {test['category']}")
        print(f"Reference Answer: {test['reference_answer'][:150]}...")
        
        # Retrieval Evaluation
        print(f"\n{'=' * 80}")
        print("Retrieval Evaluation")
        print(f"{'=' * 80}")
        
        retrieval_result = evaluate_retrieval(
            test['question'], 
            test['keywords']
        )
        
        print(f"MRR: {retrieval_result.mrr:.4f}")
        print(f"nDCG: {retrieval_result.ndcg:.4f}")
        print(f"Keywords Found: {retrieval_result.keywords_found}/{retrieval_result.total_keywords}")
        print(f"Keyword Coverage: {retrieval_result.keyword_coverage:.1f}%")
        print(f"\nRetrieved Documents:")
        for i, doc in enumerate(retrieval_result.retrieved_docs, 1):
            print(f"  {i}. {doc}...")
        
        # Answer Evaluation
        print(f"\n{'=' * 80}")
        print("Answer Evaluation")
        print(f"{'=' * 80}")
        
        answer_result, generated_answer, retrieved_docs = evaluate_answer(
            test['question'],
            test['reference_answer'],
            test['keywords']
        )
        
        print(f"\nGenerated Answer:\n{generated_answer}")
        print(f"\nFeedback:\n{answer_result.feedback}")
        print("\nScores:")
        print(f"  Accuracy: {answer_result.accuracy:.2f}/5")
        print(f"  Completeness: {answer_result.completeness:.2f}/5")
        print(f"  Relevance: {answer_result.relevance:.2f}/5")
        print(f"  Overall: {(answer_result.accuracy + answer_result.completeness + answer_result.relevance)/3:.2f}/5")
        print(f"{'=' * 80}\n")
        
        # Store results
        results['individual_tests'].append({
            'test_number': test_number,
            'question': test['question'],
            'category': test['category'],
            'keywords': test['keywords'],
            'generated_answer': generated_answer,
            'reference_answer': test['reference_answer'],
            'retrieval': {
                'mrr': retrieval_result.mrr,
                'ndcg': retrieval_result.ndcg,
                'keyword_coverage': retrieval_result.keyword_coverage,
                'keywords_found': retrieval_result.keywords_found
            },
            'answer': {
                'accuracy': answer_result.accuracy,
                'completeness': answer_result.completeness,
                'relevance': answer_result.relevance,
                'feedback': answer_result.feedback
            }
        })
        
        # Collect metrics
        all_accuracy.append(answer_result.accuracy)
        all_completeness.append(answer_result.completeness)
        all_relevance.append(answer_result.relevance)
        all_mrr.append(retrieval_result.mrr)
        all_ndcg.append(retrieval_result.ndcg)
        all_keyword_coverage.append(retrieval_result.keyword_coverage)
    
    # Calculate summary
    results['summary'] = {
        'total_tests': len(test_cases),
        'avg_accuracy': sum(all_accuracy) / len(all_accuracy) if all_accuracy else 0,
        'avg_completeness': sum(all_completeness) / len(all_completeness) if all_completeness else 0,
        'avg_relevance': sum(all_relevance) / len(all_relevance) if all_relevance else 0,
        'avg_mrr': sum(all_mrr) / len(all_mrr) if all_mrr else 0,
        'avg_ndcg': sum(all_ndcg) / len(all_ndcg) if all_ndcg else 0,
        'avg_keyword_coverage': sum(all_keyword_coverage) / len(all_keyword_coverage) if all_keyword_coverage else 0,
    }
    
    return results

# ============================================================================
# PRINT SUMMARY REPORT
# ============================================================================

def print_summary_report(results):
    """Print summary statistics"""
    summary = results['summary']
    
    print("\n" + "=" * 80)
    print("EVALUATION SUMMARY REPORT")
    print("=" * 80)
    print(f"\nTotal Tests Run: {summary['total_tests']}")
    print(f"\nAverage Scores:")
    print(f"  Accuracy:         {summary['avg_accuracy']:.2f}/5.0")
    print(f"  Completeness:     {summary['avg_completeness']:.2f}/5.0")
    print(f"  Relevance:        {summary['avg_relevance']:.2f}/5.0")
    print(f"  Overall:          {(summary['avg_accuracy'] + summary['avg_completeness'] + summary['avg_relevance'])/3:.2f}/5.0")
    
    print(f"\nRetrieval Metrics:")
    print(f"  Mean Reciprocal Rank (MRR): {summary['avg_mrr']:.4f}")
    print(f"  nDCG (Normalized Gains):    {summary['avg_ndcg']:.4f}")
    print(f"  Keyword Coverage:           {summary['avg_keyword_coverage']:.1f}%")
    print(f"\n{'=' * 80}\n")

# ============================================================================
# USAGE
# ============================================================================

# Load test data
import json
test_cases = []
with open('tests.jsonl', 'r') as f:
    for line in f:
        test_cases.append(json.loads(line.strip()))

# Run detailed evaluation (first 5 tests for quick demo)
results = run_detailed_evaluation(test_cases, num_tests=None)

# Print summary
print_summary_report(results)

# Save all results
with open('detailed_evaluation_results.json', 'w') as f:
    json.dump(results, f, indent=2)
    print("‚úÖ Saved detailed results to: detailed_evaluation_results.json")


FileNotFoundError: [Errno 2] No such file or directory: 'tests.jsonl'