In [40]:
import os
import glob

from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_openai import ChatOpenAI

import gradio as gr



In [41]:
load_dotenv()


True

In [42]:
#using lanchain's loaders
folders = glob.glob("knowledge-base/*")

documents=[]

for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={'encoding' : 'utf-8'})
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)
print(len(documents))

76


In [43]:
documents[1]

Document(metadata={'source': 'knowledge-base\\company\\careers.md', 'doc_type': 'company'}, page_content="# Careers at Insurellm\n\n## Why Join Insurellm?\n\nAt Insurellm, we're not just building software—we're revolutionizing an entire industry. Since our founding in 2015, we've evolved from a high-growth startup to a lean, profitable company with 32 highly talented employees managing 32 active contracts across all eight of our product lines.\n\nAfter reaching 200 employees in 2020, we strategically restructured in 2022-2023 to focus on sustainable growth, operational excellence, and building a world-class remote-first culture. Today, we're a tight-knit team of exceptional professionals who deliver outsized impact through automation, AI, and strategic focus on high-value enterprise clients—from regional insurers to global reinsurance partners.\n\n### Our Culture\n\nWe live by our core values every day:\n- **Innovation First**: We encourage experimentation and creative problem-solving\

In [44]:
# divide into chunks using the recursivetextsplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 800, chunk_overlap = 250)
chunks= text_splitter.split_documents(documents) 
chunks[0]

Document(metadata={'source': 'knowledge-base\\company\\about.md', 'doc_type': 'company'}, page_content='# About Insurellm\n\nInsurellm was founded by Avery Lancaster in 2015 as an insurance tech startup designed to disrupt an industry in need of innovative products. Its first product was Markellm, the marketplace connecting consumers with insurance providers.\n\nThe company experienced rapid growth in its first five years, expanding its product portfolio to include Carllm (auto insurance portal), Homellm (home insurance portal), and Rellm (enterprise reinsurance platform). By 2020, Insurellm had reached a peak of 200 employees with 12 offices across the US.')

In [45]:
chunks[100]

Document(metadata={'source': 'knowledge-base\\contracts\\Contract with Evergreen Life Insurance for Lifellm.md', 'doc_type': 'contracts'}, page_content='---\n\n## Features\n\nEvergreen Life Insurance will receive the following Starter Tier features:\n\n1. **AI-Powered Underwriting:** Accelerated underwriting process analyzing:\n   - Medical histories and prescription databases (Milliman IntelliScript)\n   - Motor vehicle records (MVRs)\n   - Credit-based insurance scores\n   - Lifestyle and occupation risk factors\n   - Automated underwriting decisions for standard risks\n\n2. **Risk Assessment:** AI-driven mortality risk modeling considering:\n   - Age, gender, and family medical history\n   - Current health conditions and lab values\n   - Tobacco and alcohol use\n   - High-risk activities and occupations')

### Pick an embedding model and create a vector database

In [46]:
embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")

db_name = "vector_db"

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()
    
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(vectorstore._collection.count())

552


### setting up langchain objects: 
### 1)retreiver    2)llm

In [47]:
retreiver = vectorstore.as_retriever(search_type="similarity",search_kwargs={"k": 4})
# llm = ChatGoogleGenerativeAI(
#     model="gemini-3-flash-preview",
#     temperature=0.3,
#     max_output_tokens=512,
#     google_api_key=os.getenv("GOOGLE_API_KEY")
# )
llm = ChatOpenAI(
    model="xiaomi/mimo-v2-flash:free",  # example
    openai_api_key=os.getenv("OPENROUTER_API_KEY"),
    openai_api_base="https://openrouter.ai/api/v1",
    temperature=0.3,
    max_tokens=512,
)

In [48]:
retreiver.invoke("Who is Avery?")

[Document(id='1a0a718c-6438-4757-b76a-18f55b29c68f', metadata={'source': 'knowledge-base\\employees\\Avery Lancaster.md', 'doc_type': 'employees'}, page_content="## Other HR Notes\n- **Professional Development**: Avery has actively participated in leadership training programs and industry conferences, representing Insurellm and fostering partnerships.  \n- **Diversity & Inclusion Initiatives**: Avery has championed a commitment to diversity in hiring practices, seeing visible improvements in team representation since 2021.  \n- **Work-Life Balance**: Feedback revealed concerns regarding work-life balance, which Avery has approached by implementing flexible working conditions and ensuring regular check-ins with the team.\n- **Community Engagement**: Avery led community outreach efforts, focusing on financial literacy programs, particularly aimed at underserved populations, improving Insurellm's corporate social responsibility image."),
 Document(id='b5450b57-cf81-48d4-ad88-aea2d7094309'

In [49]:
llm.invoke("Who is Avery?")

AIMessage(content='Because "Avery" is a common name, it could refer to several different people or entities. To give you the best answer, here are the most notable individuals and uses of the name:\n\n**1. Avery Brooks (Actor)**\n*   **Known for:** Playing **Captain Benjamin Sisko** in *Star Trek: Deep Space Nine* and Hawk in *Spenser: For Hire*.\n*   **Significance:** He is one of the most prominent actors in the *Star Trek* franchise.\n\n**2. Avery (Rapper)**\n*   **Real name:** Avery Sexton.\n*   **Known for:** Being a member of the hip-hop collective **Seshollowaterboyz** alongside artists like Bones, Xavier Wulf, and Chris Travis. He is known for his lo-fi, underground rap style.\n\n**3. Avery Schreiber (Comedian)**\n*   **Known for:** His stand-up career in the 1970s and 80s, as well as his memorable role as "Jack" in the 1985 film *The Garbage Pail Kids Movie*.\n\n**4. Avery (The Sims)**\n*   **Context:** In the life simulation video game *The Sims 4*, Avery is a pre-made charac

### Putting it all together

In [50]:
SYSTEM_PROMPT_TEMPLATE = """
You are a knowledgeable, friendly assistant representing the company Insurellm.
You are chatting with a user about Insurellm.
If relevant, use the given context to answer any question.
If you don't know the answer, say so.
Context:
{context}
"""

In [51]:
def answer_question(question: str, history):
    docs = retreiver.invoke(question)
    context = "\n\n".join(doc.page_content for doc in docs)
    system_prompt = SYSTEM_PROMPT_TEMPLATE.format(context=context)
    response = llm.invoke([SystemMessage(content=system_prompt), HumanMessage(content=question)])
    return response.content
    

In [52]:
# answer_question("Who is Averi Lancaster?", [])

In [53]:
# gr.ChatInterface(answer_question).launch(inbrowser=True)


In [54]:
llm1 = ChatOpenAI(
    model="mistralai/devstral-2512:free",  # example
    openai_api_key=os.getenv("OPENROUTER_API_KEY"),
    openai_api_base="https://openrouter.ai/api/v1",
    temperature=0.3,
    max_tokens=512,
)

In [57]:
from rag_evaluator import RAGEvaluator
from langchain_core.messages import SystemMessage, HumanMessage
import json
from dataclasses import dataclass
from typing import List, Tuple

# ============================================================================
# DATA CLASSES FOR DETAILED EVALUATION
# ============================================================================

@dataclass
class RetrievalResult:
    """Results from retrieval evaluation"""
    mrr: float  # Mean Reciprocal Rank
    ndcg: float  # Normalized Discounted Cumulative Gain
    keywords_found: int
    total_keywords: int
    keyword_coverage: float  # percentage
    retrieved_docs: List[str]

@dataclass
class AnswerResult:
    """Results from answer evaluation"""
    accuracy: float  # 0-5
    completeness: float  # 0-5
    relevance: float  # 0-5
    feedback: str

# ============================================================================
# EVALUATION FUNCTIONS
# ============================================================================

def evaluate_retrieval(question, keywords, k=4):
    """
    Evaluate retrieval quality
    Returns: RetrievalResult with metrics
    """
    docs = retreiver.invoke(question)
    
    # Get doc content
    doc_contents = [doc.page_content.lower() for doc in docs]
    combined_content = " ".join(doc_contents)
    
    # Count keywords found
    keywords_found = 0
    for keyword in keywords:
        if keyword.lower() in combined_content:
            keywords_found += 1
    
    # Calculate keyword coverage
    keyword_coverage = (keywords_found / len(keywords) * 100) if keywords else 0
    
    # Calculate MRR (Mean Reciprocal Rank)
    mrr = 0.0
    for idx, doc in enumerate(docs):
        doc_lower = doc.page_content.lower()
        found_keywords = sum(1 for kw in keywords if kw.lower() in doc_lower)
        if found_keywords > 0:
            mrr = 1.0 / (idx + 1)
            break
    
    # Calculate nDCG (simplified version)
    dcg = 0.0
    idcg = 0.0
    for idx in range(min(len(docs), len(keywords))):
        doc_lower = docs[idx].page_content.lower()
        found_keywords = sum(1 for kw in keywords if kw.lower() in doc_lower)
        dcg += found_keywords / (idx + 1)
        idcg += 1 / (idx + 1)
    
    ndcg = dcg / idcg if idcg > 0 else 0.0
    
    return RetrievalResult(
        mrr=mrr,
        ndcg=ndcg,
        keywords_found=keywords_found,
        total_keywords=len(keywords),
        keyword_coverage=keyword_coverage,
        retrieved_docs=[doc.page_content[:200] for doc in docs]
    )

def evaluate_answer(question, reference_answer, keywords):
    """
    Evaluate generated answer quality
    Returns: (AnswerResult, generated_answer, retrieved_docs)
    """
    # Get retrieval
    docs = retreiver.invoke(question)
    context = "\n\n".join(doc.page_content for doc in docs)
    
    # Generate answer
    system_prompt = SYSTEM_PROMPT_TEMPLATE.format(context=context)
    response = llm1.invoke([
        SystemMessage(content=system_prompt),
        HumanMessage(content=question)
    ])
    generated_answer = response.content
    
    # Evaluate accuracy (keyword matching)
    answer_lower = generated_answer.lower()
    keywords_matched = sum(1 for kw in keywords if kw.lower() in answer_lower)
    accuracy = (keywords_matched / len(keywords) * 5) if keywords else 5.0
    accuracy = min(accuracy, 5.0)
    
    # Evaluate completeness (length and detail)
    ref_words = len(reference_answer.split())
    gen_words = len(generated_answer.split())
    length_ratio = gen_words / ref_words if ref_words > 0 else 1.0
    completeness = 5.0 if 0.7 <= length_ratio <= 1.3 else 3.0
    
    # Evaluate relevance (semantic similarity)
    ref_words_set = set(reference_answer.lower().split())
    gen_words_set = set(generated_answer.lower().split())
    similarity = len(ref_words_set & gen_words_set) / len(ref_words_set | gen_words_set)
    relevance = similarity * 5.0
    
    # Generate feedback
    feedback_parts = []
    if keywords_matched < len(keywords):
        missing = [kw for kw in keywords if kw.lower() not in answer_lower]
        feedback_parts.append(f"Missing keywords: {', '.join(missing[:3])}")
    if length_ratio < 0.7:
        feedback_parts.append("Answer too short - lacks detail")
    elif length_ratio > 1.3:
        feedback_parts.append("Answer too long - could be more concise")
    if similarity < 0.5:
        feedback_parts.append("Answer diverges from reference - may have hallucinations")
    
    feedback = " | ".join(feedback_parts) if feedback_parts else "Good answer!"
    
    return AnswerResult(
        accuracy=accuracy,
        completeness=completeness,
        relevance=relevance,
        feedback=feedback
    ), generated_answer, docs

# ============================================================================
# MAIN EVALUATION LOOP
# ============================================================================

def run_detailed_evaluation(test_cases, num_tests=None):
    """
    Run detailed evaluation on test cases
    """
    if num_tests:
        test_cases = test_cases[:num_tests]
    
    results = {
        'individual_tests': [],
        'summary': {}
    }
    
    print("\n" + "=" * 80)
    print("DETAILED RAG SYSTEM EVALUATION")
    print("=" * 80)
    print(f"Total tests to run: {len(test_cases)}\n")
    
    # Track metrics
    all_accuracy = []
    all_completeness = []
    all_relevance = []
    all_mrr = []
    all_ndcg = []
    all_keyword_coverage = []
    
    for test_number, test in enumerate(test_cases, 1):
        # Print test info
        print(f"\n{'=' * 80}")
        print(f"Test #{test_number}")
        print(f"{'=' * 80}")
        print(f"Question: {test['question']}")
        print(f"Keywords: {test['keywords']}")
        print(f"Category: {test['category']}")
        print(f"Reference Answer: {test['reference_answer'][:150]}...")
        
        # Retrieval Evaluation
        print(f"\n{'=' * 80}")
        print("Retrieval Evaluation")
        print(f"{'=' * 80}")
        
        retrieval_result = evaluate_retrieval(
            test['question'], 
            test['keywords']
        )
        
        print(f"MRR: {retrieval_result.mrr:.4f}")
        print(f"nDCG: {retrieval_result.ndcg:.4f}")
        print(f"Keywords Found: {retrieval_result.keywords_found}/{retrieval_result.total_keywords}")
        print(f"Keyword Coverage: {retrieval_result.keyword_coverage:.1f}%")
        print(f"\nRetrieved Documents:")
        for i, doc in enumerate(retrieval_result.retrieved_docs, 1):
            print(f"  {i}. {doc}...")
        
        # Answer Evaluation
        print(f"\n{'=' * 80}")
        print("Answer Evaluation")
        print(f"{'=' * 80}")
        
        answer_result, generated_answer, retrieved_docs = evaluate_answer(
            test['question'],
            test['reference_answer'],
            test['keywords']
        )
        
        print(f"\nGenerated Answer:\n{generated_answer}")
        print(f"\nFeedback:\n{answer_result.feedback}")
        print("\nScores:")
        print(f"  Accuracy: {answer_result.accuracy:.2f}/5")
        print(f"  Completeness: {answer_result.completeness:.2f}/5")
        print(f"  Relevance: {answer_result.relevance:.2f}/5")
        print(f"  Overall: {(answer_result.accuracy + answer_result.completeness + answer_result.relevance)/3:.2f}/5")
        print(f"{'=' * 80}\n")
        
        # Store results
        results['individual_tests'].append({
            'test_number': test_number,
            'question': test['question'],
            'category': test['category'],
            'keywords': test['keywords'],
            'generated_answer': generated_answer,
            'reference_answer': test['reference_answer'],
            'retrieval': {
                'mrr': retrieval_result.mrr,
                'ndcg': retrieval_result.ndcg,
                'keyword_coverage': retrieval_result.keyword_coverage,
                'keywords_found': retrieval_result.keywords_found
            },
            'answer': {
                'accuracy': answer_result.accuracy,
                'completeness': answer_result.completeness,
                'relevance': answer_result.relevance,
                'feedback': answer_result.feedback
            }
        })
        
        # Collect metrics
        all_accuracy.append(answer_result.accuracy)
        all_completeness.append(answer_result.completeness)
        all_relevance.append(answer_result.relevance)
        all_mrr.append(retrieval_result.mrr)
        all_ndcg.append(retrieval_result.ndcg)
        all_keyword_coverage.append(retrieval_result.keyword_coverage)
    
    # Calculate summary
    results['summary'] = {
        'total_tests': len(test_cases),
        'avg_accuracy': sum(all_accuracy) / len(all_accuracy) if all_accuracy else 0,
        'avg_completeness': sum(all_completeness) / len(all_completeness) if all_completeness else 0,
        'avg_relevance': sum(all_relevance) / len(all_relevance) if all_relevance else 0,
        'avg_mrr': sum(all_mrr) / len(all_mrr) if all_mrr else 0,
        'avg_ndcg': sum(all_ndcg) / len(all_ndcg) if all_ndcg else 0,
        'avg_keyword_coverage': sum(all_keyword_coverage) / len(all_keyword_coverage) if all_keyword_coverage else 0,
    }
    
    return results

# ============================================================================
# PRINT SUMMARY REPORT
# ============================================================================

def print_summary_report(results):
    """Print summary statistics"""
    summary = results['summary']
    
    print("\n" + "=" * 80)
    print("EVALUATION SUMMARY REPORT")
    print("=" * 80)
    print(f"\nTotal Tests Run: {summary['total_tests']}")
    print(f"\nAverage Scores:")
    print(f"  Accuracy:         {summary['avg_accuracy']:.2f}/5.0")
    print(f"  Completeness:     {summary['avg_completeness']:.2f}/5.0")
    print(f"  Relevance:        {summary['avg_relevance']:.2f}/5.0")
    print(f"  Overall:          {(summary['avg_accuracy'] + summary['avg_completeness'] + summary['avg_relevance'])/3:.2f}/5.0")
    
    print(f"\nRetrieval Metrics:")
    print(f"  Mean Reciprocal Rank (MRR): {summary['avg_mrr']:.4f}")
    print(f"  nDCG (Normalized Gains):    {summary['avg_ndcg']:.4f}")
    print(f"  Keyword Coverage:           {summary['avg_keyword_coverage']:.1f}%")
    print(f"\n{'=' * 80}\n")

# ============================================================================
# USAGE
# ============================================================================

# Load test data
import json
test_cases = []
with open('tests.jsonl', 'r') as f:
    for line in f:
        test_cases.append(json.loads(line.strip()))

# Run detailed evaluation (first 5 tests for quick demo)
results = run_detailed_evaluation(test_cases, num_tests=None)

# Print summary
print_summary_report(results)

# Save all results
with open('detailed_evaluation_results.json', 'w') as f:
    json.dump(results, f, indent=2)
    print("✅ Saved detailed results to: detailed_evaluation_results.json")



DETAILED RAG SYSTEM EVALUATION
Total tests to run: 150


Test #1
Question: Who won the prestigious IIOTY award in 2023?
Keywords: ['Maxine', 'Thompson', 'IIOTY']
Category: direct_fact
Reference Answer: Maxine Thompson won the prestigious Insurellm Innovator of the Year (IIOTY) award in 2023....

Retrieval Evaluation
MRR: 1.0000
nDCG: 1.0909
Keywords Found: 2/3
Keyword Coverage: 66.7%

Retrieved Documents:
  1. ## Other HR Notes
- Maxine participated in various company-sponsored trainings related to big data technologies and cloud infrastructure.  
- She was recognized for her contributions with the prestigi...
  2. ## Annual Performance History
- **2018**: **3/5** - Adaptable team player but still learning to take initiative.
- **2019**: **4/5** - Demonstrated strong problem-solving skills, outstanding contribut...
  3. ## Annual Performance History
- **2020:**  
  - Completed onboarding successfully.  
  - Met expectations in delivering project milestones.  
  - Received positive fee