In [10]:
from dotenv import load_dotenv
load_dotenv()

True

In [11]:
import pandas as pd

# QA
inputs = [
    "What is RAG system?",
    "What is decision-making loop?",
    "Explain feedback loop mechanisms.",
]

outputs = [
    "A Retrieval-Augmented Generation (RAG) system is a paradigm in Natural Language Processing (NLP) that combines generative and retrieval systems. In its traditional form, it works by retrieving relevant documents or information from external sources (like a database or search engine). This retrieved external knowledge is then used by a generative model to produce responses that are more relevant and contextual.",
    "In the context of Agentic RAG, a decision-making loop is a context-aware process that decides how to process and use retrieved information. Guided by the system's goals and priorities, these loops evaluate the retrieved data, determine its relevance, and decide on the best response to align with the task's objectives.",
    "Feedback loop mechanisms are a central feature of Agentic RAG that enable iterative and continuous learning. Unlike traditional systems where the process ends after a response is generated , these loops allow the system to evaluate its generated content against the desired goals. Based on this feedback, the system refines its performance by adjusting both its retrieval and generation processes in real-time.",
]

# Dataset
qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]
df = pd.DataFrame(qa_pairs)

# Write to csv
csv_path = "/GenAI Hackathon/2.0/data/qnadata.csv"
df.to_csv(csv_path, index=False)

In [12]:
from langsmith import Client

client = Client()
dataset_name = "llmoops_dataset"

# Store
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Input and expected output pairs for llmoops_dataset",
)
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

{'example_ids': ['094a0be4-024f-48bc-9241-88ded2268bfe',
  '0c74f7fd-d97f-4667-bfbc-14d3499f3271',
  'ed09d237-a7cc-47e6-b0b4-b25c6d45587a'],
 'count': 3}

In [13]:
import sys
sys.path.append("/GenAI Hackathon/2.0")

from pathlib import Path
from cementGPT_llm_chat.src.document_ingestion.data_ingestion import ChatIngestor
from cementGPT_llm_chat.src.document_chat.retrieval import ConversationalRAG
import os

# Simple file adapter for local file paths
class LocalFileAdapter:
    """Adapter for local file paths to work with ChatIngestor."""
    def __init__(self, file_path: str):
        self.path = Path(file_path)
        self.name = self.path.name
    
    def getbuffer(self) -> bytes:
        return self.path.read_bytes()
    
    def getbuffer(self) -> bytes:
        return self.path.read_bytes()


def answer_ai_report_question(
    inputs: dict,
    data_path: str = "/GenAI Hackathon/2.0/data/AgenticRAGRedefiningRetrieval-AugmentedGenerationforAdaptiveIntelligence.pdf",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    k: int = 5
) -> dict:
    """
    Answer questions about the AI Engineering Report using RAG.
    
    Args:
        inputs: Dictionary containing the question, e.g., {"question": "What is RAG?"}
        data_path: Path to the AI Engineering Report text file
        chunk_size: Size of text chunks for splitting
        chunk_overlap: Overlap between chunks
        k: Number of documents to retrieve
    
    Returns:
        Dictionary with the answer, e.g., {"answer": "RAG stands for..."}
    """
    try:
        # Extract question from inputs
        question = inputs.get("question", "")
        if not question:
            return {"answer": "No question provided"}
        
        # Check if file exists
        if not Path(data_path).exists():
            return {"answer": f"Data file not found: {data_path}"}
        
        # Create file adapter
        file_adapter = LocalFileAdapter(data_path)
        
        # Build index using ChatIngestor
        ingestor = ChatIngestor(
            temp_base="data",
            faiss_base="faiss_index",
            use_session_dirs=True
        )
        
        # Build retriever
        ingestor.built_retriver(
            uploaded_files=[file_adapter],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            k=k
        )
        
        # Get session ID and index path
        session_id = ingestor.session_id
        index_path = f"faiss_index/{session_id}"
        
        # Create RAG instance and load retriever
        rag = ConversationalRAG(session_id=session_id)
        rag.load_retriever_from_faiss(
            index_path=index_path,
            k=k,
            index_name=os.getenv("FAISS_INDEX_NAME", "index")
        )
        
        # Get answer
        answer = rag.invoke(question, chat_history=[])
        
        return {"answer": answer}
        
    except Exception as e:
        return {"answer": f"Error: {str(e)}"}

In [14]:
# Test the function with a sample question
test_input = {"question": "What is RAG system?"}
result = answer_ai_report_question(test_input)
print("Question:", test_input["question"])
print("\nAnswer:", result["answer"])

{"timestamp": "2025-10-26T10:39:29.811432Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-26T10:39:29.813626Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-26T10:39:29.815737Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-26T10:39:29.822255Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251026_160929_2ef8a429", "temp_dir": "C:\\GenAI Hackathon\\2.0\\notebook\\data\\session_20251026_160929_2ef8a429", "faiss_dir": "C:\\GenAI Hackathon\\2.0\\notebook\\faiss_index\\session_20251026_160929_2ef8a429", "sessionized": true, "timestamp": "2025-10-26T10:39:29.825687Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "AgenticRAGRedefiningRetrieval-AugmentedGenerationforAdaptiveIntelligence.pdf", "saved_as": "C:\\GenAI Hackathon\\2.0\\n

Question: What is RAG system?

Answer: The RAG system uses available external knowledge from a retrieval component as an extension to the raw knowledge passed to the generative model. It empowers the system to draw abundant external knowledge with a degree of dynamism, capable of dealing with multiple situations and various complex questions. In its traditional form, relevant documents or pieces of information are retrieved from external sources and then consumed by a generative model to produce contextual responses.


In [15]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

In [16]:
# Example: Test with all golden questions
print("Testing all questions from the dataset:\n")
for i, q in enumerate(inputs, 1):
    test_input = {"question": q}
    result = answer_ai_report_question(test_input)
    print(f"Q{i}: {q}")
    print(f"A{i}: {result['answer']}\n")
    print("-" * 80 + "\n")

Testing all questions from the dataset:



{"timestamp": "2025-10-26T10:40:51.340995Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-26T10:40:51.342508Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-26T10:40:51.344252Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-26T10:40:51.350116Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251026_161051_2b2e2301", "temp_dir": "C:\\GenAI Hackathon\\2.0\\notebook\\data\\session_20251026_161051_2b2e2301", "faiss_dir": "C:\\GenAI Hackathon\\2.0\\notebook\\faiss_index\\session_20251026_161051_2b2e2301", "sessionized": true, "timestamp": "2025-10-26T10:40:51.355189Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "AgenticRAGRedefiningRetrieval-AugmentedGenerationforAdaptiveIntelligence.pdf", "saved_as": "C:\\GenAI Hackathon\\2.0\\n

Q1: What is RAG system?
A1: The RAG system uses available external knowledge from a retrieval component as an extension to the raw knowledge passed to the generative model. This integration empowers the system to draw abundant external knowledge with a degree of dynamism, capable of dealing with multiple situations and various complex questions. In its traditional form, relevant documents or pieces of information are retrieved from external sources and then consumed by a generative model to produce contextual responses.

--------------------------------------------------------------------------------



{"timestamp": "2025-10-26T10:40:58.512313Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-26T10:40:58.513579Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-26T10:40:58.514799Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-26T10:40:58.520334Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251026_161058_ae650ef7", "temp_dir": "C:\\GenAI Hackathon\\2.0\\notebook\\data\\session_20251026_161058_ae650ef7", "faiss_dir": "C:\\GenAI Hackathon\\2.0\\notebook\\faiss_index\\session_20251026_161058_ae650ef7", "sessionized": true, "timestamp": "2025-10-26T10:40:58.526468Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "AgenticRAGRedefiningRetrieval-AugmentedGenerationforAdaptiveIntelligence.pdf", "saved_as": "C:\\GenAI Hackathon\\2.0\\n

Q2: What is decision-making loop?
A2: Agentic RAG uses context-aware decision-making loops to decide how to process and use retrieved information, which are guided by the system's goals and priorities. The system evaluates harvested data, determines its relevance, and makes decisions to create the best responses that are consistent with the stated mission. The system continues to refine its decision making and make further decisions from the feedback of previous actions.

--------------------------------------------------------------------------------



{"timestamp": "2025-10-26T10:41:05.045771Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-26T10:41:05.047032Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-26T10:41:05.047892Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-26T10:41:05.052354Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251026_161105_d3c86576", "temp_dir": "C:\\GenAI Hackathon\\2.0\\notebook\\data\\session_20251026_161105_d3c86576", "faiss_dir": "C:\\GenAI Hackathon\\2.0\\notebook\\faiss_index\\session_20251026_161105_d3c86576", "sessionized": true, "timestamp": "2025-10-26T10:41:05.058069Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "AgenticRAGRedefiningRetrieval-AugmentedGenerationforAdaptiveIntelligence.pdf", "saved_as": "C:\\GenAI Hackathon\\2.0\\n

Q3: Explain feedback loop mechanisms.
A3: Agentic RAG contains feedback loop mechanisms that enable the system to evaluate retrieved and generated content with respect to desired goals and outcomes. The system's performance is refined simultaneously on the retrieval and generation processes based on this feedback. This iterative learning model allows the system to evolve with every interaction.

--------------------------------------------------------------------------------



In [18]:
import os
from langsmith.evaluation import evaluate, LangChainStringEvaluator
from langchain_google_genai import ChatGoogleGenerativeAI

# create a Google LLM instance for evaluation
eval_llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",                 # pick a model you know works for chat
    google_api_key=os.getenv("GOOGLE_API_KEY"),
    temperature=0.0,
    max_output_tokens=512,
)

# pass the llm instance via the evaluator config
qa_evaluator = [
    LangChainStringEvaluator(
        "cot_qa",
        config={"llm": eval_llm}   # this tells the evaluator to use your Google LLM
    )
]

# now run evaluation (same as before)
experiment_results = evaluate(
    answer_ai_report_question,
    data="llmoops_dataset",
    evaluators=qa_evaluator,
    experiment_prefix="test-llmoops_dataset-qa-rag",
    metadata={"variant": "Agentic RAG ...", "chunk_size": 1000, "chunk_overlap": 200, "k": 5},
)


View the evaluation results for experiment: 'test-llmoops_dataset-qa-rag-ae551b4f' at:
https://smith.langchain.com/o/2df5139e-03a9-47c7-8f97-fe54947713a4/datasets/517197ea-52e0-44a3-8e3b-131a2e1c400e/compare?selectedSessions=5ff0007d-6f0d-43a0-96ad-cbdb8e3d8fd7




  from .autonotebook import tqdm as notebook_tqdm
0it [00:00, ?it/s]{"timestamp": "2025-10-26T10:50:31.147192Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-26T10:50:31.150165Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-26T10:50:31.152882Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-26T10:50:31.164731Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251026_162031_7de5525b", "temp_dir": "C:\\GenAI Hackathon\\2.0\\notebook\\data\\session_20251026_162031_7de5525b", "faiss_dir": "C:\\GenAI Hackathon\\2.0\\notebook\\faiss_index\\session_20251026_162031_7de5525b", "sessionized": true, "timestamp": "2025-10-26T10:50:31.171677Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "AgenticRAGRedefiningRetrieval-AugmentedGenerationfo

### Custom Correctness Evaluator
Creating an LLM-as-a-Judge evaluator to assess semantic and factual alignment

In [19]:
from langsmith.schemas import Run, Example
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

def correctness_evaluator(run: Run, example: Example) -> dict:
    """
    Custom LLM-as-a-Judge evaluator for correctness.
    
    Correctness means how well the actual model output matches the reference output 
    in terms of factual accuracy, coverage, and meaning.
    
    Args:
        run: The Run object containing the actual outputs
        example: The Example object containing the expected outputs
    
    Returns:
        dict with 'score' (1 for correct, 0 for incorrect) and 'reasoning'
    """
    # Extract actual and expected outputs
    actual_output = run.outputs.get("answer", "")
    expected_output = example.outputs.get("answer", "")
    input_question = example.inputs.get("question", "")
    
    # Define the evaluation prompt
    eval_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an evaluator whose job is to judge correctness.

Correctness means how well the actual model output matches the reference output in terms of factual accuracy, coverage, and meaning.

- If the actual output matches the reference output semantically (even if wording differs), it should be marked correct.
- If the output misses key facts, introduces contradictions, or is factually incorrect, it should be marked incorrect.

Do not penalize for stylistic or formatting differences unless they change meaning."""),
        ("human", """<example>
<input>
{input}
</input>

<output>
Expected Output: {expected_output}

Actual Output: {actual_output}
</output>
</example>

Please grade the following agent run given the input, expected output, and actual output.
Focus only on correctness (semantic and factual alignment).

Respond with:
1. A brief reasoning (1-2 sentences)
2. A final verdict: either "CORRECT" or "INCORRECT"

Format your response as:
Reasoning: [your reasoning]
Verdict: [CORRECT or INCORRECT]""")
    ])
    
    # Initialize LLM (using Gemini as shown in your config)
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-pro",
        temperature=0
    )
    
    # Create chain and invoke
    chain = eval_prompt | llm
    
    try:
        response = chain.invoke({
            "input": input_question,
            "expected_output": expected_output,
            "actual_output": actual_output
        })
        
        response_text = response.content
        
        # Parse the response
        reasoning = ""
        verdict = ""
        
        for line in response_text.split('\n'):
            if line.startswith("Reasoning:"):
                reasoning = line.replace("Reasoning:", "").strip()
            elif line.startswith("Verdict:"):
                verdict = line.replace("Verdict:", "").strip()
        
        # Convert verdict to score (1 for correct, 0 for incorrect)
        score = 1 if "CORRECT" in verdict.upper() else 0
        
        return {
            "key": "correctness",
            "score": score,
            "reasoning": reasoning,
            "comment": f"Verdict: {verdict}"
        }
        
    except Exception as e:
        return {
            "key": "correctness",
            "score": 0,
            "reasoning": f"Error during evaluation: {str(e)}"
        }

In [20]:
# Run evaluation with the custom correctness evaluator
from langsmith.evaluation import evaluate

# Define evaluators - using custom correctness evaluator
evaluators = [correctness_evaluator]

dataset_name = "llmoops_dataset"

# Run evaluation
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=evaluators,
    experiment_prefix="llmoops-correctness-eval",
    description="Evaluating RAG system with custom correctness evaluator (LLM-as-a-Judge)",
    metadata={
        "variant": "AgenticRAGRedefiningRetrieval-AugmentedGenerationforAdaptiveIntelligence",
        "evaluator": "custom_correctness_llm_judge",
        "model": "gemini-2.5-pro",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

print("\nEvaluation completed! Check the LangSmith UI for detailed results.")

View the evaluation results for experiment: 'llmoops-correctness-eval-f63d32fe' at:
https://smith.langchain.com/o/2df5139e-03a9-47c7-8f97-fe54947713a4/datasets/517197ea-52e0-44a3-8e3b-131a2e1c400e/compare?selectedSessions=63488257-b575-422f-b259-0dfcdc8d7805




0it [00:00, ?it/s]{"timestamp": "2025-10-26T11:58:44.787285Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-26T11:58:44.789251Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-26T11:58:44.791330Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-26T11:58:44.798045Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251026_172844_c1b03e06", "temp_dir": "C:\\GenAI Hackathon\\2.0\\notebook\\data\\session_20251026_172844_c1b03e06", "faiss_dir": "C:\\GenAI Hackathon\\2.0\\notebook\\faiss_index\\session_20251026_172844_c1b03e06", "sessionized": true, "timestamp": "2025-10-26T11:58:44.803158Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "AgenticRAGRedefiningRetrieval-AugmentedGenerationforAdaptiveIntelligence.pdf", "saved_as": "C:\\GenAI


Evaluation completed! Check the LangSmith UI for detailed results.
