In [9]:

import os
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

os.environ["GROQ_API_KEY"] = "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" 

if os.environ["GROQ_API_KEY"].startswith("gsk_"):
    print("‚úÖ API Key successfully loaded.")
else:
    raise ValueError("‚ùå Please paste your valid Groq API Key in the code above!")


try:
    embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    if os.path.exists("./chroma_db"):
        db = Chroma(persist_directory="./chroma_db", embedding_function=embedding)
        retriever = db.as_retriever(search_kwargs={"k": 2})
        print("‚úÖ Vector Database Connected.")
    else:
        print("‚ö†Ô∏è Warning: 'chroma_db' folder not found. RAG functionality will fail.")
except Exception as e:
    print(f"‚ùå Error loading Database: {e}")

try:
    llm_judge = ChatGroq(model="llama-3.3-70b-versatile", temperature=0)
    print("‚úÖ Judge LLM Initialized.")
except Exception as e:
    print(f"‚ùå LLM Connection Failed: {e}")

‚úÖ API Key successfully loaded.
‚úÖ Vector Database Connected.
‚úÖ Judge LLM Initialized.


In [10]:
test_set = [
    {
        "question": "What are the primary risks mentioned for this company?",
        "ground_truth": "Volatility in oil prices and regulatory changes." # <--- CHANGE THIS to match your PDF
    },
    {
        "question": "What is the outlook on future growth?",
        "ground_truth": "Focus on renewable energy and 5G expansion." # <--- CHANGE THIS to match your PDF
    }
]

In [11]:
def evaluate_rag(question, ground_truth):
    # 1. Run Retrieval
    docs = retriever.invoke(question)
    retrieved_context = "\n".join([d.page_content for d in docs])
    
    # 2. Ask the Judge
    prompt = f"""
    You are a strict teacher grading an AI's homework.
    
    User Question: {question}
    Expected Answer (Ground Truth): {ground_truth}
    
    Actual Retrieved Context by AI:
    {retrieved_context}
    
    ---
    TASK:
    Does the "Actual Retrieved Context" contain the information needed to answer the question according to the "Ground Truth"?
    
    Reply with ONLY one word: 'PASS' or 'FAIL'.
    Then provide a 1-sentence explanation.
    """
    
    score = llm_judge.invoke(prompt).content
    return score, retrieved_context

In [12]:
results = []

print("üß™ STARTING RAG EVALUATION...\n")

for item in test_set:
    print(f"‚ùì Testing: {item['question']}")
    score, context = evaluate_rag(item['question'], item['ground_truth'])
    print(f"üìù Result: {score}")
    print("-" * 30)
    results.append(score)

# Calculate Accuracy
pass_count = sum(1 for r in results if "PASS" in r)
accuracy = (pass_count / len(test_set)) * 100

print(f"\nüèÜ FINAL SYSTEM GRADE: {accuracy}% Accuracy")
if accuracy < 50:
    print("‚ö†Ô∏è Recommendation: Improve chunk size or PDF quality.")
else:
    print("‚úÖ System Ready for Production.")

üß™ STARTING RAG EVALUATION...

‚ùì Testing: What are the primary risks mentioned for this company?
üìù Result: FAIL. The "Actual Retrieved Context" does not mention volatility in oil prices and regulatory changes, which are the primary risks mentioned in the "Ground Truth".
------------------------------
‚ùì Testing: What is the outlook on future growth?
üìù Result: FAIL. The "Actual Retrieved Context" does not contain the specific information about focusing on renewable energy and 5G expansion as required by the "Ground Truth" to answer the question about the outlook on future growth.
------------------------------

üèÜ FINAL SYSTEM GRADE: 0.0% Accuracy
‚ö†Ô∏è Recommendation: Improve chunk size or PDF quality.
