## Ragas Evaluation

In [None]:
import os
import json
import pandas as pd
from typing import List, Dict, Any
from dotenv import load_dotenv
import warnings
warnings.filterwarnings('ignore')

In [None]:
import random
import time
from ragas.dataset import Dataset
import ast
from datasets import Dataset as HFDataset

In [2]:
# RAGAS imports
from ragas import evaluate
from ragas.metrics import (
    Faithfulness,AnswerRelevancy,
    ContextPrecision,ContextRecall,
    AnswerCorrectness,AnswerSimilarity
)
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

# LangChain imports
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from datasets import Dataset

# Import your existing components
import sys
sys.path.append('.')
from weaviate_client import init_weaviate, WeaviateHybridRetriever
from rag_utils import run_agent_query, create_rag_agent
from config import GROQ_API_KEY, COLLECTION_EMBED_MAP

load_dotenv()

True

In [4]:
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
load_dotenv()

True

In [None]:
weaviate_client = init_weaviate()

# Initialize Groq LLM for evaluation
groq_llm = ChatGroq(
    # model="openai/gpt-oss-120b",
    model="llama-3.3-70b-versatile",
    groq_api_key=GROQ_API_KEY,
    temperature=0.1,
    max_tokens=2000,
    n = 1
)

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="all-mpnet-base-v2",
    model_kwargs={'device': 'cpu'}
)

# Wrap for RAGAS
wrapped_llm = LangchainLLMWrapper(groq_llm)
wrapped_embeddings = LangchainEmbeddingsWrapper(embeddings)

2025-09-22 16:22:51.294 
  command:

    streamlit run c:\Users\KarthikKodam(Quadran\vs\Capstone\genai\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
  wrapped_embeddings = LangchainEmbeddingsWrapper(embeddings)
  return self.new_target(*args, **kwargs)


In [8]:
## Evaluation Queries and Ground Truth
evaluation_data = [
    {
        "question": """What was the Supreme Court of Canada's decision in the case concerning Fisheries Jurisdiction between Spain and Canada?""",
        "ground_truth": """The Supreme Court of Canada did not make a decision in the case concerning Fisheries Jurisdiction between Spain and Canada, as the case was heard by the International Court of Justice. The ICJ ruled that it had no jurisdiction to adjudicateon the dispute brought by Spain in 1995."""
    },
    {
        "question": """Why did Judge Koroma dissent from the Court's finding that it lacked jurisdiction to respond to the WHO's request? from Legality of the Use by a State of Nuclear Weapons in Armed Conflict case""",
        "ground_truth": """Judge Koroma dissented because he believed the Court misconstrued the question. In his view, the question related to the health and environmental effects of nuclear weapons, which he maintained fell "eminently within the competence and scope of the agency's activities."""
    },
    {
        "question": """What was Judge Weeramantry's view on the Court's application of the "principle of speciality" to the WHO? from Legality of the Use by a State of Nuclear Weapons in Armed Conflict case """,
        "ground_truth": """Judge Weeramantry disagreed with the Court's rigid application of the "principle of speciality," which took the question of legality out of the WHO's area of concern just because peace and security were within the concerns of the Security Council."""
    },
    {
        "question": """What did the Federal Court of Appeal hold regarding the proper constitution of the Charter challenge? from Canadian Council for Refugees v. Canada (Citizenship and Immigration) case""",
        "ground_truth": """The Federal Court of Appeal held that the Charter challenge was not properly constituted because it should have been directed at other forms of state action, specifically the administrative reviews required by s. 102(3) of the IRPA, rather than s. 159.3 of the IRPR."""
    },
    {
        "question": """Does the evidence support the Federal Court judge's finding that detention of returnees in the United States is "automatic"? from case Canadian Council for Refugees v. Canada (Citizenship and Immigration)""",
        "ground_truth": """The evidence does not support the finding that detention is "automatic." The text states that detention is not universally applied and that returnees' risks of detention vary on a case-by-case basis."""
    }
]

In [9]:
# Initialize RAG System

# Create retriever
collection_name = "InLegalBERT_Chunks"
retriever = WeaviateHybridRetriever(
    client=weaviate_client,
    collection_name=collection_name,
    embedding_model_name=COLLECTION_EMBED_MAP[collection_name]["model"],
    alpha=0.5,
    k=3
)

# Create RAG agent
agent = create_rag_agent(
    retriever=retriever,
    # selected_model="openai/gpt-oss-120b",
    selected_model="llama-3.3-70b-versatile",
    temperature=0.1,
    max_tokens=2000,
    alpha=0.5,
    top_k=3,
    show_metadata=False
    )

No sentence-transformers model found with name law-ai/InLegalBERT. Creating a new one with mean pooling.
  memory = ConversationSummaryBufferMemory(
  agent = initialize_agent(


In [None]:
# Generate Answers and Retrieve Contexts

for i, data in enumerate(evaluation_data, 1):
    print(f"\nProcessing query {i}/{len(evaluation_data)}")
    print(f"Query: {data['question'][:80]}...")
    
    try:
        # Run RAG system
        result = run_agent_query(
            agent=agent,
            query=data["question"],
            retriever=retriever,
            show_metadata=False
        )
        
        # Store results
        data["answer"] = result["generated_answer"]
        data["contexts"] = result["contexts"]
        
        print(f"  ✅ Contexts retrieved ({len(data['contexts'])} docs)")
        print(f"  Answer preview: {data['answer'][:100]}...")
        
    except Exception as e:
        print(f"  ❌ Error: {e}")
        data["answer"] = f"Error: {str(e)}"
        data["contexts"] = []

time.sleep(10)

In [9]:
# Create dataset for RAGAS
ragas_dataset = Dataset.from_list([
    {
        "question": data["question"],
        "answer": data["answer"],
        "contexts": data["contexts"],
        "ground_truth": data["ground_truth"]
    }
    for data in evaluation_data
])

print(f"📊 RAGAS dataset created with {len(ragas_dataset)} samples")

📊 RAGAS dataset created with 3 samples


In [20]:
ragas_dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 3
})

In [10]:
print("\n🔍 Sample data:")
for i in range(min(2, len(ragas_dataset))):
    sample = ragas_dataset[i]
    print(f"\nSample {i+1}:")
    print(f"Question: {sample['question'][:60]}...")
    print(f"Answer: {sample['answer'][:80]}...")
    print(f"Contexts: {len(sample['contexts'])} retrieved")


🔍 Sample data:

Sample 1:
Question: What was the Supreme Court of Canada's decision in the case ...
Answer: The Supreme Court of Canada did not hear the case concerning Fisheries Jurisdict...
Contexts: 3 retrieved

Sample 2:
Question: Why did Judge Koroma dissent from the Court's finding that i...
Answer: Judge Koroma dissented from the Court's finding that it lacked jurisdiction to r...
Contexts: 3 retrieved


In [11]:
# Initialize RAGAS Metrics
metrics = [
    Faithfulness(llm=wrapped_llm),
    AnswerRelevancy(llm=wrapped_llm, embeddings=wrapped_embeddings),
    ContextPrecision(llm=wrapped_llm),
    ContextRecall(llm=wrapped_llm),
    AnswerCorrectness(llm=wrapped_llm, embeddings=wrapped_embeddings),
    AnswerSimilarity(embeddings=wrapped_embeddings)
]

In [12]:
print("🧮 RAGAS metrics initialized:")
for metric in metrics:
    print(f"  - {metric.name}")

🧮 RAGAS metrics initialized:
  - faithfulness
  - answer_relevancy
  - context_precision
  - context_recall
  - answer_correctness
  - answer_similarity


In [None]:
# try:
#     results = evaluate(
#         dataset=ragas_dataset,
#         metrics=metrics
#     )
#     print("✅ Evaluation completed!")
# except Exception as e:
#     print(f"❌ Evaluation failed: {e}")
#     import traceback
#     traceback.print_exc()

In [14]:
# if 'results' in locals():    
#     df = results.to_pandas()

In [15]:
# df

In [16]:
# df.to_csv("eval_dataset.csv", index=False)

In [17]:
df2 = pd.read_csv("eval_dataset.csv") 
eval_df = df2[['user_input','retrieved_contexts','response','reference']].head()
eval_df

Unnamed: 0,user_input,retrieved_contexts,response,reference
0,What was the Supreme Court of Canada's decisio...,['lack of title to act on the high seas agains...,The Supreme Court of Canada did not hear the c...,The Supreme Court of Canada did not make a dec...
1,Why did Judge Koroma dissent from the Court's ...,['principles of treaty interpretation and shou...,Judge Koroma dissented from the Court's findin...,Judge Koroma dissented because he believed the...
2,What was Judge Weeramantry's view on the Court...,['principles of treaty interpretation and shou...,Judge Weeramantry disagreed with the Court's r...,Judge Weeramantry disagreed with the Court's r...


In [19]:
# --- Preprocess retrieved_contexts ---
eval_df["retrieved_contexts"] = eval_df["retrieved_contexts"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

In [None]:
results_list = []

for i, row in eval_df.iterrows():
    # Create a 1-row DataFrame
    single_row_df = pd.DataFrame([row])

    # Convert pandas → HuggingFace Dataset
    hf_dataset = HFDataset.from_pandas(single_row_df)

    try:
        eval_result = evaluate(hf_dataset, metrics=metrics)
        eval_df_row = eval_result.to_pandas()
        results_list.append(eval_df_row)
    except Exception as e:
        print(f"❌ Error at row {i}: {e}")
        continue

    # Sleep to respect rate limits
    time.sleep(20)

final_results = pd.concat(results_list, ignore_index=True)
final_results.to_csv("ragas_results.csv", index=False)
print("🎉 All metrics calculated and saved to ragas_results.csv")

  user_id = json.load(open(uuid_filepath))["userid"]
  self._single_turn_ascore(sample=sample, callbacks=group_cm),
  return await self._ascore(row, callbacks)
  return await self._ascore(row, callbacks)
Exception raised in Job[1]: IndexError(list index out of range)
  similarity_score = await self.answer_similarity.ascore(
Evaluating: 100%|██████████| 6/6 [00:39<00:00,  6.60s/it]
  self._single_turn_ascore(sample=sample, callbacks=group_cm),
  return await self._ascore(row, callbacks)
  return await self._ascore(row, callbacks)
Exception raised in Job[1]: IndexError(list index out of range)
  similarity_score = await self.answer_similarity.ascore(
Evaluating: 100%|██████████| 6/6 [00:57<00:00,  9.65s/it]
  self._single_turn_ascore(sample=sample, callbacks=group_cm),
  return await self._ascore(row, callbacks)
  return await self._ascore(row, callbacks)
Exception raised in Job[1]: IndexError(list index out of range)
  similarity_score = await self.answer_similarity.ascore(
Evaluating: 

🎉 All metrics calculated and saved to ragas_results.csv


In [4]:
df3 = pd.read_csv("ragas_results.csv") 
df3

Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_relevancy,context_precision,context_recall,answer_correctness,answer_similarity
0,What was the Supreme Court of Canada's decisio...,['lack of title to act on the high seas agains...,The Supreme Court of Canada did not hear the c...,The Supreme Court of Canada did not make a dec...,0.571429,,0.833333,1.0,0.53699,0.947959
1,Why did Judge Koroma dissent from the Court's ...,['principles of treaty interpretation and shou...,Judge Koroma dissented from the Court's findin...,Judge Koroma dissented because he believed the...,0.833333,,1.0,1.0,0.35213,0.741853
2,What was Judge Weeramantry's view on the Court...,['principles of treaty interpretation and shou...,Judge Weeramantry disagreed with the Court's r...,Judge Weeramantry disagreed with the Court's r...,0.857143,,0.833333,1.0,0.357129,0.761849
