In [1]:
print("ok")

ok


In [3]:
import os
import warnings
warnings.filterwarnings('ignore')

from dotenv import load_dotenv
load_dotenv()

True

In [4]:
import numpy as np
import pandas as pd
import asyncio
import json

from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

from ragas import SingleTurnSample, EvaluationDataset, evaluate
from ragas.metrics import (
    Faithfulness,
    ResponseRelevancy,
    LLMContextPrecisionWithReference,
    LLMContextRecall,
    ContextEntityRecall,
    NoiseSensitivity
)

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

print("‚úÖ All imports successful")

‚úÖ All imports successful


In [5]:
llm = AzureChatOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    deployment_name=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    model=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME")
)

ragas_llm = LangchainLLMWrapper(llm)
ragas_embeddings = LangchainEmbeddingsWrapper(embeddings)

print("‚úÖ LLM initialized: gpt-4o")
print("‚úÖ Embeddings initialized: text-embedding-ada-002")
print("‚úÖ RAGAS wrappers ready")

‚úÖ LLM initialized: gpt-4o
‚úÖ Embeddings initialized: text-embedding-ada-002
‚úÖ RAGAS wrappers ready


In [19]:
def run_async(coro):
    try:
        loop = asyncio.get_event_loop()
        if loop.is_running():
            import nest_asyncio
            nest_asyncio.apply()
            return loop.run_until_complete(coro)
        else:
            return asyncio.run(coro)
    except RuntimeError:
        return asyncio.run(coro)


In [7]:
test_response = "The first Super Bowl was held on January 15, 1967 in Los Angeles. It was a sunny day with clear skies."
test_context = [
    "The First AFL-NFL World Championship Game was played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles, California."
]
print("üìù Response to evaluate:")
print(f"   '{test_response}'")
print("\nüìö Retrieved context:")
print(f"   '{test_context[0]}'")

üìù Response to evaluate:
   'The first Super Bowl was held on January 15, 1967 in Los Angeles. It was a sunny day with clear skies.'

üìö Retrieved context:
   'The First AFL-NFL World Championship Game was played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles, California.'


In [8]:
claim_extraction_prompt = ChatPromptTemplate.from_template("""
Given the following response, extract ALL factual claims as a numbered list.
Each claim should be a single, verifiable statement.

Response: {response}

Extract each factual claim:
""")

claim_chain = claim_extraction_prompt | llm | StrOutputParser()
extracted_claim_raw = claim_chain.invoke({"response":test_context})

print("üîç STEP 1: Extracted Claims from Response")
print("=" * 50)
print(extracted_claim_raw)

üîç STEP 1: Extracted Claims from Response
1. The First AFL-NFL World Championship Game was played on January 15, 1967.  
2. The game took place at the Los Angeles Memorial Coliseum.  
3. The game was held in Los Angeles, California.


In [9]:
claims = [
    "The first Super Bowl was held on January 15, 1967",
    "The first Super Bowl was held in Los Angeles",
    "It was a sunny day",
    "There were clear skies"
]
print("üìã Claims to verify:")
for i, claim in enumerate(claims, 1):
    print(f"   {i}. {claim}")

üìã Claims to verify:
   1. The first Super Bowl was held on January 15, 1967
   2. The first Super Bowl was held in Los Angeles
   3. It was a sunny day
   4. There were clear skies


In [12]:
verification_prompt = ChatPromptTemplate.from_template("""
Given the following context and claim, determine if the claim is SUPPORTED by the context.

Context: {context}

Claim: {claim}

Answer with:
- "SUPPORTED" if the claim can be verified from the context
- "NOT SUPPORTED" if the claim cannot be verified or contradicts the context

Also provide a brief explanation.

Verdict:
""")

verify_claims = verification_prompt | llm | StrOutputParser()

print("üîç STEP 2: Verifying Each Claim Against Context")
print("=" * 60)

verification_results = []
for claim in claims:
    result = verify_claims.invoke({
        "context":test_context[0],
        "claim":claim
    })
    is_supported = "SUPPORTED" in result.upper() and "NOT SUPPORTED" not in result.upper()
    verification_results.append({
        "claim":claim,
        "supported": is_supported,
        "explanation":result,
    })
    status = "‚úÖ" if is_supported else "‚ùå"
    print(f"\n{status} Claims: {claim}")
    print(f"    Result: {result[:100]}..." if len(result)>100 else f"    Result: {result[:100]}")

üîç STEP 2: Verifying Each Claim Against Context

‚úÖ Claims: The first Super Bowl was held on January 15, 1967
    Result: **SUPPORTED**

**Explanation:** The context states that the First AFL-NFL World Championship Game wa...

‚úÖ Claims: The first Super Bowl was held in Los Angeles
    Result: SUPPORTED

Explanation: The context states that the First AFL-NFL World Championship Game (the event...

‚ùå Claims: It was a sunny day
    Result: **NOT SUPPORTED**

**Explanation:** The context provides information about the date, location, and e...

‚ùå Claims: There were clear skies
    Result: NOT SUPPORTED

Explanation: The context does not provide any information about the weather condition...


In [13]:
print("\nüìä Claim Verification Summary")
print("=" * 80)

df_verification = pd.DataFrame([
    {
        "Clain": r['claim'],
        "Supported?":"‚úÖ Yes" if r["supported"] else "‚ùå No",
        "Reason":"Found in context" if r['supported'] else "HALLUCINATION - Not in context!"

    }
    for r in verification_results
])
df_verification.head()


üìä Claim Verification Summary


Unnamed: 0,Clain,Supported?,Reason
0,"The first Super Bowl was held on January 15, 1967",‚úÖ Yes,Found in context
1,The first Super Bowl was held in Los Angeles,‚úÖ Yes,Found in context
2,It was a sunny day,‚ùå No,HALLUCINATION - Not in context!
3,There were clear skies,‚ùå No,HALLUCINATION - Not in context!


In [17]:
supported_count = sum(1 for r in verification_results if r['supported'])
total_claims = len(verification_results)

manual_faithfulness = supported_count/total_claims

print("üî¢ STEP 3: Calculate Faithfulness Score")
print("=" * 50)
print(f"\n   Supported claims: {supported_count}")
print(f"   Total claims: {total_claims}")
print(f"\n   Formula: Faithfulness = {supported_count} / {total_claims}")
print(f"\n   üìä Manual Faithfulness Score: {manual_faithfulness:.2f}")

üî¢ STEP 3: Calculate Faithfulness Score

   Supported claims: 2
   Total claims: 4

   Formula: Faithfulness = 2 / 4

   üìä Manual Faithfulness Score: 0.50


In [None]:
faithfulness_sample = SingleTurnSample(
    user_input="When was the first Super Bowl?",
    response=test_response,
    retrieved_contexts=test_context
)
faithfulness_metric = Faithfulness(llm=ragas_llm)
ragas_faithfulness = run_async(faithfulness_metric.single_turn_ascore(faithfulness_sample))

print("üî¨ RAGAS Faithfulness Result")
print("=" * 50)
print(f"\n   Manual calculation:  {manual_faithfulness:.2f}")
print(f"   RAGAS metric score:  {ragas_faithfulness:.2f}")
print(f"\n   Difference: {abs(manual_faithfulness - ragas_faithfulness):.2f}")

üî¨ RAGAS Faithfulness Result

   Manual calculation:  0.50
   RAGAS metric score:  0.50

   Difference: 0.00


In [25]:
faithfulness_examples = [
    {
        "name": "Perfect Faithfulness (No hallucinations)",
        "response": "The first Super Bowl was played on January 15, 1967 at the Los Angeles Memorial Coliseum.",
        "context": ["The First AFL-NFL World Championship Game was played on January 15, 1967, at the Los Angeles Memorial Coliseum."]
    },
    {
        "name": "Partial Faithfulness (Some hallucinations)",
        "response": "The first Super Bowl was on January 15, 1967. The Green Bay Packers won 35-10 with Bart Starr as MVP.",
        "context": ["The First AFL-NFL World Championship Game was played on January 15, 1967."]
    },
    {
        "name": "Zero Faithfulness (Complete hallucination)",
        "response": "The first Super Bowl was held in Miami in 1970 and attracted over 100,000 spectators.",
        "context": ["The First AFL-NFL World Championship Game was played on January 15, 1967, at the Los Angeles Memorial Coliseum."]
    }
]

print("üìä Faithfulness Comparison: Different Scenarios")
print("=" * 70)

for example in faithfulness_examples:
    sample = SingleTurnSample(
        user_input="Tell me about the first Super Bowl",
        response=example["response"],
        retrieved_contexts=example['context']
    )
    score = run_async(faithfulness_metric.single_turn_ascore(sample))

    print(f"\n{example['name']}: ")
    print(f"   Response: {example['response'][:80] if len(example['response'])> 80 else example['response']}")
    print(f"Score: { score:.2f}")

üìä Faithfulness Comparison: Different Scenarios

Perfect Faithfulness (No hallucinations): 
   Response: The first Super Bowl was played on January 15, 1967 at the Los Angeles Memorial 
Score: 1.00

Partial Faithfulness (Some hallucinations): 
   Response: The first Super Bowl was on January 15, 1967. The Green Bay Packers won 35-10 wi
Score: 0.33

Zero Faithfulness (Complete hallucination): 
   Response: The first Super Bowl was held in Miami in 1970 and attracted over 100,000 specta
Score: 0.00


In [26]:
original_question = "When was the first Super Bowl?"
test_answer = "The first Super Bowl was held on January 15, 1967"

print("üìù Original Question:")
print(f"   '{original_question}'")
print("\nüìù Answer to Evaluate:")
print(f"   '{test_answer}'")

üìù Original Question:
   'When was the first Super Bowl?'

üìù Answer to Evaluate:
   'The first Super Bowl was held on January 15, 1967'


In [28]:
question_gen_prompt = ChatPromptTemplate.from_template("""
Given the following answer, generate exactly 3 different questions that this answer would be a good response to.
The questions should be varied but all answerable by this response.

Answer: {answer}

Generate 3 questions (one per line):
1.
2.
3.
""")

question_gen_chain = question_gen_prompt | llm | StrOutputParser()

generated_question = question_gen_chain.invoke({"answer":test_answer})

print("üîç STEP 1: Generated Hypothetical Questions")
print("=" * 50)
print(generated_question)

üîç STEP 1: Generated Hypothetical Questions
1. When was the first Super Bowl held?  
2. What significant sports event took place on January 15, 1967?  
3. Can you tell me the date of the inaugural Super Bowl?  


In [30]:
generated_questions = [
    "When was the first Super Bowl held?",
    "What date was the inaugural Super Bowl?",
    "On what day did the first Super Bowl take place?"
]

print("üìã Questions for embedding comparison:")
print("   Original: ",original_question)
print("   Generated:")
for i,q in enumerate(generated_questions,1):
    print(f"      {i}. {q}")

üìã Questions for embedding comparison:
   Original:  When was the first Super Bowl?
   Generated:
      1. When was the first Super Bowl held?
      2. What date was the inaugural Super Bowl?
      3. On what day did the first Super Bowl take place?


In [33]:
def cosine_similarity(vec1, vec2):
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

print("‚úÖ Cosine similarity function ready")
print("\nüìê Formula: cos(Œ∏) = (A ¬∑ B) / (||A|| √ó ||B||)")

‚úÖ Cosine similarity function ready

üìê Formula: cos(Œ∏) = (A ¬∑ B) / (||A|| √ó ||B||)


In [35]:
print("üîç STEP : Computing Embeddings and Similarities")
print("=" * 60)

original_embedding = embeddings.embed_query(original_question)
print(f"\n‚úÖ Original question embedded (dim={len(original_embedding)})")

similarities = []
for i, gen_q in enumerate(generated_questions,1):
    gen_embedding = embeddings.embed_query(gen_q)
    sim = cosine_similarity(original_embedding, gen_embedding)
    similarities.append(sim)

    print(f"\n   Question {i}. '{gen_q}'")
    print(f"     Similarity to original: {sim:.4f}")

üîç STEP : Computing Embeddings and Similarities

‚úÖ Original question embedded (dim=1536)

   Question 1. 'When was the first Super Bowl held?'
     Similarity to original: 0.9821

   Question 2. 'What date was the inaugural Super Bowl?'
     Similarity to original: 0.9415

   Question 3. 'On what day did the first Super Bowl take place?'
     Similarity to original: 0.9574


In [39]:
manual_relavancy = np.mean(similarities)
print("üî¢ STEP 3: Calculate Answer Relevancy Score")
print("=" * 50)
print(f"   Similarities: {[f'{s:.4f}'for s in similarities]}")

print(f"   Formula: Average of similarities")
print(f"\n  ({'+'.join([f'{s:.4f}' for s in similarities])})/{len(similarities)}")
print(f"\n   üìä Manual Answer Relevancy: {manual_relavancy:.4f}")

üî¢ STEP 3: Calculate Answer Relevancy Score
   Similarities: ['0.9821', '0.9415', '0.9574']
   Formula: Average of similarities

  (0.9821+0.9415+0.9574)/3

   üìä Manual Answer Relevancy: 0.9604


In [None]:
relevancy_sample = SingleTurnSample(
    user_input=original_question,
    response=test_answer,
    retrieved_contexts=["The First AFL-NFL World Championship Game was played on January 15, 1967."]
)

relevancy_metric = ResponseRelevancy(llm=ragas_llm, embeddings=ragas_embeddings)
ragas_relevancy = run_async(relevancy_metric.single_turn_ascore(relevancy_sample))

print("üî¨ RAGAS Answer Relevancy Result")
print("=" * 50)

print(f" \n    Manual calculation: {manual_relavancy:.4f}")
print(f"     Ragas metric score: {ragas_relevancy:.4f}")

üî¨ RAGAS Answer Relevancy Result

   Manual calculation: 0.9604
     Ragas metric score: 0.9821


In [45]:
relevancy_examples = [
    {
        "name": "Highly Relevant (Directly answers WHEN)",
        "question": "When was the first Super Bowl?",
        "answer": "The first Super Bowl was held on January 15, 1967.",
    },
    {
        "name": "Partially Relevant (Answers but adds extra info)",
        "question": "When was the first Super Bowl?",
        "answer": "The Super Bowl is the annual championship game of the NFL, first held on January 15, 1967.",
    },
    {
        "name": "Low Relevancy (Doesn't answer WHEN)",
        "question": "When was the first Super Bowl?",
        "answer": "The Super Bowl is the annual championship game of the National Football League.",
    },
    {
        "name": "Off-topic (Completely irrelevant)",
        "question": "When was the first Super Bowl?",
        "answer": "Pizza is a popular Italian dish that spread worldwide in the 20th century.",
    }
]

print("Answer Relevancy Comparison: ")
print("="*70)

for example in relevancy_examples:
    sample = SingleTurnSample(
        user_input=example['question'],
        response=example['answer'],
        retrieved_contexts=["Context not relevant for this metric."]
    )
    score = run_async(relevancy_metric.single_turn_ascore(sample))

    print(f"\n   {example['name']}")
    print(f"   Q: '{example['question']}'")
    print(f"   A: '{example['answer'][:60]}...'"  if len(example['answer'])>60 else f"   A: '{example['answer']}'")

    print(f"   Score: {score:.4f}")

Answer Relevancy Comparison: 

   Highly Relevant (Directly answers WHEN)
   Q: 'When was the first Super Bowl?'
   A: 'The first Super Bowl was held on January 15, 1967.'
   Score: 0.9821

   Partially Relevant (Answers but adds extra info)
   Q: 'When was the first Super Bowl?'
   A: 'The Super Bowl is the annual championship game of the NFL, f...'
   Score: 0.9455

   Low Relevancy (Doesn't answer WHEN)
   Q: 'When was the first Super Bowl?'
   A: 'The Super Bowl is the annual championship game of the Nation...'
   Score: 0.8970

   Off-topic (Completely irrelevant)
   Q: 'When was the first Super Bowl?'
   A: 'Pizza is a popular Italian dish that spread worldwide in the...'
   Score: 0.7797


In [46]:
question = "Where is the Eiffel Tower located?"
reference = "The Eiffel Tower is located in Paris, France."

chunks_with_relevance = [
    ("The Eiffel Tower is located in Paris, France.", True),      # Directly relevant
    ("Paris is the capital of France.", True),                     # Somewhat relevant
    ("The tower was built in 1889.", False),                       # Not relevant to WHERE
    ("Pizza originated in Italy.", False),                         # Completely irrelevant
]

print("Question: '{}'\n".format(question))
print("Retrieved Chunks (with relevance):")
for i, (chunk, relevant) in enumerate(chunks_with_relevance,1):
    status = "‚úÖ Relevant" if relevant else "‚ùå Not relevant"
    print(f"   {i}. {status}: {chunk}")

Question: 'Where is the Eiffel Tower located?'

Retrieved Chunks (with relevance):
   1. ‚úÖ Relevant: The Eiffel Tower is located in Paris, France.
   2. ‚úÖ Relevant: Paris is the capital of France.
   3. ‚ùå Not relevant: The tower was built in 1889.
   4. ‚ùå Not relevant: Pizza originated in Italy.


In [50]:
relevance_prompt = ChatPromptTemplate.from_template("""
Given the question and reference answer, determine if the following context chunk is RELEVANT.

Question: {question}
Reference Answer: {reference}
Context Chunk: {chunk}

Is this chunk relevant for answering the question? Answer only "RELEVANT" or "NOT RELEVANT".
""")

relevance_chain = relevance_prompt | llm | StrOutputParser()

print("üîç Manual Relevance Classification")
print("=" * 60)

relevance_result = []
for chunk,expected in chunks_with_relevance:
    result = relevance_chain.invoke({
        "question":question,
        "reference":reference,
        "chunk":chunk,
    })
    is_relevant = "RELEVANT" in result.upper() and "NOT RELEVANT" not in result.upper()
    relevance_result.append(is_relevant)
    status = "‚úÖ" if is_relevant else "‚ùå"
    print(f"{status} '{chunk[:50]}...'-> {result.strip()}")

üîç Manual Relevance Classification
‚úÖ 'The Eiffel Tower is located in Paris, France....'-> RELEVANT
‚úÖ 'Paris is the capital of France....'-> RELEVANT
‚ùå 'The tower was built in 1889....'-> NOT RELEVANT
‚ùå 'Pizza originated in Italy....'-> NOT RELEVANT


In [53]:
good_ranking = [True, True, False, False]

print("üìä GOOD RANKING: Relevant chunks at TOP")
print("=" * 60)
print("\nRanking: [‚úÖ Relevant, ‚úÖ Relevant, ‚ùå Not Rel, ‚ùå Not Rel]")
print("\nPrecision@K calculation:")

precisions_good = []
relevant_count = 0

for k, is_relevant in enumerate(good_ranking,1):
    if is_relevant:
        relevant_count +=1
    precision_at_k = relevant_count / k
    contribute = "‚Üí Contributes" if is_relevant else "‚Üí Does NOT contribute"
    print(f"   Position {k}: Precision@{k} = {relevant_count}/{k} = {contribute}")
    if is_relevant:
        precisions_good.append(precision_at_k)

total_relevant = sum(good_ranking)
context_precision_good = sum(precisions_good)/total_relevant if total_relevant >0 else 0

print(f"\n   Sum of contributing precisions: {sum(precisions_good):.2f}")
print(f"   Total relevant items: {total_relevant}")
print(f"\n   üìä Context Precision (Good Ranking): {context_precision_good:.2f}")

üìä GOOD RANKING: Relevant chunks at TOP

Ranking: [‚úÖ Relevant, ‚úÖ Relevant, ‚ùå Not Rel, ‚ùå Not Rel]

Precision@K calculation:
   Position 1: Precision@1 = 1/1 = ‚Üí Contributes
   Position 2: Precision@2 = 2/2 = ‚Üí Contributes
   Position 3: Precision@3 = 2/3 = ‚Üí Does NOT contribute
   Position 4: Precision@4 = 2/4 = ‚Üí Does NOT contribute

   Sum of contributing precisions: 2.00
   Total relevant items: 2

   üìä Context Precision (Good Ranking): 1.00


In [55]:
bad_ranking = [False, False, True, True]

print("üìä BAD RANKING: Relevant chunks at BOTTOM")
print("=" * 60)
print("\nRanking: [‚ùå Not Rel, ‚ùå Not Rel, ‚úÖ Relevant, ‚úÖ Relevant]")
print("\nPrecision@K calculation:")

precisions_bad = []
relevant_count = 0

for k, is_relevant in enumerate(bad_ranking,1):
    if is_relevant:
        relevant_count+=1
    precision_at_k = relevant_count / k
    contributes = "‚Üí Contributes" if is_relevant else "‚Üí Does NOT contribute"
    print(f"   Position {k}: Precision@{k}={relevant_count}/{k} = {precision_at_k}")

    if is_relevant:
        precisions_bad.append(precision_at_k)
total_relevant = sum(bad_ranking)
context_precision_bad = sum(precisions_bad)/total_relevant if total_relevant > 0 else 0 

print(f"\n   Sum of contributing precision: {sum(precisions_bad):.2f}")
print(f"   Total relevant items: {total_relevant}")
print(f"\n   üìä Context Precision (Bad Ranking): {context_precision_bad:.2f}")



üìä BAD RANKING: Relevant chunks at BOTTOM

Ranking: [‚ùå Not Rel, ‚ùå Not Rel, ‚úÖ Relevant, ‚úÖ Relevant]

Precision@K calculation:
   Position 1: Precision@1=0/1 = 0.0
   Position 2: Precision@2=0/2 = 0.0
   Position 3: Precision@3=1/3 = 0.3333333333333333
   Position 4: Precision@4=2/4 = 0.5

   Sum of contributing precision: 0.83
   Total relevant items: 2

   üìä Context Precision (Bad Ranking): 0.42


In [None]:
good_sample = SingleTurnSample(
    user_input=question,
    reference=reference,
    retrieved_contexts=[
        "The Eiffel Tower is located in Paris, France.",
        "Paris is the capital of France.",
        "The tower was built in 1889.",
        "Pizza originated in Italy."
    ]
)

bad_sample = SingleTurnSample(
    user_input=question,
    reference=reference,
    retrieved_contexts=[
        "Pizza originated in Italy.",
        "The tower was built in 1889.",
        "Paris is the capital of France.",
        "The Eiffel Tower is located in Paris, France."
    ]
)

precision_metrics = LLMContextPrecisionWithReference(llm=ragas_llm)

good_score = run_async(precision_metrics.single_turn_ascore(good_sample))
bad_score = run_async(precision_metrics.single_turn_ascore(bad_sample))

print("üî¨ RAGAS Context Precision Results")
print("=" * 50)
print(f"\n   Good Ranking (relevant at top): {good_score:.2f}")
print(f"   Bad Ranking (relevant at botton): {bad_score:.2f}")
print(f"\n   Difference: {good_score-bad_score:.2f}")

üî¨ RAGAS Context Precision Results

 Good Ranking (relevant at top): 1.00
   Bad Ranking (relevant at botton): 0.42

   Difference: 0.58


In [59]:
recall_question = "Tell me about the Eiffel Tower."
recall_reference = "The Eiffel Tower is located in Paris. It was built in 1889. It is 330 meters tall."

# Retrieved context (missing the height information)
recall_context = [
    "The Eiffel Tower is a landmark located in Paris, France.",
    "The tower was completed in 1889 for the World's Fair."
]
print("üìù Reference Answer (Ground Truth):")
print(f"   '{recall_reference}'")
print("\nüìö Retrieved Context:")
for i, ctx in enumerate(recall_context, 1):
    print(f"   {i}. '{ctx}'")

üìù Reference Answer (Ground Truth):
   'The Eiffel Tower is located in Paris. It was built in 1889. It is 330 meters tall.'

üìö Retrieved Context:
   1. 'The Eiffel Tower is a landmark located in Paris, France.'
   2. 'The tower was completed in 1889 for the World's Fair.'


In [60]:
reference_claims = [
    "The Eiffel Tower is located in Paris",
    "It was built in 1889",
    "It is 330 meters tall"
]

print("üîç STEP 1: Reference Claims")
print("=" * 50)
for i, claim in enumerate(reference_claims, 1):
    print(f"   {i}. {claim}")

üîç STEP 1: Reference Claims
   1. The Eiffel Tower is located in Paris
   2. It was built in 1889
   3. It is 330 meters tall


In [62]:
attribution_prompt = ChatPromptTemplate.from_template("""
Can the following claim be attributed to (found in) the given context?

Context:
{context}

Claim: {claim}

Answer "YES" if the claim is supported by the context, "NO" if it cannot be found.
"""
)

attribution_chain = attribution_prompt | llm | StrOutputParser()

print("üîç STEP 2: Claim Attribution Check")
print("=" * 60)

combined_context = "\n".join(recall_context)
attribution_result=[]

for claim in reference_claims:
    result = attribution_chain.invoke({
        "context":combined_context,
        "claim":claim
    })
    found = "YES" in result.upper()
    attribution_result.append(found)
    print(attribution_result)
    status = "‚úÖ Found" if found else "‚ùå MISSING"
    print(f"   {status}:  '{claim}'")
    if not found:
        print(f"      ‚ö†Ô∏è This information was NOT retrieved!")

üîç STEP 2: Claim Attribution Check
[True]
   ‚úÖ Found:  'The Eiffel Tower is located in Paris'
[True, True]
   ‚úÖ Found:  'It was built in 1889'
[True, True, False]
   ‚ùå MISSING:  'It is 330 meters tall'
      ‚ö†Ô∏è This information was NOT retrieved!


In [66]:
claims_found = sum(attribution_result)
total_claims = len(reference_claims)
manual_recall = claims_found / total_claims

print("üî¢ STEP 3: Calculate Context Recall")
print("=" * 50)
print(f"\n   Claims found in context: {claims_found}")
print(f"   Total claims in reference: {total_claims}")
print(f"\n   Formula: {claims_found} / {total_claims} = {manual_recall:.2f}")
print(f"\n   üìä Context Recall: {manual_recall:.2f}")
print(f"\n   ‚ö†Ô∏è Interpretation: {100 - manual_recall*100:.0f}% of required info was NOT retrieved!")

üî¢ STEP 3: Calculate Context Recall

   Claims found in context: 2
   Total claims in reference: 3

   Formula: 2 / 3 = 0.67

   üìä Context Recall: 0.67

   ‚ö†Ô∏è Interpretation: 33% of required info was NOT retrieved!


In [67]:
recall_sample = SingleTurnSample(
    user_input=recall_question,
    response="The Eiffel Tower is in Paris and was built in 1889.",
    reference=recall_reference,
    retrieved_contexts=recall_context
)

recall_metric = LLMContextRecall(llm=ragas_llm)
ragas_recall = run_async(recall_metric.single_turn_ascore(recall_sample))

print("üî¨ RAGAS Context Recall Result")
print("=" * 50)
print(f"\n   Manual calculation: {manual_recall:.2f}")
print(f"   RAGAS metric score: {ragas_recall:.2f}")

üî¨ RAGAS Context Recall Result

   Manual calculation: 0.67
   RAGAS metric score: 0.67


In [68]:
entity_reference = "Albert Einstein developed the theory of relativity at Princeton University in 1905."
entity_context = [
    "Albert Einstein was a famous physicist who worked at Princeton."
]

print("üìù Reference Answer:")
print(f"   '{entity_reference}'")
print("\nüìö Retrieved Context:")
print(f"   '{entity_context[0]}'")

üìù Reference Answer:
   'Albert Einstein developed the theory of relativity at Princeton University in 1905.'

üìö Retrieved Context:
   'Albert Einstein was a famous physicist who worked at Princeton.'


In [71]:
entity_extraction_prompt = ChatPromptTemplate.from_template("""
Extract all named entities from the following text. 
Include: PERSON, ORGANIZATION, LOCATION, DATE, and other proper nouns.

Text: {text}

List each entity on a new line with its type:
"""

)
entity_chain = entity_extraction_prompt | llm | StrOutputParser()
print("üîç Entity Extraction")
print("=" * 60)

print("   Referance Entity:")
ref_entities = entity_chain.invoke({
    "text":entity_reference
})
print(ref_entities)

print("\n   Context Entities: ")
ctx_entities = entity_chain.invoke(
    {"text":entity_context[0]}
)
print(ctx_entities)


üîç Entity Extraction
   Referance Entity:
Albert Einstein - PERSON  
Princeton University - ORGANIZATION  
1905 - DATE  

   Context Entities: 
Albert Einstein - PERSON  
Princeton - LOCATION


In [86]:
reference_entities = {
    "Albert Einstein": "PERSON",
    "Princeton University": "ORGANIZATION",
    "1905": "DATE"
}

context_entities = {
    "Albert Einstein": "PERSON",
    "Princeton": "ORGANIZATION"  # Partial match
}

print("üìä Entity Comparison")
print("=" * 60)

print("\n| Entity in Reference  |     Type     |Found in Context? |")
print("|" + "-" * 22 + "|" + "-" * 14 + "|" + "-" * 18 + "|")

found_count = 0
for entity, entity_type in reference_entities.items():
    found = any(entity.lower() in ctx.lower() or ctx.lower() in entity.lower() for ctx in context_entities.keys())
    if found:
        found_count+=1
    status = "‚úÖ Yes" if found else "‚ùå MISSING"
    print(f"| {entity:20} | {entity_type:12} |{status:16}|")

entity_recall = found_count / len(reference_entities)
print(f"\n   Entity Recall: {found_count}/{len(reference_entities)} = {entity_recall:.2f}")
print(f"‚ö†Ô∏è Missing: '1905' - Critical date not retrieved!")

üìä Entity Comparison

| Entity in Reference  |     Type     |Found in Context? |
|----------------------|--------------|------------------|
| Albert Einstein      | PERSON       |‚úÖ Yes           |
| Princeton University | ORGANIZATION |‚úÖ Yes           |
| 1905                 | DATE         |‚ùå MISSING       |

   Entity Recall: 2/3 = 0.67
‚ö†Ô∏è Missing: '1905' - Critical date not retrieved!


In [87]:
entity_sample = SingleTurnSample(
    reference=entity_reference,
    retrieved_contexts=entity_context
)
entity_metric = ContextEntityRecall(llm=ragas_llm)
ragas_entity_recall = run_async(entity_metric.single_turn_ascore(entity_sample))

print("üî¨ RAGAS Context Entity Recall Result")
print("=" * 50)
print(f"\n   Manual estimate: {entity_recall:.2f}")
print(f"   RAGAS metric score: {ragas_entity_recall:.2f}")

üî¨ RAGAS Context Entity Recall Result

   Manual estimate: 0.67
   RAGAS metric score: 0.25


In [88]:

noise_question = "What is LIC known for?"
noise_response = "LIC is the largest insurance company in India, known for its vast portfolio. LIC contributes to financial stability."
noise_reference = "LIC is the largest insurance company in India, established in 1956. It is known for managing a large portfolio of investments."

noise_contexts = [
    "LIC was established in 1956 following nationalization.",           # ‚úÖ Relevant
    "LIC is the largest insurance company with huge investments.",      # ‚úÖ Relevant
    "LIC manages substantial funds for financial stability.",           # ‚úÖ Relevant
    "The Indian economy is one of the fastest-growing economies..."     # ‚ùå NOISE!
]

print("üìù Question: '{}'\n".format(noise_question))
print("üìù Response to evaluate:")
print(f"   '{noise_response}'")
print("\nüìù Reference (Ground Truth):")
print(f"   '{noise_reference}'")
print("\nüìö Retrieved Contexts:")
for i, ctx in enumerate(noise_contexts, 1):
    noise_tag = " ‚Üê NOISE!" if i == 4 else " ‚úÖ"
    print(f"   {i}. '{ctx[:60]}...'{noise_tag}")

üìù Question: 'What is LIC known for?'

üìù Response to evaluate:
   'LIC is the largest insurance company in India, known for its vast portfolio. LIC contributes to financial stability.'

üìù Reference (Ground Truth):
   'LIC is the largest insurance company in India, established in 1956. It is known for managing a large portfolio of investments.'

üìö Retrieved Contexts:
   1. 'LIC was established in 1956 following nationalization....' ‚úÖ
   2. 'LIC is the largest insurance company with huge investments....' ‚úÖ
   3. 'LIC manages substantial funds for financial stability....' ‚úÖ
   4. 'The Indian economy is one of the fastest-growing economies.....' ‚Üê NOISE!


In [89]:
# Analyze claims in response

response_claims = [
    ("LIC is the largest insurance company in India", True, "Matches reference"),
    ("LIC is known for its vast portfolio", True, "Matches reference (portfolio)"),
    ("LIC contributes to financial stability", False, "NOT in reference - possible hallucination from noise!")
]

print("üîç Claim Analysis")
print("=" * 70)

print("\n| Claim | Correct? | Reason |")
print("|" + "-" * 45 + "|" + "-" * 10 + "|" + "-" * 40 + "|")

incorrect_count = 0
for claim, is_correct, reason in response_claims:
    status = "‚úÖ Yes" if is_correct else "‚ùå No"
    if not is_correct:
        incorrect_count += 1
    print(f"| {claim[:43]:43} | {status:8} | {reason[:38]:38} |")

üîç Claim Analysis

| Claim | Correct? | Reason |
|---------------------------------------------|----------|----------------------------------------|
| LIC is the largest insurance company in Ind | ‚úÖ Yes    | Matches reference                      |
| LIC is known for its vast portfolio         | ‚úÖ Yes    | Matches reference (portfolio)          |
| LIC contributes to financial stability      | ‚ùå No     | NOT in reference - possible hallucinat |


In [90]:
total_claims = len(response_claims)
noise_sensitivity = incorrect_count / total_claims

print("üî¢ Noise Sensitivity Calculation")
print("=" * 50)
print(f"\n   Incorrect claims: {incorrect_count}")
print(f"   Total claims: {total_claims}")
print(f"\n   Formula: {incorrect_count} / {total_claims} = {noise_sensitivity:.2f}")
print(f"\n   üìä Noise Sensitivity: {noise_sensitivity:.2f}")

if noise_sensitivity < 0.3:
    print("   ‚úÖ Good! Model is mostly resistant to noise.")
elif noise_sensitivity < 0.6:
    print("   ‚ö†Ô∏è Warning! Model is sometimes confused by noise.")
else:
    print("   üö® Bad! Model is highly susceptible to noise.")

üî¢ Noise Sensitivity Calculation

   Incorrect claims: 1
   Total claims: 3

   Formula: 1 / 3 = 0.33

   üìä Noise Sensitivity: 0.33


In [91]:
noise_sample = SingleTurnSample(
    user_input=noise_question,
    response=noise_response,
    reference=noise_reference,
    retrieved_contexts=noise_contexts
)
noise_metric_relevant = NoiseSensitivity(llm=ragas_llm, mode="relevant")
ragas_noise = run_async(noise_metric_relevant.single_turn_ascore(noise_sample))

print("üî¨ RAGAS Noise Sensitivity Result")
print("=" * 50)
print(f"\n   Mode: relevant")
print(f"   Score: {ragas_noise:.2f}")
print(f"\n   Remember: Lower is better for this metric!")

üî¨ RAGAS Noise Sensitivity Result

   Mode: relevant
   Score: 0.33

   Remember: Lower is better for this metric!


In [92]:
complete_sample = SingleTurnSample(
    user_input="What is the Eiffel Tower and where is it located?",
    response="The Eiffel Tower is a famous iron lattice tower located in Paris, France. It was built in 1889.",
    reference="The Eiffel Tower is a wrought-iron lattice tower in Paris, France. It was constructed from 1887 to 1889.",
    retrieved_contexts=[
        "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France.",
        "The tower was constructed from 1887 to 1889 as the centerpiece of the 1889 World's Fair.",
        "The Eiffel Tower is named after Gustave Eiffel, whose company designed and built the tower.",
        "Paris is known for its cafe culture and fashion industry."  # Some noise
    ]
)
print("üìä Complete Sample for Evaluation")
print("=" * 60)
print(f"\nQuestion: {complete_sample.user_input}")
print(f"\nResponse: {complete_sample.response}")
print(f"\nReferance: {complete_sample.reference}")
print(f"\nContexts: {len(complete_sample.retrieved_contexts)}")

üìä Complete Sample for Evaluation

Question: What is the Eiffel Tower and where is it located?

Response: The Eiffel Tower is a famous iron lattice tower located in Paris, France. It was built in 1889.

Referance: The Eiffel Tower is a wrought-iron lattice tower in Paris, France. It was constructed from 1887 to 1889.

Contexts: 4


In [93]:
print("üî¨ Running All 6 RAGAS Metrics")
print("=" * 60)

all_metric = {
    "Faithfulness":Faithfulness(llm=ragas_llm),
    "Answer Relevancy": ResponseRelevancy(llm=ragas_llm,embeddings=ragas_embeddings),
    "Context Precision": LLMContextPrecisionWithReference(llm=ragas_llm),
    "Context Recall": LLMContextRecall(llm=ragas_llm),
    "Context Entity Recall":ContextEntityRecall(llm=ragas_llm),
    "Noise Sensitivity":NoiseSensitivity(llm=ragas_llm)
}
result = {}
for name, metric in all_metric.items():
    try:
        score = run_async(metric.single_turn_ascore(complete_sample))
        result[name] = score

        if name == "Noise Sensitivity":
            quality = "Good" if score < 0.3 else "Concerning" if score < 0.6 else "Poor"
            direction = "(lower is better)"
        else:
            quality = "Good" if score > 0.7 else "Concerning" if score > 0.5 else "Poor"
            direction = "(higher is better)"
        print(f"\n   {name}: {score:.3f} {direction}")
        print(f"   Assessment: {quality}")



    except Exception as e:
        print(f"\n  {name}: Error - {str(e)[:50]}")
        result[name] = None

üî¨ Running All 6 RAGAS Metrics

   Faithfulness: 1.000 (higher is better)
   Assessment: Good

   Answer Relevancy: 0.970 (higher is better)
   Assessment: Good

   Context Precision: 1.000 (higher is better)
   Assessment: Good

   Context Recall: 1.000 (higher is better)
   Assessment: Good

   Context Entity Recall: 1.000 (higher is better)
   Assessment: Good

   Noise Sensitivity: 0.000 (lower is better)
   Assessment: Good


In [94]:
summary_data = []
for name, score in result.items():
    if score is not None:
        if name == "Noise Sensitivity":
            ideal = "0.0"
            status = "‚úÖ" if score<0.3 else "‚ö†Ô∏è" if score < 0.6 else "‚ùå"
        else:
            ideal = "1.0"
            status = "‚úÖ" if score > 0.7 else "‚ö†Ô∏è" if score >0.5 else "‚ùå"
        summary_data.append({
            "Metric":name,
            "Score": score,
            "Ideal": ideal,
            "Status": status
        }) 
df_summary = pd.DataFrame(summary_data)
print(df_summary.to_string(index=False))

               Metric    Score Ideal Status
         Faithfulness 1.000000   1.0      ‚úÖ
     Answer Relevancy 0.969854   1.0      ‚úÖ
    Context Precision 1.000000   1.0      ‚úÖ
       Context Recall 1.000000   1.0      ‚úÖ
Context Entity Recall 1.000000   1.0      ‚úÖ
    Noise Sensitivity 0.000000   0.0      ‚úÖ


In [95]:
# Create a batch of test samples

test_samples = [
    SingleTurnSample(
        user_input="What is RAG?",
        response="RAG stands for Retrieval Augmented Generation. It combines retrieval systems with LLMs to provide accurate, grounded responses.",
        reference="RAG (Retrieval Augmented Generation) is a technique that enhances LLM responses by retrieving relevant documents and using them as context.",
        retrieved_contexts=[
            "RAG combines retrieval with generation for accurate responses.",
            "Retrieval Augmented Generation uses external knowledge bases."
        ]
    ),
    SingleTurnSample(
        user_input="What are embeddings?",
        response="Embeddings are numerical vector representations of text that capture semantic meaning.",
        reference="Embeddings are dense vector representations that encode semantic information about text into numerical format.",
        retrieved_contexts=[
            "Embeddings convert text to dense vectors.",
            "Vector representations capture semantic similarity."
        ]
    ),
    SingleTurnSample(
        user_input="What is chunking?",
        response="Chunking is the process of breaking documents into smaller pieces for processing.",
        reference="Chunking divides large documents into smaller segments that can be individually embedded and retrieved.",
        retrieved_contexts=[
            "Document chunking breaks text into manageable pieces.",
            "Chunk size affects retrieval quality."
        ]
    )
]

print(f"üìä Created {len(test_samples)} test samples for batch evaluation")

üìä Created 3 test samples for batch evaluation


In [96]:
from ragas import EvaluationDataset

eval_dataset = EvaluationDataset(samples=test_samples)

batch_metrics = [
    Faithfulness(llm=ragas_llm),
    ResponseRelevancy(llm=ragas_llm, embeddings=ragas_embeddings),
    LLMContextRecall(llm=ragas_llm)
]

print("üî¨ Running Batch Evaluation...")
print("=" * 50)

batch_result = evaluate(
    dataset=eval_dataset,
    metrics=batch_metrics
)
print("\n‚úÖ Batch evaluation complete!")

üî¨ Running Batch Evaluation...


Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.



‚úÖ Batch evaluation complete!


In [97]:
print("üìä Batch Evaluation Results")
print("=" * 60)

result_df = batch_result.to_pandas()
print(result_df.to_string())

print("\nüìà Average Scores:")
for col in result_df.columns:
    if col not in ['user_input', 'response', 'reference', 'retrieved_contexts']:
        avg = result_df[col].mean()
        print(f"   {col}: {avg:.3f}")

üìä Batch Evaluation Results
             user_input                                                                                                               retrieved_contexts                                                                                                                         response                                                                                                                                    reference  faithfulness  answer_relevancy  context_recall
0          What is RAG?  [RAG combines retrieval with generation for accurate responses., Retrieval Augmented Generation uses external knowledge bases.]  RAG stands for Retrieval Augmented Generation. It combines retrieval systems with LLMs to provide accurate, grounded responses.  RAG (Retrieval Augmented Generation) is a technique that enhances LLM responses by retrieving relevant documents and using them as context.      0.666667          0.940876             0.0
1  What are embeddings?     