# Metrics

###LLM-based metrics

All LLM based metrics in ragas are inherited from MetricWithLLM class. These metrics expects a LLM object to be set before scoring.

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [3]:
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")

In [3]:
from ragas.metrics import FactualCorrectness

scorer = FactualCorrectness(llm= llm)

###Non-LLM-based metrics

These metrics rely on traditional methods to evaluate the performance of the AI application, such as string similarity, BLEU score, etc. Due to the same, these metrics are known to have a lower correlation with human evaluation.

In [12]:
from ragas import SingleTurnSample, EvaluationDataset

# Sample 1
sample = SingleTurnSample(
    user_input="What is the capital of Germany?",
    retrieved_contexts=["Berlin is the capital and largest city of Germany."],
    response="The capital of Germany is Berlin.",
    reference="Berlin",
)

In [None]:
## SingleTurn Metrics
from ragas.metrics import FactualCorrectness

scorer = FactualCorrectness()
await scorer.single_turn_ascore(sample)

In [None]:
##MultiTurn Metrics
from ragas.metrics import AgentGoalAccuracyWithoutReference
from ragas import MultiTurnSample

scorer = AgentGoalAccuracyWithoutReference()
await scorer.multi_turn_ascore(sample)


# Context Precision

- It is a metric that measures the proportion of relevant chunks in the retrieved_contexts.
- It is calculated as the mean of the precision@k for each chunk in the context.
- Precision@k is the ratio of the number of relevant chunks at rank k to the total number of chunks at rank k.

##LLM Based Context Precision

In [6]:
# Context Precision without referen
from ragas import SingleTurnSample
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextPrecisionWithoutReference

evaluator_llm = LangchainLLMWrapper(llm)
context_precision = LLMContextPrecisionWithoutReference(llm=evaluator_llm)

sample = SingleTurnSample(
    user_input="Where is the Eiffel Tower located?",
    response="The Eiffel Tower is located in Paris.",
    retrieved_contexts=["The Eiffel Tower is located in Paris."], 
)

await context_precision.single_turn_ascore(sample)

0.9999999999

In [7]:
# Context Precision with reference
from ragas.metrics import LLMContextPrecisionWithReference

ref_context_precision = LLMContextPrecisionWithReference(llm= evaluator_llm)

sample_2 = SingleTurnSample(
    user_input="Where is the Eiffel Tower located?",
    reference="The Eiffel Tower is located in Paris.",
    retrieved_contexts=["The Eiffel Tower is located in Paris."],
)

await ref_context_precision.single_turn_ascore(sample_2)

0.9999999999

##Non LLM Based Context Precision

In [8]:
from ragas.metrics import NonLLMContextPrecisionWithReference

NonLLM_context_precision = NonLLMContextPrecisionWithReference()

sample_3 = SingleTurnSample(
    retrieved_contexts=["The Eiffel Tower is located in Paris."], 
    reference_contexts=["Paris is the capital of France.", "The Eiffel Tower is one of the most famous landmarks in Paris."]
)

await NonLLM_context_precision.single_turn_ascore(sample_3)

0.9999999999

# Context Recall

- Context Recall measures how many of the relevant documents (or pieces of information) were successfully retrieved

In [9]:
# LLM Based Context Recall

from ragas import SingleTurnSample
from ragas.metrics import LLMContextRecall

sample_4 = SingleTurnSample(
    user_input="Where is the Eiffel Tower located?",
    response="The Eiffel Tower is located in Paris.",
    reference="The Eiffel Tower is located in Paris.",
    retrieved_contexts=["Paris is the capital of France."], 
)

context_recall = LLMContextRecall(llm= evaluator_llm)
await context_recall.single_turn_ascore(sample_4)

1.0

In [10]:
# Non LLM Based Context Recall
from ragas.metrics import NonLLMContextRecall

sample_1 = SingleTurnSample(
    retrieved_contexts=["Paris is the capital of France."], 
    reference_contexts=["Paris is the capital of France.", "The Eiffel Tower is one of the most famous landmarks in Paris."]
)

nonllm_context_recall = NonLLMContextRecall()
await nonllm_context_recall.single_turn_ascore(sample_1)

0.5

# Context Entities Recall

ContextEntityRecall metric gives the measure of recall of the retrieved context, based on the number of entities present in both reference and retrieved_contexts relative to the number of entities present in the reference alone.

In [11]:
from ragas.metrics import ContextEntityRecall

sample2 = SingleTurnSample(
    reference="The Eiffel Tower is located in Paris.",
    retrieved_contexts=["The Eiffel Tower is located in Paris."], 
)

CER = ContextEntityRecall(llm= evaluator_llm)
await CER.single_turn_ascore(sample2)

0.999999995

# Noise Sensitivity

In [12]:
from ragas.metrics import NoiseSensitivity

sample3 = SingleTurnSample(
    user_input="What is the Life Insurance Corporation of India (LIC) known for?",
    response="The Life Insurance Corporation of India (LIC) is the largest insurance company in India, known for its vast portfolio of investments. LIC contributes to the financial stability of the country.",
    reference="The Life Insurance Corporation of India (LIC) is the largest insurance company in India, established in 1956 through the nationalization of the insurance industry. It is known for managing a large portfolio of investments.",
    retrieved_contexts=[
        "The Life Insurance Corporation of India (LIC) was established in 1956 following the nationalization of the insurance industry in India.",
        "LIC is the largest insurance company in India, with a vast network of policyholders and huge investments.",
        "As the largest institutional investor in India, LIC manages substantial funds, contributing to the financial stability of the country.",
        "The Indian economy is one of the fastest-growing major economies in the world, thanks to sectors like finance, technology, manufacturing etc."
    ]
)

ns = NoiseSensitivity(llm= evaluator_llm)
await ns.single_turn_ascore(sample3)

np.float64(0.3333333333333333)

In [14]:
scorer = NoiseSensitivity(llm=evaluator_llm, mode="irrelevant")
await scorer.single_turn_ascore(sample3)

np.float64(0.0)

# Response Relevancy

- The ResponseRelevancy metric measures how relevant a response is to the user input.

In [17]:
from ragas.metrics import ResponseRelevancy
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_ollama import OllamaEmbeddings

sample4 = SingleTurnSample(
        user_input="When was the first super bowl?",
        response="The first superbowl was held on Jan 15, 1967",
        retrieved_contexts=[
            "The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles."
        ]
    )

evaluator_embeddings = LangchainEmbeddingsWrapper(OllamaEmbeddings(model="llama3.2:1b"))
RR = ResponseRelevancy(llm= evaluator_llm, embeddings=evaluator_embeddings)
await RR.single_turn_ascore(sample4)

np.float64(0.9352738308044745)

# Faithfulness

- The Faithfulness metric measures how factually consistent a response is with the retrieved context.

In [18]:
from ragas.metrics import Faithfulness

sample4 = SingleTurnSample(
        user_input="When was the first super bowl?",
        response="The first superbowl was held on Jan 15, 1967",
        retrieved_contexts=[
            "The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles."
        ]
    )

ff = Faithfulness(llm= evaluator_llm)
await ff.single_turn_ascore(sample4)

1.0

##Faithfullness with HHEM-2.1-Open

- Vectara's HHEM-2.1-Open is a classifier model (T5) that is trained to detect hallucinations from LLM generated text.

In [None]:
## Need to install HuggingFace Transformers Library

# Faithfullness with HHEM-2.1-Open
from ragas.metrics import FaithfulnesswithHHEM

sample = SingleTurnSample(
        user_input="When was the first super bowl?",
        response="The first superbowl was held on Jan 15, 1967",
        retrieved_contexts=[
            "The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles."
        ]
    )

hhem_scorer = FaithfulnesswithHHEM(llm= evaluator_llm)
await hhem_scorer.single_turn_ascore(sample)

In [None]:
my_device = "cuda:0"
my_batch_size = 10

scorer = FaithfulnesswithHHEM(device=my_device, batch_size=my_batch_size)
await scorer.single_turn_ascore(sample)