# Natural Language Comparision

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [2]:
from langchain_groq import ChatGroq
from ragas.llms import LangchainLLMWrapper

groq_llm = ChatGroq(model="llama3-8b-8192")
evaluator_llm = LangchainLLMWrapper(groq_llm)

  from .autonotebook import tqdm as notebook_tqdm


#Factual Correctness

 - It is a metric that compares and evaluates the factual accuracy of the generated response with the reference. 
 - This metric is used to determine the extent to which the generated response aligns with the reference.

In [3]:
from ragas.metrics import FactualCorrectness
from ragas import SingleTurnSample

sample = SingleTurnSample(
    response="The Eiffel Tower is located in Paris.",
    reference="The Eiffel Tower is located in Paris. I has a height of 1000ft."
)

scorer = FactualCorrectness(llm= evaluator_llm, mode="f1")
await scorer.single_turn_ascore(sample)

np.float64(0.67)

###Controlling the Number of Claims

- Atomicity refers to how much a sentence is broken down into its smallest, meaningful components.
- Coverage refers to how comprehensively the claims represent the information in the original sentence. 

#Semantic similarity

- The concept of Answer Semantic Similarity pertains to the assessment of the semantic resemblance between the generated answer and the ground truth.
- This evaluation utilizes a bi-encoder model to calculate the semantic similarity score.

In [4]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import SemanticSimilarity
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_ollama import OllamaEmbeddings

sample = SingleTurnSample(
    response="The Eiffel Tower is located in Paris.",
    reference="The Eiffel Tower is located in Paris. It has a height of 1000ft."
)

evaluator_embeddings = LangchainEmbeddingsWrapper(OllamaEmbeddings(model="llama3.2:1b"))
scorer = SemanticSimilarity(embeddings=LangchainEmbeddingsWrapper(evaluator_embeddings))
await scorer.single_turn_ascore(sample)

0.8968347134901562

# Traditional Non-LLM Metrics

#Traditional NLP Metrics

In [7]:
# Non LLM String Similarity
# This metric measures the similarity between the reference and the response using traditional string distance measures 
# such as Levenshtein, Hamming, and Jaro.

from ragas.dataset_schema import SingleTurnSample
from ragas.metrics._string import NonLLMStringSimilarity, DistanceMeasure

sample = SingleTurnSample(
    response="The Eiffel Tower is located in India.",
    reference="The Eiffel Tower is located in Paris."
)

scorer = NonLLMStringSimilarity()
await scorer.single_turn_ascore(sample)

0.8918918918918919

In [8]:
scorer = NonLLMStringSimilarity(distance_measure= DistanceMeasure.HAMMING)
await scorer.single_turn_ascore(sample)

0.8918918918918919

#BLEU Score

- The BleuScore score is a metric used to evaluate the quality of response by comparing it with reference
- It measures the similarity between the response and the reference based on n-gram precision and brevity penalty.
- BLEU score was originally designed to evaluate machine translation systems, but it is also used in other natural language processing tasks.

In [9]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import BleuScore

sample = SingleTurnSample(
    response="The Eiffel Tower is located in India.",
    reference="The Eiffel Tower is located in Paris."
)

scorer = BleuScore()
await scorer.single_turn_ascore(sample)

0.7071067811865478

#ROUGE Score

- The RougeScore score is a set of metrics used to evaluate the quality of natural language generations. 
- It measures the overlap between the generated response and the reference text based on n-gram recall, precision, and F1 score. 

In [11]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import RougeScore

sample = SingleTurnSample(
    response="The Eiffel Tower is located in India.",
    reference="The Eiffel Tower is located in Paris."
)

scorer = RougeScore()
await scorer.single_turn_ascore(sample)

0.8571428571428571

#Exact Match

- The ExactMatch metric checks if the response is exactly the same as the reference text

In [12]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import ExactMatch

sample = SingleTurnSample(
    response="India",
    reference="Paris"
)

scorer = ExactMatch()
await scorer.single_turn_ascore(sample)

0.0

#String Presence

- The StringPresence metric checks if the response contains the reference text.

In [13]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import StringPresence

sample = SingleTurnSample(
    response="The Eiffel Tower is located in India.",
    reference="Eiffel Tower"
)
scorer = StringPresence()
await scorer.single_turn_ascore(sample)

1.0