# General Purpose Metrics

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [None]:
from langchain_groq import ChatGroq
from langchain_ollama import ChatOllama
from ragas.llms import LangchainLLMWrapper

#groq_llm = ChatGroq(model="gemma2-9b-it")  #   llama3-8b-8192
llm = ChatOllama(model="llama3.2:1b")
evaluator_llm = LangchainLLMWrapper(llm)

### Aspect Critic

- AspectCritic is an evaluation metric that can be used to evaluate responses based on predefined aspects in free form natural language.

In [None]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import AspectCritic

sample = SingleTurnSample(
    user_input="Where is the Eiffel Tower located?",
    response="The Eiffel Tower is located in Paris.",
)

scorer =  AspectCritic(
        name="maliciousness",
        definition="Is the submission intended to harm, deceive, or exploit users?",
        llm=evaluator_llm
    )

await scorer.single_turn_ascore(sample)

### Simple Criteria Scoring

- Course graned evaluation method is an evaluation metric that can be used to score (integer) responses based on predefined single free form scoring criteria.

In [None]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import SimpleCriteriaScore

sample = SingleTurnSample(
    user_input="Where is the Eiffel Tower located?",
    response="The Eiffel Tower is located in Paris.",
    reference="The Eiffel Tower is located in Egypt"
)

scorer =  SimpleCriteriaScore(
    name="course_grained_score", 
    definition="Score 0 to 5 by similarity",
    llm=evaluator_llm
)

await scorer.single_turn_ascore(sample)

### Rubrics based criteria scoring

- The Rubric-Based Criteria Scoring Metric is used to do evaluations based on user-defined rubrics.

In [None]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import RubricsScore

sample = SingleTurnSample(
    response="The Earth is flat and does not orbit the Sun.",
    reference="Scientific consensus, supported by centuries of evidence, confirms that the Earth is a spherical planet that orbits the Sun. This has been demonstrated through astronomical observations, satellite imagery, and gravity measurements.",
)

rubrics = {
    "score1_description": "The response is entirely incorrect and fails to address any aspect of the reference.",
    "score2_description": "The response contains partial accuracy but includes major errors or significant omissions that affect its relevance to the reference.",
    "score3_description": "The response is mostly accurate but lacks clarity, thoroughness, or minor details needed to fully address the reference.",
    "score4_description": "The response is accurate and clear, with only minor omissions or slight inaccuracies in addressing the reference.",
    "score5_description": "The response is completely accurate, clear, and thoroughly addresses the reference without any errors or omissions.",
}


scorer = RubricsScore(rubrics=rubrics, llm=evaluator_llm)
await scorer.single_turn_ascore(sample)

### Instance Specific rubrics criteria scoring

- Instance Specific Evaluation Metric is a rubric-based method used to evaluate each item in a dataset individually. 

This differs from the Rubric Based Criteria Scoring Metric, where a single rubric is applied to uniformly evaluate all items in the dataset. In the Instance-Specific Evaluation Metric, you decide which rubric to use for each item.

In [None]:
from ragas import EvaluationDataset
from ragas.evaluation import evaluate
from ragas.metrics import InstanceRubrics

dataset = [
    # Relevance to Query
    {
        "user_query": "How do I handle exceptions in Python?",
        "response": "To handle exceptions in Python, use the `try` and `except` blocks to catch and handle errors.",
        "reference": "Proper error handling in Python involves using `try`, `except`, and optionally `else` and `finally` blocks to handle specific exceptions or perform cleanup tasks.",
        "rubrics": {
            "score0_description": "The response is off-topic or irrelevant to the user query.",
            "score1_description": "The response is fully relevant and focused on the user query.",
        },
    },
    # Code Efficiency
    {
        "user_query": "How can I create a list of squares for numbers 1 through 5 in Python?",
        "response": """
            # Using a for loop
            squares = []
            for i in range(1, 6):
                squares.append(i ** 2)
            print(squares)
                """,
        "reference": """
            # Using a list comprehension
            squares = [i ** 2 for i in range(1, 6)]
            print(squares)
                """,
        "rubrics": {
            "score0_description": "The code is inefficient and has obvious performance issues (e.g., unnecessary loops or redundant calculations).",
            "score1_description": "The code is efficient, optimized, and performs well even with larger inputs.",
        },
    },
]


evaluation_dataset = EvaluationDataset.from_list(dataset)

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[InstanceRubrics(llm=evaluator_llm)],
    llm=evaluator_llm,
)

result

# Tasks Metrics

### Summarization Score

- SummarizationScore metric gives a measure of how well the summary (response) captures the important information from the retrieved_contexts.

In [None]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import SummarizationScore


sample = SingleTurnSample(
    response="A company is launching a fitness tracking app that helps users set exercise goals, log meals, and track water intake, with personalized workout suggestions and motivational reminders.",
    reference_contexts=[
        "A company is launching a new product, a smartphone app designed to help users track their fitness goals. The app allows users to set daily exercise targets, log their meals, and track their water intake. It also provides personalized workout recommendations and sends motivational reminders throughout the day."
    ]
)

scorer = SummarizationScore(llm=evaluator_llm)
await scorer.single_turn_ascore(sample)