# Evaluating using RAGAS
The notebook shows how to perform evaluation of the performance of a RAG pipeline using the RAGAS library.

In [1]:
import nest_asyncio
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
from datasets import Dataset
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings
from ragas import evaluate
from ragas.run_config import RunConfig

  from .autonotebook import tqdm as notebook_tqdm

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness

For example, replace imports like: `from langchain.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from ragas.metrics._context_entities_recall import (


In [2]:
nest_asyncio.apply()  # apply the event loop async fix

First load the evaluation set from a file. This should contain not only the synthetic (or human generated) test questions and `ground_truth`s, but also the answer generated by the RAG pipeline as well as the contexts used by the pipeline to generate that answer

In [31]:
df = pd.read_csv(
    "../data/evaluation-sets/eidc-eval-sample.csv", converters={"contexts": pd.eval}
)
eval_dataset = Dataset.from_pandas(df)

In [12]:
eval_dataset["contexts"]

[['The dataset entitled "Snow Survey of Great Britain: transcribed data for Scotland, 1945 to 2007" contains the following information in it\'s "description" metadata field: This dataset comprises observations of snowline from the Snow Survey of Great Britain (SSGB) at 140 sites across Scotland . Daily observations were made between 1945 and 2007. Observations were made by a ground observer who looked out from a given location at 0900 GMT each day and noted the elevation at which snow cover was greater than 50%. \n\nThe initial aim was to \'secure representative data relating to the occurrence of snow cover at different altitudes in the various upland districts over the period October to June\'. \n\nThe data were collated by the British Glaciological Society until 1954 and thereafter by the Met Office. It has been transcribed from paper records held in the Met Office archives in Edinburgh.',
  'The dataset entitled "Global Navigation Satellite System (GNSS) survey of Ciste Mhearad snow

In [13]:
df

Unnamed: 0,question,ground_truth,answer,contexts
0,What was the frequency of snowline observation...,The frequency of snowline observations made da...,The available information does not provide a c...,"[The dataset entitled ""Snow Survey of Great Br..."
1,What was the primary focus of studying the Eur...,The primary focus of studying the European sha...,The available information does not clearly sta...,"[The dataset entitled ""Diet, timing of egg lay..."
2,What are the UKCEH Land Cover Classes used to ...,The UKCEH Land Cover Classes used to describe ...,The UKCEH Land Cover Classes used to describe ...,"[The dataset entitled ""Land Cover Map 2020 (la..."
3,What method was used to classify the pixels in...,The Random Forest classification method was us...,"Based on the available information, it appears...","[The dataset entitled ""Land Cover Map 2017 (la..."
4,What were the specific locations where the exp...,The answer to given question is not present in...,"Based on the available information, it does no...","[The dataset entitled ""Ammonia measurements fr..."


In [14]:
llm = ChatOllama(model="mistral-nemo", num_ctx=16384)
embeddings = OllamaEmbeddings(model="mistral-nemo", num_ctx=16384)

In [15]:
from ragas.metrics import (
    answer_correctness,
    answer_relevancy,
    answer_similarity,
    context_entity_recall,
    context_precision,
    context_recall,
    faithfulness,
)

In [16]:
result = evaluate(
    eval_dataset,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall,
        context_entity_recall,
        answer_similarity,
        answer_correctness,
    ],
    llm=llm,
    embeddings=embeddings,
    raise_exceptions=False,
    run_config=RunConfig(max_workers=1),
)
result

Evaluating:  14%|█▍        | 5/35 [00:35<03:02,  6.07s/it]Failed to parse output. Returning None.
Evaluating: 100%|██████████| 35/35 [03:40<00:00,  6.31s/it]


{'faithfulness': 0.6956, 'answer_relevancy': 0.1845, 'context_precision': 0.3775, 'context_recall': 0.8000, 'context_entity_recall': 0.3667, 'answer_similarity': 0.2146, 'answer_correctness': 0.0534}

In [26]:
result_df = result.to_pandas()
pio.templates.default = "gridon"
fig = go.Figure()
metrics = [
    metric
    for metric in result_df.columns.to_list()
    if metric not in ["question", "ground_truth", "answer", "contexts"]
]
for metric in metrics:
    fig.add_trace(
        go.Violin(
            y=result_df[metric],
            name=metric,
            points="all",
            box_visible=True,
            meanline_visible=True,
        )
    )
fig.update_yaxes(range=[-0.02, 1.02])
with open("eval.png", "wb") as f:
    f.write(fig.to_image(format="png"))

In [30]:
import json

with open("metrics.json", "w") as f:
    json.dump(result, f)