In [32]:
import os
import pickle
import numpy as np
import pandas as pd
import nest_asyncio
from datasets import Dataset
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from ragas import evaluate, RunConfig
from ragas.metrics import faithfulness
from langchain_huggingface import HuggingFaceEmbeddings
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness,
)

load_dotenv()

api_key = os.getenv('GROQ_API_KEY')

models = ['llama3-70b-8192', 'llama3-8b-8192', 'mixtral-8x7b-32768', 'gemma-7b-it', 'gemma2-9b-it']
metrics = [answer_relevancy, faithfulness, context_recall, context_precision, answer_similarity, answer_correctness]

In [33]:
llm = ChatGroq(groq_api_key=api_key, model_name="llama3-8b-8192", temperature=0)

In [34]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [35]:
scores_dict = {'model': []}
for metric in metrics:
    scores_dict[metric.name] = []

nest_asyncio.apply()

for model in models:
    with open(f'/content/sample_data/{model}.pkl', 'rb') as pickle_file:
        loaded_data_samples = pickle.load(pickle_file)

    scores_dict['model'].append(model)
    for metric in metrics:
        dataset = Dataset.from_dict(loaded_data_samples)
        score = evaluate(dataset, metrics=[metric], llm = llm, embeddings = embeddings, run_config = RunConfig(timeout=180, max_retries=20, max_wait=120, max_workers=16))

        scores_dict[metric.name].append(np.round(score[metric.name], 3))

scores_df = pd.DataFrame(scores_dict)

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[3]: TimeoutError()


Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[4]: TimeoutError()


Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

In [37]:
scores_df.head()

Unnamed: 0,model,answer_relevancy,faithfulness,context_recall,context_precision,answer_similarity,answer_correctness
0,llama3-70b-8192,0.892,0.715,1.0,1.0,0.713,0.611
1,llama3-8b-8192,0.762,0.542,1.0,1.0,0.605,0.487
2,mixtral-8x7b-32768,0.905,0.461,1.0,1.0,0.704,0.577
3,gemma-7b-it,0.196,0.342,1.0,1.0,0.595,0.456
4,gemma2-9b-it,0.447,0.65,1.0,1.0,0.514,0.481
