
# EVALUATORS

In [None]:
import os
os.environ["OPENAI_API_KEY"]  = "your-key"
openai_api_key="your-key"
result=[]

### DEEPEVAL

In [None]:
from deepeval import evaluate
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.test_case import LLMTestCase

metric = ContextualPrecisionMetric(
    threshold=0.7,
    model="gpt-3.5-turbo",
    include_reason=True
)

test_cases = []
for item in result:
    test_case = LLMTestCase(
        input=item['question'],
        actual_output=item['answer'],
        expected_output=item['ground_truths'],
        retrieval_context=item['contexts']
    )
    test_cases.append(test_case)

scores = []
for test_case in test_cases:
    metric.measure(test_case)
    scores.append(metric.score)

average_score = sum(scores) / len(scores)

print("Average Context precision Score:", average_score)

### UPTRAIN

In [None]:
from uptrain import EvalLLM, ResponseMatching
data = []
scores=[]
eval_llm = EvalLLM(openai_api_key=openai_api_key)

for item in result:
    data = [{
        "question": [item['question']],
        "response": [item['answer']],
        "ground_truth": [item['ground_truths']],
        
    }]
    
    res = eval_llm.evaluate(
        data=data,
        checks = [ResponseMatching(method = 'llm')]  
    )
   
    for evaluation_result in res:
        score = evaluation_result['score_response_match_llm']
        scores.append(score)
  
average_score = sum(scores) / len(scores)
print("Average Context precision Score:", average_score)

### TRULENS

In [None]:
from trulens_eval import Tru, Feedback, Select
from trulens_eval.tru_custom_app import instrument
from trulens_eval.feedback.provider.openai import OpenAI
from trulens_eval import TruCustomApp

import numpy as np

tru = Tru()
provider = OpenAI()

f_context_relevance = Feedback(provider.context_relevance_with_cot_reasons).on(Select.RecordCalls.retrieve.args.query).on(Select.RecordCalls.retrieve.rets).aggregate(np.mean)

class DummyRAG:
    def __init__(self, results):
        self.results = results

    @instrument
    def retrieve(self, query):
        for result in self.results:
            if result['question'] == query:
                return result['contexts']
        return []

    @instrument
    def generate_completion(self, query, contexts):
        for result in self.results:
            if result['question'] == query:
                return result['answer']
        return ""

    @instrument
    def query(self, query):
        contexts = self.retrieve(query)
        completion = self.generate_completion(query, contexts)
        return completion

dummy_rag = DummyRAG(result)
tru_rag = TruCustomApp(dummy_rag, app_id='RAG v1', feedbacks=[f_context_relevance])

with tru_rag as recording:
    for i in result:
        dummy_rag.query(i['question'])

print(tru.get_leaderboard(app_ids=["RAG v1"]))

### TONIC VALIDATE

In [None]:
from tonic_validate import Benchmark
from tonic_validate import ValidateScorer
from tonic_validate.metrics import AnswerConsistencyMetric, AnswerSimilarityMetric,AugmentationPrecisionMetric
benchmark = Benchmark(
    questions=[r['question'] for r in result],
    answers=[r['ground_truths'] for r  in result]
)

def get_rag_response(question):
    for i in result:
        if i['question'] == question:
            return {
                "llm_answer": i['answer'],
                "llm_context_list": i['contexts']
            }


scorer = ValidateScorer(metrics=[AnswerSimilarityMetric()])

run = scorer.score(benchmark, get_rag_response)
run

### RAGAS

In [None]:
from ragas.metrics import context_recall
from ragas import evaluate
from datasets import Dataset 
import pandas as pd
dfresult = pd.DataFrame(result)
dfresult['ground_truths'] = dfresult['ground_truths'].apply(lambda x: [x] if isinstance(x, str) else x)
dataset = Dataset.from_pandas(dfresult)
score = evaluate(dataset,metrics=[context_recall])
scoredf=score.to_pandas()
avg_score = scoredf['context_recall'].mean()
print("Average Score:", avg_score)

### FALCON EVALUATE

In [None]:
from falcon_evaluate.fevaluate_results import ModelScoreSummary
from falcon_evaluate.fevaluate_plot import ModelPerformancePlotter
import pandas as pd
import nltk

def convert_to_dataframe(results):
    data = {
        'prompt': [],
        'reference': [],
        'Model A': []
    }
    
    for result in results:
        data['prompt'].append(result['question'])
        data['reference'].append(result['ground_truths'])
        data['Model A'].append(result['answer'])
       
    
    df = pd.DataFrame(data)
    return df

df = convert_to_dataframe(result)
model_score_summary = ModelScoreSummary(df)
resultt,agg_score_df = model_score_summary.execute_summary()
resultt.to_csv("falconresult.csv")