In [1]:
from eval_fusion_core.utils.loaders import load_evaluation_inputs
from eval_fusion_test.settings import get_openai_settings

## `ragas`

In [None]:
from eval_fusion_ragas.evaluator import RagasEvaluator
from eval_fusion_ragas.metrics import RagasMetric

In [None]:
def test_evaluator(input_count: int):
    llm_settings, em_settings = get_openai_settings()
    inputs = load_evaluation_inputs('../assets/amnesty_qa.json')
    inputs = inputs[:input_count]

    with RagasEvaluator(llm_settings, em_settings) as evaluator:
        outputs = evaluator.evaluate(inputs, metrics=[RagasMetric.FAITHFULNESS])

    for output in outputs:
        for output_entry in output.output_entries:
            print(output_entry, end='\n\n')

In [None]:
test_evaluator(3)  # 2.5 min

metric_name='faithfulness' score=0.84 reason=None error=None time=44.40238550002687

metric_name='faithfulness' score=0.5172413793103449 reason=None error=None time=61.594070750055835

metric_name='faithfulness' score=0.42857142857142855 reason=None error=None time=43.207843584008515



In [2]:
async def a_test_evaluator(input_count: int):
    llm_settings, em_settings = get_openai_settings()
    inputs = load_evaluation_inputs('../assets/amnesty_qa.json')
    inputs = inputs[:input_count]

    with RagasEvaluator(llm_settings, em_settings) as evaluator:
        outputs = await evaluator.a_evaluate(inputs, metrics=[RagasMetric.FAITHFULNESS])

    for output in outputs:
        for output_entry in output.output_entries:
            print(output_entry, end='\n\n')

In [3]:
await a_test_evaluator(3)

metric_name='faithfulness' score=0.68 reason=None error=None time=33.963236584095284

metric_name='faithfulness' score=0.041666666666666664 reason=None error=None time=34.28584862500429

metric_name='faithfulness' score=0.3076923076923077 reason=None error=None time=19.234456334030256



## `deepeval`

In [None]:
from eval_fusion_deepeval.evaluator import DeepEvalEvaluator
from eval_fusion_deepeval.metrics import DeepEvalMetric

In [3]:
async def a_test_evaluator(input_count: int):
    llm_settings, _ = get_openai_settings()
    inputs = load_evaluation_inputs('../assets/amnesty_qa.json')
    inputs = inputs[:input_count]

    with DeepEvalEvaluator(llm_settings) as evaluator:
        outputs = await evaluator.a_evaluate(
            inputs, metrics=[DeepEvalMetric.FAITHFULNESS]
        )

    for output in outputs:
        for output_entry in output.output_entries:
            print(output_entry, end='\n\n')

In [4]:
await a_test_evaluator(3)

metric_name='Faithfulness' score=1.0 reason='The score is 1.00 because there are no contradictions, indicating full alignment between the actual output and the retrieval context. Great job!' error=None time=12.18744883290492

metric_name='Faithfulness' score=1.0 reason='The score is 1.00 because there are no contradictions present, indicating that the actual output aligns perfectly with the retrieval context.' error=None time=13.545768999960274

metric_name='Faithfulness' score=1.0 reason='The score is 0.92 because although the actual output asserts that ConocoPhillips Company is among the largest GHG emitters, this company is not referenced in the retrieval context, leading to a minor inconsistency.' error=None time=11.912813208065927



## `arize-phoenix`

In [5]:
from eval_fusion_phoenix.evaluator import PhoenixEvaluator
from eval_fusion_phoenix.metrics import PhoenixMetric

In [8]:
async def a_test_evaluator(input_count: int):
    llm_settings, _ = get_openai_settings()
    inputs = load_evaluation_inputs('../assets/amnesty_qa.json')
    inputs = inputs[:input_count]

    with PhoenixEvaluator(llm_settings) as evaluator:
        outputs = await evaluator.a_evaluate(inputs, metrics=[PhoenixMetric.RELEVANCE])

    for output in outputs:
        for output_entry in output.output_entries:
            print(output_entry, end='\n\n')

In [9]:
await a_test_evaluator(3)

metric_name='relevance' score=1.0 reason='To determine if the reference text contains information relevant to the question regarding the global implications of the USA Supreme Court ruling on abortion, we can analyze the content step by step:\n\n1. **Identify the Question**: The question asks about the global implications of a specific ruling by the USA Supreme Court regarding abortion.\n\n2. **Examine the Reference Text**: The reference text outlines the Supreme Court ruling, its immediate impact on abortion access in the USA, and explicitly mentions its global implications.\n\n3. **Global Context in the Reference Text**: \n   - It states that the ruling has had effects beyond national borders, indicating the USA\'s geopolitical and cultural influence.\n   - It mentions reactions from organizations and activists worldwide who fear that the ruling could encourage anti-abortion legislation in other countries.\n   - There are observations about the stalling of progressive law reform in A

## `llama-index`

In [2]:
from eval_fusion_llama_index.evaluator import LlamaIndexEvaluator
from eval_fusion_llama_index.metrics import LlamaIndexMetric

In [3]:
def test_evaluator(input_count: int):
    llm_settings, em_settings = get_openai_settings()
    inputs = load_evaluation_inputs('../assets/amnesty_qa.json')
    inputs = inputs[:input_count]

    with LlamaIndexEvaluator(llm_settings, em_settings) as evaluator:
        outputs = evaluator.evaluate(inputs, metrics=[LlamaIndexMetric.FAITHFULNESS])

    for output in outputs:
        for output_entry in output.output_entries:
            print(output_entry, end='\n\n')

In [4]:
test_evaluator(1)

metric_name='faithfulness' score=1.0 reason='YES' error=None time=1.083593624876812



In [3]:
async def a_test_evaluator(input_count: int):
    llm_settings, em_settings = get_openai_settings()
    inputs = load_evaluation_inputs('../assets/amnesty_qa.json')
    inputs = inputs[:input_count]

    with LlamaIndexEvaluator(llm_settings, em_settings) as evaluator:
        outputs = await evaluator.a_evaluate(
            inputs, metrics=[LlamaIndexMetric.FAITHFULNESS]
        )

    for output in outputs:
        for output_entry in output.output_entries:
            print(output_entry, end='\n\n')

In [4]:
await a_test_evaluator(3)

metric_name='faithfulness' score=1.0 reason='YES' error=None time=0.7645129591692239

metric_name='faithfulness' score=1.0 reason='YES' error=None time=0.6694967080838978

metric_name='faithfulness' score=0.0 reason='NO' error=None time=0.6710254161152989



## `tonic-validate`

In [2]:
from eval_fusion_tonic_validate.evaluator import TonicValidateEvaluator
from eval_fusion_tonic_validate.metrics import TonicValidateMetric

In [None]:
def test_evaluator(input_count: int):
    llm_settings, _ = get_openai_settings()
    inputs = load_evaluation_inputs('../assets/amnesty_qa.json')
    inputs = inputs[:input_count]

    with TonicValidateEvaluator(llm_settings) as evaluator:
        outputs = evaluator.evaluate(
            inputs, metrics=[TonicValidateMetric.ANSWER_CONSISTENCY]
        )

    for output in outputs:
        for output_entry in output.output_entries:
            print(output_entry, end='\n\n')

In [4]:
test_evaluator(1)

Scoring responses: 100%|██████████| 1/1 [00:05<00:00,  5.25s/it]


metric_name='answer_consistency' score=1.0 reason=None error=None time=6.0304533750750124



In [3]:
async def a_test_evaluator(input_count: int):
    llm_settings, _ = get_openai_settings()
    inputs = load_evaluation_inputs('../assets/amnesty_qa.json')
    inputs = inputs[:input_count]

    with TonicValidateEvaluator(llm_settings) as evaluator:
        outputs = await evaluator.a_evaluate(
            inputs, metrics=[TonicValidateMetric.ANSWER_CONSISTENCY]
        )

    for output in outputs:
        for output_entry in output.output_entries:
            print(output_entry, end='\n\n')

In [4]:
await a_test_evaluator(3)

Scoring responses:   0%|          | 0/1 [00:00<?, ?it/s]
Scoring responses: 100%|██████████| 1/1 [00:07<00:00,  7.19s/it]

Scoring responses: 100%|██████████| 1/1 [00:09<00:00,  9.49s/it]
Scoring responses: 100%|██████████| 1/1 [00:15<00:00, 15.22s/it]


metric_name='answer_consistency' score=1.0 reason=None error=None time=7.912690416909754

metric_name='answer_consistency' score=0.6923076923076923 reason=None error=None time=16.03793100011535

metric_name='answer_consistency' score=0.4166666666666667 reason=None error=None time=10.303196958033368



## `ragchecker`

In [2]:
from eval_fusion_ragchecker.evaluator import RagCheckerEvaluator
from eval_fusion_ragchecker.metrics import RagCheckerMetric

W0726 19:05:44.317000 93633 torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.




In [3]:
def test_evaluator(input_count: int):
    llm_settings, _ = get_openai_settings()
    inputs = load_evaluation_inputs('../assets/amnesty_qa.json')
    inputs = inputs[:input_count]

    with RagCheckerEvaluator(llm_settings) as evaluator:
        outputs = evaluator.evaluate(inputs, metrics=[RagCheckerMetric.FAITHFULNESS])

    for output in outputs:
        for output_entry in output.output_entries:
            print(output_entry, end='\n\n')

In [4]:
test_evaluator(1)

[32m2025-07-26 19:02:33.696[0m | [1mINFO    [0m | [36mragchecker.evaluator[0m:[36mextract_claims[0m:[36m113[0m - [1mExtracting claims for response of 1 RAG results.[0m
100%|██████████| 1/1 [00:08<00:00,  8.46s/it]
[32m2025-07-26 19:02:42.179[0m | [1mINFO    [0m | [36mragchecker.evaluator[0m:[36mcheck_claims[0m:[36m173[0m - [1mChecking retrieved2response for 1 RAG results.[0m
100%|██████████| 15/15 [00:10<00:00,  1.37it/s]

metric_name='faithfulness' score=71.4 reason=None error=None time=19.43028845777735






In [3]:
async def a_test_evaluator(input_count: int):
    llm_settings, _ = get_openai_settings()
    inputs = load_evaluation_inputs('../assets/amnesty_qa.json')
    inputs = inputs[:input_count]

    with RagCheckerEvaluator(llm_settings) as evaluator:
        outputs = await evaluator.a_evaluate(
            inputs, metrics=[RagCheckerMetric.FAITHFULNESS]
        )

    for output in outputs:
        for output_entry in output.output_entries:
            print(output_entry, end='\n\n')

In [4]:
await a_test_evaluator(3)

[32m2025-07-26 19:05:45.990[0m | [1mINFO    [0m | [36mragchecker.evaluator[0m:[36mextract_claims[0m:[36m113[0m - [1mExtracting claims for response of 1 RAG results.[0m
[32m2025-07-26 19:05:45.990[0m | [1mINFO    [0m | [36mragchecker.evaluator[0m:[36mextract_claims[0m:[36m113[0m - [1mExtracting claims for response of 1 RAG results.[0m
[32m2025-07-26 19:05:45.990[0m | [1mINFO    [0m | [36mragchecker.evaluator[0m:[36mextract_claims[0m:[36m113[0m - [1mExtracting claims for response of 1 RAG results.[0m
  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:06<00:00,  6.02s/it]
[32m2025-07-26 19:05:52.026[0m | [1mINFO    [0m | [36mragchecker.evaluator[0m:[36mcheck_claims[0m:[36m173[0m - [1mChecking retrieved2response for 1 RAG results.[0m
 11%|█         | 1/9 [00:01<00:08,  1.12s/it]
100%|██████████| 1/1 [00:07<00:00,  7.14s/it]
[32m2025-07-26 19:05:53.155[0m | [1mINFO    [0m | [36mragchecker.evaluator[0m:[36mcheck_claims[0m:[

metric_name='faithfulness' score=66.7 reason=None error=None time=13.306292458903044

metric_name='faithfulness' score=40.0 reason=None error=None time=23.13434824999422

metric_name='faithfulness' score=80.0 reason=None error=None time=17.682423416990787




