In [1]:
# sacado de la documentación, yo no lo he ejecutado.
# %pip install llama-index-llms-openai llama-index-embeddings-openai

# estos son los módulos que tengo instalados
# pip install llama-index
# pip install llama-index-embeddings-huggingface
# pip install llama-index-llms-ollama
# pip install matplotlib


In [1]:
# attach to the same event-loop
import nest_asyncio

nest_asyncio.apply()

In [2]:
import os
import openai

os.environ["OPENAI_API_KEY"] = "TU KEY"
# openai.api_key = os.environ["OPENAI_API_KEY"]

In [3]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Response
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import (
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    CorrectnessEvaluator,
    GuidelineEvaluator,
    SemanticSimilarityEvaluator
)
from llama_index.core.node_parser import SentenceSplitter
import pandas as pd

pd.set_option("display.max_colwidth", 0)

In [4]:
# gpt-3.5-turbo
gpt3_5 = OpenAI(temperature=0, model="gpt-3.5-turbo")

faithfulness_gpt3_5 = FaithfulnessEvaluator(llm=gpt3_5)
relevancy_gpt3_5 = RelevancyEvaluator(llm=gpt3_5)
correctness_gpt3_5 = CorrectnessEvaluator(llm=gpt3_5)
semantic = SemanticSimilarityEvaluator()  # coge el modelo de embedings de settings si no se le pasa ninguno.

In [5]:
# aquí crear el modelo que quieras evaluar

from llama_index.llms.ollama import Ollama

llm = Ollama(model="llama3.1", request_timeout=360.0)

In [6]:
def combinar_diccionarios(dict1, dict2):
    resultado = {}
    for key in dict1:
        if key in dict2:
            # Combina los arrays de ambas claves en un solo array
            resultado[key] = dict1[key] + dict2[key]
    return resultado

In [7]:
# he modificado la del ejemplo para que use result.score en vez de result.passing. en algunos tipos de evaluación sale un resultado más preciso
def get_eval_results(key, eval_results):
    results = eval_results[key]
    sum_score = 0
    count = 0
    for result in results:
        if result.score is not None:  # Verifica si score no es None
            sum_score += result.score
            count += 1
    if count == 0:
        print(f"{key} Score: No valid scores")
        return None
    score = sum_score / count
    print(f"{key} Score: {score}")
    return score

In [8]:
from llama_index.core.llama_dataset import download_llama_dataset, LabelledRagDataset
from llama_index.core.evaluation import BatchEvalRunner

datasets = ["paul_graham", "Blockchain", "Alexnet", "Covid", "Llama2Paper"]
total_results = {"correctness": [], "faithfulness": [], "relevancy": [], "semantic": []}

for dataset_name in datasets:
    path_dataset = f"./eval_data/rag_data/{datasets[0]}/rag_dataset.json"
    path_documents = f"./eval_data/rag_data/{datasets[0]}/source_files"
    rag_dataset = LabelledRagDataset.from_json(path_dataset)
    documents = SimpleDirectoryReader(input_dir=path_documents).load_data()

    rag_dataset_pandas = rag_dataset.to_pandas()
    queries = rag_dataset_pandas["query"]
    reference_answers = rag_dataset_pandas["reference_answer"]
    
    queries = queries[:20]
    reference_answers = reference_answers[:20].to_list()

    splitter = SentenceSplitter(chunk_size=512)
    vector_index = VectorStoreIndex.from_documents(
        documents, transformations=[splitter]
    )

    # ponemos todos los evaluadores que queremos usar aquí.
    runner = BatchEvalRunner(
        {"correctness": correctness_gpt3_5,
         "faithfulness": faithfulness_gpt3_5,
         "relevancy": relevancy_gpt3_5,
         "semantic": semantic},
        workers=8,
    )
    
    # he corregido una cosa de aquí del ejemplo, que no estaba (llm=llm), supongo que por error.
    eval_results = await runner.aevaluate_queries(
        vector_index.as_query_engine(llm=llm),
        queries=queries,
        reference=reference_answers,
    )
    
    total_results = combinar_diccionarios(total_results, eval_results)
    print(f"length of correctcness results: {len(total_results["correctness"])}")

    print(f"results of datasets {dataset_name} of model: {llm.model}")
    score_faithfulness = get_eval_results("faithfulness", eval_results)
    score_relevancy = get_eval_results("relevancy", eval_results)
    score_correctness = get_eval_results("correctness", eval_results)
    score_semantic = get_eval_results("semantic", eval_results)

print(f"total results of all datasets of model: {llm.model}")
score_faithfulness = get_eval_results("faithfulness", total_results)
score_relevancy = get_eval_results("relevancy", total_results)
score_correctness = get_eval_results("correctness", total_results)
score_semantic = get_eval_results("semantic", total_results)

length of correctcness results: 20
results of datasets paul_graham of model: llama3.1
faithfulness Score: 0.95
relevancy Score: 0.85
correctness Score: 4.027777777777778
semantic Score: 0.9462717242268106
length of correctcness results: 40
results of datasets Blockchain of model: llama3.1
faithfulness Score: 0.95
relevancy Score: 0.9
correctness Score: 4.029411764705882
semantic Score: 0.9500067004670466
length of correctcness results: 60
results of datasets Alexnet of model: llama3.1
faithfulness Score: 0.95
relevancy Score: 0.85
correctness Score: 4.03125
semantic Score: 0.9461367867337019
length of correctcness results: 80
results of datasets Covid of model: llama3.1
faithfulness Score: 0.95
relevancy Score: 0.8
correctness Score: 4.027777777777778
semantic Score: 0.9470647153637957
length of correctcness results: 100
results of datasets Llama2Paper of model: llama3.1
faithfulness Score: 1.0
relevancy Score: 0.9
correctness Score: 4.133333333333334
semantic Score: 0.9468532805779579

In [149]:
print(eval_results.keys())

print(eval_results["correctness"][0].dict().keys())

print(eval_results["correctness"][0].passing)
print(eval_results["correctness"][0].response)
print(eval_results["correctness"][0].contexts)

dict_keys(['correctness', 'faithfulness', 'relevancy', 'semantic'])
dict_keys(['query', 'contexts', 'response', 'passing', 'feedback', 'score', 'pairwise_source', 'invalid_result', 'invalid_reason'])
True
The article details several potential early signs of Covid-19 infection. These include sore throat, fever, chills, muscle aches, gastrointestinal disturbances such as diarrhea and nausea, and changes in sense of smell or taste. Some individuals may experience painful red and purple lesions on their fingers and toes, which are referred to as "Covid toe." Symptoms can vary widely among different people; some might not show many symptoms at all.

More serious cases have been associated with inflammation and organ damage even without difficulty breathing. There has also been a noted incidence of dangerous blood clots, strokes, and brain impairments in severe Covid-19 infections.

None


In [131]:
print(eval_results["correctness"][:1])

[EvaluationResult(query='What are some traditional application domains where deep learning has been successfully applied?', contexts=None, response="Deep learning applications have transcended conventional boundaries in various fields. Here's a concise list of some significant areas:\n\n1. **Computer Vision**:\n   - Object localization and detection.\n   - Image or video captioning.\n\n2. **Multimedia Entertainment**: \n   - Media processing, including generating content like images from text (text-to-image synthesis).\n\n3. **Autonomous Vehicles**: \n   - Vehicle segmentation in autonomous cars to enhance safety features such as collision avoidance systems.\n\n4. **Language Processing and Natural Language Understanding**:\n   - Machine translation.\n   - Speech recognition tasks involving voice-based commands or interaction with AI assistants like Siri, Alexa etc.\n\n5. **Healthcare and Medicine**: \n   - Brian cancer detection using medical imaging data analysis techniques powered by