In [1]:
# sacado de la documentación, yo no lo he ejecutado.
# %pip install llama-index-llms-openai llama-index-embeddings-openai

# estos son los módulos que tengo instalados
# pip install llama-index
# pip install llama-index-embeddings-huggingface
# pip install llama-index-llms-ollama
# pip install matplotlib


In [2]:
# attach to the same event-loop
import nest_asyncio

nest_asyncio.apply()

In [3]:
import os
import openai

os.environ["OPENAI_API_KEY"] = "TU KEY"
# openai.api_key = os.environ["OPENAI_API_KEY"]

In [4]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Response
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import (
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    CorrectnessEvaluator,
    GuidelineEvaluator,
    SemanticSimilarityEvaluator
)
from llama_index.core.node_parser import SentenceSplitter
import pandas as pd

pd.set_option("display.max_colwidth", 0)

In [5]:
# gpt-3.5-turbo
gpt3_5 = OpenAI(temperature=0, model="gpt-3.5-turbo")

faithfulness_gpt3_5 = FaithfulnessEvaluator(llm=gpt3_5)
relevancy_gpt3_5 = RelevancyEvaluator(llm=gpt3_5)
correctness_gpt3_5 = CorrectnessEvaluator(llm=gpt3_5)
semantic = SemanticSimilarityEvaluator()  # coge el modelo de embedings de settings si no se le pasa ninguno.

In [6]:
# creo el modelo de embeddings que quiero probar

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

Settings.embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1"
)



In [7]:
print(Settings.embed_model.model_name)

sentence-transformers/multi-qa-mpnet-base-dot-v1


In [8]:
# aquí crear el modelo que quieras evaluar

from llama_index.llms.ollama import Ollama

llm = Ollama(model="llama3_1_sauerkraut", request_timeout=360.0)

In [9]:
def combinar_diccionarios(dict1, dict2):
    resultado = {}
    for key in dict1:
        if key in dict2:
            # Combina los arrays de ambas claves en un solo array
            resultado[key] = dict1[key] + dict2[key]
    return resultado

In [10]:
# he modificado la del ejemplo para que use result.score en vez de result.passing. en algunos tipos de evaluación sale un resultado más preciso
def get_eval_results(key, eval_results):
    results = eval_results[key]
    sum_score = 0
    count = 0
    for result in results:
        if result.score is not None:  # Verifica si score no es None
            sum_score += result.score
            count += 1
    if count == 0:
        print(f"{key} Score: No valid scores")
        return None
    score = sum_score / count
    print(f"{key} Score: {score}")
    return score

In [11]:
from llama_index.core.llama_dataset import download_llama_dataset, LabelledRagDataset
from llama_index.core.evaluation import BatchEvalRunner

datasets = ["paul_graham", "Blockchain", "Alexnet", "Covid", "Llama2Paper"]
total_results = {"correctness": [], "faithfulness": [], "relevancy": [], "semantic": []}

for dataset_name in datasets:
    path_dataset = f"./eval_data/rag_data/{datasets[0]}/rag_dataset.json"
    path_documents = f"./eval_data/rag_data/{datasets[0]}/source_files"
    rag_dataset = LabelledRagDataset.from_json(path_dataset)
    documents = SimpleDirectoryReader(input_dir=path_documents).load_data()

    rag_dataset_pandas = rag_dataset.to_pandas()
    queries = rag_dataset_pandas["query"]
    reference_answers = rag_dataset_pandas["reference_answer"]
    
    queries = queries[:20]
    reference_answers = reference_answers[:20].to_list()

    splitter = SentenceSplitter(chunk_size=512)
    vector_index = VectorStoreIndex.from_documents(
        documents, transformations=[splitter]
    )

    # ponemos todos los evaluadores que queremos usar aquí.
    runner = BatchEvalRunner(
        {"correctness": correctness_gpt3_5,
         "faithfulness": faithfulness_gpt3_5,
         "relevancy": relevancy_gpt3_5,
         "semantic": semantic},
        workers=8,
    )
    
    # he corregido una cosa de aquí del ejemplo, que no estaba (llm=llm), supongo que por error.
    eval_results = await runner.aevaluate_queries(
        vector_index.as_query_engine(llm=llm),
        queries=queries,
        reference=reference_answers,
    )
    
    total_results = combinar_diccionarios(total_results, eval_results)
    print(f"length of correctcness results: {len(total_results["correctness"])}")

    print(f"results of dataset {dataset_name} of embeddings model: {Settings.embed_model.model_name}")
    score_faithfulness = get_eval_results("faithfulness", eval_results)
    score_relevancy = get_eval_results("relevancy", eval_results)
    score_correctness = get_eval_results("correctness", eval_results)
    score_semantic = get_eval_results("semantic", eval_results)

print(f"total results of all datasets of embeddings model: {Settings.embed_model.model_name}")
score_faithfulness = get_eval_results("faithfulness", total_results)
score_relevancy = get_eval_results("relevancy", total_results)
score_correctness = get_eval_results("correctness", total_results)
score_semantic = get_eval_results("semantic", total_results)

length of correctcness results: 20
results of datasets paul_graham of embeddings model: sentence-transformers/multi-qa-mpnet-base-dot-v1
faithfulness Score: 0.95
relevancy Score: 0.9
correctness Score: 4.0
semantic Score: 0.9483131462711073
length of correctcness results: 40
results of datasets Blockchain of embeddings model: sentence-transformers/multi-qa-mpnet-base-dot-v1
faithfulness Score: 0.9
relevancy Score: 0.75
correctness Score: 4.0
semantic Score: 0.9429335376457614
length of correctcness results: 60
results of datasets Alexnet of embeddings model: sentence-transformers/multi-qa-mpnet-base-dot-v1
faithfulness Score: 0.9
relevancy Score: 0.9
correctness Score: 3.9
semantic Score: 0.9441007259660952
length of correctcness results: 80
results of datasets Covid of embeddings model: sentence-transformers/multi-qa-mpnet-base-dot-v1
faithfulness Score: 0.9
relevancy Score: 0.8
correctness Score: 4.029411764705882
semantic Score: 0.945689379815042
length of correctcness results: 100


In [12]:
print(eval_results.keys())

print(eval_results["correctness"][0].dict().keys())

print(eval_results["correctness"][0].passing)
print(eval_results["correctness"][0].response)
print(eval_results["correctness"][0].contexts)

dict_keys(['correctness', 'faithfulness', 'relevancy', 'semantic'])
dict_keys(['query', 'contexts', 'response', 'passing', 'feedback', 'score', 'pairwise_source', 'invalid_result', 'invalid_reason'])
True
The author's first experience with programming was on an IBM 1401 in the basement of his junior high school. He used an early version of Fortran as the programming language. The biggest challenge he faced was not having any data to input into the program, which severely limited what he could do with the computer.
None


In [13]:
print(eval_results["correctness"][:1])

[EvaluationResult(query='In the essay, the author mentions his early experiences with programming. Describe the first computer he used for programming, the language he used, and the challenges he faced.', contexts=None, response="The author's first experience with programming was on an IBM 1401 in the basement of his junior high school. He used an early version of Fortran as the programming language. The biggest challenge he faced was not having any data to input into the program, which severely limited what he could do with the computer.", passing=True, feedback="The generated answer provides relevant information about the first computer the author used for programming (IBM 1401), the programming language used (Fortran), and the challenges faced (lack of data input). The details are accurate and align closely with the reference answer, but it lacks some specific details such as the author's age when he started using the computer.", score=4.5, pairwise_source=None, invalid_result=False