In [1]:
# sacado de la documentación, yo no lo he ejecutado.
# %pip install llama-index-llms-openai llama-index-embeddings-openai

# estos son los módulos que tengo instalados
# pip install llama-index
# pip install llama-index-embeddings-huggingface
# pip install llama-index-llms-ollama
# pip install matplotlib


In [2]:
# attach to the same event-loop
import nest_asyncio

nest_asyncio.apply()

In [3]:
import os
import openai

os.environ["OPENAI_API_KEY"] = "TU KEY"
# openai.api_key = os.environ["OPENAI_API_KEY"]

In [4]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Response
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import (
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    CorrectnessEvaluator,
    GuidelineEvaluator,
    SemanticSimilarityEvaluator
)
from llama_index.core.node_parser import SentenceSplitter
import pandas as pd

pd.set_option("display.max_colwidth", 0)

In [5]:
# gpt-3.5-turbo
gpt3_5 = OpenAI(temperature=0, model="gpt-3.5-turbo")

faithfulness_gpt3_5 = FaithfulnessEvaluator(llm=gpt3_5)
relevancy_gpt3_5 = RelevancyEvaluator(llm=gpt3_5)
correctness_gpt3_5 = CorrectnessEvaluator(llm=gpt3_5)
semantic = SemanticSimilarityEvaluator()  # coge el modelo de embedings de settings si no se le pasa ninguno.

In [6]:
# aquí crear el modelo que quieras evaluar

from llama_index.llms.ollama import Ollama

llm = Ollama(model="phi3_5_mini", request_timeout=360.0)

In [7]:
def combinar_diccionarios(dict1, dict2):
    resultado = {}
    for key in dict1:
        if key in dict2:
            # Combina los arrays de ambas claves en un solo array
            resultado[key] = dict1[key] + dict2[key]
    return resultado

In [8]:
# he modificado la del ejemplo para que use result.score en vez de result.passing. en algunos tipos de evaluación sale un resultado más preciso
def get_eval_results(key, eval_results):
    results = eval_results[key]
    sum_score = 0
    count = 0
    for result in results:
        if result.score is not None:  # Verifica si score no es None
            sum_score += result.score
            count += 1
    if count == 0:
        print(f"{key} Score: No valid scores")
        return None
    score = sum_score / count
    print(f"{key} Score: {score}")
    return score

In [9]:
from llama_index.core.llama_dataset import download_llama_dataset, LabelledRagDataset
from llama_index.core.evaluation import BatchEvalRunner

datasets = ["paul_graham", "Blockchain", "Alexnet", "Covid", "Llama2Paper"]
total_results = {"correctness": [], "faithfulness": [], "relevancy": [], "semantic": []}

for dataset_name in datasets:
    path_dataset = f"./eval_data/rag_data/{datasets[0]}/rag_dataset.json"
    path_documents = f"./eval_data/rag_data/{datasets[0]}/source_files"
    rag_dataset = LabelledRagDataset.from_json(path_dataset)
    documents = SimpleDirectoryReader(input_dir=path_documents).load_data()

    rag_dataset_pandas = rag_dataset.to_pandas()
    queries = rag_dataset_pandas["query"]
    reference_answers = rag_dataset_pandas["reference_answer"]
    
    queries = queries[:20]
    reference_answers = reference_answers[:20].to_list()

    splitter = SentenceSplitter(chunk_size=512)
    vector_index = VectorStoreIndex.from_documents(
        documents, transformations=[splitter]
    )

    # ponemos todos los evaluadores que queremos usar aquí.
    runner = BatchEvalRunner(
        {"correctness": correctness_gpt3_5,
         "faithfulness": faithfulness_gpt3_5,
         "relevancy": relevancy_gpt3_5,
         "semantic": semantic},
        workers=8,
    )
    
    # he corregido una cosa de aquí del ejemplo, que no estaba (llm=llm), supongo que por error.
    eval_results = await runner.aevaluate_queries(
        vector_index.as_query_engine(llm=llm),
        queries=queries,
        reference=reference_answers,
    )
    
    total_results = combinar_diccionarios(total_results, eval_results)
    print(f"length of correctcness results: {len(total_results["correctness"])}")

    print(f"results of datasets {dataset_name} of model: {llm.model}")
    score_faithfulness = get_eval_results("faithfulness", eval_results)
    score_relevancy = get_eval_results("relevancy", eval_results)
    score_correctness = get_eval_results("correctness", eval_results)
    score_semantic = get_eval_results("semantic", eval_results)

print(f"total results of all datasets of model: {llm.model}")
score_faithfulness = get_eval_results("faithfulness", total_results)
score_relevancy = get_eval_results("relevancy", total_results)
score_correctness = get_eval_results("correctness", total_results)
score_semantic = get_eval_results("semantic", total_results)

length of correctcness results: 20
results of datasets paul_graham of model: phi3_5_mini
faithfulness Score: 0.85
relevancy Score: 0.7
correctness Score: 3.0
semantic Score: 0.8805704617135282
length of correctcness results: 40
results of datasets Blockchain of model: phi3_5_mini
faithfulness Score: 0.65
relevancy Score: 0.55
correctness Score: 2.5277777777777777
semantic Score: 0.8665848776203695
length of correctcness results: 60
results of datasets Alexnet of model: phi3_5_mini
faithfulness Score: 0.95
relevancy Score: 0.7
correctness Score: 2.85
semantic Score: 0.8954530883128797
length of correctcness results: 80
results of datasets Covid of model: phi3_5_mini
faithfulness Score: 0.65
relevancy Score: 0.65
correctness Score: 2.5526315789473686
semantic Score: 0.8695089954398046
length of correctcness results: 100
results of datasets Llama2Paper of model: phi3_5_mini
faithfulness Score: 0.75
relevancy Score: 0.65
correctness Score: 2.264705882352941
semantic Score: 0.87152304351097

In [13]:
print(eval_results.keys())

print(eval_results["correctness"][0].dict().keys())

print(eval_results["correctness"][0].passing)
print(eval_results["correctness"][0].response)
print(eval_results["correctness"][0].contexts)

dict_keys(['correctness', 'faithfulness', 'relevancy', 'semantic'])
dict_keys(['query', 'contexts', 'response', 'passing', 'feedback', 'score', 'pairwise_source', 'invalid_result', 'invalid_reason'])
False

In those days I was quite young (13 or 14) so it's not surprising that my main focus in life is on writing what we call "short stories" today! But back then short story were good enough as far as you know, right? It seemed like a better use of time to write them than learn programming. I didn't understand the appeal either because everything was so hard and slow compared with computers nowadays...
In those days there weren't microprocessors or high-speed internet for anything other that writing essays! That said, you should give me a good idea of how to write an effective short story in this context. I know it would be pretty easy if everything else were available today (easier prose editors and the like), but can't imagine life without them either...
Anyway what was your experience

In [14]:
print(eval_results["correctness"][:1])

[EvaluationResult(query='In the essay, the author mentions his early experiences with programming. Describe the first computer he used for programming, the language he used, and the challenges he faced.', contexts=None, response='\nIn those days I was quite young (13 or 14) so it\'s not surprising that my main focus in life is on writing what we call "short stories" today! But back then short story were good enough as far as you know, right? It seemed like a better use of time to write them than learn programming. I didn\'t understand the appeal either because everything was so hard and slow compared with computers nowadays...\nIn those days there weren\'t microprocessors or high-speed internet for anything other that writing essays! That said, you should give me a good idea of how to write an effective short story in this context. I know it would be pretty easy if everything else were available today (easier prose editors and the like), but can\'t imagine life without them either...\n