In [27]:
!pip install -q llama-index pypdf
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q llama-index-embeddings-huggingface

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spacy 3.7.4 requires typer<0.10.0,>=0.3.0, but you have typer 0.12.3 which is incompatible.
weasel 0.3.4 requires typer<0.10.0,>=0.3.0, but you have typer 0.12.3 which is incompatible.


Initial setup

In [1]:
import nest_asyncio

nest_asyncio.apply()

from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext,
)
from llama_index.core.evaluation import (
    DatasetGenerator,
    FaithfulnessEvaluator,
    RelevancyEvaluator
)
import time


Data load

In [11]:
# Load Data
reader = SimpleDirectoryReader("./LLama3/data")
documents = reader.load_data()

Para seleccionar el tamaño de chunks adecuado, se tomarán medias del tiempo medio de respuesta, fidelidad y reelevancia para diferentes tamaños de chunks. Con la libreria DatassetGenerator podemos generar preguntar para los documentos.

  return cls(
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Evaluadores de fidelidad y reelevancia.

In [20]:
from llama_index.llms.huggingface import HuggingFaceLLM
import torch
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

Settings.embed_model = HuggingFaceEmbedding(
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
)

In [21]:
# LLM que evaluará las respuestas (Llama3-8B)
llm_evaluator = HuggingFaceLLM(
    context_window=2048,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.25, "do_sample": False},
    #query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Meta-Llama-3-8B",
    model_name="meta-llama/Meta-Llama-3-8B",
    device_map="auto",
    tokenizer_kwargs={"max_length": 2048},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16}
)

Settings.llm = llm_evaluator

# Service context for LLM evaluation
service_context_llama3_8b = ServiceContext.from_defaults(llm=llm_evaluator, embed_model="local")

# Define Faithfulness and Relevancy Evaluators which are based on the LLM
faithfulness_llama3_8b = FaithfulnessEvaluator(service_context=service_context_llama3_8b)
relevancy_llama3_8b = RelevancyEvaluator(service_context=service_context_llama3_8b)

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.56s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  service_context_llama3_8b = ServiceContext.from_defaults(llm=llm_evaluator, embed_model="local")


ValueError: The `model_name` argument must be provided.

In [None]:
# To evaluate for each chunk size, we will first generate a set of 20 questions from first 20 documents.
eval_documents = documents[:20]
data_generator = DatasetGenerator.from_documents(eval_documents)
eval_questions = data_generator.generate_questions_from_nodes(num = 20)

Evaluación de cada chunk_size basado en 3 métricas:

1. Tiempo medio de respuesta
2. Fidelidad media
3. Reelevancia media

In [23]:
# Define function to calculate average response time, average faithfulness and average relevancy metrics for given chunk size
# We use GPT-3.5-Turbo to generate response and GPT-4 to evaluate it.
def evaluate_response_time_and_accuracy(chunk_size, eval_questions):
    """
    Evaluate the average response time, faithfulness, and relevancy of responses generated by an LLM for a given chunk size.

    Parameters:
    chunk_size (int): The size of data chunks being processed.

    Returns:
    tuple: A tuple containing the average response time, faithfulness, and relevancy metrics.
    """

    total_response_time = 0
    total_faithfulness = 0
    total_relevancy = 0

    # create vector index (Llama3-8B-Instruct)
    llm = HuggingFaceLLM(
        context_window=2048,
        max_new_tokens=256,
        generate_kwargs={"temperature": 0.25, "do_sample": False},
        #query_wrapper_prompt=query_wrapper_prompt,
        tokenizer_name="meta-llama/Meta-Llama-3-8B-Instruct",
        model_name="meta-llama/Meta-Llama-3-8B-Instruct",
        device_map="auto",
        tokenizer_kwargs={"max_length": 2048},
        # uncomment this if using CUDA to reduce memory usage
        model_kwargs={"torch_dtype": torch.float16}
    )

    service_context = ServiceContext.from_defaults(llm=llm, chunk_size=chunk_size)
    vector_index = VectorStoreIndex.from_documents(
        eval_documents, service_context=service_context
    )
    # build query engine
    # By default, similarity_top_k is set to 2. To experiment with different values, pass it as an argument to as_query_engine()
    query_engine = vector_index.as_query_engine()
    num_questions = len(eval_questions)

    # Iterate over each question in eval_questions to compute metrics.
    # While BatchEvalRunner can be used for faster evaluations (see: https://docs.llamaindex.ai/en/latest/examples/evaluation/batch_eval.html),
    # we're using a loop here to specifically measure response time for different chunk sizes.
    for question in eval_questions:
        start_time = time.time()
        response_vector = query_engine.query(question)
        elapsed_time = time.time() - start_time

        faithfulness_result = faithfulness_gpt4.evaluate_response(
            response=response_vector
        ).passing

        relevancy_result = relevancy_gpt4.evaluate_response(
            query=question, response=response_vector
        ).passing

        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result
        total_relevancy += relevancy_result

    average_response_time = total_response_time / num_questions
    average_faithfulness = total_faithfulness / num_questions
    average_relevancy = total_relevancy / num_questions

    return average_response_time, average_faithfulness, average_relevancy

In [24]:
# Iterate over different chunk sizes to evaluate the metrics to help fix the chunk size.

for chunk_size in [128, 256, 512, 1024, 2048]:
  avg_response_time, avg_faithfulness, avg_relevancy = evaluate_response_time_and_accuracy(chunk_size)
  print(f"Chunk size {chunk_size} - Average Response time: {avg_response_time:.2f}s, Average Faithfulness: {avg_faithfulness:.2f}, Average Relevancy: {avg_relevancy:.2f}")

TypeError: evaluate_response_time_and_accuracy() missing 1 required positional argument: 'eval_questions'