In [None]:
# !pip install llama_index==0.11.4

In [3]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
import openai

openai.api_key = "sk-your-api-key"
Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0.2)

In [9]:
from llama_index.core import Document, VectorStoreIndex
from llama_index.core.node_parser import TokenTextSplitter

text = "Hôm nay trời nắng, tôi đi ăn kem, lạnh buốt cả răng!"
doc = Document(text=text)
splitter = TokenTextSplitter(
    chunk_size=20,
    chunk_overlap=5,
    separator= " "
)
nodes = splitter.get_nodes_from_documents([doc])
vector_index = VectorStoreIndex(nodes)
query_engine = vector_index.as_query_engine()

Metadata length (2) is close to chunk size (20). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


In [10]:

from llama_index.core.evaluation import (
    BatchEvalRunner,
    CorrectnessEvaluator,
    FaithfulnessEvaluator,
    RelevancyEvaluator
)
import asyncio
import pandas as pd

In [11]:
import nest_asyncio
from tqdm.asyncio import tqdm_asyncio
nest_asyncio.apply()

In [12]:
from llama_index.core.llama_dataset.generator import RagDatasetGenerator

dataset_generator2 =  RagDatasetGenerator(nodes, num_questions_per_chunk=1)
eval_questions2 = dataset_generator2.generate_questions_from_nodes()
df = eval_questions2.to_pandas()

In [13]:
df

Unnamed: 0,query,reference_contexts,reference_answer,reference_answer_by,query_by
0,Hôm nay thời tiết như thế nào và bạn đã làm gì...,"[Hôm nay trời nắng, tôi đi ăn]",,,ai (gpt-4o-mini)
1,What sensory experience is described in the ph...,"[đi ăn kem, lạnh buốt cả răng!]",,,ai (gpt-4o-mini)


In [14]:
# Initialize the evaluators
correctness_evaluator = CorrectnessEvaluator() # Useful for measuring if the response is correct against a reference answer
faithfulness_evaluator = FaithfulnessEvaluator() # Useful for measuring if the response is hallucinated
relevancy_evaluator = RelevancyEvaluator() # Useful for measuring if the query is actually answered by the response

# Define an asynchronous function for evaluation
async def evaluate_async():
    # Initialize the BatchEvalRunner
    runner = BatchEvalRunner(
        {
            "correctness": correctness_evaluator,
            "faithfulness": faithfulness_evaluator,
            "relevancy": relevancy_evaluator
        },
        show_progress = True
    )

    # Run the asynchronous evaluation
    eval_result = await runner.aevaluate_queries(
        query_engine = vector_index.as_query_engine(),
        queries = [question for question in df['query']],
    )

    return eval_result

In [15]:
result = asyncio.run(evaluate_async())


100%|██████████| 2/2 [00:03<00:00,  1.91s/it]
100%|██████████| 6/6 [00:02<00:00,  2.84it/s]


In [16]:
result

{'correctness': [EvaluationResult(query='Hôm nay thời tiết như thế nào và bạn đã làm gì trong ngày hôm đó?', contexts=None, response='Hôm nay trời nắng và bạn đã đi ăn kem.', passing=True, feedback="The generated answer is relevant to the user query as it provides information about the weather and an activity done during the day. However, it lacks detail about the specific weather conditions and the user's activities, which could enhance the response.", score=4.0, pairwise_source=None, invalid_result=False, invalid_reason=None),
  EvaluationResult(query='What sensory experience is described in the phrase "lạnh buốt cả răng" in relation to eating ice cream?', contexts=None, response='The phrase "lạnh buốt cả răng" describes the sensation of extreme cold that affects the teeth when eating ice cream. It conveys a sharp, intense feeling of coldness that can be uncomfortable or even painful.', passing=True, feedback='The generated answer accurately describes the sensory experience related t

In [17]:
data = []
for i, question in enumerate(df['query']):
    correctness_result = result['correctness'][i]
    faithfulness_result = result['faithfulness'][i]
    relevancy_result = result['relevancy'][i]
    data.append({
        'Query': question,
        'Correctness response': correctness_result.response,
        'Correctness passing': correctness_result.passing,
        'Correctness feedback': correctness_result.feedback,
        'Correctness score': correctness_result.score,
        'Faithfulness response': faithfulness_result.response,
        'Faithfulness passing': faithfulness_result.passing,
        'Faithfulness feedback': faithfulness_result.feedback,
        'Faithfulness score': faithfulness_result.score,
        'Relevancy response': relevancy_result.response,
        'Relevancy passing': relevancy_result.passing,
        'Relevancy feedback': relevancy_result.feedback,
        'Relevancy score': relevancy_result.score,
    })

# Create a pandas DataFrame
df3 = pd.DataFrame(data)

In [18]:
df3

Unnamed: 0,Query,Correctness response,Correctness passing,Correctness feedback,Correctness score,Faithfulness response,Faithfulness passing,Faithfulness feedback,Faithfulness score,Relevancy response,Relevancy passing,Relevancy feedback,Relevancy score
0,Hôm nay thời tiết như thế nào và bạn đã làm gì...,Hôm nay trời nắng và bạn đã đi ăn kem.,True,The generated answer is relevant to the user q...,4.0,Hôm nay trời nắng và bạn đã đi ăn kem.,True,YES,1.0,Hôm nay trời nắng và bạn đã đi ăn kem.,True,YES,1.0
1,What sensory experience is described in the ph...,"The phrase ""lạnh buốt cả răng"" describes the s...",True,The generated answer accurately describes the ...,5.0,"The phrase ""lạnh buốt cả răng"" describes the s...",False,NO,0.0,"The phrase ""lạnh buốt cả răng"" describes the s...",True,YES,1.0


In [19]:
correctness_scores = df3['Correctness score'].mean()
Faithfulness_scores = df3['Faithfulness score'].mean()
Relevancy_scores = df3['Relevancy score'].mean()
print(f"Correctness scores: {correctness_scores}")
print(f"Faithfulness scores: {Faithfulness_scores}")
print(f"Relevancy scores: {Relevancy_scores}")

Correctness scores: 4.5
Faithfulness scores: 0.5
Relevancy scores: 1.0


In [23]:
import openai
from llama_index.core import Settings, Document, VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.evaluation import (
    BatchEvalRunner,
    CorrectnessEvaluator,
    FaithfulnessEvaluator,
    RelevancyEvaluator
)
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
import asyncio
import pandas as pd
import nest_asyncio
from tqdm.asyncio import tqdm_asyncio

# Set up necessary settings for LLM
def setup_openai(api_key: str, model: str = "gpt-4o-mini", temperature: float = 0.2):
    openai.api_key = api_key
    Settings.llm = OpenAI(model=model, temperature=temperature)

# Split text into smaller chunks for processing
def create_document_and_splitter(text: str, chunk_size: int = 20, chunk_overlap: int = 5, separator: str = " "):
    doc = Document(text=text)
    splitter = TokenTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separator=separator
    )
    nodes = splitter.get_nodes_from_documents([doc])
    return nodes

# Create a vector store index and a query engine
def create_vector_store_index(nodes):
    vector_index = VectorStoreIndex(nodes)
    query_engine = vector_index.as_query_engine()
    return query_engine

# Generate questions from nodes
def generate_questions(nodes, num_questions_per_chunk: int = 1):
    dataset_generator = RagDatasetGenerator(nodes, num_questions_per_chunk=num_questions_per_chunk)
    eval_questions = dataset_generator.generate_questions_from_nodes()
    return eval_questions.to_pandas()

# Define asynchronous evaluation function
async def evaluate_async(query_engine, df):
    correctness_evaluator = CorrectnessEvaluator() # Evaluate correctness against a reference answer
    faithfulness_evaluator = FaithfulnessEvaluator() # Evaluate hallucination of the response
    relevancy_evaluator = RelevancyEvaluator() # Evaluate if the response actually answers the query

    # Initialize the BatchEvalRunner
    runner = BatchEvalRunner(
        {
            "correctness": correctness_evaluator,
            "faithfulness": faithfulness_evaluator,
            "relevancy": relevancy_evaluator
        },
        show_progress=True
    )

    # Run the asynchronous evaluation
    eval_result = await runner.aevaluate_queries(
        query_engine= query_engine,
        queries=[question for question in df['query']],
    )

    return eval_result

# Collect and aggregate the results
def aggregate_results(df, eval_result):
    data = []
    for i, question in enumerate(df['query']):
        correctness_result = eval_result['correctness'][i]
        faithfulness_result = eval_result['faithfulness'][i]
        relevancy_result = eval_result['relevancy'][i]
        data.append({
            'Query': question,
            'Correctness response': correctness_result.response,
            'Correctness passing': correctness_result.passing,
            'Correctness feedback': correctness_result.feedback,
            'Correctness score': correctness_result.score,
            'Faithfulness response': faithfulness_result.response,
            'Faithfulness passing': faithfulness_result.passing,
            'Faithfulness feedback': faithfulness_result.feedback,
            'Faithfulness score': faithfulness_result.score,
            'Relevancy response': relevancy_result.response,
            'Relevancy passing': relevancy_result.passing,
            'Relevancy feedback': relevancy_result.feedback,
            'Relevancy score': relevancy_result.score,
        })

    # Create a pandas DataFrame from the data
    df_result = pd.DataFrame(data)
    return df_result

# Calculate and print average scores
def print_average_scores(df):
    correctness_scores = df['Correctness score'].mean()
    faithfulness_scores = df['Faithfulness score'].mean()
    relevancy_scores = df['Relevancy score'].mean()
    print(f"Correctness scores: {correctness_scores}")
    print(f"Faithfulness scores: {faithfulness_scores}")
    print(f"Relevancy scores: {relevancy_scores}")

# Main function to execute the steps
def main():
    # Apply nested asyncio
    nest_asyncio.apply()

    # Setup OpenAI
    setup_openai(api_key="sk-")

    # Create document and split into nodes
    text = "Hôm nay trời nắng, tôi đi ăn kem, lạnh buốt cả răng!"
    nodes = create_document_and_splitter(text)

    # Create vector store index and query engine
    query_engine = create_vector_store_index(nodes)

    # Generate evaluation questions
    df = generate_questions(nodes)

    # Evaluate and aggregate results
    eval_result = asyncio.run(evaluate_async(query_engine, df))
    df_result = aggregate_results(df, eval_result)

    # Print average scores
    print_average_scores(df_result)




In [24]:
main()

Metadata length (2) is close to chunk size (20). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


100%|██████████| 2/2 [00:02<00:00,  1.13s/it]
100%|██████████| 6/6 [00:03<00:00,  1.88it/s]

Correctness scores: 4.5
Faithfulness scores: 0.5
Relevancy scores: 1.0



