# Evaluating response / source nodes

## Setup

In [1]:
from llama_index.core.evaluation import (
    RelevancyEvaluator,
    GuidelineEvaluator,
)

from llama_index.core.query_engine import (
    FLAREInstructQueryEngine,
    RetrieverQueryEngine,
    RetryQueryEngine,
    RetryGuidelineQueryEngine,
    KnowledgeGraphQueryEngine,
)

from llama_index.core import (
    TreeIndex,
    VectorStoreIndex,
    SimpleDirectoryReader,
    Response,
)

import pandas as pd

import os

import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from chromadb import Settings
from IPython.display import Markdown, display
from llama_index.core import PromptTemplate, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from openai import OpenAI, AzureOpenAI
from llama_index.vector_stores.chroma import ChromaVectorStore

import importlib
import util

In [2]:
from dotenv import load_dotenv

load_dotenv(override=True)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_VERSION = os.getenv("OPENAI_API_VERSION")

openai_client = AzureOpenAI(
    api_key=OPENAI_API_KEY,  
    api_version="2024-05-01-preview", # https://learn.microsoft.com/en-us/azure/ai-services/openai/reference?WT.mc_id=AZ-MVP-5004796
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

In [3]:
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=OPENAI_API_KEY,
    model_name="text-embedding-ada-002",
    api_type="azure", 
    api_version="2024-05-01-preview"
)

In [4]:
from dotenv import load_dotenv
from IPython.display import Markdown, display
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.prompts.default_prompt_selectors import \
    DEFAULT_TREE_SUMMARIZE_PROMPT_SEL
from llama_index.core.query_engine import (RouterQueryEngine,
                                           TransformQueryEngine)
from llama_index.core.response_synthesizers import TreeSummarize
from llama_index.core.selectors import LLMMultiSelector
from llama_index.core.tools import QueryEngineTool
from llama_index.llms.openai import OpenAI
# from llama_index.postprocessor.cohere_rerank import CohereRerank

from util.helpers import create_and_save_wiki_md_files, get_wiki_pages
from util.query_engines import VerboseHyDEQueryTransform, WeatherQueryEngine

import chromadb
from chromadb import Settings
from llama_index.llms.openai import OpenAI
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore

from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

In [5]:
chroma_client = chromadb.PersistentClient(
    path="./data/baseline-rag-pdf-docs/chromadb", settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.get_or_create_collection(
    name="landsforsoeg", metadata={"hnsw:space": "cosine"})
vector_store = ChromaVectorStore(chroma_collection=collection)

llm = AzureOpenAI(
    model="gpt-4",
    deployment_name="gpt4",
    api_key=os.getenv("OPENAI_API_KEY"),  
    # api_version=os.getenv("OPENAI_API_VERSION"),
    api_version = "2024-05-01-preview", # https://learn.microsoft.com/en-us/azure/ai-services/openai/reference?WT.mc_id=AZ-MVP-5004796
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

# You need to deploy your own embedding model as well as your own chat completion model
embed_model = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    deployment_name="text-embedding-ada-002",
    api_key=os.getenv("OPENAI_API_KEY"),  
    # api_version=os.getenv("OPENAI_API_VERSION"),
    api_version = "2024-05-01-preview", # https://learn.microsoft.com/en-us/azure/ai-services/openai/reference?WT.mc_id=AZ-MVP-5004796
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

# Define the ingestion pipeline to add documents to vector store
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=512, chunk_overlap=20),
        embed_model,
    ],
    vector_store=vector_store,
)

In [None]:
# Fetch documents
documents = SimpleDirectoryReader('./data/docs').load_data()

# Run pipeline
pipeline.run(documents=documents)

print("Indexing complete")

In [13]:
llm = AzureOpenAI(
    model="gpt-4",
    deployment_name="gpt4",
    api_key=os.getenv("OPENAI_API_KEY"),  
    api_version=os.getenv("OPENAI_API_VERSION"), # https://learn.microsoft.com/en-us/azure/ai-services/openai/reference?WT.mc_id=AZ-MVP-5004796
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

# You need to deploy your own embedding model as well as your own chat completion model
embed_model = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    deployment_name="text-embedding-ada-002",
    api_key=os.getenv("OPENAI_API_KEY"),  
    api_version=os.getenv("OPENAI_API_VERSION"), # https://learn.microsoft.com/en-us/azure/ai-services/openai/reference?WT.mc_id=AZ-MVP-5004796
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

In [6]:
documents = SimpleDirectoryReader('./data/docs').load_data()

from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

index = VectorStoreIndex.from_documents(documents=documents, embed_model=embed_model, llm=llm, verbose=True)

In [7]:
# NOTE: This is ONLY necessary in jupyter notebook.
# Details: Jupyter runs an event-loop behind the scenes.
#          This results in nested event-loops when we start an event-loop to make async queries.
#          This is normally not allowed, we use nest_asyncio to allow it for convenience.
import nest_asyncio
nest_asyncio.apply()

In [16]:
## Prompt of RelevancyEvaluator
DEFAULT_EVAL_TEMPLATE = PromptTemplate(
    "Your task is to evaluate if the response for the query \
    is in line with the context information provided.\n"
    "You have two options to answer. Either YES/ NO.\n"
    "Answer - YES, if the response for the query \
    is in line with context information otherwise NO.\n"
    "Query and Response: \n {query_str}\n"
    "Context: \n {context_str}\n"
    "Answer: "
)

DEFAULT_REFINE_TEMPLATE = PromptTemplate(
    "We want to understand if the following query and response is"
    "in line with the context information: \n {query_str}\n"
    "We have provided an existing YES/NO answer: \n {existing_answer}\n"
    "We have the opportunity to refine the existing answer "
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{context_msg}\n"
    "------------\n"
    "If the existing answer was already YES, still answer YES. "
    "If the information is present in the new context, answer YES. "
    "Otherwise answer NO.\n"
)

In [8]:
query = "hvordan bekæmper jeg væselhale?"

In [9]:
# evaluator = RelevancyEvaluator(llm=llm, eval_template=DEFAULT_EVAL_TEMPLATE, refine_template=DEFAULT_REFINE_TEMPLATE)
evaluator = RelevancyEvaluator(llm=llm)
query_engine = RetryQueryEngine(
    query_engine=index.as_query_engine(llm=llm), evaluator=evaluator
)

In [10]:
response = query_engine.query(str_or_query_bundle=query)
display(Markdown(f"Query: {query}\n\nResponse:\n\n {response}"))

Query: hvordan bekæmper jeg væselhale?

Response:

 For at bekæmpe væselhale, kan du anvende 0,7 l. Mateno Duo 600 SC pr. ha. Dette har vist sig at være mest effektivt, især når det anvendes tidligt, for eksempel den 20. august. Tilsætning af Boxer kan også reducere mængden af væselhale i frøet, men det kan ikke anvendes i praksis på de behandlingstidspunkter, der er blevet anvendt i dette forsøg, på grund af fordampning af prosulfocarb. Andre muligheder inkluderer kombinationer af Mateno Duo og Boxer i forskellige mængder, samt tilføjelse af Atlantis OD i nogle tilfælde. Det er vigtigt at bemærke, at resultaterne kan variere, og det optimale tidspunkt for anvendelse kan variere.

In [11]:
from llama_index.core.evaluation import EvaluationResult

# define jupyter display function
def display_eval_df(
    query: str, response: Response, eval_result: EvaluationResult
) -> None:
    eval_df = pd.DataFrame(
        {
            "Query": query,
            "Response": str(response),
            "Source": response.source_nodes[0].node.text[:1000] + "...",
            "Evaluation Result": "Pass" if eval_result.passing else "Fail",
            "Reasoning": eval_result.feedback,
        },
        index=[0],
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)

In [17]:
eval_result.passing

True

In [12]:
eval_result = evaluator.evaluate_response(
    query=query, response=response
)

display_eval_df(query, response, eval_result)

Unnamed: 0,Query,Response,Source,Evaluation Result,Reasoning
0,hvordan bekæmper jeg væselhale?,"For at bekæmpe væselhale, kan du anvende 0,7 l. Mateno Duo 600 SC pr. ha. Dette har vist sig at være mest effektivt, især når det anvendes tidligt, for eksempel den 20. august. Tilsætning af Boxer kan også reducere mængden af væselhale i frøet, men det kan ikke anvendes i praksis på de behandlingstidspunkter, der er blevet anvendt i dette forsøg, på grund af fordampning af prosulfocarb. Andre muligheder inkluderer kombinationer af Mateno Duo og Boxer i forskellige mængder, samt tilføjelse af Atlantis OD i nogle tilfælde. Det er vigtigt at bemærke, at resultaterne kan variere, og det optimale tidspunkt for anvendelse kan variere.","Den bed - ste bekæmpelse af væselhale er opnået i led 3, hvor der har været anvendt 0,7 l. Mateno Duo 600 SC pr. ha den 20. august – altså en tidlig sprøjtning. Indholdet af væ - selhale i frøet i led 3 er dog stadig for højt i forhold til certificering, men er nedbragt væsentligt sammenlignet med ubehandlet. Tilsætning af Boxer giver i dette forsøg færre væselhaler i frøet, men kan grundet fordampning af prosulfocarb ikke anvendes i praksis på de behand - lingstidspunkter, der har været anvendt i dette forsøg. Der har været en god bekæmpelse af alm. rapgræs i Slemning efter kraftig nedbør umiddelbart efter såning giver udfordringer med fremspiringen.FOTO: KRISTIAN JURANICH, SEGES INNOVATION...",Pass,YES
