# RAG Triad of metrics

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import utils

import os
import openai
openai.api_key = utils.get_openai_api_key()     # TODO: Add your OpenAI API key

In [None]:
from trulens_eval import Tru

tru = Tru()
tru.reset_database()

In [None]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./eBook-How-to-Build-a-Career-in-AI.pdf"]
).load_data()

In [None]:
from llama_index import Document

document = Document(text="\n\n".\
                    join([doc.text for doc in documents]))

In [None]:
from utils import build_sentence_window_index

from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

sentence_index = build_sentence_window_index(
    document,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="sentence_index"
)

In [None]:
from utils import get_sentence_window_query_engine

sentence_window_engine = \
get_sentence_window_query_engine(sentence_index)

In [None]:
output = sentence_window_engine.query(
    "How do you create your AI portfolio?")
output.response

## Feedback functions

A feedback function provides a *score* after reviewing and LLM app's *inputs*, *outputs*, and *intermediate results*.

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
from trulens_eval import OpenAI as fOpenAI

provider = fOpenAI()        # LLM used to run feedback

### 1. Answer Relevance

Is the final response useful?

In [None]:
from trulens_eval import Feedback

f_qa_relevance = Feedback(
    provider.relevance_with_cot_reasons,    # Feedback function method (cot = Chain-of-Thought, provides justification)
    name="Answer Relevance"                 # Human readable name for dashboard
).on_input_output()                         # Pointer to user query and app output

### 2. Context Relevance

How good is the retrieval?

In [None]:
from trulens_eval import TruLlama

context_selection = TruLlama.select_source_nodes().node.text

In [None]:
import numpy as np

f_qs_relevance = (
    Feedback(
        provider.qs_relevance,              # Feedback function method
        name="Context Relevance")           # Human readable name for dashboard
    .on_input()                             # Pointer to user query
    .on(context_selection)                  # Poiinter to retrieved contexts (intermediate results)
    .aggregate(np.mean)                     # Aggregate score accross all retrieved context
)

In [None]:
import numpy as np

f_qs_relevance = (
    Feedback(
        provider.qs_relevance_with_cot_reasons,             # Feedback function method (cot = Chain-of-Thought, provides justification)
        name="Context Relevance")                           # Human readable name for dashboard
    .on_input()
    .on(context_selection)
    .aggregate(np.mean)
)

### 3. Groundedness

In [None]:
from trulens_eval.feedback import Groundedness

grounded = Groundedness(groundedness_provider=provider)

In [None]:
f_groundedness = (
    Feedback(
        grounded.groundedness_measure_with_cot_reasons,     # Feedback function method (cot = Chain-of-Thought, provides justification)
        name="Groundedness"                                 # Human readable name for dashboard
    )
    .on(context_selection)                                  # Pointer to retrieved contexts (intermediate results)
    .on_output()                                            # Pointer to app output
    .aggregate(grounded.grounded_statements_aggregator)     # Aggregate score accross all retrieved context
)

## Evaluation of the RAG application

In [None]:
from trulens_eval import TruLlama
from trulens_eval import FeedbackMode

tru_recorder = TruLlama(
    sentence_window_engine,
    app_id="App_1",
    feedbacks=[
        f_qa_relevance,
        f_qs_relevance,
        f_groundedness
    ]
)

In [None]:
eval_questions = []
with open('eval_questions.txt', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        eval_questions.append(item)

In [None]:
eval_questions

In [None]:
eval_questions.append("How can I be successful in AI?")

In [None]:
eval_questions

In [None]:
for question in eval_questions:
    with tru_recorder as recording:
        sentence_window_engine.query(question)

In [None]:
records, feedback = tru.get_records_and_feedback(app_ids=[])
records.head()

In [None]:
import pandas as pd

pd.set_option("display.max_colwidth", None)
records[["input", "output"] + feedback]

In [None]:
tru.get_leaderboard(app_ids=[])

In [None]:
tru.run_dashboard()

## Evaluate and Iterate

- Start with LlamaIndex Basic RAG.
- Evaluate with TurLens RAG Triad.
- - Failure modes related to context size.
- Iterate with LlamaIndex Sentence Window RAG.
- Re-evaluate with TruLens RAG Triad.
- - Do we see imporvements in Context Relevance?
- - What about other metrics?
- Experiment with different window sizes.
- - What window size results in the best eval metrics?

TruLens RAG Triad:
- Failure modes usually arise because context size is too small. Once you increase context past a certain point, you might see improvements in Context Relevance.
- When Context Relevance goes up, we often find improvements in Groundedness as well, because the LLM in the completion step has enough relevant context to produce the summary.
- When the LLM does not have enough relevant context, it tends to leverage its own internal knowledge from the pre-training dataset to fill the gaps, which causes a loss of Groundedness.

Window size:
- If window size is too small, there might not be enough relevant context to get a good score on Context Relevance and Groundedness.
- If window size is too big, irrelevant context can creep into the response, leading to bad scores in Context Relevance and Groundedness.

Feedback Functions can be implemented in different ways:
- Ground Truth Evals: Pretty meaningful, expensive, hard to scale. (Usual starting point, experts evaluate response)
- Human Evals: Very meaningful, hard to scale. (Non-experts evaluate response -> Lower confidence)
- LLM Evals: Very meaningful, easier to scale. (Research shows human and LLMs evaluate responses about the same (~80% agree))
- MLM Evals: Very meaningful, easier to scale.
- Traditional NLP Evals: ROUGE and BLEU scores, very syntactic (overlap of words), less meaningfull, easy to scale.