In [1]:
#!pip install trulens_eval llama_index llama_hub html2text llmsherpa tenacity protobuf==3.20

In [2]:
from trulens_eval import Tru

tru = Tru(database_redact_keys = True)

Package protobuf is installed but has a version conflict:
	(protobuf 3.20.0 (c:\users\28263\anaconda3\lib\site-packages), Requirement.parse('protobuf>=4.23.2'))

This package is optional for trulens_eval so this may not be a problem but if
you need to use the related optional features and find there are errors, you
will need to resolve the conflict:

    ```bash
    pip install 'protobuf>=4.23.2'
    ```

If you are running trulens_eval in a notebook, you may need to restart the
kernel after resolving the conflict. If your distribution is in a bad place
beyond this package, you may need to reinstall trulens_eval so that all of the
dependencies get installed and hopefully corrected:
    
    ```bash
    pip uninstall -y trulens_eval
    pip install trulens_eval
    ```

Package watchdog is installed but has a version conflict:
	(watchdog 2.1.6 (c:\users\28263\anaconda3\lib\site-packages), Requirement.parse('watchdog>=3.0.0'))

This package is optional for trulens_eval so this may not be

🦑 Tru initialized with db url sqlite:///default.sqlite .
🔒 Secret keys will not be included in the database.


In [3]:
from llama_index.readers.smart_pdf_loader import SmartPDFLoader

llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_loader = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)

documents = pdf_loader.load_data("https://www.iii.org/sites/default/files/docs/pdf/Insurance_Handbook_20103.pdf")

In [43]:
from langchain.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.5)


In [44]:
from llama_index.core.schema import Document
from llama_index.core.indices.vector_store.base import VectorStoreIndex
#join all the documents
document = Document(text = "/n/n".join([doc.text for doc in documents]))

from llama_index.core.indices.service_context import ServiceContext 
from llama_index.embeddings.huggingface import HuggingFaceEmbedding 
from langchain.embeddings.openai import OpenAIEmbeddings
import os

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

embed_model = OpenAIEmbedding()

Settings.embed_model = OpenAIEmbeddings(
    model='text-embedding-ada-002',
    openai_api_key=os.environ["OPENAI_API_KEY"]
)
Settings.llm=OpenAI(model = 'text-davinci-003', temperature = 0)

# service_context = ServiceContext.from_defaults(
#     llm=llm,
#     embed_model =embed_model,
# )

#from llama_index import VectorStoreIndex 
index = VectorStoreIndex.from_documents([document])


from llama_index.core import PromptTemplate 

system_prompt = PromptTemplate(
    "We have provided some context information bellow that you may use, \n"
    "---------------------------------------\n"
    "{context_str}"
    "---------------------------------------\n"
    "Please answer the question {query_str}\n"
)

rag_basic = index.as_query_engine(text_qa_template=system_prompt)

In [45]:
honest_evals = [
    "What are the typical coverage options for homeowners insurance",
    "What are the requirements for long term care insurance to start",
    "How much in losses does fraud account for in property and casualty insurance",
    "What was the most costly earthquake in US History for insurers"
]

# Set up evaluation

In [46]:
import os 
import numpy as np
from trulens_eval import Feedback


from trulens_eval.feedback.provider import OpenAI as fOpenAI

openai = fOpenAI()


#Answer relevance

#on_input_output() --> what text we wanna evaluate this feedback function on. This text may be located in a
# variety places. Here we use helper funciton
qa_relevance = (
    Feedback(openai.relevance_with_cot_reasons, 
                        name = "Answer Relevance").on_input_output()
)


from trulens_eval import TruLlama

#context relevance
#We wanna look at each peaice of context that we are treating and how relevant it is with users queary
# ,on(Truelama) --> where the context is located in a of llama index app
#.aggregate --> since we are gonna measure relevance of multiple peices of context -- use mean
from trulens_eval import TruLlama
qs_relevance = (
    Feedback(openai.qs_relevance_with_cot_reasons, name = "Context Relevance")
    .on_input()
    .on(TruLlama.select_source_nodes().node.text)
    .aggregate(np.mean)
)

#embedding distance


# embedding distance
from langchain.embeddings.openai import OpenAIEmbeddings
from trulens_eval.feedback import Embeddings



model_name = 'text-embedding-ada-002'
Settings.embed_model = OpenAIEmbedding()

# embed_model = OpenAIEmbeddings(
#     model=model_name,
#     openai_api_key=os.environ["OPENAI_API_KEY"]
# )


#groundedness

from trulens_eval.feedback import Groundedness

grounded = Groundedness(groundedness_provider=openai)

f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name="Groundedness")
        .on(TruLlama.select_source_nodes().node.text.collect()) #context
        .on_output()#output 
        .aggregate(grounded.grounded_statements_aggregator)#aggregator
)

honest_feedbacks = [qa_relevance, qs_relevance, f_groundedness]

#honest_feedbacks = [qa_relevance, qs_relevance, f_embed_dist, f_groundedness]

# embed = Embeddings(embed_model=embed_model)

# f_embed_dist = (
#     Feedback(embed.cosine_distance)
#     .on_input()
#     .on(TruLlama.select_source_nodes().node.text)
# )





✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input statement will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


In [47]:
from trulens_eval import TruLlama
tru_recorder_rag_basic = TruLlama(
        rag_basic,
        app_id='1) Basic RAG - Honest Eval',
        feedbacks=honest_feedbacks
    )



In [48]:
# Run evaluation on sample questions
with tru_recorder_rag_basic as recording:
    for question in honest_evals:
        response = rag_basic.query(question)

Error calling wrapped function wrapped_llm_predict.
Traceback (most recent call last):
  File "c:\Users\28263\anaconda3\lib\site-packages\trulens_eval\instruments.py", line 733, in tru_wrapper
    rets, cost = Endpoint.track_all_costs_tally(
  File "c:\Users\28263\anaconda3\lib\site-packages\trulens_eval\feedback\provider\endpoint\base.py", line 496, in track_all_costs_tally
    result, cbs = Endpoint.track_all_costs(
  File "c:\Users\28263\anaconda3\lib\site-packages\trulens_eval\feedback\provider\endpoint\base.py", line 477, in track_all_costs
    return Endpoint._track_costs(
  File "c:\Users\28263\anaconda3\lib\site-packages\trulens_eval\feedback\provider\endpoint\base.py", line 574, in _track_costs
    result: T = __func(*args, **kwargs)
  File "c:\Users\28263\anaconda3\lib\site-packages\llama_index\core\llms\callbacks.py", line 219, in wrapped_llm_predict
    f_return_val = f(_self, *args, **kwargs)
  File "c:\Users\28263\anaconda3\lib\site-packages\llama_index\llms\langchain\bas

NotFoundError: Error code: 404 - {'error': {'message': 'The model `text-davinci-003` has been deprecated, learn more here: https://platform.openai.com/docs/deprecations', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}

In [None]:
# get leaderboard
tru.get_leaderboard(app_ids=["1) Basic RAG - Honest Eval"])

Unnamed: 0_level_0,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1
