In [14]:
import os
from dotenv import load_dotenv
import pandas as pd  # Import pandas to fix the error

# Loading API KEY from environment
load_dotenv()
OpenAI_key = os.getenv("OPENAI_API_KEY") 
Huggingface_key = os.getenv("HUGGINGFACE_API_KEY")

In [15]:
from trulens.core import TruSession
from trulens.dashboard import run_dashboard

session = TruSession()
session.reset_database()
run_dashboard(session, force=True) 

Updating app_name and app_version in apps table: 0it [00:00, ?it/s]
Updating app_id in records table: 0it [00:00, ?it/s]
Updating app_json in apps table: 0it [00:00, ?it/s]

Starting dashboard ...





Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.178.104:53106 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

#### Load prompts and data 

In [16]:
# Load prompts
prompt_path = "prompts/"

def read_prompt(prompt_path):
    with open(prompt_path, 'r') as file:
        return file.read()

system_input_stage_1 = read_prompt(prompt_path + "system_insurance_classification_prompt.txt")
human_input_stage_1 = read_prompt(prompt_path + "human_insurance_classification_prompt.txt") 

df = pd.read_excel('ground_truth_set.xlsx')

#### Define custom application 

In [17]:
# Define Pydantic models
from pydantic import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser 

class classify_category_model(BaseModel):
    category: str = Field(description="determine the category of the insurance product",
                          enum = [                             
                             "Term Life Insurance", "Whole Life Insurance", "Pension Insurance", "Disability Insurance", 
                             "Long-Term Care Pension Insurance", "Health Insurance", "Critical Illness Insurance", "Basic Ability Insurance", 
                             "Long-Term Care Cost Insurance", "Long-Term Care Daily Allowance Insurance", "Liability Insurance", 
                             "Business Interruption Insurance", "Home Contents Insurance", "Building Insurance", "Business Property Insurance", 
                             "Commercial Insurance", "Loan Repayment Insurance", "Construction Performance Insurance", 
                             "Machinery Breakdown and Machinery Insurance", "Credit Insurance", "Fidelity Guarantee Insurance", "Erection Insurance", 
                             "Natural Disaster Insurance", "Accident Insurance", "Travel Insurance", "Transport Insurance", "Private Unemployment Insurance", 
                             "Pet Insurance", "Driver Protection Insurance", "Legal Protection Insurance"]
    )

parser = PydanticOutputParser(pydantic_object=classify_category_model)  # Updated to use the renamed class
format_inst = parser.get_format_instructions() 


In [18]:
# Define a first app to use

from trulens.apps.custom import instrument
import openai 

class CategorizationApp:
    @instrument
    def categorize(self, human_input_query):
        client = openai.OpenAI()
        response = (
            client.beta.chat.completions.parse(
                model="gpt-4o",
                messages=[
                    {
                        "role": "system", 
                        "content": system_input_stage_1.format(format_instructions=format_inst)
                    },
                    {
                        "role": "user", 
                        "content": human_input_query 
                    }
                ],
                response_format=classify_category_model
            )
            .choices[0].message.parsed
        )
        return response.category

In [19]:
categorization_app = CategorizationApp() 

#### Define a golden data set  per each task (e.g., categorization, summarization, etc )

In [20]:
# build a complete prompt by passing the variables to the prompt 
query_dic = {'query': []}
for index, row in df.iterrows():
    query_dic['query'].append(human_input_stage_1.format(company=row['company'], title=row['title'], content=row['content']))

In [21]:
# build a QA dataset 
import pprint 

categorization_golden_set = pd.DataFrame({
    'query': query_dic['query'],
    'expected_response': df['category']
}).to_dict("records")

#### Define feedback functions

In [22]:
from trulens.core import Feedback
from trulens.feedback import GroundTruthAgreement
from trulens.providers.openai import OpenAI
from trulens.apps.custom import TruCustomApp
from trulens.core import Select
from trulens.providers.huggingface import Huggingface

# from trulens.core import Select
# from trulens.providers.huggingface import Huggingface

provider = OpenAI(model_engine="gpt-4o")
hug_provider = Huggingface() # for groundedness evaluation 

gta = GroundTruthAgreement(categorization_golden_set, provider=provider)

f_groundtruth = Feedback(
    gta.agreement_measure, name="Ground Truth Similarity (LLM)"
).on_input_output() 

f_groundedness_llm = (
    Feedback(
        provider.groundedness_measure_with_cot_reasons,
        name="Groundedness_LLM",
    )
    .on(Select.RecordInput)
    .on(Select.RecordOutput)
)

f_groundedness_nli = (
    Feedback(
        hug_provider.groundedness_measure_with_nli,
        name="Groundedness_NLI",
    )
    .on(Select.RecordInput)
    .on(Select.RecordOutput)
)

✅ In Ground Truth Similarity (LLM), input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Ground Truth Similarity (LLM), input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Groundedness_LLM, input source will be set to __record__.main_input or `Select.RecordInput` .
✅ In Groundedness_LLM, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Groundedness_NLI, input source will be set to __record__.main_input or `Select.RecordInput` .
✅ In Groundedness_NLI, input statement will be set to __record__.main_output or `Select.RecordOutput` .


In [23]:
# Define a wrapper ( Instrument the callable for logging with TruLens )
categorization_recorder = TruCustomApp(
    categorization_app,
    app_name="categorization",
    app_version="v1",
    feedbacks=[
        f_groundtruth, f_groundedness_llm, f_groundedness_nli
    ],
)

## Evaluate the prompt 

In [25]:
# #  test a single run of the App as so. This should show up on the dashboard.
# for row in categorization_golden_set[:50]:
#     with categorization_recorder:
#         categorization_app.categorize(human_input_query = row['query'])   

from tenacity import retry
from tenacity import stop_after_attempt
from tenacity import wait_random_exponential 

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def run_with_backoff(doc):
    return categorization_recorder.with_record(categorization_app.categorize, human_input_query = row['query'])

for row in categorization_golden_set:
    llm_response = run_with_backoff(row["query"])
    print(llm_response)

('Commercial Insurance', Record(record_id='record_hash_c0feae2379f9f13624998d2538154343', app_id='app_hash_064b90faeb1d9d62a9ae967cdb89ccf7', cost=Cost(n_requests=0, n_successful_requests=0, n_completion_requests=0, n_classification_requests=0, n_classes=0, n_embedding_requests=0, n_embeddings=0, n_tokens=0, n_stream_chunks=0, n_prompt_tokens=0, n_completion_tokens=0, n_cortex_guardrails_tokens=0, cost=0.0, cost_currency='USD'), perf=Perf(start_time=datetime.datetime(2025, 1, 17, 4, 10, 45, 623386), end_time=datetime.datetime(2025, 1, 17, 4, 10, 46, 491492)), ts=datetime.datetime(2025, 1, 17, 4, 10, 46, 491627), tags='-', meta=None, main_input='Here are the examples of insurance categories \n\n[start of insurance categories]\n- Term Life Insurance\n- Whole Life Insurance\n- Pension Insurance\n- Disability Insurance\n- Long-Term Care Pension Insurance\n- Health Insurance\n- Critical Illness Insurance\n- Basic Ability Insurance\n- Long-Term Care Cost Insurance\n- Long-Term Care Daily All

In [39]:
# for row in categorization_golden_set:
#     with categorization_recorder:
#             categorization_app.categorize(human_input_query = row['query'])

# To avoid the token per minute limit, we can use the following code
# https://www.trulens.org/cookbook/use_cases/summarization_eval/#write-feedback-functions
from tenacity import retry
from tenacity import stop_after_attempt
from tenacity import wait_random_exponential 

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def run_with_backoff(doc):
    return categorization_recorder.with_record(categorization_app.categorize, human_input_query = row['query'])

for row in categorization_golden_set:
    llm_response = run_with_backoff(row["query"])
    print(llm_response)

('Commercial Insurance', Record(record_id='record_hash_b3ad8b467c95fb5333a10962512936eb', app_id='app_hash_064b90faeb1d9d62a9ae967cdb89ccf7', cost=Cost(n_requests=0, n_successful_requests=0, n_completion_requests=0, n_classification_requests=0, n_classes=0, n_embedding_requests=0, n_embeddings=0, n_tokens=0, n_stream_chunks=0, n_prompt_tokens=0, n_completion_tokens=0, n_cortex_guardrails_tokens=0, cost=0.0, cost_currency='USD'), perf=Perf(start_time=datetime.datetime(2025, 1, 17, 2, 55, 38, 859439), end_time=datetime.datetime(2025, 1, 17, 2, 55, 39, 905472)), ts=datetime.datetime(2025, 1, 17, 2, 55, 39, 906132), tags='-', meta=None, main_input='Here are the examples of insurance categories \n\n[start of insurance categories]\n- Term Life Insurance\n- Whole Life Insurance\n- Pension Insurance\n- Disability Insurance\n- Long-Term Care Pension Insurance\n- Health Insurance\n- Critical Illness Insurance\n- Basic Ability Insurance\n- Long-Term Care Cost Insurance\n- Long-Term Care Daily All

ERROR [trulens.core.instruments] Error calling wrapped function categorize.
ERROR [trulens.core.instruments] Traceback (most recent call last):
  File "/Users/jaeyeopchung/.pyenv/versions/test-en/lib/python3.10/site-packages/trulens/core/instruments.py", line 769, in tru_wrapper
    rets, tally = core_endpoint.Endpoint.track_all_costs_tally(
  File "/Users/jaeyeopchung/.pyenv/versions/test-en/lib/python3.10/site-packages/trulens/core/feedback/endpoint.py", line 589, in track_all_costs_tally
    result, cbs = Endpoint.track_all_costs(
  File "/Users/jaeyeopchung/.pyenv/versions/test-en/lib/python3.10/site-packages/trulens/core/feedback/endpoint.py", line 551, in track_all_costs
    return Endpoint._track_costs(
  File "/Users/jaeyeopchung/.pyenv/versions/test-en/lib/python3.10/site-packages/trulens/core/feedback/endpoint.py", line 666, in _track_costs
    result: T = __func(*args, **kwargs)
  File "/var/folders/7_/lzvh2hfd7nbfj2n6q9k0hb980000gn/T/ipykernel_39040/2287920619.py", line 11,

('Legal Protection Insurance', Record(record_id='record_hash_394e937c9032413a48db24e2e307b7c5', app_id='app_hash_064b90faeb1d9d62a9ae967cdb89ccf7', cost=Cost(n_requests=0, n_successful_requests=0, n_completion_requests=0, n_classification_requests=0, n_classes=0, n_embedding_requests=0, n_embeddings=0, n_tokens=0, n_stream_chunks=0, n_prompt_tokens=0, n_completion_tokens=0, n_cortex_guardrails_tokens=0, cost=0.0, cost_currency='USD'), perf=Perf(start_time=datetime.datetime(2025, 1, 17, 2, 57, 16, 722583), end_time=datetime.datetime(2025, 1, 17, 2, 57, 31, 811929)), ts=datetime.datetime(2025, 1, 17, 2, 57, 31, 812305), tags='-', meta=None, main_input='Here are the examples of insurance categories \n\n[start of insurance categories]\n- Term Life Insurance\n- Whole Life Insurance\n- Pension Insurance\n- Disability Insurance\n- Long-Term Care Pension Insurance\n- Health Insurance\n- Critical Illness Insurance\n- Basic Ability Insurance\n- Long-Term Care Cost Insurance\n- Long-Term Care Dai

ERROR [trulens.core.instruments] Error calling wrapped function categorize.
ERROR [trulens.core.instruments] Traceback (most recent call last):
  File "/Users/jaeyeopchung/.pyenv/versions/test-en/lib/python3.10/site-packages/trulens/core/instruments.py", line 769, in tru_wrapper
    rets, tally = core_endpoint.Endpoint.track_all_costs_tally(
  File "/Users/jaeyeopchung/.pyenv/versions/test-en/lib/python3.10/site-packages/trulens/core/feedback/endpoint.py", line 589, in track_all_costs_tally
    result, cbs = Endpoint.track_all_costs(
  File "/Users/jaeyeopchung/.pyenv/versions/test-en/lib/python3.10/site-packages/trulens/core/feedback/endpoint.py", line 551, in track_all_costs
    return Endpoint._track_costs(
  File "/Users/jaeyeopchung/.pyenv/versions/test-en/lib/python3.10/site-packages/trulens/core/feedback/endpoint.py", line 666, in _track_costs
    result: T = __func(*args, **kwargs)
  File "/var/folders/7_/lzvh2hfd7nbfj2n6q9k0hb980000gn/T/ipykernel_39040/2287920619.py", line 11,

('Health Insurance', Record(record_id='record_hash_a4f76af9609c1b0dca1390605d0ecbe2', app_id='app_hash_064b90faeb1d9d62a9ae967cdb89ccf7', cost=Cost(n_requests=0, n_successful_requests=0, n_completion_requests=0, n_classification_requests=0, n_classes=0, n_embedding_requests=0, n_embeddings=0, n_tokens=0, n_stream_chunks=0, n_prompt_tokens=0, n_completion_tokens=0, n_cortex_guardrails_tokens=0, cost=0.0, cost_currency='USD'), perf=Perf(start_time=datetime.datetime(2025, 1, 17, 2, 58, 13, 392081), end_time=datetime.datetime(2025, 1, 17, 2, 58, 28, 726429)), ts=datetime.datetime(2025, 1, 17, 2, 58, 28, 726740), tags='-', meta=None, main_input='Here are the examples of insurance categories \n\n[start of insurance categories]\n- Term Life Insurance\n- Whole Life Insurance\n- Pension Insurance\n- Disability Insurance\n- Long-Term Care Pension Insurance\n- Health Insurance\n- Critical Illness Insurance\n- Basic Ability Insurance\n- Long-Term Care Cost Insurance\n- Long-Term Care Daily Allowan

ERROR [trulens.core.instruments] Error calling wrapped function categorize.
ERROR [trulens.core.instruments] Traceback (most recent call last):
  File "/Users/jaeyeopchung/.pyenv/versions/test-en/lib/python3.10/site-packages/trulens/core/instruments.py", line 769, in tru_wrapper
    rets, tally = core_endpoint.Endpoint.track_all_costs_tally(
  File "/Users/jaeyeopchung/.pyenv/versions/test-en/lib/python3.10/site-packages/trulens/core/feedback/endpoint.py", line 589, in track_all_costs_tally
    result, cbs = Endpoint.track_all_costs(
  File "/Users/jaeyeopchung/.pyenv/versions/test-en/lib/python3.10/site-packages/trulens/core/feedback/endpoint.py", line 551, in track_all_costs
    return Endpoint._track_costs(
  File "/Users/jaeyeopchung/.pyenv/versions/test-en/lib/python3.10/site-packages/trulens/core/feedback/endpoint.py", line 666, in _track_costs
    result: T = __func(*args, **kwargs)
  File "/var/folders/7_/lzvh2hfd7nbfj2n6q9k0hb980000gn/T/ipykernel_39040/2287920619.py", line 11,

('Legal Protection Insurance', Record(record_id='record_hash_2cfe20400db3414fe68f744766cecf7a', app_id='app_hash_064b90faeb1d9d62a9ae967cdb89ccf7', cost=Cost(n_requests=0, n_successful_requests=0, n_completion_requests=0, n_classification_requests=0, n_classes=0, n_embedding_requests=0, n_embeddings=0, n_tokens=0, n_stream_chunks=0, n_prompt_tokens=0, n_completion_tokens=0, n_cortex_guardrails_tokens=0, cost=0.0, cost_currency='USD'), perf=Perf(start_time=datetime.datetime(2025, 1, 17, 2, 59, 45, 771479), end_time=datetime.datetime(2025, 1, 17, 3, 0, 1, 202273)), ts=datetime.datetime(2025, 1, 17, 3, 0, 1, 202440), tags='-', meta=None, main_input='Here are the examples of insurance categories \n\n[start of insurance categories]\n- Term Life Insurance\n- Whole Life Insurance\n- Pension Insurance\n- Disability Insurance\n- Long-Term Care Pension Insurance\n- Health Insurance\n- Critical Illness Insurance\n- Basic Ability Insurance\n- Long-Term Care Cost Insurance\n- Long-Term Care Daily A

In [30]:
from trulens_eval import Tru
tru = Tru()
records, feedback = tru.get_records_and_feedback(app_ids=[])
import pandas as pd

# Convert records to a DataFrame
records_df = pd.DataFrame(records)

# Save the DataFrame to an Excel file
records_df.to_excel("records.xlsx", index=False)

records_df[['Ground Truth Similarity (LLM)', 'Groundedness_NLI']]

KeyError: "['Groundedness_NLI'] not in index"

In [57]:
# show the result 
session.get_leaderboard()["Ground Truth Similarity (LLM)"]


app_name        app_version
categorization  v1             0.402632
Name: Ground Truth Similarity (LLM), dtype: float64

In [21]:
# show the result on the web 
run_dashboard(session, force=True) 

Starting dashboard ...


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.178.104:52147 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>