# Dev Notebook

This notebook loads the version of trulens_eval from the enclosing repo folder. You can use this to debug or devlop trulens_eval features.

In [1]:
# pip uninstall -y trulens_eval
# pip install git+https://github.com/truera/trulens@piotrm/azure_bugfixes#subdirectory=trulens_eval

# trulens_eval notebook dev

%load_ext autoreload
%autoreload 2
from pathlib import Path
import sys

base = Path().cwd()
while not (base / "trulens_eval").exists():
    base = base.parent

print(base)

# If running from github repo, can use this:
sys.path.append(str(base))

# Uncomment for more debugging printouts.
"""
import logging
root = logging.getLogger()
root.setLevel(logging.DEBUG)

handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
root.addHandler(handler)
"""

from trulens_eval.keys import check_keys

check_keys(
    "OPENAI_API_KEY",
    "HUGGINGFACE_API_KEY"
)

from trulens_eval import Tru
tru = Tru()
# tru.reset_database()

# tru.run_dashboard(_dev=base, force=True)

/Volumes/dev_new/trulens/trulens_eval
✅ Key OPENAI_API_KEY set from environment (same value found in .env file at /Volumes/dev_new/.env).
✅ Key HUGGINGFACE_API_KEY set from environment (same value found in .env file at /Volumes/dev_new/.env).
🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of Tru` to prevent this.


In [None]:
from trulens_eval.feedback.provider import LiteLLM
from trulens_eval.feedback.provider.endpoint import Endpoint
from litellm import completion

from trulens_eval.keys import check_or_set_keys

import os

check_or_set_keys(
    "AZURE_OPENAI_ENDPOINT",
    "AZURE_OPENAI_API_KEY",
    "OPENAI_API_TYPE",
    "OPENAI_API_VERSION",
    "AZURE_OPENAI_DEPLOYMENT_NAME"
)

import litellm
from litellm.utils import get_llm_provider, completion_cost

prov = get_llm_provider(model="azure/truera-gpt-35-turbo")

"""
lm_res = litellm.completion(
    model="azure/truera-gpt-35-turbo",
    api_base="https://truera-dev-azure-openai.openai.azure.com/",
    messages=[{"role": "user", "content": "hello there"}]
)
"""
    #azure_client_params = { 
    #    "api_key": os.environ["AZURE_OPENAI_API_KEY"], 
    #    "azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"], 
    #    "api_version": os.environ["OPENAI_API_VERSION"], 
    #}


In [None]:
get_llm_provider("azure/truera-gpt-35-turbo")

In [None]:
#prov = LiteLLM(
#    model_engine="azure/" + os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
#    completion_kwargs=dict(api_base="https://truera-dev-azure-openai.openai.azure.com/")
#)
from trulens_eval.feedback.provider.bedrock import Bedrock
from trulens_eval.feedback.provider import LiteLLM

# Have to delete litellm endpoint singleton as it may have been created
# with the wrong underlying litellm provider in a prior test.
# Endpoint.delete_singleton_by_name("litellm")

#provider = LiteLLM(f"bedrock/{Bedrock.DEFAULT_MODEL_ID}")
provider = Bedrock()

In [None]:
from pprint import pprint
provider.endpoint

In [None]:
provider.sentiment("hello there")

In [None]:
res, costs = await Endpoint.atrack_all_costs_tally(prov.sentiment, text="This rocks!")


In [None]:
costs

In [None]:
op.chat.completions.create("Hello?", stream=True)

In [None]:
from trulens_eval.feedback.provider import Huggingface
from trulens_eval.feedback.provider import OpenAI
from trulens_eval.feedback.provider.endpoint import Endpoint
from openai import OpenAI
op = OpenAI()

hugs = Huggingface()

In [None]:
res, cost = Endpoint.track_all_costs_tally(hugs.positive_sentiment, text="This rocks!")

In [None]:
res

In [None]:
import json

import boto3
from langchain.docstore.document import Document
from langchain_community.retrievers.bedrock import \
    AmazonKnowledgeBasesRetriever
from langchain_community.retrievers.bedrock import RetrievalConfig


class MyKnowledgeBaseRetriever(AmazonKnowledgeBasesRetriever): 
    def __init__(self, knowledge_base_id, region_name, retrieval_config):
        super().__init__(
            knowledge_base_id=knowledge_base_id,
            region_name=region_name,
            retrieval_config=retrieval_config
        )
        self.knowledge_base_id = knowledge_base_id
        self.region_name = region_name
        self.retrieval_config = retrieval_config

    def get_relevant_documents(self, query):
        # Create a session using your AWS credentials
        session = boto3.Session(region_name=self.region_name)

        # Create a client for the Bedrock Agent Runtime service
        bedrock_client = session.client('bedrock-agent-runtime')

        # Retrieve relevant documents from the Knowledge Base
        response = bedrock_client.retrieve(
            knowledgeBaseId=self.knowledge_base_id,
            retrievalQuery={
                'text': query
            },
            retrievalConfiguration=self.retrieval_config
        )
        
        relevant_documents = response['retrievalResults']
        docs = []

        for result in relevant_documents:
            content = result['content']['text']
            location = result['location']['s3Location']['uri']
            score = result['score']

            doc = Document(page_content=content, metadata={'location': location, 'score': score})
            docs.append(doc)

        # Return the relevant documents
        return docs

In [None]:
from langchain.llms.bedrock import Bedrock
from langchain.chains.retrieval_qa.base import RetrievalQA

retriever = MyKnowledgeBaseRetriever(
    knowledge_base_id="MY KNOWLEDGE BASE ID",
    region_name="us-east-1",
    retrieval_config={"vectorSearchConfiguration": {"numberOfResults": 6}}
)

# Create an instance of the Bedrock LLM
llm = Bedrock(
    model_id='meta.llama2-70b-chat-v1',
    model_kwargs={"temperature": 0.1, "top_p": 0.9, "max_gen_len": 1200}
)

# Create an instance of the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

# Garbage collecting testing

In [None]:
from trulens_eval import TruChain, Tru
tru = Tru()

from operator import itemgetter
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser

prompt1 = ChatPromptTemplate.from_template("what is the city {person} is from?")
prompt2 = ChatPromptTemplate.from_template(
    "what country is the city {city} in? respond in {language}"
)

model = ChatOpenAI()

chain1 = prompt1 | model | StrOutputParser()

chain2 = (
    {"city": chain1, "language": itemgetter("language")}
    | prompt2
    | model
    | StrOutputParser()
)

chain2_input = {"person": "obama", "language": "spanish"}
# chain2.invoke(chain2_input)

tru_recorder2 = TruChain(
    chain2,
    app_id='Chain1'
)

with tru_recorder2 as recs:
    llm_response = chain2.invoke(chain2_input)

In [None]:
chain2 = (
    {"city": chain1, "language": itemgetter("language")}
    | prompt2
    | model
    | StrOutputParser()
)

tru_recorder2 = TruChain(
    chain2,
    app_id='Chain2'
)

with tru_recorder2 as recs:
    llm_response = chain2.invoke(chain2_input)

In [None]:
chain2 = (
    {"city": chain1, "language": itemgetter("language")}
    | prompt2
    | model
    | StrOutputParser()
)

tru_recorder2 = TruChain(
    chain2,
    app_id='Chain3'
)

with tru_recorder2 as recs:
    llm_response = chain2.invoke(chain2_input)

In [None]:
chain2 = (
    {"city": chain1, "language": itemgetter("language")}
    | prompt2
    | model
    | StrOutputParser()
)

tru_recorder2 = TruChain(
    chain2,
    app_id='Chain4'
)

with tru_recorder2 as recs:
    llm_response = chain2.invoke(chain2_input)

In [None]:
chain2 = (
    {"city": chain1, "language": itemgetter("language")}
    | prompt2
    | model
    | StrOutputParser()
)

tru_recorder2 = TruChain(
    chain2,
    app_id='Chain5'
)

In [None]:
with tru_recorder2 as recs:
    llm_response = chain2.invoke(chain2_input)

In [None]:
from concurrent.futures import as_completed
import json
import os
from pathlib import Path
from time import sleep

import dotenv
from tqdm.auto import tqdm

from trulens_eval import Feedback
from trulens_eval import Tru
from trulens_eval.feedback.provider.endpoint.base import Endpoint
from trulens_eval.feedback.provider.hugs import Dummy
from trulens_eval.schema import Cost
from trulens_eval.schema import FeedbackMode
from trulens_eval.schema import Record
from trulens_eval.tru_custom_app import TruCustomApp
from trulens_eval.utils.threading import TP

# Context selection tests

In [None]:
import boto3
import json
from langchain_community.retrievers.bedrock import (
    AmazonKnowledgeBasesRetriever,
    RetrievalConfig,
)
from langchain.docstore.document import Document

from langchain.llms.bedrock import Bedrock
from langchain.chains.retrieval_qa.base import RetrievalQA


class MyKnowledgeBaseRetriever(AmazonKnowledgeBasesRetriever):

    def __init__(self, knowledge_base_id, region_name, retrieval_config):
        super().__init__(
            knowledge_base_id=knowledge_base_id,
            region_name=region_name,
            retrieval_config=retrieval_config
        )
        self.knowledge_base_id = knowledge_base_id
        self.region_name = region_name
        self.retrieval_config = retrieval_config

    def get_relevant_documents(self, query):
        # Create a session using your AWS credentials
        session = boto3.Session(region_name=self.region_name)

        # Create a client for the Bedrock Agent Runtime service
        bedrock_client = session.client('bedrock-agent-runtime')

        # Retrieve relevant documents from the Knowledge Base
        response = bedrock_client.retrieve(
            knowledgeBaseId=self.knowledge_base_id,
            retrievalQuery={'text': query},
            retrievalConfiguration=self.retrieval_config
        )

        relevant_documents = response['retrievalResults']
        docs = []

        for result in relevant_documents:
            content = result['content']['text']
            location = result['location']['s3Location']['uri']
            score = result['score']

            doc = Document(
                page_content=content,
                metadata={
                    'location': location,
                    'score': score
                }
            )
            docs.append(doc)

        # Return the relevant documents
        return docs


# Configure the AWS credentials and region
# config = Config(region_name='us-east-1')

retriever = MyKnowledgeBaseRetriever(
    knowledge_base_id="MY KNOWLEDGE BASE ID",
    region_name="us-east-1",
    retrieval_config={"vectorSearchConfiguration": {
        "numberOfResults": 6
    }}
)

# Create an instance of the Bedrock LLM
llm = Bedrock(
    model_id='meta.llama2-70b-chat-v1',
    model_kwargs={
        "temperature": 0.1,
        "top_p": 0.9,
        "max_gen_len": 1200
    }
)

# Create an instance of the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, retriever=retriever, return_source_documents=True
)

#qa_chain.invoke("What steps should be taken during the demo ride in the buying cycle to improve the likelihood of closing the deal?")

In [None]:
from trulens_eval.app import App

import numpy as np

tru = Tru()

context = App.select_context(qa_chain)

In [None]:
context

In [None]:
# Setup Tru and/or dashboard.

tru = Tru()

#tru.reset_database()

tru.start_dashboard(
    force = True,
    _dev=Path().cwd().parent.parent.resolve()
)

# Pydantic testing

In [None]:
from typing import Any

from pydantic import BaseModel
from pydantic import Field
from pydantic import field_validator
from pydantic import model_validator
from pydantic import PydanticUndefinedAnnotation
from pydantic import SerializeAsAny
from pydantic import ValidationInfo
from pydantic import validator
from pydantic_core import PydanticUndefined


class CustomLoader(BaseModel):
    cls: Any

    def __init__(self, *args, **kwargs):
        kwargs['cls'] = type(self)
        super().__init__(*args, **kwargs)

    @model_validator(mode='before')
    @staticmethod
    def my_model_validate(obj, info: ValidationInfo):
        if not isinstance(obj, dict):
            return obj

        cls = obj['cls']
        # print(cls, subcls, obj, info)

        validated = dict()
        for k, finfo in cls.model_fields.items():
            print(k, finfo)
            typ = finfo.annotation
            val = finfo.get_default()

            if val is PydanticUndefined:
                val = obj[k]

            print(typ, type(typ))
            if isinstance(typ, type) \
            and issubclass(typ, CustomLoader) \
            and isinstance(val, dict) and "cls" in val:
                subcls = val['cls']
                val = subcls.model_validate(val)
    
            validated[k] = val
            
        return validated

class SubModel(CustomLoader):
    sm: int = 3

class Model(CustomLoader):
    m: int = 2
    sub: SubModel

class SubSubModelA(SubModel):
    ssma: int = 42

class SubModelA(SubModel):
    sma: int = 0
    subsub: SubSubModelA

class SubModelB(SubModel):
    smb: int = 1

c = Model(sub=SubModelA(subsub=SubSubModelA()))

In [None]:
c

In [None]:
c.model_dump()

In [None]:
Model.model_validate({'cls': Model, 'm': 2, 'sub': {'cls': SubModelA, 'sma':3, 'subsub': {'cls': SubSubModelA, 'ssma': 42}}})

In [None]:
Model.model_validate({'c': 2, 'sub': {}})

# Keys testing

In [None]:
# Show keys.

import os
for k in os.environ:
    if "KEY" in k:
        print(k)

# Bedrock testing


In [None]:
from trulens_eval import Bedrock
bedrock = Bedrock(
    model_id = "amazon.titan-tg1-large",
    region_name="us-west-2"
)

In [None]:
Endpoint.print_instrumented()

In [None]:
bedrock_response, cost = Endpoint.track_all_costs_tally(
    thunk=lambda: bedrock.endpoint.client.invoke_model_with_response_stream(
    body=json.dumps({'inputText': "Hello there."}), modelId="amazon.titan-tg1-large")
)

# Huggingface testing

In [None]:
from trulens_eval import Huggingface
Huggingface()

In [None]:
Endpoint.print_instrumented()

# AzureOpenAI Testing

In [None]:
from trulens_eval.keys import check_keys
check_keys(
    "AZURE_OPENAI_API_KEY",
    "AZURE_OPENAI_ENDPOINT",
    "OPENAI_API_VERSION",
    "OPENAI_API_TYPE",
    "AZURE_OPENAI_DEPLOYMENT_NAME",
)

In [None]:
from llama_index.llms import AzureOpenAI as AzureOpenAIChat
import os

gpt_35_turbo = AzureOpenAIChat(
    deployment_name="gpt-35-turbo",
    model="gpt-35-turbo",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    model_version="0613",
    temperature=0.0,
)
c = gpt_35_turbo._get_client()
gpt_35_turbo._get_credential_kwargs()

In [None]:
c.base_url

In [None]:
import os
from trulens_eval import feedback
azopenai = feedback.AzureOpenAI(
    deployment_name=os.environ['AZURE_OPENAI_DEPLOYMENT_NAME']
)

In [None]:
azopenai.endpoint.client.client_kwargs

In [None]:
# azopenai.relevance(prompt="Where is Germany?", response="Germany is in Europe.")

In [None]:
# reval = feedback.AzureOpenAI.model_validate(azopenai.model_dump())
# reval.relevance(prompt="Where is Germany?", response="Poland is in Europe.")

In [None]:
azureOpenAI = azopenai

from trulens_eval.feedback.provider import AzureOpenAI
from trulens_eval.feedback import Groundedness, GroundTruthAgreement
from trulens_eval import TruLlama, Feedback
from trulens_eval.app import App
import numpy as np
# Initialize provider class
#azureOpenAI = AzureOpenAI(deployment_name="gpt-35-turbo")

grounded = Groundedness(groundedness_provider=azureOpenAI)
# Define a groundedness feedback function
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons)
    .on_input_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = Feedback(azureOpenAI.relevance_with_cot_reasons).on_input_output()
# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(azureOpenAI.qs_relevance_with_cot_reasons)
    .on_input_output()
    .aggregate(np.mean)
)

# GroundTruth for comparing the Answer to the Ground-Truth Answer
#ground_truth_collection = GroundTruthAgreement(golden_set, provider=azureOpenAI)
#f_answer_correctness = (
#    Feedback(ground_truth_collection.agreement_measure)
#    .on_input_output()
#)

In [None]:
f_groundedness.model_dump()

In [None]:
from trulens_eval.utils.pyschema import WithClassInfo
from trulens_eval.utils.serial import SerialModel
from trulens_eval.feedback.groundedness import Groundedness
import pydantic

In [None]:
Groundedness.model_validate(grounded.model_dump())

In [None]:
f2 = Feedback.model_validate(f_groundedness.model_dump())

In [None]:
f2.implementation.obj.init_bindings.kwargs

In [None]:
f2.imp

In [None]:
def test_serial(f):
    print("Before serialization:")
    print(f.imp("Where is Poland?", "Poland is in Europe"))
    f_dump = f.model_dump()
    f = Feedback.model_validate(f_dump)
    print("After serialization:")
    print(f.imp("Where is Poland?", "Germany is in Europe"))
    return f

f2 = test_serial(f_groundedness)

In [None]:
f_groundedness.imp

In [None]:
f2.imp

In [None]:
f_answer_relevance = Feedback(azureOpenAI.relevance_with_cot_reasons).on_input_output()

# test without serialization
print(f_answer_relevance.imp(prompt="Where is Germany?", response="Germany is in Europe."))

# serialize/deserialize
f_answer_relevance2 = Feedback.model_validate(f_answer_relevance.model_dump())

# test after deserialization
print(f_answer_relevance2.imp(prompt="Where is Germany?", response="Poland is in Europe."))

In [None]:
fr = feedback.Feedback.model_validate(f.model_dump())

In [None]:
fr.imp(prompt="Where is Germany?", response="Germany is in Europe.")

# Dummy endpoint testing

In [None]:
tp = TP()

d = Dummy(
    loading_prob=0.1,
    freeze_prob=0.0, # we expect requests to have their own timeouts so freeze should never happen
    error_prob=0.01,
    overloaded_prob=0.1,
    rpm=6000
)

# Langchain testing

In [None]:
# Imports from langchain to build app. You may need to install langchain first
# with the following:
# ! pip install langchain>=0.0.170
from langchain.chains import LLMChain
from langchain_community.llms import OpenAI
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.prompts import HumanMessagePromptTemplate

# Initialize Huggingface-based feedback function collection class:
# bedrock = Bedrock(model_engine="Bedrock", model_id = "anthropic.claude-v2", region_name="us-west-2")

# Define a language match feedback function using HuggingFace.
#f_relevance = Feedback(bedrock.relevance).on_input_output()
# By default this will check language match on the main app input and main app
# output.

full_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(
        template=
        "Provide a helpful response with relevant background information for the following: {prompt}",
        input_variables=["prompt"],
    )
)

chat_prompt_template = ChatPromptTemplate.from_messages([full_prompt])

llm = OpenAI(temperature=0.9, max_tokens=128)

chain = LLMChain(llm=llm, prompt=chat_prompt_template, verbose=True)

tru_recorder1 = tru.Chain(
    chain,
    app_id='Chain1_ChatApplication',
    #feedbacks=[f_relevance]
)

with tru_recorder1 as recording:
    llm_response = chain.run("What's the capital of the USA?")

display(llm_response)

# Langchain expressions testing

In [None]:
from trulens_eval import TruChain, Tru
tru = Tru()

from operator import itemgetter
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser

prompt1 = ChatPromptTemplate.from_template("what is the city {person} is from?")
prompt2 = ChatPromptTemplate.from_template(
    "what country is the city {city} in? respond in {language}"
)

model = ChatOpenAI()

chain1 = prompt1 | model | StrOutputParser()

chain2 = (
    {"city": chain1, "language": itemgetter("language")}
    | prompt2
    | model
    | StrOutputParser()
)

chain2_input = {"person": "obama", "language": "spanish"}

chain2.invoke(chain2_input)

tru_recorder2 = TruChain(
    chain2,
    app_id='Chain2'
)

with tru_recorder2 as recs:
    llm_response = chain2.invoke({"person": "obama", "language": "spanish"})

In [None]:
record = recs.get()

In [None]:
record.calls

In [None]:
! pip install 'llama_index<0.10'

# Llama-Index testing


In [2]:
from trulens_eval import Feedback, Tru, TruLlama
from trulens_eval.feedback import Groundedness
from trulens_eval.feedback.provider.openai import OpenAI

tru = Tru()

from llama_index import VectorStoreIndex, QueryBundle
from llama_index.readers.web import SimpleWebPageReader

documents = SimpleWebPageReader(
    html_to_text=True
).load_data(["http://paulgraham.com/worked.html"])
index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine()

ImportError: cannot import name 'global_handler' from 'llama_index.core' (/opt/homebrew/Caskroom/miniconda/base/envs/py311_trulens/lib/python3.11/site-packages/llama_index/core/__init__.py)

In [3]:
! pip list | grep llama

llama-index                              0.9.48
llama-index-agent-openai                 0.1.4
llama-index-core                         0.10.9
llama-index-embeddings-openai            0.1.5
llama-index-indices-managed-llama-cloud  0.1.2
llama-index-legacy                       0.9.48
llama-index-llms-openai                  0.1.5
llama-index-multi-modal-llms-openai      0.1.3
llama-index-program-openai               0.1.3
llama-index-question-gen-openai          0.1.2
llama-index-readers-file                 0.1.4
llama-index-readers-llama-parse          0.1.2
llama-parse                              0.3.4
llamaindex-py-client                     0.1.13
pyllama                                  0.0.9


In [None]:
tru_query_engine_recorder = TruLlama(query_engine,
    app_id='LlamaIndex_App1',
    feedbacks=[])

In [None]:
# or as context manager
with tru_query_engine_recorder as recording:
    print(query_engine.query(QueryBundle("What did the author do growing up?")))

In [None]:
rec = recording.get().layout_calls_as_app() # important
from trulens_eval.utils.serial import Lens
from trulens_eval.schema import Select
all_args = next(Lens().app.query[0].args.str_or_query_bundle.get(rec))

Select.render_for_dashboard(Select.RecordRets)

# TruBasicApp testing

In [None]:
from trulens_eval import TruBasicApp

SCRIPT_DIR = Path().cwd()
dotenv.load_dotenv(SCRIPT_DIR / "my.env")

tru = Tru(database_redact_keys=True)#database_url=os.environ.get("database_url"))

def llm_standalone(prompt):
    return prompt

f_sentiment = Feedback(bedrock.sentiment).on_output()

recorder = TruBasicApp(llm_standalone, app="default", feedbacks=[f_sentiment])