In [1]:
%%time
%%capture

!pip install trulens_eval llama_index

CPU times: user 13.2 ms, sys: 8.99 ms, total: 22.2 ms
Wall time: 2.11 s


In [2]:
%%time

import nltk
nltk.set_proxy('http://myproxy:7890')

from trulens_eval import Tru
tru = Tru()
tru.reset_database()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.
CPU times: user 6.4 s, sys: 496 ms, total: 6.89 s
Wall time: 6.57 s


## 最基本的检索查询

In [3]:
%%time

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import Settings
from llama_index.llms.openai_like import OpenAILike
from llama_index.embeddings.ollama import OllamaEmbedding

Settings.chunk_size = 128
Settings.chunk_overlap = 16

Settings.llm = OpenAILike(
    model="qwen2", 
    api_base="http://192.168.0.73:11434/v1", 
    api_key="ollama",
    is_chat_model=True,
    temperature=0.1,
    request_timeout=60.0
)

Settings.embed_model =OllamaEmbedding(
    model_name="quentinz/bge-large-zh-v1.5",
    base_url="http://monkey:11434",
    ollama_additional_kwargs={"mirostat": 0}, # -mirostat N 使用 Mirostat 采样。
)

documents = SimpleDirectoryReader("data").load_data()
index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine(similarity_top_k=3)

CPU times: user 416 ms, sys: 20.4 ms, total: 436 ms
Wall time: 8.47 s


In [4]:
%%time

response = query_engine.query("北海公园在哪里？")
print(response)

北海公园位于北京市西城区文津街1号。
CPU times: user 64.8 ms, sys: 6 µs, total: 64.8 ms
Wall time: 664 ms


## 评估

In [5]:
%%time
%%capture

!pip install litellm

CPU times: user 7.53 ms, sys: 8.64 ms, total: 16.2 ms
Wall time: 1.79 s


In [6]:
%%time

import litellm

from trulens_eval.feedback.provider import LiteLLM

# litellm.set_verbose = False

ollama_provider = LiteLLM(
    model_engine="ollama/qwen2", 
    api_base="http://192.168.0.73:11434"
)

CPU times: user 56 ms, sys: 8.05 ms, total: 64.1 ms
Wall time: 63.3 ms


In [7]:
%%time

from trulens_eval.feedback.provider import OpenAI
from trulens_eval import Feedback
import numpy as np

# Initialize provider class
provider = ollama_provider

# select context to be used in feedback. the location of context is app specific.
from trulens_eval.app import App
context = App.select_context(query_engine)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(provider.groundedness_measure_with_cot_reasons, name = "Groundedness")
    .on(context.collect()) # collect context chunks into a list
    .on_output()
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = (
    Feedback(provider.relevance_with_cot_reasons, name = "Answer Relevance")
    .on_input_output()
)
# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(provider.context_relevance_with_cot_reasons, name = "Context Relevance")
    .on_input()
    .on(context)
    .aggregate(np.mean)
)

✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to __record__.app.query.rets.source_nodes[:].node.text .
CPU times: user 29.1 ms, sys: 67 µs, total: 29.2 ms
Wall time: 29.1 ms


In [8]:
%%time

from trulens_eval import TruLlama
tru_query_engine_recorder = TruLlama(
    query_engine,
    app_id='LlamaIndex_App1',
    feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance]
)

CPU times: user 225 ms, sys: 7.3 ms, total: 233 ms
Wall time: 243 ms


In [9]:
%%time

litellm.set_verbose = True

# or as context manager
with tru_query_engine_recorder as recording:
    query_engine.query("北海公园在哪里？")



[92mRequest to litellm:[0m
[92mlitellm.completion(temperature=0.0, model='ollama/qwen2', messages=[{'role': 'system', 'content': "You are a RELEVANCE grader; providing the relevance of the given RESPONSE to the given PROMPT.\n        Respond only as a number from 0 to 10 where 0 is the least relevant and 10 is the most relevant. \n\n        A few additional scoring guidelines:\n\n        - Long RESPONSES should score equally well as short RESPONSES.\n\n        - Answers that intentionally do not answer the question, such as 'I don't know' and model refusals, should also be counted as the most RELEVANT.\n\n        - RESPONSE must be relevant to the entire PROMPT to get a score of 10.\n\n        - RELEVANCE score should increase as the RESPONSE provides RELEVANT context to more parts of the PROMPT.\n\n        - RESPONSE that is RELEVANT to none of the PROMPT should get a score of 0.\n\n        - RESPONSE that is RELEVANT to some of the PROMPT should get as score of 2, 3, or 4. Highe

In [10]:
%%time

last_record = recording.records[-1]

from trulens_eval.utils.display import get_feedback_result
get_feedback_result(last_record, "Context Relevance")

CPU times: user 63.7 ms, sys: 8.01 ms, total: 71.7 ms
Wall time: 14 s


In [11]:
%%time

from trulens_eval.guardrails.llama import WithFeedbackFilterNodes

# note: feedback function used for guardrail must only return a score, not also reasons
f_context_relevance_score = Feedback(provider.context_relevance)

filtered_query_engine = WithFeedbackFilterNodes(query_engine, feedback=f_context_relevance_score, threshold=0.5)

CPU times: user 8.74 ms, sys: 53 µs, total: 8.79 ms
Wall time: 8.72 ms


In [None]:
%%time

import litellm
litellm.set_verbose=True

tru_recorder = TruLlama(filtered_query_engine,
    app_id='LlamaIndex_App1_Filtered',
    feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness])

with tru_recorder as recording:
    llm_response = filtered_query_engine.query("北海公园在哪里？")