### Step1 导入相关包

In [1]:
import os
# Imports main tools:
from trulens_eval import TruChain, Tru
tru = Tru()
tru.reset_database()

# Imports from LangChain to build app
import bs4
from langchain import hub
from langchain_community.llms import BaichuanLLM
# from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


WARNI [langchain_community.utils.user_agent] USER_AGENT environment variable not set, consider setting it to identify your requests.


### Step2 加载数据

In [2]:
data = "../demo/"
# 这里为了做演示，随便找了一个pdf

In [3]:

loader = PyPDFDirectoryLoader(data)

docs_before_split = loader.load()
# 过滤目录和附录
docs_before_split = [doc for doc in docs_before_split if doc.metadata['page'] > 7 and doc.metadata['page'] <275] 
# 这里可以理解为对自己的文档切分成块，chuck size是每一块的大小，可以根据需求调整
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 256,
    chunk_overlap  = 30,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]

Document(page_content='第一章 电力现货市场基础   \n             \n1   \n第一章 \n \n \n电力现货市场基础  \n \n \n \n  \n 1. 什么是电力市场？电力市场与 普通商品市场有哪 些差异？电力市场有\n哪些特征？ \n（1）电力市场的概念。  \n我国关于电力市场的权威解释始见于《中国电力百科全书  电力系统卷（第二版） 》。\n电力市场的定义为：基于市场经济原则，电力市场的定义为基于市场经济原则，为实现', metadata={'source': '..\\demo\\电力现货市场101问.pdf', 'page': 8})

### Step3 创建向量数据库

In [4]:
# 从过往工作经验看，embedding对于rag效果影响比较大，一般首选还是openai embedding做这一部分，开源的效果很一般
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="moka-ai/m3e-base",  # 使用m3e模型做embeddding
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)



RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
d:\miniconda3\envs\transformers\lib\distutils\core.py

In [None]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

### Step4 创建RAG

In [None]:
# 创建检索器
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [None]:
# 创建prompt
prompt_template = """
请用下面相关文本回答问题，如果不知道答案，就回复不知道，

{context}

Question: {question}

Helpful Answer:
"""

prompt = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)


In [None]:
# 创建百川的LLM
llm = BaichuanLLM()

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
res = rag_chain.invoke("电力市场与普通商品市场的差异?")
print(res)

电力市场与普通商品市场的差异主要体现在以下几个方面：

1. 无仓储性：电力商品的生产、交割和消费几乎是同时完成的，其交割速度远快于一般商品，因此也不存在一般商品一手交钱一手交货的交易方式。

2. 同质性：电能不带有任何生产者的标识，电能生产者将生产的电能输入电网，消费者从电网中获取电能，无法区分电能的来源。

3. 网络产业特性：电力市场具有网络产业特性，无仓储性的市场供需关系以及整个销售的网络性特征使得电力市场与普通商品市场在交易模式、价格形成机制等方面存在显著差异。


### Step5 初始化反馈函数

In [None]:
from trulens_eval.feedback.provider import OpenAI
from trulens_eval import Feedback
import numpy as np

# Initialize provider class
provider = OpenAI()

# select context to be used in feedback. the location of context is app specific.
from trulens_eval.app import App
context = App.select_context(rag_chain)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(provider.groundedness_measure_with_cot_reasons, name = "Groundedness")
    .on(context.collect()) # collect context chunks into a list
    .on_output()
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = (
    Feedback(provider.relevance_with_cot_reasons, name = "Answer Relevance")
    .on_input_output()
)
# Context relevance between question and each context chunk.
f_context_relevance = (
    Feedback(provider.context_relevance_with_cot_reasons, name = "Context Relevance")
    .on_input()
    .on(context)
    .aggregate(np.mean)
)

✅ In Groundedness, input source will be set to __record__.app.first.steps__.context.first.invoke.rets[:].page_content.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to __record__.app.first.steps__.context.first.invoke.rets[:].page_content .


In [None]:
tru_recorder = TruChain(rag_chain,
    app_id='initial_rag',
    feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness])


In [None]:
with tru_recorder as recording:
    llm_response = rag_chain.invoke("电力市场与普通商品市场的差异？")

display(llm_response)

'电力市场与普通商品市场的差异主要体现在电力商品的自然属性和社会属性上。电力商品具有无仓储性，即电能的生产、交割和消费几乎是同时完成的，其交割速度远快于一般商品，因此不存在一般商品一手交钱一手交货的交易方式。此外，电力商品具有同质性，电能不带有任何生产者的标识，交易规则对电力也适用，不需要任何特殊的交易规则和协议。然而，电力商品具有难以储存的特殊性，这一特点导致供需双方提前达成的交易结果与需求方实时消费的电能量在数量上必然存在偏差，为弥补这种偏差，需要维持实时平衡的特殊的市场规则。'

In [None]:
tru.get_leaderboard()

Unnamed: 0_level_0,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1
Chain1_ChatApplication,8.8,0.0
initial_rag,8.666667,0.0


In [None]:
rec = recording.get()
display(rec)

Record(record_id='record_hash_a886e6f9860688478dd2de62dccafff4', app_id='initial_rag', cost=Cost(n_requests=0, n_successful_requests=0, n_classes=0, n_tokens=0, n_stream_chunks=0, n_prompt_tokens=0, n_completion_tokens=0, cost=0.0), perf=Perf(start_time=datetime.datetime(2024, 7, 22, 11, 10, 4, 357161), end_time=datetime.datetime(2024, 7, 22, 11, 10, 24, 482862)), ts=datetime.datetime(2024, 7, 22, 11, 10, 24, 482862), tags='-', meta=None, main_input='电力市场与普通商品市场的差异？', main_output='电力市场与普通商品市场的差异主要体现在电力商品的自然属性和社会属性上。电力商品具有无仓储性，即电能的生产、交割和消费几乎是同时完成的，其交割速度远快于一般商品，因此不存在一般商品一手交钱一手交货的交易方式。此外，电力商品具有同质性，电能不带有任何生产者的标识，交易规则对电力也适用，不需要任何特殊的交易规则和协议。然而，电力商品具有难以储存的特殊性，这一特点导致供需双方提前达成的交易结果与需求方实时消费的电能量在数量上必然存在偏差，为弥补这种偏差，需要维持实时平衡的特殊的市场规则。', main_error=None, calls=[RecordAppCall(call_id='2555e38b-894b-405a-89be-565c3c36f62a', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=2001493412544, init_bindings=None), name='invoke')), RecordApp

In [81]:
for feedback, feedback_result in rec.wait_for_feedback_results().items():
    print(feedback.name, feedback_result.result)

Answer Relevance None
Context Relevance None
Groundedness None
