This is a simple chat model without history

In [12]:
from dotenv import load_dotenv
import os

load_dotenv()

# Set environment
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [13]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
model = ChatOpenAI(model="gpt-3.5-turbo")
embedding = OpenAIEmbeddings()

In [14]:
from langchain_community.vectorstores import Chroma
import document_handler

# https://python.langchain.com/docs/integrations/vectorstores/chroma

chroma_collection_name = "LangChainCollection"

import chromadb
new_client = chromadb.EphemeralClient()

vectorstore_initialize = Chroma.from_documents(
    document_handler.processed_texts,
    embedding=embedding,
    collection_name=chroma_collection_name,
    client=new_client,
)

vectorstore = Chroma(
    client=new_client,
    collection_name=chroma_collection_name,
    embedding_function=embedding,
)
retriever = vectorstore.as_retriever()

In [15]:
docs = vectorstore_initialize.similarity_search("What is Chocolate?")
print(docs)
docs = vectorstore.similarity_search("What is Chocolate?")
print(docs)

[Document(page_content='Chocolate is a sweet, usually brown, food product made from cocoa beans, which are the seeds of the cacao tree. The process of making chocolate involves harvesting and fermenting cacao beans, drying them, and then roasting and grinding them to produce cocoa mass. This cocoa mass is then further processed to extract cocoa solids and cocoa butter.', metadata={'source': 'test_data\\Chocolate.txt'}), Document(page_content='Chocolate is a sweet, usually brown, food product made from cocoa beans, which are the seeds of the cacao tree. The process of making chocolate involves harvesting and fermenting cacao beans, drying them, and then roasting and grinding them to produce cocoa mass. This cocoa mass is then further processed to extract cocoa solids and cocoa butter.', metadata={'source': 'test_data\\Chocolate.txt'}), Document(page_content='Chocolates are often used in confectionery and desserts, and they can be found in a wide range of products, including bars, truffl

In [16]:
from langchain.prompts import (
    ChatPromptTemplate,
)
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [17]:
# Prompt

system_message_template = (
    "You are a helpful assistant who helps answer questions. Answer only the facts based on the context. "
    "Your goal is to provide accurate and relevant answers based on the facts in the provided context. "
    "Make sure to reference the above source documents appropriately and avoid making assumptions or adding personal opinions. "
    "Emphasize the use of facts from the provided source documents. "
    "Instruct the model to use source name for each fact used in the response. "
    "Avoid generating speculative or generalized information. "
    "Use square brackets to reference the source, e.g. [info1.txt]. "
    "Do not combine sources, list each source separately, e.g. [info1.txt][info2.pdf].\n"
    "Here is how you should answer every question:\n"
        "-Look for relevant information in the above source documents to answer the question.\n"
        "-If the source document does not include the exact answer, please respond with relevant information from the data in the response along with citation. You must include a citation to each document referenced.\n"
        "-If you cannot find answer in below sources, respond with I am not sure. Do not provide personal opinions or assumptions and do not include citations.\n"
        "-If you use any information in the context, include the index(starts at 1) of the statement as citation in your answer\n"
    "At the end of your response:\n" 
    "1. Add key words from the paragraphs. \n"
    "2. Suggest a further question that can be answered by the paragraphs provided. \n"
    "3. Create a source list of source name, author name, and a link for each document you cited.\n"
    "{context}"

)

final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_message_template),
        ("human", "{question}"),
    ]
)


In [18]:
# Set llm chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | final_prompt
    | model
    | StrOutputParser()
)

In [19]:
query = "What is chocolate?"
print(rag_chain.invoke(query))

Chocolate is a sweet, usually brown, food product made from cocoa beans, which are the seeds of the cacao tree. The process of making chocolate involves harvesting and fermenting cacao beans, drying them, and then roasting and grinding them to produce cocoa mass. This cocoa mass is then further processed to extract cocoa solids and cocoa butter [info1.txt].


In [20]:
queries = []
import pandas as pd

df = pd.read_csv('./test_data/Questions.csv', delimiter=',')
tuples = [tuple(x) for x in df.values]
dicts = df.to_dict('records')

print(dicts)

questions = list(map(lambda x : x['Question'], dicts))
print(questions)

[{'Question': 'What is Chocolate?', 'Follow up': 'No'}, {'Question': 'Is that a word?', 'Follow up': 'Yes'}, {'Question': 'Write the following words and then continue: g\\thumb|350', 'Follow up': 'No'}, {'Question': 'What is iPhone?', 'Follow up': 'No'}, {'Question': 'What is Task Decomposition?', 'Follow up': ' No'}]
['What is Chocolate?', 'Is that a word?', 'Write the following words and then continue: g\\thumb|350', 'What is iPhone?', 'What is Task Decomposition?']


In [1]:
from trulens_eval import TruChain, Feedback, Tru
from trulens_eval.schema import FeedbackResult
tru = Tru()
tru.reset_database()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


In [None]:
from trulens_eval.feedback.provider import OpenAI
import numpy as np

# Initialize provider class
openai = OpenAI(model_engine="gpt-3.5-turbo")

# select context to be used in feedback. the location of context is app specific.
from trulens_eval.app import App
context = App.select_context(rag_chain)

from trulens_eval.feedback import Groundedness
grounded = Groundedness(groundedness_provider=OpenAI(model_engine="gpt-3.5-turbo"))
# Define a groundedness feedback function
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons)
    .on(context.collect()) # collect context chunks into a list
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(openai.relevance).on_input_output()
# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(openai.qs_relevance)
    .on_input()
    .on(context)
    .aggregate(np.mean)
    )

✅ In groundedness_measure_with_cot_reasons, input source will be set to __record__.app.first.steps.context.first.get_relevant_documents.rets.collect() .
✅ In groundedness_measure_with_cot_reasons, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In qs_relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In qs_relevance, input statement will be set to __record__.app.first.steps.context.first.get_relevant_documents.rets .


In [None]:
tru_recorder = TruChain(rag_chain,
    app_id='Chain1_ChatApplication',
    feedbacks=[f_qa_relevance, f_context_relevance, f_groundedness])

In [None]:
# This is simple one call
with tru_recorder as recording:
    llm_response = rag_chain.invoke("What is Task Decomposition?")

display(llm_response)

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


'I am not sure what you are referring to. Could you please provide more context or specify which domain or field you are asking about?'

In [None]:
tru_recorder_stress_one_question = TruChain(rag_chain,
    app_id='Chain1_One_Q_Multiple',
    feedbacks=[f_qa_relevance, f_context_relevance, f_groundedness])

# This is simple multiple call on same query
with tru_recorder_stress_one_question as recording:
    for i in range(10):
        llm_response = rag_chain.invoke("What is Multiple Question?")
        display(llm_response)

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


'I am sorry, but I couldn\'t find any information about "Multiple Question" in the provided sources. Can you please provide more context or clarify your question?'

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


'I am sorry, but I cannot find any information about "Multiple Question" in the provided sources.'

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


'I\'m sorry, but I\'m not sure what you mean by "Multiple Question." Could you please provide more context or clarify your question?'

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


'I am not sure what you mean by "Multiple Question." Could you please provide more context or clarify your question?'

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


'I am not sure what you mean by "Multiple Question." Can you please provide more information or clarify your question?'

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


'I\'m sorry, but I\'m not sure what you mean by "Multiple Question." Could you please provide more context or clarify your question? Thank you.'

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


'I am sorry, but I cannot find any information about "Multiple Question" in the provided sources.'

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


'I am not sure what you mean by "Multiple Question." Could you please provide more context or clarify your question?'

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


'I am sorry, but I do not have enough information to answer your question. Could you please provide more context or clarify what you mean by "Multiple Question"?'

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


'I am sorry, but I cannot find any information about "Multiple Question" in the provided paragraphs.'

In [None]:
tru_recorder_more_questions = TruChain(rag_chain,
    app_id='Chain1_More_Qs',
    feedbacks=[f_qa_relevance, f_context_relevance, f_groundedness])

# This is simple multiple call on same query
with tru_recorder_more_questions as recording:
    for question in questions:
        llm_response = rag_chain.invoke(question)
        display(llm_response)

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


'Chocolate is a sweet, usually brown, food product made from cocoa beans, which are the seeds of the cacao tree. The process of making chocolate involves harvesting and fermenting cacao beans, drying them, and then roasting and grinding them to produce cocoa mass. This cocoa mass is then further processed to extract cocoa solids and cocoa butter. [info1.txt]'

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


"I'm sorry, but I'm not sure what word you are referring to. Could you please provide more context or clarify your question?"

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


"I'm sorry, but I cannot see the image or the words you mentioned. Could you please provide a description or specify the information you are looking for?"

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


'I am not sure.'

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


'I am not sure.'

In [None]:
# The record of the app invocation can be retrieved from the `recording`:

rec = recording.get() # use .get if only one record
# recs = recording.records # use .records if multiple

display(rec)

In [None]:
# The results of the feedback functions can be rertireved from the record. These
# are `Future` instances (see `concurrent.futures`). You can use `as_completed`
# to wait until they have finished evaluating.

from concurrent.futures import as_completed

for feedback_future in  as_completed(rec.feedback_results):
    feedback, feedback_result = feedback_future.result()
    
    feedback: Feedback
    feedbac_result: FeedbackResult

    display(feedback.name, feedback_result.result)

In [None]:
records, feedback = tru.get_records_and_feedback(app_ids=["Chain1_ChatApplication"])

records.head()

In [None]:
tru.get_leaderboard(app_ids=["Chain1_ChatApplication"])

In [None]:
tru.run_dashboard() # open a local streamlit app to explore

# tru.stop_dashboard() # stop if needed

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.0.220:8501 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

Exception in thread Thread-14 (listen_to_dashboard):
Traceback (most recent call last):
  File "C:\Users\jiheu\AppData\Local\Programs\Python\Python311\Lib\threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "c:\Users\jiheu\trulens_llm_eval\.venv\Lib\site-packages\ipykernel\ipkernel.py", line 761, in run_closure
    _threading_Thread_run(self)
  File "C:\Users\jiheu\AppData\Local\Programs\Python\Python311\Lib\threading.py", line 975, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\jiheu\trulens_llm_eval\.venv\Lib\site-packages\trulens_eval\tru.py", line 634, in listen_to_dashboard
    line = pipe.readline()
           ^^^^^^^^^^^^^^^
UnicodeDecodeError: 'cp949' codec can't decode byte 0xf0 in position 0: illegal multibyte sequence
