This is a simple chat model without history

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

# Set environment
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["DEEPEVAL_RESULTS_FOLDER"] = "./data"

In [2]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
model = ChatOpenAI(model="gpt-3.5-turbo")
embedding = OpenAIEmbeddings()

In [3]:
from langchain_community.vectorstores import Chroma
import document_handler

# https://python.langchain.com/docs/integrations/vectorstores/chroma

chroma_collection_name = "LangChainCollection"

import chromadb
new_client = chromadb.EphemeralClient()

vectorstore_initialize = Chroma.from_documents(
    document_handler.processed_texts,
    embedding=embedding,
    collection_name=chroma_collection_name,
    client=new_client,
)

vectorstore = Chroma(
    client=new_client,
    collection_name=chroma_collection_name,
    embedding_function=embedding,
)
retriever = vectorstore.as_retriever()

In [4]:
from langchain.prompts import (
    ChatPromptTemplate,
)
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

Feedback function trulens

In [5]:
# Prompt
# Try changing the prompt values to see the impact

system_message_template = (
    "You are a helpful assistant who helps answer questions. Answer only the facts based on the context. "
    "Your goal is to provide accurate and relevant answers based on the facts in the provided context. "
    "Make sure to reference the above source documents appropriately and avoid making assumptions or adding personal opinions. "
    "Emphasize the use of facts from the provided source documents. "
    "Instruct the model to use source name for each fact used in the response. "
    "Avoid generating speculative or generalized information. "
    "Use square brackets to reference the source, e.g. [info1.txt]. "
    "Do not combine sources, list each source separately, e.g. [info1.txt][info2.pdf].\n"
    "Here is how you should answer every question:\n"
        "-Look for relevant information in the above source documents to answer the question.\n"
        "-If the source document does not include the exact answer, please respond with relevant information from the data in the response along with citation. You must include a citation to each document referenced.\n"
        "-If you cannot find answer in below sources, respond with I am not sure. Do not provide personal opinions or assumptions and do not include citations.\n"
        "-If you use any information in the context, include the index(starts at 1) of the statement as citation in your answer\n"
    "At the end of your response:\n" 
    "1. Add key words from the paragraphs. \n"
    "2. Suggest a further question that can be answered by the paragraphs provided. \n"
    "3. Create a source list of source name, author name, and a link for each document you cited.\n"
    "{context}"
)


final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_message_template),
        ("human", "{question}"),
    ]
)


In [64]:
# Deepeval, we need to get all the data to provide to the evaluation tool, therefore, this example uses function to produce json output that LLM automatically provides to the 
# Try changing the value of these to see the change.
functions = [
    {
        "name": "QandARecord",
        "description": "The llm's what's needed to provide response",
        "parameters": {
            "type": "object",
            "properties": {
                "input": {"type": "string", "description": "The question"},
                "document_body": {
                    "type": "string",
                    "description": "Information from a document from the vectorstore",
                },
                "output": {"type": "string", "description": "The answer or response to the input(question)"},
            },
            "required": ["input", "document_body", "output"],
        },
    }
]

In [65]:
# Set llm chain
def format_docs(docs):
    context = "\n\n".join(doc.page_content for doc in docs)
    print(context)
    return context

from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from operator import itemgetter

rag_chain = (
    {"context": itemgetter("question") | retriever | format_docs, "question":  itemgetter("question")}
    | final_prompt
    | model.bind(function_call={"name": "QandARecord"}, functions=functions)
    | JsonOutputFunctionsParser()
)

In [62]:
queries = []
import pandas as pd

df = pd.read_csv('./test_data/Questions.csv', delimiter=',')
tuples = [tuple(x) for x in df.values]
dicts = df.to_dict('records')

print(dicts)

questions = list(map(lambda x : x['Question'], dicts))
print(questions)

[{'Question': 'What is Chocolate?', 'Chat_history': "{'history': []}", 'Answer': 'Chocolate is a sweet, usually brown, food product made from cocoa beans, which are the seeds of the cacao tree. The process of making chocolate involves harvesting and fermenting cacao beans, drying them, and then roasting and grinding them to produce cocoa mass. This cocoa mass is then further processed to extract cocoa solids and cocoa butter [info1.txt].'}, {'Question': 'Is that a word?', 'Chat_history': "{'history': [HumanMessage(content='What is Chocolate?'), AIMessage(content='Chocolate is a sweet food product that is usually brown in color and is made from cocoa beans, which are the seeds of the cacao tree. The process of making chocolate involves harvesting and fermenting cacao beans, drying them, and then roasting and grinding them to produce cocoa mass. This cocoa mass is further processed to extract cocoa solids and cocoa butter [info1.txt].')]}", 'Answer': 'Yes, Chocolate is a word.'}, {'Quest

In [9]:
from deepeval.test_case import LLMTestCase
from deepeval.dataset import EvaluationDataset
from deepeval.metrics import HallucinationMetric
from deepeval.metrics import FaithfulnessMetric, ContextualRelevancyMetric, AnswerRelevancyMetric
from deepeval import assert_test
from deepeval import run_test
from deepeval import evaluate

In [10]:
import deepeval
@deepeval.set_hyperparameters
def hyperparameters():
    return {
        "temperature": 0.3,
        "chunck_size": 1000,
        "model": "gpt-3.5-turbo",
        "prompt_template": final_prompt,
    }

In [66]:
# One question
query = "What is Chocolate?"
result = rag_chain.invoke({'question': query})
print(result)

test_case = LLMTestCase(
    input=final_prompt.format(question=result["input"], context=result["reference"]),
    actual_output=result["output"],
    retrieval_context = result["reference"]
)

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


Chocolate is a sweet, usually brown, food product made from cocoa beans, which are the seeds of the cacao tree. The process of making chocolate involves harvesting and fermenting cacao beans, drying them, and then roasting and grinding them to produce cocoa mass. This cocoa mass is then further processed to extract cocoa solids and cocoa butter.

Chocolates are often used in confectionery and desserts, and they can be found in a wide range of products, including bars, truffles, candies, and baked goods. Chocolate is enjoyed worldwide and is often associated with indulgence and celebration. Additionally, cocoa has been linked to various potential health benefits, such as antioxidants and mood enhancement, when consumed in moderation.

Chocolate comes in various forms, such as dark chocolate, milk chocolate, and white chocolate. Dark chocolate contains a higher percentage of cocoa solids and less sugar, giving it a more intense and bitter flavor. Milk chocolate, on the other hand, includ

Deepeval evaluation to evaluate and get UI version of the evaluation result you need to export this notebook file and then provide name to the exported python file 'test_[name_of_the_file].py' and then run on terminal by following this guide:
1. on terminal go to the root directory
2. python3 -m venv venv
3. source venv/bin/activate
4. deepeval login [your_api_key]
5. Comment out any unnecessary code
6. deepeval test run test_example.py

In [None]:
# The hallucination metric determines whether your LLM generates factually correct information by comparing the actual_output to the provided context.

# input
# actual_output
# context
metric = HallucinationMetric(threshold=0.5)

In [30]:
# How relevant the answer is to the input 
# QA Relevance - Response relevant to input? - TruLens matching
metricQR = AnswerRelevancyMetric(threshold=0.5, model="gpt-3.5-turbo", include_reason=True)
# Whether the output is factually aligning with contents of retrieved contexts
# Groundedness - is response relevant to context? - TruLens matching
metricF = FaithfulnessMetric(threshold=0.5, model="gpt-3.5-turbo", include_reason=True)
# Relevancy of the retrieved context to the input
# Context Relevance - Is context relevant to input? - TruLens matching
metricCR = ContextualRelevancyMetric(threshold=0.5, model="gpt-3.5-turbo", include_reason=True)


In [None]:
# metric.measure(test_case)
# print(metric.score)
# print(metric.reason)

ValueError: Output or context cannot be None

In [40]:
from deepeval import evaluate
from deepeval import assert_test
from deepeval import track

metrics = [metricQR, metricF, metricCR]

test_cases = []

for question in questions:
    result = rag_chain.invoke({"question": question})
    # result = rag_chain.invoke({"question": "What is iPhone?"})
    print(result)

    question=result.get("input", None)
    actual_output=result.get("output", None)
    retrieval_context = [result.get("document_body", None)]
    input = final_prompt.format(question=result["input"], context=result["document_body"])

    print(question)
    print(actual_output)
    print(retrieval_context)

    test_case = LLMTestCase(
        input=final_prompt.format(question=input, context=retrieval_context),
        actual_output=actual_output,
        retrieval_context = retrieval_context
    )

    # print(evaluate(test_case, metrics))
    test_cases.append(test_case)

print(evaluate(test_cases, metrics))

#Here it fails when a data is missing/None

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


Chocolate is a sweet, usually brown, food product made from cocoa beans, which are the seeds of the cacao tree. The process of making chocolate involves harvesting and fermenting cacao beans, drying them, and then roasting and grinding them to produce cocoa mass. This cocoa mass is then further processed to extract cocoa solids and cocoa butter.

Chocolates are often used in confectionery and desserts, and they can be found in a wide range of products, including bars, truffles, candies, and baked goods. Chocolate is enjoyed worldwide and is often associated with indulgence and celebration. Additionally, cocoa has been linked to various potential health benefits, such as antioxidants and mood enhancement, when consumed in moderation.

Chocolate comes in various forms, such as dark chocolate, milk chocolate, and white chocolate. Dark chocolate contains a higher percentage of cocoa solids and less sugar, giving it a more intense and bitter flavor. Milk chocolate, on the other hand, includ

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


Chocolate is a sweet, usually brown, food product made from cocoa beans, which are the seeds of the cacao tree. The process of making chocolate involves harvesting and fermenting cacao beans, drying them, and then roasting and grinding them to produce cocoa mass. This cocoa mass is then further processed to extract cocoa solids and cocoa butter.

Chocolates are often used in confectionery and desserts, and they can be found in a wide range of products, including bars, truffles, candies, and baked goods. Chocolate is enjoyed worldwide and is often associated with indulgence and celebration. Additionally, cocoa has been linked to various potential health benefits, such as antioxidants and mood enhancement, when consumed in moderation.

Chocolate comes in various forms, such as dark chocolate, milk chocolate, and white chocolate. Dark chocolate contains a higher percentage of cocoa solids and less sugar, giving it a more intense and bitter flavor. Milk chocolate, on the other hand, includ

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


Chocolate is a sweet, usually brown, food product made from cocoa beans, which are the seeds of the cacao tree. The process of making chocolate involves harvesting and fermenting cacao beans, drying them, and then roasting and grinding them to produce cocoa mass. This cocoa mass is then further processed to extract cocoa solids and cocoa butter.

Chocolates are often used in confectionery and desserts, and they can be found in a wide range of products, including bars, truffles, candies, and baked goods. Chocolate is enjoyed worldwide and is often associated with indulgence and celebration. Additionally, cocoa has been linked to various potential health benefits, such as antioxidants and mood enhancement, when consumed in moderation.

Chocolate comes in various forms, such as dark chocolate, milk chocolate, and white chocolate. Dark chocolate contains a higher percentage of cocoa solids and less sugar, giving it a more intense and bitter flavor. Milk chocolate, on the other hand, includ

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


Chocolate is a sweet, usually brown, food product made from cocoa beans, which are the seeds of the cacao tree. The process of making chocolate involves harvesting and fermenting cacao beans, drying them, and then roasting and grinding them to produce cocoa mass. This cocoa mass is then further processed to extract cocoa solids and cocoa butter.

Chocolates are often used in confectionery and desserts, and they can be found in a wide range of products, including bars, truffles, candies, and baked goods. Chocolate is enjoyed worldwide and is often associated with indulgence and celebration. Additionally, cocoa has been linked to various potential health benefits, such as antioxidants and mood enhancement, when consumed in moderation.

Chocolate comes in various forms, such as dark chocolate, milk chocolate, and white chocolate. Dark chocolate contains a higher percentage of cocoa solids and less sugar, giving it a more intense and bitter flavor. Milk chocolate, on the other hand, includ

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


Chocolate is a sweet, usually brown, food product made from cocoa beans, which are the seeds of the cacao tree. The process of making chocolate involves harvesting and fermenting cacao beans, drying them, and then roasting and grinding them to produce cocoa mass. This cocoa mass is then further processed to extract cocoa solids and cocoa butter.

Chocolate comes in various forms, such as dark chocolate, milk chocolate, and white chocolate. Dark chocolate contains a higher percentage of cocoa solids and less sugar, giving it a more intense and bitter flavor. Milk chocolate, on the other hand, includes milk solids in addition to cocoa, creating a sweeter and creamier taste. White chocolate consists mainly of cocoa butter, sugar, and milk solids but lacks cocoa solids.

Chocolates are often used in confectionery and desserts, and they can be found in a wide range of products, including bars, truffles, candies, and baked goods. Chocolate is enjoyed worldwide and is often associated with in

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Here are some more examples but comment out these when you run the evaluation

In [None]:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams

summarization_metric = GEval(
    name="Summarization",
    criteria="Summarization - determine if the actual output is an accurate and concise summarization of the input.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=["Check whether the 'actual output' has omitted any detail from 'input'"],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

In [None]:
from deepeval.metrics import SummarizationMetric

test_case = LLMTestCase(input=input, actual_output=actual_output)
metric = SummarizationMetric(
    threshold=0.5,
    model="gpt-3.5-turbo",
    assessment_questions=[
        "Is the inclusion score based on a percentage of 'yes' answers?",
        "Does the score ensure the summary's accuracy with the source?",
        "Does a higher score mean a more comprehensive summary?"
    ]
)

metric.measure(test_case)
print(metric.score)

In [None]:
from deepeval.metrics.ragas import RagasMetric

metric = RagasMetric(threshold=0.5, model="gpt-3.5-turbo")
test_case = LLMTestCase(
    input="What if these shoes don't fit?",
    actual_output=actual_output,
    retrieval_context=retrieval_context
)


In [None]:
# pip install Dbias

from deepeval.metrics import BiasMetric
metric = BiasMetric(threshold=0.5)
test_case = LLMTestCase(
    input="What if these shoes don't fit?",
    # Replace this with the actual output from your LLM application
    actual_output = "We offer a 30-day full refund at no extra cost."
)


In [None]:
# pip install detoxify
from deepeval.metrics import ToxicityMetric
metric = ToxicityMetric(threshold=0.5)
test_case = LLMTestCase(
    input="What if these shoes don't fit?",
    # Replace this with the actual output from your LLM application
    actual_output = "We offer a 30-day full refund at no extra cost."
)

metric.measure(test_case)
print(metric.score)



In [None]:
summarization_metric.measure(test_case)
print(summarization_metric.score)
print(summarization_metric.reason)

0.4
The 'actual output' misses several key requirements outlined in the 'input' instructions. It does not add keywords from the paragraph, suggest a further question that could be answered by the paragraph provided, or create a source list with source name, author name, and a link for each document cited. Furthermore, the citation format '[info1.txt]' does not match the instruction to use square brackets with the source name and does not provide a separate listing for each source as no sources were combined.


In [None]:
from deepeval.metrics import JudgementalGPT
from deepeval.test_case import LLMTestCaseParams

code_correctness_metric = JudgementalGPT(
    name="Code Correctness",
    criteria="Code Correctness - determine whether the code in the 'actual output' produces a valid JSON.",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
    language=Languages.SPANISH,
    threshold=0.5,
)

In [None]:
from deepeval.metrics.ragas import ConcisenessMetric
from deepeval.metrics.ragas import CorrectnessMetric
from deepeval.metrics.ragas import CoherenceMetric
from deepeval.metrics.ragas import MaliciousnessMetric



In [None]:
import deepeval

# At the end of your LLM call
tracking = deepeval.track(
    event_name="Chatbot",
    model="gpt-3.5-turbo",
    input="input",
    output="output",
    distinct_id="a user Id",
    conversation_id="a conversation thread Id",
    retrieval_context=["..."],
    completion_time=8.23,
    token_usage=134,
    token_cost=0.23,
    additional_data={"example": "example"},
    fail_silently=True,
    run_on_background_thread=True
)

print (tracking)

None
