## Import libraries

In [20]:
import openai
from langsmith.wrappers import wrap_openai
from langsmith import traceable
import os
from langchain_pinecone import PineconeVectorStore
import time
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import OpenAIEmbeddings
from langchain import hub
from langchain_openai import ChatOpenAI
from langsmith.evaluation import evaluate

# Grade prompt
grade_prompt_answer_accuracy = prompt = hub.pull("langchain-ai/rag-answer-vs-reference")

In [2]:
os.environ["OPENAI_API_KEY"]=os.getenv("openai_key")
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"]=os.getenv("langsmith_api_key")
pinecone_api_key=os.environ.get("PINECONE_API_KEY")
index_name="test-cs-240711"  


## Dataset Creation

In [34]:
from langsmith import Client
from langsmith.evaluation import evaluate

client = Client()

# Define dataset: these are your test cases
dataset_name = "test-cs-240711-updated"
dataset = client.create_dataset(dataset_name, description="A sample dataset for hr system in dialog 360")
client.create_examples(
    inputs=[
        {"input_question": "What specific topics does the Invoicing Support team offer support for?"},
        {"input_question": "What should a CS agent do before escalating an issue to Invoicing Support if messaging is impacted?"},
        {"input_question": "What is the purpose of the Synchronize WABA feature in the WABA section?"},
        {"input_question": "What should you do if a client encounters a 402 error when sending a message in WABA?"},
        {"input_question": "What is the role of a Business Solutions Provider (BSP) in accessing the WhatsApp API?"},
        {"input_question": "What does On-Premise refer to in the context of WhatsApp API setup?"},
        {"input_question": "What does the abbreviation WABA stand for, and what does it represent?"},
        {"input_question": "What is the first step in migrating from an On-Premise setup to a 360Dialog hosted solution?"},
        {"input_question": "Why might disabling two-factor authentication be helpful during the migration process?"},
        {"input_question": "What does 360Dialog do after setting up the new WhatsApp Business API client?"},
    ],
    outputs=[
        {"output_answer": "The Invoicing Support team provides assistance with direct billing invoice disputes, partner paid invoice disputes, refunds over 50 EUR/USD for subscription fees, increasing billing for Multiconnect setups, and payment enforcement topics."},
        {"output_answer": "A CS agent should first resolve any messaging issues before escalating to Invoicing Support, except if a number is disabled due to non-payment."},
        {"output_answer": "The Synchronize WABA feature is used to pull the latest data for WABA."},
        {"output_answer": " If a client encounters a 402 error, the solution is to set the payment method for WABA."},
        {"output_answer": "A Business Solutions Provider (BSP) like 360dialog allows access to the WhatsApp API, with about 80 such providers available globally."},
        {"output_answer": "On-Premise refers to a setup where the technology stack for the WhatsApp API is hosted either by 360dialog on their infrastructure or by the client on their own infrastructure."},
        {"output_answer": "WABA stands for WhatsApp Business Account, representing the account used for business communication on WhatsApp."},
        {"output_answer": "The first step is to back up the settings data from the current WhatsApp Business API client"},
        {"output_answer": "Disabling two-factor authentication is helpful if the authentication code is forgotten and re-registration is needed, although re-registration is not required for a smooth migration."},
        {"output_answer": "After setting up the new client, 360Dialog logs into the new instance, performs a restore of the settings, and conducts a health check to ensure everything is working properly."},
    ],
    dataset_id=dataset.id,
)


LangSmithConflictError: Conflict for /datasets. HTTPError('409 Client Error: Conflict for url: https://api.smith.langchain.com/datasets', '{"detail":"Dataset with this name already exists."}')

In [35]:
dataset_name

'test-cs-240711-updated'

## Get the cost and timeline from API

https://api.smith.langchain.com/public/b37ca9b1-60cd-4a2a-817e-3c4e4443fdc0/run?exclude_s3_stored_attributes=false


## RAG evaluation

In [25]:
dataset_name = "test-cs-240711"
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [26]:
pinecone_api_key

'34bd534b-0900-43bd-b9a3-846b3b8fd823'

In [27]:
pc = Pinecone(api_key=pinecone_api_key)
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
retriever = vectorstore.as_retriever()

In [28]:
### RAG bot

import openai
from langsmith import traceable
from langsmith.wrappers import wrap_openai

class RagBot:

    def __init__(self, retriever, model: str = "gpt-4-0125-preview"):
        self._retriever = retriever
        # Wrapping the client instruments the LLM
        self._client = wrap_openai(openai.Client())
        self._model = model

    @traceable()
    def retrieve_docs(self, question):
        return self._retriever.invoke(question)

    @traceable()
    def invoke_llm(self, question, docs):
        response = self._client.chat.completions.create(
            model=self._model,
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful AI code assistant with expertise in cs related queries."
                    " Use the following docs to produce a concise code solution to the user question.\n\n"
                    f"## Docs\n\n{docs}",
                },
                {"role": "user", "content": question},
            ],
        )

        # Evaluators will expect "answer" and "contexts"
        return {
            "answer": response.choices[0].message.content,
            "contexts": [str(doc) for doc in docs],
        }

    @traceable()
    def get_answer(self, question: str):
        docs = self.retrieve_docs(question)
        return self.invoke_llm(question, docs)

rag_bot = RagBot(retriever)

In [29]:
response = rag_bot.get_answer("What specific topics does the Invoicing Support team offer support for?")
response["answer"][:150]

'The Invoicing Support team offers support for the following specific topics:\n\n1. Direct Billing Invoice disputes (overcharging, undercharging, client '

In [30]:
def predict_rag_answer(example: dict):
    """Use this for answer evaluation"""
    response = rag_bot.get_answer(example["input_question"])
    return {"answer": response["answer"]}

def predict_rag_answer_with_context(example: dict):
    """Use this for evaluation of retrieved documents and hallucinations"""
    response = rag_bot.get_answer(example["input_question"])
    return {"answer": response["answer"], "contexts": response["contexts"]}

## Response vs reference answer (Answer/response correctness)


In [48]:
print(grade_prompt_answer_accuracy[0].prompt.template)

You are a teacher grading a quiz. 

You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer. 
(2) Ensure that the student answer does not contain any conflicting statements.
(3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.

Score:
A score of 1 means that the student's answer meets all of the criteria. This is the highest (best) score. 
A score of 0 means that the student's answer does not meet all of the criteria. This is the lowest possible score you can give.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset.


In [16]:


def answer_evaluator(run, example) -> dict:
    """
    A simple evaluator for RAG answer accuracy
    """

    # Get question, ground truth answer, RAG chain answer
    input_question = example.inputs["input_question"]
    reference = example.outputs["output_answer"]
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_answer_accuracy | llm

    # Run evaluator
    score = answer_grader.invoke({"question": input_question,
                                  "correct_answer": reference,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_v_reference_score", "score": score}

In [None]:
experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=[answer_evaluator],
    experiment_prefix="cs-rag-answer-v-reference",
    metadata={"version": "cs context, gpt-4-0125-preview"},
)

## Response vs input (response relevance)


In [56]:
grade_prompt_answer_helpfulness = prompt = hub.pull("langchain-ai/rag-answer-helpfulness")


In [57]:
print(grade_prompt_answer_helpfulness[0].prompt.template)

You are a teacher grading a quiz. 

You will be given a QUESTION and a STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Ensure the STUDENT ANSWER is concise and relevant to the QUESTION
(2) Ensure the STUDENT ANSWER helps to answer the QUESTION

Score:
A score of 1 means that the student's answer meets all of the criteria. This is the highest (best) score. 
A score of 0 means that the student's answer does not meet all of the criteria. This is the lowest possible score you can give.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset.


In [31]:
# Grade prompt

def answer_helpfulness_evaluator(run, example) -> dict:
    """
    A simple evaluator for RAG answer helpfulness
    """

    # Get question, ground truth answer, RAG chain answer
    input_question = example.inputs["input_question"]
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_answer_helpfulness | llm

    # Run evaluator
    score = answer_grader.invoke({"question": input_question,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_helpfulness_score", "score": score}

In [36]:
dataset_name

'test-cs-240711-updated'

In [50]:
experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=[answer_helpfulness_evaluator],
    experiment_prefix="cs-rag-answer-helpfulness",
    metadata={"version": "cs context, gpt-4-0125-preview"},
    blocking=False
)

View the evaluation results for experiment: 'cs-rag-answer-helpfulness-ec989b02' at:
https://smith.langchain.com/o/9bdcd0b2-8b47-501f-b66b-d379cce39a32/datasets/b30a204e-c905-40d8-a9a9-530289e86bcf/compare?selectedSessions=d0194078-619f-4a4c-b003-b0e2c4207e41




0it [00:00, ?it/s]

10it [00:27,  2.75s/it]


In [55]:
experiment_results
for i, result in enumerate(experiment_results):
    print(result['evaluation_results'])

{'results': [EvaluationResult(key='answer_helpfulness_score', score=1, value=None, comment=None, correction=None, evaluator_info={}, feedback_config=None, source_run_id=UUID('458fc167-4dcf-4f6d-b5fe-7ff38390fd6e'), target_run_id=None)]}
{'results': [EvaluationResult(key='answer_helpfulness_score', score=1, value=None, comment=None, correction=None, evaluator_info={}, feedback_config=None, source_run_id=UUID('83b18c3c-6436-46d2-be8d-955cd9cdfbc6'), target_run_id=None)]}
{'results': [EvaluationResult(key='answer_helpfulness_score', score=1, value=None, comment=None, correction=None, evaluator_info={}, feedback_config=None, source_run_id=UUID('ba76ea1a-31a6-4276-9742-5fa584851cf1'), target_run_id=None)]}
{'results': [EvaluationResult(key='answer_helpfulness_score', score=1, value=None, comment=None, correction=None, evaluator_info={}, feedback_config=None, source_run_id=UUID('53b1fde4-d39a-40e5-b36d-44605d67c4cd'), target_run_id=None)]}
{'results': [EvaluationResult(key='answer_helpfulnes

## Response vs retrieved docs (Groundidness/faithfulness)


In [59]:
grade_prompt_hallucinations = prompt = hub.pull("langchain-ai/rag-answer-hallucination")
print(grade_prompt_hallucinations[0].prompt.template)

You are a teacher grading a quiz. 

You will be given FACTS  and a STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Ensure the STUDENT ANSWER is grounded in the FACTS. 
(2) Ensure the STUDENT ANSWER does not contain "hallucinated" information outside the scope of the FACTS.

Score:
A score of 1 means that the student's answer meets all of the criteria. This is the highest (best) score. 
A score of 0 means that the student's answer does not meet all of the criteria. This is the lowest possible score you can give.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset.


In [60]:

def answer_hallucination_evaluator(run, example) -> dict:
    """
    A simple evaluator for generation hallucination
    """

    # RAG inputs
    input_question = example.inputs["input_question"]
    contexts = run.outputs["contexts"]

    # RAG answer
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_hallucinations | llm

    # Get score
    score = answer_grader.invoke({"documents": contexts,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_hallucination", "score": score}

In [61]:
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[answer_hallucination_evaluator],
    experiment_prefix="rag-answer-hallucination",
    metadata={"version": "cv context, gpt-4-0125-preview"},
)

View the evaluation results for experiment: 'rag-answer-hallucination-f2d2cbc3' at:
https://smith.langchain.com/o/9bdcd0b2-8b47-501f-b66b-d379cce39a32/datasets/b30a204e-c905-40d8-a9a9-530289e86bcf/compare?selectedSessions=5682d77d-d656-4ff5-b293-4e5e89d0ec52




10it [00:29,  2.99s/it]


In [62]:
for i, result in enumerate(experiment_results):
    print(result['evaluation_results'])

{'results': [EvaluationResult(key='answer_hallucination', score=1, value=None, comment=None, correction=None, evaluator_info={}, feedback_config=None, source_run_id=UUID('20a4c8ac-cba4-4f4e-ad04-8383f84384ec'), target_run_id=None)]}
{'results': [EvaluationResult(key='answer_hallucination', score=1, value=None, comment=None, correction=None, evaluator_info={}, feedback_config=None, source_run_id=UUID('ac25beab-138e-49bd-b79b-42be30ced29d'), target_run_id=None)]}
{'results': [EvaluationResult(key='answer_hallucination', score=1, value=None, comment=None, correction=None, evaluator_info={}, feedback_config=None, source_run_id=UUID('c3389fe2-b1f6-4b6f-acfc-27f64053d677'), target_run_id=None)]}
{'results': [EvaluationResult(key='answer_hallucination', score=1, value=None, comment=None, correction=None, evaluator_info={}, feedback_config=None, source_run_id=UUID('392d0a2d-a4bb-4a2f-8cf3-50e3381858ce'), target_run_id=None)]}
{'results': [EvaluationResult(key='answer_hallucination', score=1, v

## Retrieved docs vs input (context relevance)

In [63]:
grade_prompt_doc_relevance = hub.pull("langchain-ai/rag-document-relevance")
print(grade_prompt_doc_relevance[0].prompt.template)

You are a teacher grading a quiz. 

You will be given a QUESTION and a set of FACTS provided by the student. 

Here is the grade criteria to follow:
(1) You goal is to identify FACTS that are completely unrelated to the QUESTION
(2) If the facts contain ANY keywords or semantic meaning related to the question, consider them relevant
(3) It is OK if the facts have SOME information that is unrelated to the question (2) is met 

Score:
A score of 1 means that the FACT contain ANY keywords or semantic meaning related to the QUESTION and are therefore relevant. This is the highest (best) score. 
A score of 0 means that the FACTS are completely unrelated to the QUESTION. This is the lowest possible score you can give.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset.


In [64]:
def docs_relevance_evaluator(run, example) -> dict:
    """
    A simple evaluator for document relevance
    """

    # RAG inputs
    input_question = example.inputs["input_question"]
    contexts = run.outputs["contexts"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_doc_relevance | llm

    # Get score
    score = answer_grader.invoke({"question":input_question,
                                  "documents":contexts})
    score = score["Score"]

    return {"key": "document_relevance", "score": score}

In [65]:
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[docs_relevance_evaluator],
    experiment_prefix="rag-doc-relevance",
    metadata={"version": "cv context, gpt-4-0125-preview"},
)

View the evaluation results for experiment: 'rag-doc-relevance-3fba8dfe' at:
https://smith.langchain.com/o/9bdcd0b2-8b47-501f-b66b-d379cce39a32/datasets/b30a204e-c905-40d8-a9a9-530289e86bcf/compare?selectedSessions=25222e7b-d46a-43dc-9d21-75badfbfae7f




10it [00:35,  3.52s/it]
