## Evalute LLM output

### Built-in evaluators

In [11]:
### Evaluate LLM by LLM
# before start fill env variables .env file:
# LANGCHAIN_API_KEY="put_here_your_langchain(langsmith)_api_token"
# OPENAI_API_KEY="put_here_your_openai_token"
# HUGGINGFACE_API_TOKEN="put_here_your_huggingface_token"
# to use OpenAI API you need to add billing details https://platform.openai.com/settings/organization/billing/overview
# for langchain token remember to add read permissions associated with token
from dotenv import load_dotenv
from langchain.chains import LLMChain
from langchain.prompts.prompt import PromptTemplate
from langchain.llms import OpenAI
from langchain_openai import ChatOpenAI
from langchain.evaluation import load_evaluator

#load dotenv (API key from .env)
load_dotenv()

True

In [12]:
llm = ChatOpenAI(model_name="gpt-4o")
template = """
You are base of knowledge about star wars. Respond to question below with only name without any additional text.
{input}
"""
prompt_template = PromptTemplate.from_template(template=template)
chain = LLMChain(llm=llm, prompt=prompt_template)
prediction = chain.predict(input="What is the capital of star wars Sith Empire?")

evaluator = load_evaluator("labeled_score_string", llm=ChatOpenAI(model="gpt-4o"))
eval_result = evaluator.evaluate_strings(
    prediction=prediction,
    reference="Dromund Kaas",
    input="What is the capital of star wars Sith Empire?",
)
print(eval_result)



{'reasoning': 'The response provided by the AI assistant is "Dromund Kaas," which is indeed the correct answer to the user\'s question about the capital of the Sith Empire in the Star Wars universe. \n\n- **Helpfulness**: The response is helpful as it directly answers the user\'s question.\n- **Relevance**: The response is relevant because it correctly identifies the capital of the Sith Empire.\n- **Correctness**: The information is accurate and factual.\n- **Depth**: While the response is correct, it is very brief and lacks additional context or details that could provide more depth, such as mentioning its importance or role in the Star Wars lore.\n\nGiven these considerations, the response is accurate but lacks depth. Therefore, it merits a good but not perfect score.\n\nRating: [[8]]', 'score': 8}


### Evaluate LLM by LLM

In [2]:
model = OpenAI(temperature=0)
template = """You are an expert in grading answers.
You are grading the following question:
{query}
Here is the correct expected answer:
{answer}
You are grading the following predicted answer:
{result}
What grade do you give from 0 to 5, where 0 is the lowest for low similarity and 5 is for the high similarity?
"""

prompt = PromptTemplate(
    input_variables=["query", "answer", "result"], template=template
)

  warn_deprecated(


In [3]:
context_examples = [
    {
        "question": "Why people don't brief underwater?",
        "context": "Because people don't have gills",
    },
    {
        "question": "Why the sky is blue?",
        "context": "Sky isn't blue. Its just optical effect related to sun rays coming to eye through atmosphere and interpretation in our mind.",
    },
    {
        "question": "What is in my pocket?",
        "context": "",
    },
]
prompt_qa = "Answer the question based on the  context\nContext:{context}\nQuestion:{question}\nAnswer:"
template = PromptTemplate(input_variables=["context", "question"], template=prompt_qa)
qa_chain = LLMChain(llm=model, prompt=template)
predictions = qa_chain.apply(context_examples)
predictions

[{'text': " People don't breathe underwater because they do not have gills, which are necessary for extracting oxygen from water."},
 {'text': " The sky appears blue due to an optical effect caused by the sun's rays passing through the Earth's atmosphere and our brain's interpretation of this phenomenon. In reality, the sky does not have a color."},
 {'text': ' I am not able to answer that question as I do not have access to your pocket.'}]

In [4]:
from langchain.evaluation.qa import ContextQAEvalChain

eval_chain = ContextQAEvalChain.from_llm(model)
graded_outputs = eval_chain.evaluate(
    context_examples, predictions, question_key="question", prediction_key="text"
)
print(graded_outputs)

[{'text': ' CORRECT'}, {'text': ' CORRECT'}, {'text': ' CORRECT'}]


### Evaluation criteria

In [5]:
import os
from langsmith import Client
from langchain_openai import ChatOpenAI

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "GEMMA EVALUATIONs"
os.environ["LANGCHAIN_API_KEY"] = "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"

In [6]:
client = Client()
# Inputs are provided to your model, so it know what to generate
dataset_inputs = [
    "Why people don't have 3 legs?",
    "Why people are not flying?",
]

#use 1st LLM for generating texts
llm_test= ChatOpenAI(model="gpt-3.5-turbo", temperature=0.1,max_tokens=256)
# llm_gen = ChatOpenAI(base_url="https://api-inference.huggingface.com/v1", model="google/gemma-2b-it",temperature=0.1,max_tokens=256)
# and 2nd to evaluate different criteria of response generated with 1st LLM
llm_gen = ChatOpenAI(model="gpt-4o", temperature=0.1,max_tokens=256)

In [7]:

dataset_outputs = [
    {"result": llm_test.invoke(50*dataset_inputs[0])},
    {"result": llm_test.invoke(50*dataset_inputs[1])},
]
print(dataset_outputs)

[{'result': AIMessage(content='Humans do not have three legs because we are bipedal creatures, meaning we walk on two legs. Our bodies have evolved over millions of years to be efficient at walking and running on two legs, and having a third leg would not provide any significant advantage. Additionally, having three legs would likely be cumbersome and hinder our ability to move effectively.', response_metadata={'token_usage': {'completion_tokens': 69, 'prompt_tokens': 457, 'total_tokens': 526}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-b6b2fad6-73d5-45ec-8c9b-ae7f16445b90-0')}, {'result': AIMessage(content='There could be several reasons why people are not flying:\n\n1. Fear of COVID-19: Many people are hesitant to fly due to concerns about contracting the virus while traveling.\n\n2. Travel restrictions: Some countries have implemented travel restrictions or quarantine requirements, making it difficult for people to 

In [8]:
import uuid

dataset_name = "existential questions run:" + uuid.uuid4().__str__() #need to modify this value on every run of notebook

# Storing inputs in a dataset lets us
# run chains and LLMs over a shared set of examples.
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="evaluate LLM output",
)
client.create_examples(
    inputs=[{"question": q} for q in dataset_inputs],
    outputs=dataset_outputs,
    dataset_id=dataset.id,
)

In [9]:
from langchain.smith import RunEvalConfig
from langsmith.evaluation import EvaluationResult, run_evaluator

@run_evaluator
def custom_evaluator(run, example) -> EvaluationResult:
    """
    checks if output contains specific word
    :param run: 
    :param example: 
    :return: int
    """
    generated = run.outputs["generations"][0][0]["text"]
    if 'human' in generated:
        score = 1
    else:
        score = 0
    return EvaluationResult(key="result", score=score)

In [10]:
eval_config = RunEvalConfig(
    custom_evaluators=[custom_evaluator],
    evaluators=[
        "criteria",
        "qa",         #directly grade a response as "correct" or "incorrect" based on the reference answer
        "context_qa", #use the provided reference context in determining correctness
        "cot_qa",     #use chain of thought "reasoning" before determining a final verdict. This tends to lead to responses that better correlate with human labels
        RunEvalConfig.Criteria("insensitivity"),
        RunEvalConfig.Criteria("relevance"),
        RunEvalConfig.Criteria("helpfulness"),
        RunEvalConfig.Criteria("maliciousness"),
        RunEvalConfig.Criteria("harmfulness"),
        RunEvalConfig.Criteria("coherence"),
        RunEvalConfig.Criteria("conciseness"),
        RunEvalConfig.Criteria("misogyny"),
        RunEvalConfig.Criteria("criminality"),
        RunEvalConfig.Criteria("controversiality"),
        RunEvalConfig.Criteria( #custom defined criteria related to specific problem we want to solve and problems detected in output
            {
                "valuation": "Do texts contain valuation of subject, like glorifying some characteristic or judging someone?"
                " Respond Y if they do, N if they're entirely objective and stick to the facts without additions."
            }
        )
    ],
)

In [11]:
#in case of error ‘model is currently loading;’, wait couple of minutes and run notebook again
scores = client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=llm_gen,
    evaluation=eval_config,
    verbose=True,
    project_name=dataset_name,
)
print(scores)

View the evaluation results for project 'existential questions run:99803b99-df87-43b5-8305-22f37b26547d' at:
https://smith.langchain.com/o/3e1f981e-76ef-5491-9a42-e33f3bdfeba4/datasets/e6a6c9c0-db5c-4bc8-a94f-069b21023138/compare?selectedSessions=3328bc53-19b2-4846-91b6-968bce63f2a4

View all tests for Dataset existential questions run:99803b99-df87-43b5-8305-22f37b26547d at:
https://smith.langchain.com/o/3e1f981e-76ef-5491-9a42-e33f3bdfeba4/datasets/e6a6c9c0-db5c-4bc8-a94f-069b21023138
[------------------------------------------------->] 2/2

Unnamed: 0,feedback.helpfulness,feedback.correctness,feedback.Contextual Accuracy,feedback.COT Contextual Accuracy,feedback.insensitivity,feedback.relevance,feedback.maliciousness,feedback.harmfulness,feedback.coherence,feedback.conciseness,feedback.misogyny,feedback.criminality,feedback.controversiality,feedback.valuation,feedback.result,error,execution_time,run_id
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,2.0,2
unique,,,,,,,,,,,,,,,,0.0,,2
top,,,,,,,,,,,,,,,,,,a20298ef-4326-404b-b3dd-f9cd29d7efec
freq,,,,,,,,,,,,,,,,,,1
mean,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.5,,6.865795,
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.707107,,1.192045,
min,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,6.022892,
25%,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.25,,6.444343,
50%,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.5,,6.865795,
75%,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.75,0.0,0.0,0.0,0.0,0.75,,7.287247,


{'project_name': 'existential questions run:99803b99-df87-43b5-8305-22f37b26547d', 'results': {'bd2cf407-6caa-4b09-a8cb-7f789173704a': {'input': {'question': "Why people don't have 3 legs?"}, 'feedback': [EvaluationResult(key='helpfulness', score=1, value='Y', comment='The criterion for this task is "helpfulness". The submission should be helpful, insightful, and appropriate.\n\nLooking at the submission, the AI provides a detailed explanation of why humans have two legs instead of three. It gives four reasons, each backed by scientific and evolutionary facts. \n\n1. The first point about efficiency in movement is helpful and insightful as it explains how bipedalism is beneficial for long-distance travel and energy conservation.\n2. The second point about evolutionary history is also insightful, providing a background on how humans evolved from primate ancestors.\n3. The third point about balance and coordination is appropriate and insightful, explaining how the human body is designed 