## Prepare Dataset

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
%env LANGCHAIN_PROJECT="nj-community"

### Assets

In [None]:
questions = [
            "What's the name of the actor who played Neo in 'Matrix'?",
            "Who is the founder of Amazon?",
            "Give me the name of the singer and one of the other Beatles member",
             ]

# my ground thruths 
dataset_outputs = [
    {"must_mention": ["Keanu Reeves"]},
    {"must_mention": ["Elon Musk",]}, #ERROR
    {"must_mention": ["John Lennon"], "is_optional": ["Paul McCartney","George Harrison", "Ringo Starr"]},
]

### Create

In [None]:
from langsmith import Client
client = Client()
dataset_name = "Simple-Evaluation"


In [None]:

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Q/A about general knowledge"
)

client.create_examples(
    inputs=[{"question": q} for q in questions],
    outputs=dataset_outputs,
    dataset_id=dataset.id,
)

print(f"Dataset Created with Id: {dataset.id}")

### Evaluate 

In [None]:
from langchain_openai import AzureChatOpenAI
    
model = AzureChatOpenAI(
            azure_deployment=os.getenv("OPENAI_CHAT_DEPLOYMENT_NAME"),
            openai_api_type="azure",
            temperature=0.0,
        )

### Basic Evaluator 

In [7]:
from langchain.evaluation import Criteria
list(Criteria)

[<Criteria.CONCISENESS: 'conciseness'>,
 <Criteria.RELEVANCE: 'relevance'>,
 <Criteria.CORRECTNESS: 'correctness'>,
 <Criteria.COHERENCE: 'coherence'>,
 <Criteria.HARMFULNESS: 'harmfulness'>,
 <Criteria.MALICIOUSNESS: 'maliciousness'>,
 <Criteria.HELPFULNESS: 'helpfulness'>,
 <Criteria.CONTROVERSIALITY: 'controversiality'>,
 <Criteria.MISOGYNY: 'misogyny'>,
 <Criteria.CRIMINALITY: 'criminality'>,
 <Criteria.INSENSITIVITY: 'insensitivity'>,
 <Criteria.DEPTH: 'depth'>,
 <Criteria.CREATIVITY: 'creativity'>,
 <Criteria.DETAIL: 'detail'>]

In [10]:
from langchain.smith import RunEvalConfig, run_on_dataset
from langsmith.evaluation import EvaluationResult, run_evaluator
import uuid

eval_config = RunEvalConfig(
    eval_llm=model, # USE THE SAME MODEL 
    evaluators=[
        RunEvalConfig.Criteria("harmfulness"),        
        RunEvalConfig.Criteria("helpfulness"),
        ]
)

name = f"ga_{str(uuid.uuid4())}"
client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=model,
    evaluation=eval_config,
    verbose=True,
    project_name=name,
)

View the evaluation results for project 'ga_d5a27b79-982a-4b7c-ac5e-da040306a95a' at:
https://smith.langchain.com/o/1a3b231d-e8b9-4837-8416-2403db894308/datasets/64740203-7224-4480-baaa-42502e50c23c/compare?selectedSessions=dcd45269-4fe4-4a61-b649-eabc6114c029

View all tests for Dataset Simple-Evaluation at:
https://smith.langchain.com/o/1a3b231d-e8b9-4837-8416-2403db894308/datasets/64740203-7224-4480-baaa-42502e50c23c
[------------------------------------------------->] 3/3

Unnamed: 0,feedback.harmfulness,feedback.helpfulness,error,execution_time,run_id
count,3.0,3.0,0.0,3.0,3
unique,,,0.0,,3
top,,,,,0794be70-a302-4784-9cbc-6114217e8866
freq,,,,,1
mean,0.0,0.666667,,0.623326,
std,0.0,0.57735,,0.053098,
min,0.0,0.0,,0.566089,
25%,0.0,0.5,,0.599499,
50%,0.0,1.0,,0.63291,
75%,0.0,1.0,,0.651945,


{'project_name': 'ga_d5a27b79-982a-4b7c-ac5e-da040306a95a',
 'results': {'58d9ab61-8102-4cc0-9705-2443afc2566b': {'input': {'question': 'Give me the name of the singer and one of the other Beatles member'},
   'feedback': [EvaluationResult(key='harmfulness', score=0, value='N', comment='To determine if the submission meets the criteria of harmfulness, offensive, or inappropriate, we need to analyze the content of the submission:\n\n1. Content analysis:\n   - The submission states that the singer is Paul McCartney, which is a correct answer.\n   - The submission also states that one of the other Beatles members is John Lennon, which is also a correct answer.\n\n2. Evaluation:\n   - The submission does not contain any harmful, offensive, or inappropriate content. It provides accurate information about the singer and one of the other Beatles members.\n\nBased on the above analysis, the submission does not meet the criteria of harmfulness, offensive, or inappropriate.', correction=None, ev

### Custom Evaluators (compare expected results)

In [12]:
from langchain.smith import RunEvalConfig, run_on_dataset
from langsmith.evaluation import EvaluationResult, run_evaluator
import uuid

@run_evaluator
def must_mention(run, example) -> EvaluationResult:
    prediction = run.outputs["generations"][0][0]["text"]
    required = example.outputs.get("must_mention") or []
    score = all(phrase in prediction for phrase in required)
    return EvaluationResult(key="must_mention", score=score)

@run_evaluator
def is_optional(run, example) -> EvaluationResult:
    prediction = run.outputs["generations"][0][0]["text"]
    is_optional = example.outputs.get("is_optional") or []
    score = any(phrase in prediction for phrase in is_optional) if is_optional else -1
    return EvaluationResult(key="is_optional", score=score)

eval_config = RunEvalConfig(
    eval_llm=model, # USE THE SAME MODEL 
    custom_evaluators=[must_mention, is_optional],
    evaluators=[
        RunEvalConfig.Criteria("harmfulness"),
        ]
)

name = f"ga_{str(uuid.uuid4())}"
client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=model,
    evaluation=eval_config,
    verbose=True,
    project_name=name,
)

View the evaluation results for project 'ga_f2f61e6a-ff26-4e5a-8a93-c4be18d03bca' at:
https://smith.langchain.com/o/1a3b231d-e8b9-4837-8416-2403db894308/datasets/64740203-7224-4480-baaa-42502e50c23c/compare?selectedSessions=4f3f26ab-904d-40d6-b20c-1dad5e4648fe

View all tests for Dataset Simple-Evaluation at:
https://smith.langchain.com/o/1a3b231d-e8b9-4837-8416-2403db894308/datasets/64740203-7224-4480-baaa-42502e50c23c
[------------------------------------------------->] 3/3

Unnamed: 0,feedback.harmfulness,feedback.must_mention,feedback.is_optional,error,execution_time,run_id
count,3.0,3,3.0,0.0,3.0,3
unique,,2,2.0,0.0,,3
top,,True,-1.0,,,851c149a-c9c9-4aaa-abf2-ae6c5ce272ba
freq,,2,2.0,,,1
mean,0.0,,,,0.614671,
std,0.0,,,,0.042732,
min,0.0,,,,0.568365,
25%,0.0,,,,0.595714,
50%,0.0,,,,0.623063,
75%,0.0,,,,0.637824,


{'project_name': 'ga_f2f61e6a-ff26-4e5a-8a93-c4be18d03bca',
 'results': {'58d9ab61-8102-4cc0-9705-2443afc2566b': {'input': {'question': 'Give me the name of the singer and one of the other Beatles member'},
   'feedback': [EvaluationResult(key='harmfulness', score=0, value='N', comment='To determine if the submission meets the criteria of harmfulness, offensive, or inappropriate, we need to analyze the content of the submission:\n\n1. Content analysis:\n   - The submission states that the singer is Paul McCartney, which is a correct answer.\n   - The submission also states that one of the other Beatles members is John Lennon, which is also a correct answer.\n\n2. Evaluation:\n   - The submission does not contain any harmful, offensive, or inappropriate content. It provides accurate information about the singer and one of the other Beatles members.\n\nBased on the above analysis, the submission does not meet the criteria of harmfulness, offensive, or inappropriate.', correction=None, ev

### Use Criteria

In [16]:

from langchain.smith import RunEvalConfig, run_on_dataset
from langsmith.evaluation import EvaluationResult, run_evaluator
import uuid


eval_config = RunEvalConfig(
    eval_llm=model, # USE THE SAME MODEL 
    evaluators=[
        RunEvalConfig.Criteria("harmfulness"),
        RunEvalConfig.Criteria(
            {
                "Movie": "Does it related movies topic?"
                "Respond Y if it is , N if it's not."
            },
            # prediction_key="answer",
            reference_key="question",
        ),
        RunEvalConfig.Criteria(
            {
                "Music": "Does it related music topic?"
                "Respond Y if it is , N if it's not."
            },
            reference_key="question",
        ),
        RunEvalConfig.Criteria(
            {
                "Cooking": "Does it related cooking topic?"
                "Respond Y if it is , N if it's not."
            },
            reference_key="question",
        ),

        ]
)

name = f"ga_{str(uuid.uuid4())}"
client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=model,
    evaluation=eval_config,
    verbose=True,
    project_name=name,
)

View the evaluation results for project 'ga_2afe9ad5-4a37-4a9f-a4d7-1bc1aac6edac' at:
https://smith.langchain.com/o/1a3b231d-e8b9-4837-8416-2403db894308/datasets/64740203-7224-4480-baaa-42502e50c23c/compare?selectedSessions=70e63371-299d-414e-b5a2-2b3f01fc3176

View all tests for Dataset Simple-Evaluation at:
https://smith.langchain.com/o/1a3b231d-e8b9-4837-8416-2403db894308/datasets/64740203-7224-4480-baaa-42502e50c23c
[------------------------------------------------->] 3/3

Unnamed: 0,feedback.harmfulness,feedback.Movie,feedback.Music,feedback.Cooking,error,execution_time,run_id
count,3.0,3.0,2.0,2.0,0.0,3.0,3
unique,,,,,0.0,,3
top,,,,,,,5610927e-235b-4c19-9115-9c92f0ec02e1
freq,,,,,,,1
mean,0.0,0.333333,0.5,0.0,,0.57338,
std,0.0,0.57735,0.707107,0.0,,0.048068,
min,0.0,0.0,0.0,0.0,,0.519481,
25%,0.0,0.0,0.25,0.0,,0.554168,
50%,0.0,0.0,0.5,0.0,,0.588855,
75%,0.0,0.5,0.75,0.0,,0.600329,


{'project_name': 'ga_2afe9ad5-4a37-4a9f-a4d7-1bc1aac6edac',
 'results': {'58d9ab61-8102-4cc0-9705-2443afc2566b': {'input': {'question': 'Give me the name of the singer and one of the other Beatles member'},
   'feedback': [EvaluationResult(key='harmfulness', score=0, value='N', comment='To determine if the submission meets the criteria of harmfulness, offensive, or inappropriate, we need to analyze the content of the submission:\n\n1. Content analysis:\n   - The submission states that the singer is Paul McCartney, which is a correct answer.\n   - The submission also states that one of the other Beatles members is John Lennon, which is also a correct answer.\n\n2. Evaluation:\n   - The submission does not contain any harmful, offensive, or inappropriate content. It provides accurate information about the singer and one of the other Beatles members.\n\nBased on the above analysis, the submission does not meet the criteria of harmfulness, offensive, or inappropriate.', correction=None, ev

In [17]:


from langchain.smith import RunEvalConfig, run_on_dataset
from langsmith.evaluation import EvaluationResult, run_evaluator
import uuid


eval_config = RunEvalConfig(
    eval_llm=model, # USE THE SAME MODEL 
    evaluators=[
       RunEvalConfig.LabeledScoreString(
            {
                "accuracy": """
                    Score 1: The answer is completely unrelated to the reference.
                    Score 5: The answer has moderate relevance but contains inaccuracies.
                    Score 10: The answer is completely accurate and aligns perfectly with the reference."""
            },
            normalize_by=10,
            reference_key="must_mention",
            prediction_key="generations",
        ),

        ]
)

name = f"ga_{str(uuid.uuid4())}"
client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=model,
    evaluation=eval_config,
    verbose=True,
    project_name=name,
)

View the evaluation results for project 'ga_fe6dd889-6d2f-4639-9245-a279f15ba2e3' at:
https://smith.langchain.com/o/1a3b231d-e8b9-4837-8416-2403db894308/datasets/64740203-7224-4480-baaa-42502e50c23c/compare?selectedSessions=f9d8178f-1d9a-4b0b-be03-16e6686d151c

View all tests for Dataset Simple-Evaluation at:
https://smith.langchain.com/o/1a3b231d-e8b9-4837-8416-2403db894308/datasets/64740203-7224-4480-baaa-42502e50c23c
[------------------------------------------------->] 3/3

Unnamed: 0,feedback.score_string:accuracy,error,execution_time,run_id
count,3.0,0.0,3.0,3
unique,,0.0,,3
top,,,,52ccb766-d42c-4cde-9750-fce887d01e54
freq,,,,1
mean,0.833333,,0.534579,
std,0.288675,,0.058744,
min,0.5,,0.488369,
25%,0.75,,0.501524,
50%,1.0,,0.514679,
75%,1.0,,0.557683,


{'project_name': 'ga_fe6dd889-6d2f-4639-9245-a279f15ba2e3',
 'results': {'58d9ab61-8102-4cc0-9705-2443afc2566b': {'input': {'question': 'Give me the name of the singer and one of the other Beatles member'},
   'feedback': [EvaluationResult(key='score_string:accuracy', score=1.0, value=None, comment="The assistant's response is accurate and relevant to the user's question. It correctly identifies Paul McCartney as the singer and John Lennon as one of the other Beatles members. The response aligns perfectly with the reference. \n\nRating: [[10]]", correction=None, evaluator_info={'__run': RunInfo(run_id=UUID('fd9c7188-6522-44e4-bd35-7ee92cc0e289'))}, source_run_id=None, target_run_id=None)],
   'execution_time': 0.600688,
   'run_id': '52ccb766-d42c-4cde-9750-fce887d01e54',
   'output': AIMessage(content='The singer is Paul McCartney and one of the other Beatles members is John Lennon.'),
   'reference': {'is_optional': ['Paul McCartney',
     'George Harrison',
     'Ringo Starr'],
    

### Evaluate the model 

ONE OF THESE 
https://docs.smith.langchain.com/evaluation/custom-evaluators



### Add to dataset from Existing Results

In [None]:
%env LANGCHAIN_PROJECT=test


In [18]:
model.invoke("Who is the founder of Amazon?")

AIMessage(content='The founder of Amazon is Jeff Bezos.')

In [19]:

import langsmith
from langchain import chat_models, smith
from langchain.evaluation import EvaluatorType


# Define the evaluators to apply
eval_config = smith.RunEvalConfig(
    evaluators=[
        "cot_qa",
        RunEvalConfig.LabeledCriteria("correctness"),       
    ],
    custom_evaluators=[],
    eval_llm=model
)

client = langsmith.Client()

name = f"check_{str(uuid.uuid4())}"
chain_results = client.run_on_dataset(
    dataset_name="Simple-Evaluation-Chat",
    llm_or_chain_factory=model,
    evaluation=eval_config,
    project_name=name,
    concurrency_level=5,
    verbose=True,
)

View the evaluation results for project 'check_3a018af2-99f6-4501-b686-59a75057e958' at:
https://smith.langchain.com/o/1a3b231d-e8b9-4837-8416-2403db894308/datasets/c3e3fe27-31a5-4fe7-a5e6-6d5313d68ee2/compare?selectedSessions=07e8f6ba-99e7-42b6-b010-0325ec385f87

View all tests for Dataset ds-chat at:
https://smith.langchain.com/o/1a3b231d-e8b9-4837-8416-2403db894308/datasets/c3e3fe27-31a5-4fe7-a5e6-6d5313d68ee2
[------------------------------------------------->] 1/1

Unnamed: 0,feedback.COT Contextual Accuracy,feedback.correctness,error,execution_time,run_id
count,1.0,1.0,0.0,1.0,1
unique,,,0.0,,1
top,,,,,56ee4b92-64a5-4e81-bb14-91c80b96757f
freq,,,,,1
mean,1.0,1.0,,0.567459,
std,,,,,
min,1.0,1.0,,0.567459,
25%,1.0,1.0,,0.567459,
50%,1.0,1.0,,0.567459,
75%,1.0,1.0,,0.567459,
