## Prepare Dataset

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
%env LANGCHAIN_PROJECT="nj-community"

### Assets

In [None]:
questions = [
            "What's the name of the actor who played Neo in 'Matrix'?",
            "Who is the founder of Amazon?",
            "Give me the name of the singer and one of the other Beatles member",
             ]

# my ground thruths 
dataset_outputs = [
    {"must_mention": ["Keanu Reeves"]},
    {"must_mention": ["Elon Musk",]}, #ERROR
    {"must_mention": ["John Lennon"], "is_optional": ["Paul McCartney","George Harrison", "Ringo Starr"]},
]

### Create

In [None]:
from langsmith import Client
client = Client()
dataset_name = "Simple-Evaluation"


In [None]:

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Q/A about general knowledge"
)

client.create_examples(
    inputs=[{"question": q} for q in questions],
    outputs=dataset_outputs,
    dataset_id=dataset.id,
)

print(f"Dataset Created with Id: {dataset.id}")

### Evaluate 

In [None]:
from langchain_openai import AzureChatOpenAI
    
model = AzureChatOpenAI(
            azure_deployment=os.getenv("OPENAI_CHAT_DEPLOYMENT_NAME"),
            openai_api_type="azure",
            temperature=0.0,
        )

### Basic Evaluator 

In [None]:
from langchain.evaluation import Criteria
list(Criteria)

In [None]:
from langchain.smith import RunEvalConfig, run_on_dataset
from langsmith.evaluation import EvaluationResult, run_evaluator
import uuid

eval_config = RunEvalConfig(
    eval_llm=model, # USE THE SAME MODEL 
    evaluators=[
        RunEvalConfig.Criteria("harmfulness"),        
        RunEvalConfig.Criteria("helpfulness"),
        ]
)

name = f"ga_{str(uuid.uuid4())}"
client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=model,
    evaluation=eval_config,
    verbose=True,
    project_name=name,
)

### Custom Evaluators (compare expected results)

In [None]:
from langchain.smith import RunEvalConfig, run_on_dataset
from langsmith.evaluation import EvaluationResult, run_evaluator
import uuid

@run_evaluator
def must_mention(run, example) -> EvaluationResult:
    prediction = run.outputs["generations"][0][0]["text"]
    required = example.outputs.get("must_mention") or []
    score = all(phrase in prediction for phrase in required)
    return EvaluationResult(key="must_mention", score=score)

@run_evaluator
def is_optional(run, example) -> EvaluationResult:
    prediction = run.outputs["generations"][0][0]["text"]
    is_optional = example.outputs.get("is_optional") or []
    score = any(phrase in prediction for phrase in is_optional) if is_optional else -1
    return EvaluationResult(key="is_optional", score=score)

eval_config = RunEvalConfig(
    eval_llm=model, # USE THE SAME MODEL 
    custom_evaluators=[must_mention, is_optional],
    evaluators=[
        RunEvalConfig.Criteria("harmfulness"),
        ]
)

name = f"ga_{str(uuid.uuid4())}"
client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=model,
    evaluation=eval_config,
    verbose=True,
    project_name=name,
)

### Use Criteria

In [None]:

from langchain.smith import RunEvalConfig, run_on_dataset
from langsmith.evaluation import EvaluationResult, run_evaluator
import uuid


eval_config = RunEvalConfig(
    eval_llm=model, # USE THE SAME MODEL 
    evaluators=[
        RunEvalConfig.Criteria("harmfulness"),
        RunEvalConfig.Criteria(
            {
                "Movie": "Does it related movies topic?"
                "Respond Y if it is , N if it's not."
            },
            # prediction_key="answer",
            reference_key="question",
        ),
        RunEvalConfig.Criteria(
            {
                "Music": "Does it related music topic?"
                "Respond Y if it is , N if it's not."
            },
            reference_key="question",
        ),
        RunEvalConfig.Criteria(
            {
                "Cooking": "Does it related cooking topic?"
                "Respond Y if it is , N if it's not."
            },
            reference_key="question",
        ),

        ]
)

name = f"ga_{str(uuid.uuid4())}"
client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=model,
    evaluation=eval_config,
    verbose=True,
    project_name=name,
)

In [None]:


from langchain.smith import RunEvalConfig, run_on_dataset
from langsmith.evaluation import EvaluationResult, run_evaluator
import uuid


eval_config = RunEvalConfig(
    eval_llm=model, # USE THE SAME MODEL 
    evaluators=[
       RunEvalConfig.LabeledScoreString(
            {
                "accuracy": """
                    Score 1: The answer is completely unrelated to the reference.
                    Score 5: The answer has moderate relevance but contains inaccuracies.
                    Score 10: The answer is completely accurate and aligns perfectly with the reference."""
            },
            normalize_by=10,
            reference_key="must_mention",
            prediction_key="generations",
        ),

        ]
)

name = f"ga_{str(uuid.uuid4())}"
client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=model,
    evaluation=eval_config,
    verbose=True,
    project_name=name,
)

### Evaluate the model 

ONE OF THESE 
https://docs.smith.langchain.com/evaluation/custom-evaluators



### Add to dataset from Existing Results

In [None]:
%env LANGCHAIN_PROJECT=test


In [None]:
model.invoke("Who is the founder of Amazon?")

In [None]:

import langsmith
from langchain import chat_models, smith
from langchain.evaluation import EvaluatorType


# Define the evaluators to apply
eval_config = smith.RunEvalConfig(
    evaluators=[
        "cot_qa",
        RunEvalConfig.LabeledCriteria("correctness"),       
    ],
    custom_evaluators=[],
    eval_llm=model
)

client = langsmith.Client()

name = f"check_{str(uuid.uuid4())}"
chain_results = client.run_on_dataset(
    dataset_name="Simple-Evaluation-Chat",
    llm_or_chain_factory=model,
    evaluation=eval_config,
    project_name=name,
    concurrency_level=5,
    verbose=True,
)