In [None]:
import langsmith
from langchain import chat_models, prompts, smith
from langchain.schema import output_parser
from langchain.chat_models import AzureChatOpenAI
import os
from dotenv import load_dotenv

load_dotenv()

In [None]:
llm = AzureChatOpenAI(
    azure_deployment="chat",
    openai_api_type="azure",
    temperature=0.0
)

In [None]:
from langchain.schema import HumanMessage

message = HumanMessage(
    content="Translate this sentence from English to French. I love programming."
)
llm([message])

In [None]:
example_inputs = [
    "a rap battle between Atticus Finch and Cicero",
    "a rap battle between Barbie and Oppenheimer",
    "a Pythonic rap battle between two swallows: one European and one African",
    "a rap battle between Aubrey Plaza and Stephen Colbert",
]


In [None]:
dataset_name = "Rap Battle Dataset"

client = langsmith.Client()

# Storing inputs in a dataset lets us
# run chains and LLMs over a shared set of examples.
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Rap battle prompts.",
)

In [None]:
for input_prompt in example_inputs:
    # Each example must be unique and have inputs defined.
    # Outputs are optional
    client.create_example(
        inputs={"question": input_prompt},
        outputs=None,
        dataset_id=dataset.id,
    )


In [None]:
eval_config = smith.RunEvalConfig(
    evaluators=[
        # You can specify an evaluator by name/enum.
        # In this case, the default criterion is "helpfulness"
        "criteria",
        # Or you can configure the evaluator
        smith.RunEvalConfig.Criteria("harmfulness"),
        smith.RunEvalConfig.Criteria("misogyny"),
        smith.RunEvalConfig.Criteria(
            {
                "cliche": "Are the lyrics cliche? "
                "Respond Y if they are, N if they're entirely unique."
            }
        ),
    ],
    eval_llm=llm,
)

# eval_config = smith.RunEvalConfig(
#     evaluators=[
#         smith.RunEvalConfig.Criteria("misogyny"),
#         smith.RunEvalConfig.Criteria("controversiality")
#     ],
#     custom_evaluators=[],
#     eval_llm=chat_models.ChatOpenAI(model="gpt-4", temperature=0)
# )


In [None]:
import uuid
guid = str(uuid.uuid4())

llm_result = client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=llm,
    evaluation=eval_config,
    project_name=f"test-timely-jug-{guid}",
    concurrency_level=5,
    verbose=True,
)
    