<td>   <a target="_blank" href="https://labelbox.com" ><img src="https://labelbox.com/blog/content/images/2021/02/logo-v4.svg" width=256/></a></td>


<td>
<a href="https://colab.research.google.com/github/Labelbox/labelbox-python/blob/develop/examples/integrations/langchain/langchain.ipynb" target="_blank"><img
src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>
</td>

<td>
<a href="https://github.com/Labelbox/labelbox-python/tree/develop/examples/integrations/langchain/langchain.ipynb" target="_blank"><img
src="https://img.shields.io/badge/GitHub-100000?logo=github&logoColor=white" alt="GitHub"></a>
</td>

# LangChain -> Labelbox
This notebook is used to show an example workflow of getting LangChain traces into Labelbox conversation data format. Please review the [associated written guide](https://labelbox.com/guides/turn-langchain-logs-into-conversational-data-with-labelbox/) for more information.

In [None]:
%pip install --upgrade --quiet  langchain langsmith langchainhub
%pip install --upgrade --quiet  langchain-openai tiktoken pandas duckduckgo-search
%pip install --upgrade --quiet  "labelbox[data]"

In [None]:
import labelbox as lb
from uuid import uuid4
import os
import functools

# LangSmith Imports
from langsmith.client import Client
from langchain import hub
from langchain.agents import AgentExecutor
from langchain.agents.format_scratchpad.openai_tools import (
    format_to_openai_tool_messages,)
from langchain.agents.output_parsers.openai_tools import (
    OpenAIToolsAgentOutputParser,)
from langchain_community.tools import DuckDuckGoSearchResults
from langchain_openai import ChatOpenAI
from langsmith.evaluation import EvaluationResult
from langsmith.schemas import Example, Run, DataType
from langchain.smith import run_on_dataset
from langchain.evaluation import EvaluatorType
from langchain.smith import RunEvalConfig

## API Key and Setup
Provide a valid API key below for Labelbox, LangSmith and OpenAI in order for the notebook to work correctly.

In [None]:
LB_API_KEY = ""
LS_API_KEY = ""
OPENAI_API_KEY = ""

unique_id = uuid4().hex[0:8]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"Tracing Walkthrough - {unique_id}"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = LS_API_KEY

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

lb_client = lb.Client(LB_API_KEY)
client = Client()

### LangSmith Dataset Name
Create a sample chat data set with an example chat based run

In [None]:
dataset_name = f"Sample Dataset - {str(uuid4())}"
dataset = client.create_dataset(
    dataset_name,
    description="A sample dataset in LangSmith.",
    data_type=DataType.chat,
)
client.create_chat_example(
    messages=[
        {
            "type": "ai",
            "data": {
                "content": "hi how are you"
            }
        },
        {
            "type": "human",
            "data": {
                "content": "Im doing great how about you"
            }
        },
    ],
    generations={
        "type": "ai",
        "data": {
            "content": "Im doing great"
        },
    },  # Custom model output
    dataset_id=dataset.id,
)

### LangSmith
Below is an example of running a list of raw text evaluation strings and a LangSmith example run with Chat Gpt 3.5. Please review [LangSmith Docs](https://docs.smith.langchain.com/) for more information.

In [None]:
tools = [
    DuckDuckGoSearchResults(
        name="duck_duck_go"),  # General internet search using DuckDuckGo
]

llm = ChatOpenAI(
    model="gpt-3.5-turbo-16k",
    temperature=0,
)


# Since chains can be stateful (e.g. they can have memory), we provide
# a way to initialize a new chain for each row in the dataset. This is done
# by passing in a factory function that returns a new chain for each row.
def create_agent(prompt, llm_with_tools):
    runnable_agent = ({
        "input":
            lambda x: x["input"],
        "agent_scratchpad":
            lambda x: format_to_openai_tool_messages(x["intermediate_steps"]),
    } | prompt | llm_with_tools | OpenAIToolsAgentOutputParser())
    return AgentExecutor(agent=runnable_agent,
                         tools=tools,
                         handle_parsing_errors=True)

In [None]:
def max_pred_length(runs, examples):
    predictions = [len(run.outputs["output"]) for run in runs]
    return EvaluationResult(key="max_pred_length", score=max(predictions))

In [None]:
def check_not_idk(run: Run, example: Example):
    """Illustration of a custom evaluator."""
    agent_response = run.outputs["output"]
    if "don't know" in agent_response or "not sure" in agent_response:
        score = 0
    else:
        score = 1
    # You can access the dataset labels in example.outputs[key]
    # You can also access the model inputs in run.inputs[key]
    return EvaluationResult(
        key="not_uncertain",
        score=score,
    )

In [None]:
evaluation_config = RunEvalConfig(
    # Evaluators can either be an evaluator type (e.g., "qa", "criteria", "embedding_distance", etc.) or a configuration for that evaluator
    evaluators=[
        check_not_idk,
        # Measures whether a QA response is "Correct", based on a reference answer
        # You can also select via the raw string "qa"
        EvaluatorType.QA,
        # Measure the embedding distance between the output and the reference answer
        # Equivalent to: EvalConfig.EmbeddingDistance(embeddings=OpenAIEmbeddings())
        EvaluatorType.EMBEDDING_DISTANCE,
        # Grade whether the output satisfies the stated criteria.
        # You can select a default one such as "helpfulness" or provide your own.
        RunEvalConfig.LabeledCriteria("helpfulness"),
        # The LabeledScoreString evaluator outputs a score on a scale from 1-10.
        # You can use default criteria or write our own rubric
        RunEvalConfig.LabeledScoreString(
            {
                "accuracy":
                    """
Score 1: The answer is completely unrelated to the reference.
Score 3: The answer has minor relevance but does not align with the reference.
Score 5: The answer has moderate relevance but contains inaccuracies.
Score 7: The answer aligns with the reference but has minor errors or omissions.
Score 10: The answer is completely accurate and aligns perfectly with the reference."""
            },
            normalize_by=10,
        ),
    ],
    batch_evaluators=[max_pred_length],
)

llm_with_tools = llm.bind_tools(tools)
prompt = hub.pull("gabe/labelboxtutorialdemo"
                 )  # Change prompt in LangSmith hub to reflect example run

chain_results = run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=functools.partial(create_agent,
                                           prompt=prompt,
                                           llm_with_tools=llm_with_tools),
    evaluation=evaluation_config,
    verbose=True,
    client=client,
    project_name=f"tools-agent-test-5d466cbc-{unique_id}",
    # Project metadata communicates the experiment parameters,
    # Useful for reviewing the test results
    project_metadata={
        "env": "testing-notebook",
        "model": "gpt-3.5-turbo",
        "prompt": "5d466cbc",
    },
)

# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.
# These are logged as warnings here and captured as errors in the tracing UI.

## Labelbox
Below converts the results of the above LangSmith run to Labelbox conversation text. Please review [Labelbox conversation data docs](https://docs.labelbox.com/docs/llm-human-preference) for more information.

In [None]:
def import_conversational(
        chain_results: dict[str:str],
        user_id_dict: dict[str:dict[str:str]]) -> dict[str:str]:
    """Converts LangSmith chain_results from model invocation to Labelbox conversation data for model response comparison. Output is based on popular model response and custom model response towards prompts.

    Args:
      chain_results(dict[str:str]): Results from LangSmith model invocation against example dataset runs.
      user_id_dict(dict[str:dict[str:str]]): Dictionary mapping of LangSmith example run type to Labelbox chat names and alignment.

    Returns:
        dict[str:str]: Labelbox conversation text format
    """
    lb_conversations = []
    for key, conversational in chain_results["results"].items():
        lb_conversation = {
            "row_data": {
                "type": "application/vnd.labelbox.conversational",
                "version": 1,
                "messages": [],
                "modelOutputs": [],
            },
            "global_key": key,
            "media_type": "CONVERSATIONAL",
        }
        if "input" in conversational["output"]:
            for i, input in enumerate(conversational["output"]["input"]):
                lb_conversation["row_data"]["messages"].append({
                    "content": input["data"]["content"],
                    "timestampUsec": i + 1,
                    "user": {
                        "userId": user_id_dict[input["type"]]["id"],
                        "name": input["type"],
                    },
                    "canLabel": True,
                    "align": user_id_dict[input["type"]]["align"],
                    "messageId": str(uuid4()),
                })

        # Custom model output
        if "reference" in conversational:
            reference = conversational["reference"]["output"]
            lb_conversation["row_data"]["modelOutputs"].append({
                "title": "Custom Model Response",
                "content": reference["data"]["content"],
                "modelConfigName": "Custom Model - Example Config",
            })

        # Popular model output
        if "output" in conversational["output"]:
            output = conversational["output"]["output"]
            lb_conversation["row_data"]["modelOutputs"].append({
                "title": "Popular LLM Response",
                "content": output,
                "modelConfigName": "GPT-3.5 - Example Config",
            })

        lb_conversations.append(lb_conversation)
    return lb_conversations

### Create Labelbox Dataset

In [None]:
dataset = lb_client.create_dataset(name="demo_langchain")

### Attach Conversation Text to Dataset

In [None]:
task = dataset.create_data_rows(
    import_conversational(
        chain_results,
        {
            "human": {
                "id": "human",
                "align": "right"
            },
            "ai": {
                "id": "ai",
                "align": "left"
            },
        },
    ))
task.wait_till_done()

print(task.errors)

### Cleanup

In [None]:
# dataset.delete()