# Experiments

### Setup

In [1]:
# You can set them inline
import os

In [2]:
# Or you can use a .env file
from dotenv import load_dotenv
load_dotenv()

True

Here is the RAG Application that we've been working with throughout this course

In [None]:
import tempfile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.sitemap import SitemapLoader
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_openai import OpenAIEmbeddings
from langsmith import traceable
from openai import OpenAI
from typing import List
from .autonotebook import tqdm as notebook_tqdm
import nest_asyncio

# TODO: Configure this model!
MODEL_NAME = "gpt-5-mini"
MODEL_PROVIDER = "openai"
APP_VERSION = 1.0
RAG_SYSTEM_PROMPT = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the latest question in the conversation. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.
"""

openai_client = OpenAI()

def get_vector_db_retriever():
    persist_path = os.path.join(tempfile.gettempdir(), "union.parquet")
    embd = OpenAIEmbeddings()

    # If vector store exists, then load it
    if os.path.exists(persist_path):
        vectorstore = SKLearnVectorStore(
            embedding=embd,
            persist_path=persist_path,
            serializer="parquet"
        )
        return vectorstore.as_retriever(lambda_mult=0)

    # Otherwise, index LangSmith documents and create new vector store
    ls_docs_sitemap_loader = SitemapLoader(web_path="https://docs.smith.langchain.com/sitemap.xml", continue_on_failure=True)
    ls_docs = ls_docs_sitemap_loader.load()

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=500, chunk_overlap=0
    )
    doc_splits = text_splitter.split_documents(ls_docs)

    vectorstore = SKLearnVectorStore.from_documents(
        documents=doc_splits,
        embedding=embd,
        persist_path=persist_path,
        serializer="parquet"
    )
    vectorstore.persist()
    return vectorstore.as_retriever(lambda_mult=0)

nest_asyncio.apply()
retriever = get_vector_db_retriever()

"""
retrieve_documents
- Returns documents fetched from a vectorstore based on the user's question
"""
@traceable(run_type="chain")
def retrieve_documents(question: str):
    return retriever.invoke(question)

"""
generate_response
- Calls `call_openai` to generate a model response after formatting inputs
"""
@traceable(run_type="chain")
def generate_response(question: str, documents):
    formatted_docs = "\n\n".join(doc.page_content for doc in documents)
    messages = [
        {
            "role": "system",
            "content": RAG_SYSTEM_PROMPT
        },
        {
            "role": "user",
            "content": f"Context: {formatted_docs} \n\n Question: {question}"
        }
    ]
    return call_openai(messages)

"""
call_openai
- Returns the chat completion output from OpenAI
"""
@traceable(
    run_type="llm",
    metadata={
        "ls_provider": MODEL_PROVIDER,
        "ls_model_name": MODEL_NAME
    }
)
def call_openai(messages: List[dict]) -> str:
    return openai_client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
    )

"""
langsmith_rag
- Calls `retrieve_documents` to fetch documents
- Calls `generate_response` to generate a response based on the fetched documents
- Returns the model response
"""
@traceable(run_type="chain")
def langsmith_rag(question: str):
    documents = retrieve_documents(question)
    response = generate_response(question, documents)
    return response.choices[0].message.content


USER_AGENT environment variable not set, consider setting it to identify your requests.


### Experiment

Here is a code snippet that should look similar to what you see from the starter code!

There are a few important components here.

1. We have defined an Evaluator
2. We pipe our dataset examples (dict) to the shape of input that our function `langsmith_rag` takes (str) using a target function

In [4]:
from langsmith import evaluate, Client

client = Client()
dataset_name = "RAG Application Golden Dataset"

def is_concise_enough(reference_outputs: dict, outputs: dict) -> dict:
    score = len(outputs["output"]) < 1.5 * len(reference_outputs["output"])
    return {"key": "is_concise", "score": int(score)}

def target_function(inputs: dict):
    return langsmith_rag(inputs["question"])

evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="gpt-5-mini"
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'gpt-5-mini-37675534' at:
https://smith.langchain.com/o/4b2fe8a0-2e7b-4d38-9887-3df612dd2c6f/datasets/56615187-150f-49b6-8ed8-22566d0a2590/compare?selectedSessions=13114021-f84a-47e3-a93f-83bcf21d2746




16it [03:26, 12.93s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,Is Langsmith free to use?,I don't know — the provided context doesn't st...,,"Yes, LangSmith offers a free tier with limited...",1,7.31105,e606ea04-f291-45b3-bb69-267ef2b2a22d,4992b4a3-8766-4dbb-ac0f-f6fb63f00e87
1,How do I set up tracing to LangSmith if I'm us...,Run a LangSmith instance and configure LangCha...,,To set up tracing to LangSmith using LangChain...,1,9.45903,213e8c4f-f7f6-4d5c-b393-036858868ba3,ee81233b-00e9-4066-bb69-73163aeb9930
2,Can LangSmith be used for finetuning and model...,"No. LangSmith is focused on observability, eva...",,"Yes, LangSmith can be used for fine-tuning and...",1,6.520541,556184e2-afb9-4445-9f98-caed1815fe03,00230422-3cf3-4198-bfe5-af29a55f33c6
3,Is there a Javascript Langsmith SDK?,Yes — LangSmith provides a JavaScript/TypeScri...,,"Yes, there is a Javascript Langsmith SDK",0,6.43771,77083a6a-f4d3-4314-82f7-27073e7cb73e,7b0d59ed-4de5-499f-aa17-482ab0e9e2a9
4,Is there a way to visualize the data collected...,Yes. LangSmith’s web UI lets you view all expe...,,"Yes, LangSmith provides visualization tools th...",1,7.698908,80c917e4-3d4a-4c2f-9094-d5b8533acdd2,7d9dc9af-7f05-45b9-bc87-75576c4ce740
5,Does LangSmith support offline evaluation?,I don’t know. The provided documentation only ...,,"Yes, LangSmith supports offline evaluation thr...",1,6.830265,82764de6-bcaf-40b6-888e-a0ecd89e2575,59740fb5-316b-4555-9035-d65d7f5fd413
6,Can I integrate LangSmith with my custom appli...,Yes. You can integrate LangSmith with your cus...,,"Yes, LangSmith can be integrated with custom a...",1,78.275839,90fabeec-04fa-48cc-b50c-4fcdded0dfa3,6b5349da-6e9d-483e-a8e5-07547406db03
7,What are the key features of LangSmith?,LangSmith is a modular platform consisting of ...,,"LangSmith offers features such as tracing, eva...",1,8.595617,b95b32ff-9550-4110-b550-e71848bbe0a3,6f150b0f-c229-4cc8-aaf7-8921ead9e9e1
8,Can LangSmith be used to evaluate agents?,Yes. LangSmith is a framework-agnostic platfor...,,"Yes, LangSmith can be used to evaluate agents....",1,5.13345,32147caf-4adb-4894-9540-3548fdfb76bd,0fb7ed26-14ae-4005-b8a8-5331f894a177
9,How do I pass metadata in with @traceable?,Pass it in the options object you give to trac...,,You can pass metadata with the @traceable deco...,1,14.591577,39ad3e4a-8fd3-4b61-99b1-0c162e70f3d6,9dce4672-231d-455e-a5e2-474b2a18e324


### Modifying your Application

Now, let's change our model to gpt-35-turbo and see how it performs!

Make this change, and then run this code snippet!

In [9]:
from langsmith import evaluate, Client
from langsmith.schemas import Example, Run

def target_function(inputs: dict):
    return langsmith_rag(inputs["question"])

evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="gpt-3.5-turbo"
)

View the evaluation results for experiment: 'gpt-3.5-turbo-444fd348' at:
https://smith.langchain.com/o/4b2fe8a0-2e7b-4d38-9887-3df612dd2c6f/datasets/56615187-150f-49b6-8ed8-22566d0a2590/compare?selectedSessions=29a9d503-30d5-4af0-ac3a-79afe18c87cc




3it [00:22,  7.51s/it]


KeyboardInterrupt: 

### Running over Different pieces of Data

##### Dataset Version

You can execute an experiment on a specific version of a dataset in the sdk by using the `as_of` parameter in `list_examples`

Let's try running on just our initial dataset.

In [11]:
evaluate(
    target_function,
    data=client.list_examples(dataset_name=dataset_name, split="Base split"),   # We use as_of to specify a version
    evaluators=[is_concise_enough],
    experiment_prefix="Base split"
)

View the evaluation results for experiment: 'Base split-1bbf83a6' at:
https://smith.langchain.com/o/4b2fe8a0-2e7b-4d38-9887-3df612dd2c6f/datasets/56615187-150f-49b6-8ed8-22566d0a2590/compare?selectedSessions=fcf839a8-63f8-4af0-8fb0-8ba1dc7883d8




16it [01:44,  6.51s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,Is Langsmith free to use?,I don't know — the provided excerpts don't sta...,,"Yes, LangSmith offers a free tier with limited...",0,6.03381,e606ea04-f291-45b3-bb69-267ef2b2a22d,f040b866-afa0-4563-ad66-51c2335807af
1,How do I set up tracing to LangSmith if I'm us...,Point your LangChain tracer (or the LangSmith ...,,To set up tracing to LangSmith using LangChain...,1,6.852783,213e8c4f-f7f6-4d5c-b393-036858868ba3,2f569b53-0053-4f2e-8c5f-5df0bb90cf11
2,Can LangSmith be used for finetuning and model...,"No. LangSmith is a platform for observability,...",,"Yes, LangSmith can be used for fine-tuning and...",1,6.010242,556184e2-afb9-4445-9f98-caed1815fe03,99dc377a-e6be-4c8b-8b13-12ac5647cefa
3,Is there a Javascript Langsmith SDK?,Yes — LangSmith provides a JavaScript/TypeScri...,,"Yes, there is a Javascript Langsmith SDK",0,5.5823,77083a6a-f4d3-4314-82f7-27073e7cb73e,d810f813-f019-4ee3-a399-bd29a1f4e12d
4,Is there a way to visualize the data collected...,Yes. LangSmith's web UI lets you view experime...,,"Yes, LangSmith provides visualization tools th...",1,5.949821,80c917e4-3d4a-4c2f-9094-d5b8533acdd2,5fc71d15-f4b2-4c70-8843-9ec770729742
5,Does LangSmith support offline evaluation?,Yes. In addition to real-time online evaluator...,,"Yes, LangSmith supports offline evaluation thr...",1,9.295455,82764de6-bcaf-40b6-888e-a0ecd89e2575,48ecb07e-d997-4d8e-a1d7-8772cbe102bc
6,Can I integrate LangSmith with my custom appli...,Yes. You can integrate LangSmith into custom a...,,"Yes, LangSmith can be integrated with custom a...",1,5.44614,90fabeec-04fa-48cc-b50c-4fcdded0dfa3,6161e969-95c4-4ac3-9633-cbd53847f863
7,What are the key features of LangSmith?,LangSmith is a deployable platform for buildin...,,"LangSmith offers features such as tracing, eva...",1,6.453071,b95b32ff-9550-4110-b550-e71848bbe0a3,85ba95ee-4961-4839-b0f2-d66ad60717b1
8,Can LangSmith be used to evaluate agents?,Yes. LangSmith supports evaluating agents — it...,,"Yes, LangSmith can be used to evaluate agents....",1,4.550407,32147caf-4adb-4894-9540-3548fdfb76bd,736924d7-0da8-4b21-a990-020b81551b28
9,How do I pass metadata in with @traceable?,Pass it in the traceable options (second argum...,,You can pass metadata with the @traceable deco...,1,10.069163,39ad3e4a-8fd3-4b61-99b1-0c162e70f3d6,42f6f49a-ad26-43cc-ad18-819117daafd7


##### Dataset Split

You can run an experiment on a specific split of your dataset, let's try running on the Crucial Examples split.

In [12]:
evaluate(
    target_function,
    data=client.list_examples(dataset_name=dataset_name, splits=["Crucial"]),  # We pass in a list of Splits
    evaluators=[is_concise_enough],
    experiment_prefix="Crucial split"
)

View the evaluation results for experiment: 'Crucial split-d4fe78e0' at:
https://smith.langchain.com/o/4b2fe8a0-2e7b-4d38-9887-3df612dd2c6f/datasets/56615187-150f-49b6-8ed8-22566d0a2590/compare?selectedSessions=188abec4-3878-48c5-a22c-141958ddb8b6




5it [00:31,  6.25s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,How do I set up tracing to LangSmith if I'm us...,"Run a LangSmith instance, then configure LangC...",,To set up tracing to LangSmith using LangChain...,1,8.280815,213e8c4f-f7f6-4d5c-b393-036858868ba3,8079a4d0-76e1-499e-a74f-0265165471c3
1,Can LangSmith be used for finetuning and model...,No. LangSmith is an observability/evaluation a...,,"Yes, LangSmith can be used for fine-tuning and...",1,5.96112,556184e2-afb9-4445-9f98-caed1815fe03,77f2f138-db95-41b7-ad4d-9c6bb9c5bb37
2,Is there a Javascript Langsmith SDK?,Yes — LangSmith provides a JavaScript/TypeScri...,,"Yes, there is a Javascript Langsmith SDK",0,5.83899,77083a6a-f4d3-4314-82f7-27073e7cb73e,a8fd862e-b7e2-4822-a485-5ae73158322d
3,Is there a way to visualize the data collected...,Yes. LangSmith provides built‑in UI views to i...,,"Yes, LangSmith provides visualization tools th...",1,4.753686,80c917e4-3d4a-4c2f-9094-d5b8533acdd2,c4b67c61-2f35-40e3-9505-23d3e516f04e
4,Does LangSmith support offline evaluation?,I don't know — the provided docs only describe...,,"Yes, LangSmith supports offline evaluation thr...",1,5.920731,82764de6-bcaf-40b6-888e-a0ecd89e2575,b3729635-9112-4c83-b568-c997d6bde961


##### Specific Data Points

You can specify individual data points to run an experiment over as well

In [13]:
evaluate(
    target_function,
    data=client.list_examples(
        dataset_name=dataset_name, 
        example_ids=[   # We pass in a specific list of example_ids
            # TODO: You will need to paste in your own example ids for this to work!
            "e606ea04-f291-45b3-bb69-267ef2b2a22d",
        ]
    ),
    evaluators=[is_concise_enough],
    experiment_prefix="one specific example id"
)

View the evaluation results for experiment: 'one specific example ids-dc4feadc' at:
https://smith.langchain.com/o/4b2fe8a0-2e7b-4d38-9887-3df612dd2c6f/datasets/56615187-150f-49b6-8ed8-22566d0a2590/compare?selectedSessions=eeb17045-0fc5-4783-b661-5e3f4f80d112




1it [00:04,  4.71s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,Is Langsmith free to use?,I don't know — the provided excerpt doesn't st...,,"Yes, LangSmith offers a free tier with limited...",1,4.199421,e606ea04-f291-45b3-bb69-267ef2b2a22d,f3f942b7-a4a3-4cfc-81c5-bc5271cd3428


### Other Parameters

##### Repetitions

You can run an experiment several times to make sure you have consistent results

In [None]:
evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="two repetitions",
    num_repetitions=2   # This field defaults to 1
)

##### Concurrency
You can also kick off concurrent threads of execution to make your experiments finish faster!

In [None]:
evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="concurrency",
    max_concurrency=3,  # This defaults to None, so this is an improvement!
)

##### Metadata 

You can (and should) add metadata to your experiments, to make them easier to find in the UI

In [None]:
evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="metadata added",
    metadata={  # We can pass custom metadata for the experiment, such as the model name
        "model_name": MODEL_NAME
    }
)