# Experiments

### Setup

In [2]:
# Or you can use a .env file
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../.env", override=True)

False

Here is the RAG Application that we've been working with throughout this course

In [3]:
import os
import tempfile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.sitemap import SitemapLoader
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_openai import OpenAIEmbeddings
from langsmith import traceable
from openai import OpenAI
from typing import List
import nest_asyncio

# TODO: Configure this model!
MODEL_NAME = "gpt-4o"
MODEL_PROVIDER = "openai"
APP_VERSION = 1.0
RAG_SYSTEM_PROMPT = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the latest question in the conversation. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.
"""

openai_client = OpenAI()

def get_vector_db_retriever():
    persist_path = os.path.join(tempfile.gettempdir(), "union.parquet")
    embd = OpenAIEmbeddings()

    # If vector store exists, then load it
    if os.path.exists(persist_path):
        vectorstore = SKLearnVectorStore(
            embedding=embd,
            persist_path=persist_path,
            serializer="parquet"
        )
        return vectorstore.as_retriever(lambda_mult=0)

    # Otherwise, index LangSmith documents and create new vector store
    ls_docs_sitemap_loader = SitemapLoader(web_path="https://docs.smith.langchain.com/sitemap.xml", continue_on_failure=True)
    ls_docs = ls_docs_sitemap_loader.load()

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=500, chunk_overlap=0
    )
    doc_splits = text_splitter.split_documents(ls_docs)

    vectorstore = SKLearnVectorStore.from_documents(
        documents=doc_splits,
        embedding=embd,
        persist_path=persist_path,
        serializer="parquet"
    )
    vectorstore.persist()
    return vectorstore.as_retriever(lambda_mult=0)

nest_asyncio.apply()
retriever = get_vector_db_retriever()

"""
retrieve_documents
- Returns documents fetched from a vectorstore based on the user's question
"""
@traceable(run_type="chain")
def retrieve_documents(question: str):
    return retriever.invoke(question)

"""
generate_response
- Calls `call_openai` to generate a model response after formatting inputs
"""
@traceable(run_type="chain")
def generate_response(question: str, documents):
    formatted_docs = "\n\n".join(doc.page_content for doc in documents)
    messages = [
        {
            "role": "system",
            "content": RAG_SYSTEM_PROMPT
        },
        {
            "role": "user",
            "content": f"Context: {formatted_docs} \n\n Question: {question}"
        }
    ]
    return call_openai(messages)

"""
call_openai
- Returns the chat completion output from OpenAI
"""
@traceable(
    run_type="llm",
    metadata={
        "ls_provider": MODEL_PROVIDER,
        "ls_model_name": MODEL_NAME
    }
)
def call_openai(messages: List[dict]) -> str:
    return openai_client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
    )

"""
langsmith_rag
- Calls `retrieve_documents` to fetch documents
- Calls `generate_response` to generate a response based on the fetched documents
- Returns the model response
"""
@traceable(run_type="chain")
def langsmith_rag(question: str):
    documents = retrieve_documents(question)
    response = generate_response(question, documents)
    return response.choices[0].message.content


USER_AGENT environment variable not set, consider setting it to identify your requests.


### Experiment

Here is a code snippet that should look similar to what you see from the starter code!

There are a few important components here.

1. We have defined an Evaluator
2. We pipe our dataset examples (dict) to the shape of input that our function `langsmith_rag` takes (str) using a target function

In [8]:
from langsmith import evaluate, Client

client = Client()
dataset_name = "RAG_APPLICATION_GOLDENSET"

def is_concise_enough(reference_outputs: dict, outputs: dict) -> dict:
    score = len(outputs["output"]) < 1.5 * len(reference_outputs["output"])
    return {"key": "is_concise", "score": int(score)}

def target_function(inputs: dict):
    return langsmith_rag(inputs["question"])

evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="gpt-4o"
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'gpt-4o-d1f75659' at:
https://smith.langchain.com/o/42e50e11-148e-4f9c-8b19-47e3bd6993cb/datasets/f99cd610-6db1-470e-9057-95d5fda623ae/compare?selectedSessions=47ffe2f2-a357-4e11-a41e-473ee2055539




60it [06:42,  6.70s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,What testing capabilities does LangSmith have?,LangSmith provides off-the-shelf evaluators an...,,LangSmith offers capabilities for creating dat...,1,2.703597,004d0dcd-88f1-4327-82cd-c50efc399605,d4543e57-6fd0-481a-9307-fa6208a54df4
1,Can LangSmith be used for finetuning and model...,LangSmith is primarily a platform for monitori...,,"Yes, LangSmith can be used for fine-tuning and...",1,4.284395,33c29d9e-79af-473a-b037-9f6b41784fc7,92116bdd-c30c-4ccd-b7bf-1283b03a32ae
2,What is LangSmith used for in three sentences?,LangSmith is a platform designed for building ...,,LangSmith is a platform designed for the devel...,1,2.08118,54477beb-206d-4787-9be4-28fe0e3ea412,96901716-8946-4106-8148-8e910079795d
3,How do I pass metadata in with @traceable?,To pass metadata with the `@traceable` functio...,,You can pass metadata with the @traceable deco...,1,4.756402,668716de-05cf-4525-964e-88c0e41be715,0f02ebe5-dc20-4a67-8668-d21cacb1654f
4,How can I trace with the @traceable decorator?,To trace with the @traceable decorator in Pyth...,,To trace with the @traceable decorator in Pyth...,1,2.854872,6af21ca3-dcf1-423f-bf73-216333deb110,5f7f9e91-ea23-49f5-9e45-d12c5edffb71
5,who is the goat,"I don't have enough context to determine who ""...",,"The term ""GOAT"" stands for ""Greatest of All Ti...",1,2.119927,79190fb3-5853-4e83-bfcf-5a577c389b29,11bbefce-fd46-4049-ac22-7531844b13c5
6,Can LangSmith be used to evaluate agents?,"Yes, LangSmith can be used to evaluate agents....",,"Yes, LangSmith can be used to evaluate agents....",1,3.416949,83c9615d-7c8a-4d24-bb9b-52790602c0e6,99d4fd8d-f32f-40d3-936e-55cfea97e190
7,How do I create user feedback with the LangSmi...,To create user feedback using the LangSmith SD...,,To create user feedback with the LangSmith SDK...,1,1.9683,a36e8f8d-3217-4d86-8092-8eb9f4ce03aa,b1fe6a90-ab73-4373-b1d1-c77adbb298c9
8,Does LangSmith support offline evaluation?,The provided context specifically discusses La...,,"Yes, LangSmith supports offline evaluation thr...",1,1.803144,cb681615-e370-4d82-b757-8c1fb6a3b3a7,b00c3618-18ee-493d-adfa-156b3431e870
9,Does LangSmith support online evaluation?,"Yes, LangSmith supports online evaluation. It ...",,"Yes, LangSmith supports online evaluation as a...",1,2.081657,d46d0430-2802-44b5-9006-58d4b83b0b96,e208e028-5535-41a8-a488-b0b2fc373c12


### Modifying your Application

Now, let's change our model to gpt-35-turbo and see how it performs!

Make this change, and then run this code snippet!

In [9]:
from langsmith import evaluate, Client
from langsmith.schemas import Example, Run

def target_function(inputs: dict):
    return langsmith_rag(inputs["question"])

evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="gpt-3.5-turbo"
)

View the evaluation results for experiment: 'gpt-3.5-turbo-be312a99' at:
https://smith.langchain.com/o/42e50e11-148e-4f9c-8b19-47e3bd6993cb/datasets/f99cd610-6db1-470e-9057-95d5fda623ae/compare?selectedSessions=b0209bf8-9967-4d7a-9f0c-6829a4891041




60it [04:39,  4.65s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,What testing capabilities does LangSmith have?,LangSmith provides testing capabilities that i...,,LangSmith offers capabilities for creating dat...,1,2.255147,004d0dcd-88f1-4327-82cd-c50efc399605,6358ef99-6803-42a0-b493-420e2d66a4de
1,Can LangSmith be used for finetuning and model...,LangSmith is primarily designed for LLM observ...,,"Yes, LangSmith can be used for fine-tuning and...",1,3.956869,33c29d9e-79af-473a-b037-9f6b41784fc7,97ffd99c-320d-4826-8507-dc29247dca23
2,What is LangSmith used for in three sentences?,LangSmith is used for building production-grad...,,LangSmith is a platform designed for the devel...,1,4.190504,54477beb-206d-4787-9be4-28fe0e3ea412,be8ff2ab-561a-4a5c-99c2-29fb2673837f
3,How do I pass metadata in with @traceable?,To pass metadata with the `@traceable` decorat...,,You can pass metadata with the @traceable deco...,1,3.686474,668716de-05cf-4525-964e-88c0e41be715,8361ca48-1e65-4b84-a356-b354189e389d
4,How can I trace with the @traceable decorator?,"To trace with the @traceable decorator, simply...",,To trace with the @traceable decorator in Pyth...,1,2.159964,6af21ca3-dcf1-423f-bf73-216333deb110,33cf4dcb-8fdc-4c2c-b0e9-f2f96926f17b
5,who is the goat,I don't know.,,"The term ""GOAT"" stands for ""Greatest of All Ti...",1,1.341921,79190fb3-5853-4e83-bfcf-5a577c389b29,1232a98d-ae41-4f55-bd71-65a9033bae50
6,Can LangSmith be used to evaluate agents?,"Yes, LangSmith can be used to evaluate agents....",,"Yes, LangSmith can be used to evaluate agents....",1,1.589107,83c9615d-7c8a-4d24-bb9b-52790602c0e6,db2b7d04-cc2a-4755-9579-29d03098c649
7,How do I create user feedback with the LangSmi...,To create user feedback with the LangSmith SDK...,,To create user feedback with the LangSmith SDK...,1,2.885517,a36e8f8d-3217-4d86-8092-8eb9f4ce03aa,39eccd84-eba1-4351-a2fe-e2cb3080b5e6
8,Does LangSmith support offline evaluation?,"No, LangSmith focuses on online evaluations, p...",,"Yes, LangSmith supports offline evaluation thr...",1,1.664425,cb681615-e370-4d82-b757-8c1fb6a3b3a7,44509c42-31c9-49fa-85be-db67c749f40a
9,Does LangSmith support online evaluation?,"Yes, LangSmith supports online evaluation. It ...",,"Yes, LangSmith supports online evaluation as a...",1,1.776159,d46d0430-2802-44b5-9006-58d4b83b0b96,d1e18542-9ae1-4e4b-b787-6e9626afd542


### Running over Different pieces of Data

##### Dataset Version

You can execute an experiment on a specific version of a dataset in the sdk by using the `as_of` parameter in `list_examples`

Let's try running on just our initial dataset.

In [11]:
examples = client.list_examples(dataset_name=dataset_name)
if not examples:
    raise ValueError(f"No examples found in dataset '{dataset_name}'. Upload examples first.")

evaluate(
    target_function,
    data=examples,
    evaluators=[is_concise_enough],
    experiment_prefix="initial dataset version"
)


View the evaluation results for experiment: 'initial dataset version-59080c13' at:
https://smith.langchain.com/o/42e50e11-148e-4f9c-8b19-47e3bd6993cb/datasets/f99cd610-6db1-470e-9057-95d5fda623ae/compare?selectedSessions=eed2a119-7a5a-4f5b-855d-81336c0fbea5




60it [11:36, 11.61s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,What testing capabilities does LangSmith have?,LangSmith has capabilities for evaluating and ...,,LangSmith offers capabilities for creating dat...,1,2.050156,004d0dcd-88f1-4327-82cd-c50efc399605,b4135f5d-d0af-4051-9615-1e7061fb4b48
1,Can LangSmith be used for finetuning and model...,LangSmith is primarily a platform for LLM obse...,,"Yes, LangSmith can be used for fine-tuning and...",1,1.921885,33c29d9e-79af-473a-b037-9f6b41784fc7,27ac6e26-ed41-42d8-851a-0ab3ab2cce2b
2,What is LangSmith used for in three sentences?,LangSmith is a platform designed for building ...,,LangSmith is a platform designed for the devel...,1,1.77182,54477beb-206d-4787-9be4-28fe0e3ea412,b4cf4212-7a4d-44d2-ba97-d75f4155bd06
3,How do I pass metadata in with @traceable?,To pass metadata with the `@traceable` decorat...,,You can pass metadata with the @traceable deco...,1,193.922243,668716de-05cf-4525-964e-88c0e41be715,4b2b8d83-3cc9-4cc5-b744-d1fd00881445
4,How can I trace with the @traceable decorator?,To trace with the @traceable decorator in Pyth...,,To trace with the @traceable decorator in Pyth...,1,2.142741,6af21ca3-dcf1-423f-bf73-216333deb110,32f5dc37-6ee9-45ae-8c81-80944cfc936e
5,who is the goat,"The term ""GOAT"" stands for ""Greatest of All Ti...",,"The term ""GOAT"" stands for ""Greatest of All Ti...",1,165.675826,79190fb3-5853-4e83-bfcf-5a577c389b29,461d1da0-049f-44b8-91aa-d903ad5c8b3a
6,Can LangSmith be used to evaluate agents?,"Yes, LangSmith can be used to evaluate agents....",,"Yes, LangSmith can be used to evaluate agents....",1,2.167572,83c9615d-7c8a-4d24-bb9b-52790602c0e6,d6cc7697-7256-4dad-be4b-916da0b52fff
7,How do I create user feedback with the LangSmi...,To create user feedback with the LangSmith SDK...,,To create user feedback with the LangSmith SDK...,1,72.895918,a36e8f8d-3217-4d86-8092-8eb9f4ce03aa,4a6f122d-7ac8-48df-a69a-73c84e74cbbd
8,Does LangSmith support offline evaluation?,LangSmith primarily supports online evaluation...,,"Yes, LangSmith supports offline evaluation thr...",1,2.175571,cb681615-e370-4d82-b757-8c1fb6a3b3a7,6ad5ba0e-a255-4363-ab09-c71306d84a5d
9,Does LangSmith support online evaluation?,"Yes, LangSmith supports online evaluation. It ...",,"Yes, LangSmith supports online evaluation as a...",1,2.147019,d46d0430-2802-44b5-9006-58d4b83b0b96,ecbc8810-2a87-40e1-afc2-f4b8f45aa461


##### Dataset Split

You can run an experiment on a specific split of your dataset, let's try running on the Crucial Examples split.

In [None]:
evaluate(
    target_function,
    data=client.list_examples(dataset_name=dataset_name, splits=["Crucial Examples"]),  # We pass in a list of Splits
    evaluators=[is_concise_enough],
    experiment_prefix="Crucial Examples split"
)


TypeError: object of type 'generator' has no len()

##### Specific Data Points

You can specify individual data points to run an experiment over as well

In [None]:
evaluate(
    target_function,
    data=client.list_examples(
        dataset_name=dataset_name, 
        example_ids=[   # We pass in a specific list of example_ids
            # TODO: You will need to paste in your own example ids for this to work!
            "",
            ""
        ]
    ),
    evaluators=[is_concise_enough],
    experiment_prefix="two specific example ids"
)

### Other Parameters

##### Repetitions

You can run an experiment several times to make sure you have consistent results

In [20]:
evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="two repetitions",
    num_repetitions=2   # This field defaults to 1
)

View the evaluation results for experiment: 'two repetitions-e4d26a66' at:
https://smith.langchain.com/o/42e50e11-148e-4f9c-8b19-47e3bd6993cb/datasets/f99cd610-6db1-470e-9057-95d5fda623ae/compare?selectedSessions=bf22a1dd-68a8-47c3-8e2f-fe083b33a5f8




120it [16:35,  8.30s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,What testing capabilities does LangSmith have?,LangSmith offers testing capabilities through ...,,LangSmith offers capabilities for creating dat...,1,1.920850,004d0dcd-88f1-4327-82cd-c50efc399605,2f26096f-fb0d-48f5-b787-687300d289d3
1,Can LangSmith be used for finetuning and model...,"No, LangSmith is not designed for finetuning a...",,"Yes, LangSmith can be used for fine-tuning and...",1,1.682784,33c29d9e-79af-473a-b037-9f6b41784fc7,b4bee408-fd94-4c48-84f1-d34412a2291d
2,What is LangSmith used for in three sentences?,LangSmith is a platform designed for building ...,,LangSmith is a platform designed for the devel...,1,1.832005,54477beb-206d-4787-9be4-28fe0e3ea412,0abf8f39-b3ff-49f0-ab6b-5af5d0b56fc9
3,How do I pass metadata in with @traceable?,"To pass metadata with `@traceable`, you can in...",,You can pass metadata with the @traceable deco...,1,2.640297,668716de-05cf-4525-964e-88c0e41be715,10e4d607-a64d-4150-8f5c-875f6c08f628
4,How can I trace with the @traceable decorator?,"To trace with the @traceable decorator, you ne...",,To trace with the @traceable decorator in Pyth...,1,358.287350,6af21ca3-dcf1-423f-bf73-216333deb110,cef7a070-9d98-4b78-9498-e982b4a33413
...,...,...,...,...,...,...,...,...
115,"My item arrived damaged, what should I do?","If your item arrived damaged, you should conta...",,We are sorry to hear that. Please contact our ...,0,1.928860,359162da-dfc9-4dc3-aaad-79587975c6c3,b14ff5f0-3992-4bbd-a95e-f4a182d43ada
116,How do I apply a discount code?,The provided context does not mention any spec...,,You can apply your discount code at checkout. ...,0,1.618449,592d3c5b-d2ef-42fb-8e20-3ce704806994,438080b2-ba01-46f8-8885-d0236386259e
117,What is your return policy?,"I'm sorry, but I don't have information about ...",,You can return any item within 30 days of purc...,1,2.907023,91bf9a13-93df-492d-a778-c216c692c1ac,223a2e1f-059d-4db1-a9cf-237e2307fe7c
118,How can I track my order?,"I'm sorry, but there is no information provide...",,"Once your order has shipped, you will receive ...",1,2.212856,c20cd740-4d52-40ac-a220-caf710464216,663f0bdb-3ff1-430d-98cf-4d5d5ce2d685


##### Concurrency
You can also kick off concurrent threads of execution to make your experiments finish faster!

In [19]:
evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="concurrency",
    max_concurrency=3,  # This defaults to None, so this is an improvement!
)

View the evaluation results for experiment: 'concurrency-8e608f46' at:
https://smith.langchain.com/o/42e50e11-148e-4f9c-8b19-47e3bd6993cb/datasets/f99cd610-6db1-470e-9057-95d5fda623ae/compare?selectedSessions=485ec480-80d1-4e0a-8422-aab497a772dc




60it [01:50,  1.84s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,execution_time,example_id,id
0,What is LangSmith used for in three sentences?,LangSmith is used for building and managing pr...,,LangSmith is a platform designed for the devel...,1,2.435124,54477beb-206d-4787-9be4-28fe0e3ea412,f33c6b41-36c7-4758-abe9-8eaa2d646a1f
1,What testing capabilities does LangSmith have?,LangSmith offers testing capabilities like eva...,,LangSmith offers capabilities for creating dat...,1,3.326116,004d0dcd-88f1-4327-82cd-c50efc399605,b0b6357d-98e6-4311-a7e3-c35857c19fb3
2,How do I pass metadata in with @traceable?,"To pass metadata with `@traceable`, you can ut...",,You can pass metadata with the @traceable deco...,1,3.284772,668716de-05cf-4525-964e-88c0e41be715,a7c606cc-7242-48d4-80b5-1006977e8ea0
3,How can I trace with the @traceable decorator?,To trace with the @traceable decorator in Pyth...,,To trace with the @traceable decorator in Pyth...,1,2.445694,6af21ca3-dcf1-423f-bf73-216333deb110,659965c1-6185-40bc-bbac-21e14a4b5822
4,Can LangSmith be used for finetuning and model...,LangSmith is designed for observability and ev...,,"Yes, LangSmith can be used for fine-tuning and...",1,7.822879,33c29d9e-79af-473a-b037-9f6b41784fc7,c944fb70-c74f-4556-8c51-441108bb44a8
5,Can LangSmith be used to evaluate agents?,"Yes, LangSmith can be used to evaluate agents....",,"Yes, LangSmith can be used to evaluate agents....",1,2.346841,83c9615d-7c8a-4d24-bb9b-52790602c0e6,18d5cbb3-983c-4475-8069-fadb0aa9753d
6,who is the goat,"The term ""GOAT"" stands for ""Greatest of All Ti...",,"The term ""GOAT"" stands for ""Greatest of All Ti...",1,2.863065,79190fb3-5853-4e83-bfcf-5a577c389b29,17fc0c59-adcb-47c4-ac14-5a3cd9471581
7,How do I create user feedback with the LangSmi...,To create user feedback with the LangSmith SDK...,,To create user feedback with the LangSmith SDK...,1,1.81062,a36e8f8d-3217-4d86-8092-8eb9f4ce03aa,c98bc2a2-a0a6-4f25-a495-a24286f98208
8,Does LangSmith support online evaluation?,"Yes, LangSmith supports online evaluation. It ...",,"Yes, LangSmith supports online evaluation as a...",1,2.183072,d46d0430-2802-44b5-9006-58d4b83b0b96,5df9117a-72e0-49e2-9040-89b7a626d6bc
9,How do I pass metadata in with @traceable?,To pass metadata with the `@traceable` decorat...,,You can pass metadata with the @traceable deco...,1,3.088984,05ccb055-9509-4dcf-b6c3-1d4d128b5a6a,94cd431a-0ea5-4f94-95dd-d936f6955c68


##### Metadata 

You can (and should) add metadata to your experiments, to make them easier to find in the UI

In [None]:
evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough],
    experiment_prefix="metadata added",
    metadata={  # We can pass custom metadata for the experiment, such as the model name
        "model_name": MODEL_NAME
    }
)

View the evaluation results for experiment: 'metadata added-a759c96a' at:
https://smith.langchain.com/o/42e50e11-148e-4f9c-8b19-47e3bd6993cb/datasets/f99cd610-6db1-470e-9057-95d5fda623ae/compare?selectedSessions=f65fe924-42a0-492f-ac52-800327ee3a57




19it [00:49,  2.12s/it]