# LangChain: Evaluation

## Outline:

* Example generation
* Manual evaluation (and debuging)
* LLM-assisted evaluation
* LangChain evaluation platform

In [1]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

Note: LLM's do not always produce the same results. When executing the code in your notebook, you may get slightly different answers that those in the video.

In [2]:
# account for deprecation of LLM model
import datetime
# Get the current date
current_date = datetime.datetime.now().date()

# Define the date after which the model should be set to "gpt-3.5-turbo"
target_date = datetime.date(2024, 6, 12)

# Set the model variable based on the current date
if current_date > target_date:
    llm_model = "gpt-3.5-turbo"
else:
    llm_model = "gpt-3.5-turbo-0301"

In [6]:
import pandas as pd

table_data = {
    "name": [
        "Women's Campside Oxfords",
        "Recycled Waterhog Dog Mat, Chevron Weave",
        "Infant and Toddler Girls' Coastal Chill Swimsuit",
        "Refresh Swimwear, V-Neck Tankini Contrasts",
        "EcoFlex 3L Storm Pants"
    ],
    "description": [
        "This ultracomfortable lace-to-toe Oxford boasts durability and style, perfect for all-day wear.",
        "Protect your floors from spills and splashing. This waterhog mat is durable, stain-resistant, and easy to clean.",
        "She'll love the bright colors, ruffles, and exciting design. Ideal for swimming or beach adventures.",
        "Whether you're going for a swim or heading out, this tankini offers great support and comfort.",
        "Our new TEK O2 technology makes our four-season pants lightweight, breathable, and waterproof."
    ]
}

pd.DataFrame(table_data).to_csv("data/OutdoorClothingCatalog.csv")

## Create our Q and A application

In [13]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch

In [9]:
file = 'data/OutdoorClothingCatalog.csv'
loader = CSVLoader(file_path=file)
data = loader.load()

In [10]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

ValidationError: 1 validation error for VectorstoreIndexCreator
embedding
  field required (type=value_error.missing)

In [12]:
llm = ChatOpenAI(temperature = 0.0)
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=index.vectorstore.as_retriever(), 
    verbose=True,
    chain_type_kwargs = {
        "document_separator": "<<<<>>>>>"
    }
)

NameError: name 'index' is not defined

### Coming up with test datapoints

In [None]:
data[10]

In [None]:
data[11]

### Hard-coded examples

In [14]:
examples = [
    {
        "query": "Do the Cozy Comfort Pullover Set\
        have side pockets?",
        "answer": "Yes"
    },
    {
        "query": "What collection is the Ultra-Lofty \
        850 Stretch Down Hooded Jacket from?",
        "answer": "The DownTek collection"
    }
]

### LLM-Generated examples

In [15]:
from langchain.evaluation.qa import QAGenerateChain


In [17]:
example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI())

In [None]:
# the warning below can be safely ignored

In [18]:
new_examples = example_gen_chain.apply_and_parse(
    [{"doc": t} for t in data[:5]]
)



TypeError: unsupported operand type(s) for +=: 'dict' and 'dict'

In [19]:
new_examples[0]

NameError: name 'new_examples' is not defined

In [None]:
data[0]

### Combine examples

In [None]:
examples += new_examples

In [None]:
qa.run(examples[0]["query"])

## Manual Evaluation

In [None]:
import langchain
langchain.debug = True

In [None]:
qa.run(examples[0]["query"])

In [None]:
# Turn off the debug mode
langchain.debug = False

## LLM assisted evaluation

In [None]:
predictions = qa.apply(examples)

In [None]:
from langchain.evaluation.qa import QAEvalChain

In [None]:
llm = ChatOpenAI(temperature=0, model=llm_model)
eval_chain = QAEvalChain.from_llm(llm)

In [None]:
graded_outputs = eval_chain.evaluate(examples, predictions)

In [None]:
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + predictions[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    print("Predicted Grade: " + graded_outputs[i]['text'])
    print()

In [None]:
graded_outputs[0]

## LangChain evaluation platform

The LangChain evaluation platform, LangChain Plus, can be accessed here https://www.langchain.plus/.  
Use the invite code `lang_learners_2023`

Reminder: Download your notebook to you local computer to save your work.