# Lab | Langchain Evaluation

## Intro

Pick different sets of data and re-run this notebook. The point is for you to understand all steps involve and the many different ways one can and should evaluate LLM applications.

What did you learn? - Let's discuss that in class

## LangChain: Evaluation

### Outline:

* Example generation
* Manual evaluation (and debuging)
* LLM-assisted evaluation

In [None]:
#!pip install python-dotenv

In [None]:
from dotenv import load_dotenv, find_dotenv
import os
_ = load_dotenv(find_dotenv())

# Specify the exact path to your .env file in Google Drive
#env_path = "/content/drive/MyDrive/Ironhack/langchain_evaluation"

# Use find_dotenv() with the specified path
#_ = load_dotenv(find_dotenv(env_path))

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [None]:
# Print statements to confirm API keys were loaded

print(f"OPENAI_API_KEY loaded: {bool(OPENAI_API_KEY)}")
print(f"LANGCHAIN_API_KEY loaded: {bool(LANGCHAIN_API_KEY)}")
print(f"HUGGINGFACEHUB_API_TOKEN loaded: {bool(HUGGINGFACEHUB_API_TOKEN)}")


In [None]:
# Print statements to confirm API keys were loaded
print(f"OPENAI_API_KEY: {OPENAI_API_KEY}")
print(f"LANGCHAIN_API_KEY: {LANGCHAIN_API_KEY}")
print(f"HUGGINGFACEHUB_API_TOKEN: {HUGGINGFACEHUB_API_TOKEN}")


In [None]:
os.environ["LANGSMITH_TRACING"]="true"
os.environ["LANGSMITH_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGSMITH_API_KEY"]=LANGCHAIN_API_KEY
os.environ["LANGSMITH_PROJECT"]="langchain_evaluation-IronHack_Lab"
os.environ["OPENAI_API_KEY"]=OPENAI_API_KEY

### Example 1

#### Create our QandA application

In [None]:
#!pip install langchain_openai

In [None]:
#!pip install langchain-community

In [None]:
#!pip install langchain-huggingface

In [None]:
#!pip install -U langchain-openai

In [None]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain.llms import OpenAI
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import CSVLoader, TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI


In [None]:
file = '/content/drive/MyDrive/Ironhack/langchain_evaluation/OutdoorClothingCatalog_1000.csv' #'../data/OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file)
data = loader.load()

In [None]:
# !pip install --upgrade --force-reinstall sentence-transformers

In [None]:
#!pip install docarray

In [None]:
#!pip install --upgrade langchain pydantic

In [None]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2", model_kwargs = {'device': 'cuda'})
).from_loaders([loader])

In [None]:
llm = ChatOpenAI(temperature = 0.0, api_key=OPENAI_API_KEY)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=index.vectorstore.as_retriever(),
    verbose=True,
    chain_type_kwargs = {
        "document_separator": "<<<<>>>>>"
    }
)

#### Coming up with test datapoints

In [None]:
data[10]

In [None]:
data[11]

#### Hard-coded examples

In [None]:
from langchain.prompts import PromptTemplate

In [None]:
from langchain.prompts import PromptTemplate
from langchain.schema import BaseOutputParser
from pydantic import BaseModel, Field

examples = [
    {
        "query": "Do the Cozy Comfort Pullover Set\
        have side pockets?",
        "answer": "Yes"
    },
    {
        "query": "What collection is the Ultra-Lofty \
        850 Stretch Down Hooded Jacket from?",
        "answer": "The DownTek collection"
    }
]

# Define the prompt template
prompt_template = PromptTemplate(
    input_variables=["query"],
    template="Examples:\n"
             "1. Query: Do the Cozy Comfort Pullover Set have side pockets?\n"
             "   Answer: Yes\n"
             "2. Query: What collection is the Ultra-Lofty 850 Stretch Down Hooded Jacket from?\n"
             "   Answer: The DownTek collection\n"
             "Query: {query}\n"
             "Answer:"
)

# Define the output model
class Answer(BaseModel):
    answer: str = Field(description="The answer to the query")

# Create the output parser
class AnswerOutputParser(BaseOutputParser):
    def parse(self, text: str) -> Answer:
        # Split the response to get the answer
        answer = text.strip().split("Answer:")[-1].strip()
        return Answer(answer=answer)

# Initialize the LLM
# llm = OpenAI()
llm = ChatOpenAI()

# Create the LLMChain
llm_chain = LLMChain(
    llm=llm,
    prompt=prompt_template,
    output_parser=AnswerOutputParser()
)

# Example query
query = "Is the Cozy Comfort Pullover Set available in different colors?"

# Run the chain
result = llm_chain.run({"query": query})

# Print the result
print(result)


In [None]:
#!pip install --upgrade langchain

#### LLM-Generated examples

In [None]:
from langchain.evaluation.qa import QAGenerateChain

In [None]:
example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI())

In [None]:
llm_chain = LLMChain(llm=llm, prompt=prompt_template)

In [None]:
new_examples = example_gen_chain.apply_and_parse(
    [{"doc": t} for t in data[:5]]
)

In [None]:
new_examples[0]

In [None]:
data[0]

In [None]:
d_flattened = [data['qa_pairs'] for data in new_examples]
d_flattened

#### Combine examples

In [None]:
# examples += new_example
examples += d_flattened

In [None]:
examples[0]

In [None]:
qa.invoke(examples[0]["query"])

### Manual Evaluation - Fun part

In [None]:
import langchain
langchain.debug = True

In [None]:
qa.invoke(examples[0]["query"])

In [None]:
# Turn off the debug mode
langchain.debug = False

### LLM assisted evaluation

In [None]:
examples += d_flattened

In [None]:
examples

In [None]:
predictions = qa.batch(examples)

In [None]:
predictions

In [None]:
from langchain.evaluation.qa import QAEvalChain

In [None]:
llm = ChatOpenAI(temperature=0)
eval_chain = QAEvalChain.from_llm(llm)

In [None]:
graded_outputs = eval_chain.evaluate(examples, predictions)

In [None]:
graded_outputs

In [None]:
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + predictions[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    # print("Predicted Grade: " + graded_outputs[i]['text'])
    print()

### Example 2
One can also easily evaluate your QA chains with the metrics offered in ragas

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
loader = TextLoader("/content/drive/MyDrive/Ironhack/langchain_evaluation/nyc_text.txt")
index = VectorstoreIndexCreator(embedding=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2", model_kwargs = {'device': 'cuda'})).from_loaders([loader])


llm = ChatOpenAI(temperature= 0.0, api_key=OPENAI_API_KEY)
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=index.vectorstore.as_retriever(),
    return_source_documents=True,
)

In [None]:
# testing it out

question = "How did New York City get its name?"
result = qa_chain.invoke({"query": question})
result["result"]

In [None]:
result

Now in order to evaluate the qa system we generated a few relevant questions. We've generated a few question for you but feel free to add any you want.

In [None]:
eval_questions = [
    "What is the population of New York City as of 2020?",
    "Which borough of New York City has the highest population?",
    "What is the economic significance of New York City?",
    "How did New York City get its name?",
    "What is the significance of the Statue of Liberty in New York City?",
]

eval_answers = [
    "8,804,190",
    "Brooklyn",
    "New York City's economic significance is vast, as it serves as the global financial capital, housing Wall Street and major financial institutions. Its diverse economy spans technology, media, healthcare, education, and more, making it resilient to economic fluctuations. NYC is a hub for international business, attracting global companies, and boasts a large, skilled labor force. Its real estate market, tourism, cultural industries, and educational institutions further fuel its economic prowess. The city's transportation network and global influence amplify its impact on the world stage, solidifying its status as a vital economic player and cultural epicenter.",
    "New York City got its name when it came under British control in 1664. King Charles II of England granted the lands to his brother, the Duke of York, who named the city New York in his own honor.",
    "The Statue of Liberty in New York City holds great significance as a symbol of the United States and its ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an iconic landmark and a global symbol of cultural diversity and freedom.",
]

examples = [
    {"query": q, "ground_truths": [eval_answers[i]]}
    for i, q in enumerate(eval_questions)
]

In [None]:
examples

#### Introducing RagasEvaluatorChain

`RagasEvaluatorChain` creates a wrapper around the metrics ragas provides (documented [here](https://github.com/explodinggradients/ragas/blob/main/docs/metrics.md)), making it easier to run these evaluation with langchain and langsmith.

The evaluator chain has the following APIs

- `__call__()`: call the `RagasEvaluatorChain` directly on the result of a QA chain.
- `evaluate()`: evaluate on a list of examples (with the input queries) and predictions (outputs from the QA chain).
- `evaluate_run()`: method implemented that is called by langsmith evaluators to evaluate langsmith datasets.

lets see each of them in action to learn more.

In [None]:
result = qa_chain.invoke({"query": eval_questions[1]})
result["result"]

In [None]:
key_mapping = {
    "query": "question",
    "result": "answer",
    "source_documents": "contexts"
}

result_updated = {}
for old_key, new_key in key_mapping.items():
    if old_key in result:
        result_updated[new_key] = result[old_key]


In [None]:
result_updated

In [None]:
#!pip install --no-cache-dir recordclass

In [None]:
#!pip install ragas==0.1.9

In [None]:
from pydantic import BaseModel

In [None]:
from ragas.integrations.langchain import EvaluatorChain
# from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
)

# create evaluation chains
faithfulness_chain   = EvaluatorChain(metric=faithfulness)
answer_rel_chain     = EvaluatorChain(metric=answer_relevancy)
context_rel_chain    = EvaluatorChain(metric=context_relevancy)
context_recall_chain = EvaluatorChain(metric=context_recall)

1. `__call__()`

Directly run the evaluation chain with the results from the QA chain. Do note that metrics like context_relevancy and faithfulness require the `source_documents` to be present.

In [None]:
# Recheck the result that we are going to validate.
result

**Faithfulness**

In [None]:
# Map keys as defined
key_mapping = {
    "query": "question",
    "result": "answer",
    "source_documents": "contexts"
}

result_updated = {}
for old_key, new_key in key_mapping.items():
    if old_key in result:
        if old_key == "source_documents":  # Handle contexts specifically
            # Extract 'page_content' from each Document and ensure all are strings
            list_context = [doc.page_content for doc in result[old_key]]
            result_updated[new_key] = ' '.join(list_context)

        else:
            result_updated[new_key] = result[old_key]

In [None]:
result_updated

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
eval_result

In [None]:
eval_result = faithfulness_chain(result_updated)
eval_result["faithfulness"]

High faithfulness_score means that there are exact consistency between the source documents and the answer.

You can check lower faithfulness scores by changing the result (answer from LLM) or source_documents to something else.

In [None]:
# Map keys as defined
key_mapping = {
    "query": "question",
    "result": "answer",
    "source_documents": "contexts"
}

fake_result = {}
for old_key, new_key in key_mapping.items():
    if old_key in result:
        if old_key == "source_documents":  # Handle contexts specifically
            # Extract 'page_content' from each Document and ensure all are strings
            list_context = [doc.page_content for doc in result[old_key]]
            fake_result[new_key] = ' '.join(list_context)

        else:
            fake_result[new_key] = result[old_key]

In [None]:
fake_result

In [None]:
fake_result["answer"] = "we are the champions"

In [None]:
fake_result

In [None]:
#fake_result = result.copy()
#fake_result["answer"] = "we are the champions"
eval_result = faithfulness_chain(fake_result)
eval_result["faithfulness"]

In [None]:
fake_result_2 = fake_result.copy()
fake_result_2["answer"] = "Brooklyn is the biggest borough of NY"
eval_result = faithfulness_chain(fake_result_2)
eval_result["faithfulness"]

In [None]:
fake_result_2

**Context Relevancy**

In [None]:
# Rename 'answer' key to 'ground_truth'
result_updated_gt={}
result_updated_gt=result_updated.copy() # Create a copy to avoid modifying the original dictionary
result_updated_gt["ground_truth"] = result_updated.pop("answer") # Assign the value of the key 'answer' to new key named 'ground_truth'

# Verify the updated result structure
print(result_updated_gt)

In [None]:
eval_result = context_recall_chain(result_updated_gt)
eval_result["context_recall"]

High context_recall_score means that the ground truth is present in the source documents.

You can check lower context recall scores by changing the source_documents to something else.

In [None]:
fake_result

In [None]:
# Create a new fake_result_03 based on the current fake_result structure
fake_result_03 = fake_result.copy()  # Start with a copy of fake_result

# Update the answer to the correct response
fake_result_03["answer"] = "Manhattan (New York County) has the highest population density of any borough in New York City."

# Add the ground_truth key with the correct response
fake_result_03["ground_truth"] = "Manhattan (New York County) has the highest population density of any borough in New York City."

# Modify contexts to be a single irrelevant string
fake_result_03["contexts"] = "I love christmas"

# Print the updated fake_result_03 for verification
print(fake_result_03)

In [None]:
#from langchain.schema import Document
#fake_result = result.copy()
#fake_result["source_documents"] = [Document(page_content="I love christmas")]
eval_result = context_recall_chain(fake_result_03)
eval_result["context_recall"]

In [None]:
fake_result_04 = fake_result_03.copy()
fake_result_04["contexts"] = (
    "New York City has a large population, and Manhattan is one of its boroughs. "
    "However, Brooklyn also has a significant population density."
)

In [None]:
fake_result_04

In [None]:
eval_result = context_recall_chain(fake_result_04)
eval_result["context_recall"]

In [None]:
fake_result_05 = fake_result_04.copy()
fake_result_05["contexts"] = (
    "Manhattan has a high population density. "
    "Brooklyn also has a significant population density, but Manhattan's is higher."
)

In [None]:
fake_result_05

In [None]:
eval_result = context_recall_chain(fake_result_05)
eval_result["context_recall"]

2. `evaluate()`

Evaluate a list of inputs/queries and the outputs/predictions from the QA chain.

In [None]:
examples

In [None]:
predictions

In [None]:
# run the queries as a batch for efficiency
predictions = qa_chain.batch(examples)

In [None]:
predictions

In [None]:
# Map keys as defined
key_mapping_p = {
    "query": "question",
    "ground_truths": "ground_truth",
    "result": "answer",
    "source_documents": "contexts"
}

formatted_predictions = {}
for old_key, new_key in key_mapping_p.items():
    if old_key in predictions:
        if old_key == "source_documents":  # Handle contexts specifically
            # Extract 'page_content' from each Document and ensure all are strings
            list_context = [doc.page_content for doc in predictions[old_key]]
            formatted_predictions[new_key] = ' '.join(list_context)

        else:
            formatted_predictions[new_key] = predictions[old_key]

In [None]:
# Define the key mapping
key_mapping_p = {
    "query": "question",
    "ground_truths": "ground_truth",
    "result": "answer",
    "source_documents": "contexts",
}

# Initialize the list to store formatted predictions
formatted_predictions = []

# Iterate over each prediction in the list
for prediction in predictions:
    formatted_prediction = {}
    for old_key, new_key in key_mapping_p.items():
        if old_key == "source_documents":
            # Handle 'source_documents' specifically
            list_context = [doc.page_content for doc in prediction.get(old_key, [])]
            formatted_prediction[new_key] = " ".join(list_context)
        else:
            # Map other keys directly
            formatted_prediction[new_key] = prediction.get(old_key)
    formatted_predictions.append(formatted_prediction)

# Print the formatted predictions
formatted_predictions

In [None]:
# evaluate
print("evaluating...")
#r = faithfulness_chain.evaluate(examples, predictions)
r = await faithfulness_chain.abatch(formatted_predictions)
r

In [None]:
# evaluate context recall
print("evaluating...")
cr = await context_recall_chain.abatch(formatted_predictions)
cr