Here we evaluate the performance of our application. Interestingly, we can use LLMs themselves to gauge how our RAG pipeline is performing.

In [3]:
import os
from dotenv import load_dotenv, find_dotenv
import datetime

from langchain.chains import RetrievalQA
from langchain.chat_models import ChatGooglePalm
from langchain.embeddings import GooglePalmEmbeddings
from langchain.document_loaders import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores.pinecone import Pinecone
from langchain.evaluation.qa import QAGenerateChain, QAEvalChain

### Setting up

In [4]:
palm_key = os.getenv('PALM_API_KEY')

In [7]:
loader = CSVLoader("data/sample.csv")
data = loader.load()

In [10]:
llm = ChatGooglePalm(google_api_key=palm_key, temperature=0.0)
embeddings = GooglePalmEmbeddings(google_api_key=palm_key)

In [20]:
import os
import pinecone

# initialize pinecone
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY_02"),  # find at app.pinecone.io
    environment=os.getenv("PINECONE_ENV"),  # next to api key in console
)

In [22]:
docsearch = Pinecone.from_documents(data, embeddings, index_name='langchain-demo')

### Retrieval

In [42]:

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(), 
    verbose=True,
)

### QAGenerateChain
Automates the creation of question answer sets for evaluatiom.

In [24]:
llm = ChatGooglePalm(google_api_key=palm_key)

In [25]:
# LLM-Generated examples
example_gen_chain = QAGenerateChain.from_llm(llm=llm)

# the warning below can be safely ignored
examples = example_gen_chain.apply_and_parse(
    [{"doc": t} for t in data[:5]]
)



In [33]:
examples

[{'qa_pairs': {'query': "What is John Doe's email address?",
   'answer': 'john.doe@email.com'}},
 {'qa_pairs': {'query': "What is Jane Smith's email address?",
   'answer': 'jane.smith@email.com'}},
 {'qa_pairs': {'query': "What is Bob Johnson's email address?",
   'answer': 'bob.johnson@email.com'}},
 {'qa_pairs': {'query': "What is Alice Williams's email address?",
   'answer': 'alice.williams@email.com'}},
 {'qa_pairs': {'query': "What is Charlie Brown's email address?",
   'answer': 'charlie.brown@email.com'}}]

### Evaluation

In [43]:
qa.run(examples[0]['qa_pairs']['answer'])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'John Doe is a 28-year-old man who lives in New York City. His phone number is 555-1234.'

#### Debugging
This allows us to see the details of our chain execution. You set `debug=True`

In [44]:

# Manual Evaluation
import langchain
langchain.debug = True

qa.run(examples[0]['qa_pairs']['answer'])

# Turn off the debug mode
langchain.debug = False

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "john.doe@email.com"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "john.doe@email.com",
  "context": "Name: John Doe\nAge: 28\nEmail: john.doe@email.com\nPhone: 555-1234\nCity: New York\n\nName: Bob Johnson\nAge: 42\nEmail: bob.johnson@email.com\nPhone: 555-9876\nCity: Chicago\n\nName: Larry Green\nAge: 40\nEmail: larry.green@email.com\nPhone: 555-1234\nCity: San Diego\n\nName: Jane Smith\nAge: 35\nEmail: jane.smith@email.com\nPhone: 555-5678\nCity: Los Angeles"
}
[32;1m[1;3m[llm/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain > 5:llm:ChatGooglePalm] Entering LLM run with input:
[0m{
  "prompts": [
    

### QAEvalChain

In [49]:
# Postprocessing
examples_ = [example['qa_pairs'] for example in examples]

In [None]:

# LLM assisted evaluation
predictions = qa.apply(examples_)

eval_chain = QAEvalChain.from_llm(llm)

graded_outputs = eval_chain.evaluate(examples_, predictions)

In [53]:

for i, eg in enumerate(examples_):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + predictions[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    # print("Predicted Grade: " + graded_outputs[i]['text'])
    print()

graded_outputs[0]

# LangChain evaluation platform
# The LangChain evaluation platform, LangChain Plus, can be accessed here https://www.langchain.plus/.
# Use the invite code `lang_learners_2023`
# Reminder: Download your notebook to your local computer to save your work.


Example 0:
Question: What is John Doe's email address?
Real Answer: john.doe@email.com
Predicted Answer: John Doe's email address is john.doe@email.com.

Example 1:
Question: What is Jane Smith's email address?
Real Answer: jane.smith@email.com
Predicted Answer: Jane Smith's email address is jane.smith@email.com.

Example 2:
Question: What is Bob Johnson's email address?
Real Answer: bob.johnson@email.com
Predicted Answer: Bob Johnson's email address is bob.johnson@email.com.

Example 3:
Question: What is Alice Williams's email address?
Real Answer: alice.williams@email.com
Predicted Answer: Alice Williams's email address is alice.williams@email.com.

Example 4:
Question: What is Charlie Brown's email address?
Real Answer: charlie.brown@email.com
Predicted Answer: Charlie Brown's email address is charlie.brown@email.com. He is 25 years old and lives in Seattle.



{'results': "CORRECT\n\nThe student's answer is the same as the true answer."}

### LangChain evaluation platform
Allows evaluation in a similar fashion as we have done but with the aid of a nice UI.

The LangChain evaluation platform, `LangChain Plus`, can be accessed [here](https://www.langchain.plus/). Use the **invite code**: `lang_learners_2023`