**Evaluation with RAGAS and Advanced Retrieval Methods Using LangChain**

In [None]:
%pip install -U -q langchain openai ragas arxiv pymupdf chromadb wandb tiktoken

In [None]:
import os
import openai
from getpass import getpass

openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

**Data Collection**

we will use legal contract provided as context.

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("../data/Raptor Contract.pdf")
base_docs = loader.load()

In [None]:
for doc in base_docs:
  print(doc.metadata)

**Creating an Index**

Let's use a naive index creation strategy of just using RecursiveCharacterTextSplitter on our documents and embedding each into our VectorStore using OpenAIEmbeddings().

RecursiveCharacterTextSplitter()

Chroma

OpenAIEmbeddings()

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=250)

docs = text_splitter.split_documents(base_docs)

vectorstore = Chroma.from_documents(docs, OpenAIEmbeddings())

In [None]:
len(docs)

In [None]:
print(max([len(chunk.page_content) for chunk in docs]))

In [None]:
docs[0].page_content

convert our Chroma vectorstore into a retriever with the .as_retriever() method.

In [None]:
base_retriever = vectorstore.as_retriever(search_kwargs={"k" : 2})

In [None]:
relevant_docs = base_retriever.get_relevant_documents("How much is the escrow amount?")

In [None]:
len(relevant_docs)

In [None]:
relevant_docs[1]

**Creating a Retrieval Augmented Generation Prompt**

In [None]:
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':

### CONTEXT
{context}

### QUESTION
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

**Setting Up  Basic QA Chain**

In [None]:
from operator import itemgetter

from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

primary_qa_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

In [None]:
question = "How much is the retention amount?"

result = retrieval_augmented_qa_chain.invoke({"question" : question})

print(result)

**Ground Truth Dataset Creation Using GPT-3.5-turbo and GPT-4**

The next section might take you a long time to run, so the evaluation dataset is provided.

The basic idea is that we can use LangChain to create questions based on our contexts, and then answer those questions.

Let's look at how that works in the code!

In [None]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

question_schema = ResponseSchema(
    name="question",
    description="a question about the context."
)

question_response_schemas = [
    question_schema,
]

In [None]:
question_output_parser = StructuredOutputParser.from_response_schemas(question_response_schemas)
format_instructions = question_output_parser.get_format_instructions()

In [None]:
question_generation_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")

bare_prompt_template = "{content}"
bare_template = ChatPromptTemplate.from_template(template=bare_prompt_template)

In [None]:
from langchain.prompts import ChatPromptTemplate

qa_template = """\
You are a University Professor creating a test for advanced students. For each context, create a question that is specific to the context. Avoid creating generic or general questions.

question: a question about the context.

Format the output as JSON with the following keys:
question

context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=docs[0],
    format_instructions=format_instructions
)

question_generation_chain = bare_template | question_generation_llm

response = question_generation_chain.invoke({"content" : messages})
output_dict = question_output_parser.parse(response.content)

In [None]:
for k, v in output_dict.items():
  print(k)
  print(v)

In [None]:
%pip install -q -U tqdm

In [None]:
from tqdm import tqdm

qac_triples = []

for text in tqdm(docs[:10]):
  messages = prompt_template.format_messages(
      context=text,
      format_instructions=format_instructions
  )
  response = question_generation_chain.invoke({"content" : messages})
  try:
    output_dict = question_output_parser.parse(response.content)
  except Exception as e:
    continue
  output_dict["context"] = text
  qac_triples.append(output_dict)

In [None]:
qac_triples[5]

In [None]:
answer_generation_llm = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)

answer_schema = ResponseSchema(
    name="answer",
    description="an answer to the question"
)

answer_response_schemas = [
    answer_schema,
]

answer_output_parser = StructuredOutputParser.from_response_schemas(answer_response_schemas)
format_instructions = answer_output_parser.get_format_instructions()

qa_template = """\
You are a University Professor creating a test for advanced students. For each question and context, create an answer.

answer: a answer about the context.

Format the output as JSON with the following keys:
answer

question: {question}
context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=qac_triples[0]["context"],
    question=qac_triples[0]["question"],
    format_instructions=format_instructions
)

answer_generation_chain = bare_template | answer_generation_llm

response = answer_generation_chain.invoke({"content" : messages})
output_dict = answer_output_parser.parse(response.content)

In [None]:
for k, v in output_dict.items():
  print(k)
  print(v)

In [1]:
for triple in tqdm(qac_triples):
  messages = prompt_template.format_messages(
      context=triple["context"],
      question=triple["question"],
      format_instructions=format_instructions
  )
  response = answer_generation_chain.invoke({"content" : messages})
  try:
    output_dict = answer_output_parser.parse(response.content)
  except Exception as e:
    continue
  triple["answer"] = output_dict["answer"]

NameError: name 'tqdm' is not defined

In [None]:
%pip install -q -U datasets

In [None]:
import pandas as pd
from datasets import Dataset

ground_truth_qac_set = pd.DataFrame(qac_triples)
ground_truth_qac_set["context"] = ground_truth_qac_set["context"].map(lambda x: str(x.page_content))
ground_truth_qac_set = ground_truth_qac_set.rename(columns={"answer" : "ground_truth"})

In [None]:
eval_dataset = Dataset.from_pandas(ground_truth_qac_set)

In [None]:
eval_dataset

In [None]:
eval_dataset[3]

In [None]:
eval_dataset.to_csv("groundtruth_eval_dataset.csv")

**Evaluating RAG Pipelines**

Evaluation Using RAGAS

Now we can evaluate using RAGAS!

In [None]:
%pip install ragas

In [None]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)

from ragas.metrics.critique import harmfulness
from ragas import evaluate

def create_ragas_dataset(rag_pipeline, eval_dataset):
  rag_dataset = []
  for row in tqdm(eval_dataset):
    answer = rag_pipeline.invoke({"question" : row["question"]})
    rag_dataset.append(
        {"question" : row["question"],
         "answer" : answer["response"].content,
         "contexts" : [context.page_content for context in answer["context"]],
         "ground_truths" : [row["ground_truth"]]
         }
    )
  rag_df = pd.DataFrame(rag_dataset)
  rag_eval_dataset = Dataset.from_pandas(rag_df)
  return rag_eval_dataset

def evaluate_ragas_dataset(ragas_dataset):
  result = evaluate(
    ragas_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        context_relevancy,
        answer_correctness,
        answer_similarity
    ],
  )
  return result

In [None]:
from tqdm import tqdm
import pandas as pd

basic_qa_ragas_dataset = create_ragas_dataset(retrieval_augmented_qa_chain, eval_dataset)

save data

In [None]:
basic_qa_ragas_dataset.to_csv("basic_qa_ragas_dataset.csv")

In [None]:
basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset)

In [None]:
basic_qa_result

**Testing Other Retrievers**

Now we can test our how changing our Retriever impacts our RAGAS evaluation!

In [None]:
def create_qa_chain(retriever):
  primary_qa_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
  created_qa_chain = (
    {"context": itemgetter("question") | retriever,
     "question": itemgetter("question")
    }
    | RunnablePassthrough.assign(
        context=itemgetter("context")
      )
    | {
         "response": prompt | primary_qa_llm,
         "context": itemgetter("context"),
      }
  )

  return created_qa_chain

Parent Document Retriever

In [None]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1500)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=200)

vectorstore = Chroma(collection_name="split_parents", embedding_function=OpenAIEmbeddings())

store = InMemoryStore()

In [None]:
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [None]:
parent_document_retriever.add_documents(base_docs)

Let's create, test, and then evaluate our new chain!

In [None]:
parent_document_retriever_qa_chain = create_qa_chain(parent_document_retriever)

In [None]:
parent_document_retriever_qa_chain.invoke({"question" : "What is the retention amount"})["response"].content


In [None]:
parent_document_retriever_qa_chain.invoke({"question" : "What is the retention amount"})["response"].content

In [None]:
pdr_qa_ragas_dataset.to_csv("pdr_qa_ragas_dataset.csv")