In [61]:
import pandas as pd
import numpy as np
from copy import deepcopy
import json
import re

from typing import List

from datasets import load_dataset, Dataset

from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

from langchain_community.document_loaders import DataFrameLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents.base import Document
from langchain_openai import ChatOpenAI

In [32]:
OPENAI_API_KEY = ""
OPENAI_MODEL = "gpt-3.5-turbo"

EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

DATASET = "PrimeQA/clapnq"

FAISS_PATH = "./vectorstore/clapnq"
RESULTS_PATH = "./results/clapnq"

### Embedding Model

In [33]:
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL, 
    model_kwargs={"device": "cpu"}
)

### Dataset

In [76]:
full_dataset = load_dataset(DATASET, split="validation", name="default")

dataset = Dataset.from_list([row for row in full_dataset if row["output"][0]["answer"] != ""])
dataset = dataset.select(range(100, 300))

dataset

Dataset({
    features: ['id', 'input', 'passages', 'output'],
    num_rows: 201
})

In [35]:
document_df = pd.DataFrame({
    "id": full_dataset["id"],
    "title": [passage[0]["title"] for passage in full_dataset["passages"]],
    "text": [passage[0]["text"] for passage in full_dataset["passages"]],
    "type": "chunk"
})

document_df = document_df.drop_duplicates(["text"])
document_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 597 entries, 0 to 599
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      597 non-null    object
 1   title   597 non-null    object
 2   text    597 non-null    object
 3   type    597 non-null    object
dtypes: object(4)
memory usage: 23.3+ KB


In [36]:
loader = DataFrameLoader(document_df, page_content_column="text")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 300,
    chunk_overlap = 100
)

document_chunks = loader.load()
# document_chunks = text_splitter.split_documents(document_chunks)

In [37]:
vectorstore_db = FAISS.from_documents(
    documents=document_chunks, 
    embedding=embedding_model, 
    normalize_L2=True
)

vectorstore_db.save_local(FAISS_PATH)

### Retriever

In [38]:
chunks_vectorstore_db = FAISS.load_local(
    folder_path=FAISS_PATH, 
    embeddings=embedding_model, 
    normalize_L2=True,
    allow_dangerous_deserialization=True,
)

In [39]:
question_vectorstore_db = FAISS.from_documents(
    documents=[
        Document(page_content="Mock Question", metadata={
            "id": 0, 
            "type": "question", 
            "connections": []
        })],
    embedding=embedding_model
)

### LLM Client

In [40]:
llm_agent = ChatOpenAI(
    api_key=OPENAI_API_KEY,
    model=OPENAI_MODEL,
    temperature=0.0
)

### RAG Agent

In [72]:
class RAGAgent:
    def __init__(
        self,
        client: ChatOpenAI,
        chunk_retriever: FAISS,
        question_retriever: FAISS,
        dataframe: pd.DataFrame
    ):
        self.dataframe = dataframe
        self.client = client
        self.chunk_retriever = chunk_retriever
        self.question_retriever = question_retriever

    
    def retrieve_context(
        self,
        query: str,
        top_k: int,
        distance_threshold: float,
        retrieve_questions: bool = False,
        **krawgs
    ):
        """
        Retrieve relevant documents given query
        """
        retriever = self.chunk_retriever if not retrieve_questions else self.question_retriever
        
        docs_with_metadata = retriever.similarity_search_with_score(
            query=query,
            k=top_k,
            **krawgs
        )
        filtered_docs = [doc for doc, score in docs_with_metadata if score <= distance_threshold] 
        
        if retrieve_questions:
            doc_ids = [doc for relevant_docs in filtered_docs for doc in relevant_docs.metadata["connections"]]
        else:
            doc_ids = [doc.metadata["id"] for doc in filtered_docs]

        filtered_docs = [DataFrameLoader(pd.DataFrame(self.dataframe[self.dataframe["id"] == doc_id])).load()[0] for doc_id in set(doc_ids)]
        
        return filtered_docs
    

    def generate_response(
        self,
        question: str,
        retrieved_docs: List[Document]
    ):
        """
        Generate response based on query and context documents
        """
        documents = ["Title:" + str(doc.metadata["title"]) + "\n" + str(doc.page_content) for doc in retrieved_docs]
        context_str = "\n\n".join(documents)
    
        prompt = ChatPromptTemplate.from_messages([
            # ("system", """
            #     Answer the users QUESTION using the CONTEXT text above. 
            #     Keep your answer short, direct, and relevant to the QUESTION. Strictly ground your answers to the sentences in the provided CONTEXT.
            #     If the CONTENT doesn't contain the necessary facts to answer the QUESTION, just say "I don't know".
            #  """),
             ("system", """
                ### INSTRUCTION
                Answer the users QUESTION with the CONTEXT text above. 
                Strictly ground your answer to the provided CONTEXT. You should use the sentences or keywords from the provided CONTEXT to form your answer.
                Keep your answer concise and relevant to the QUESTION.
              """),
            ("human", "### CONTEXT\n{context}\n### QUESTION\n{question}")
        ])

        chain = prompt | self.client
        response = chain.invoke({"question": question, "context": context_str})

        return response.content
        

In [70]:
rag_agent = RAGAgent(
    client=llm_agent,
    chunk_retriever=chunks_vectorstore_db,
    question_retriever=question_vectorstore_db,
    dataframe=document_df
)

### Evaluation

In [67]:
def compute_bleu_score(ref: str, cand: str):
    smoothing_func = SmoothingFunction().method1

    reference = word_tokenize(re.sub(r'[^\w\s]','', ref.lower()))
    candidate = word_tokenize(re.sub(r'[^\w\s]','', cand.lower()))

    weight_configs = (1, 0, 0, 0)

    bleu_score = sentence_bleu(
        references=[reference], 
        hypothesis=candidate, 
        weights=weight_configs, 
        smoothing_function=smoothing_func
    )

    return bleu_score


def compute_retrieval_score(ground_truth_context: str, retrieved_context: List[Document]):
    retrieved_context_str = [doc.page_content for doc in retrieved_context]
    return 1 if ground_truth_context in retrieved_context_str else 0

In [65]:
def evaluation(
    agent: RAGAgent, 
    test_set: Dataset, 
    top_k: int, 
    q_top_k: int,
    distance_threshold: float,
    q_distance_threshold: float
):
    results = []

    for i in range(len(test_set)):
        id = test_set[i]["id"]
        question = test_set[i]["input"]
        ground_truth_answer = test_set[i]["output"][0]["answer"]
        ground_truth_context = test_set[i]["passages"][0]["text"]

        relevant_chunks = agent.retrieve_context(
            query=question, 
            top_k=top_k, 
            distance_threshold=distance_threshold,
            retrieve_questions=False
        )

        relevant_question_chunks = agent.retrieve_context(
            query=question, 
            top_k=q_top_k, 
            distance_threshold=q_distance_threshold,
            retrieve_questions=True
        )

        context = relevant_question_chunks + relevant_chunks
        response = agent.generate_response(question, context)

        bleu_score = compute_bleu_score(ground_truth_answer, response)
        retrieval_score = compute_retrieval_score(ground_truth_context, context)

        results.append({
            "question": question,
            "answer": response,
            "ground_truth_answer": ground_truth_answer,
            "context": [chunk.page_content for chunk in relevant_chunks],
            "q_context": [chunk.page_content for chunk in relevant_question_chunks],
            "ground_truth_context": ground_truth_context,
            "bleu_score": bleu_score,
            "retrieval_score": retrieval_score,
            "id": id
        })
        
    return results

In [74]:
results = evaluation(
    agent=rag_agent,
    test_set=dataset,
    top_k=5,
    q_top_k=1,
    distance_threshold=1.5,
    q_distance_threshold=0.25
)

[{'question': 'what is meant by sanctification in the bible',
  'answer': 'Sanctification in the Bible refers to the act or process of acquiring sanctity, of being made or becoming holy. It is a gift given through the power of God to a person or thing which is then considered sacred or set apart in an official capacity within the religion. To sanctify is to literally "set apart for particular use in a special purpose or work and to make holy or sacred."',
  'ground_truth_answer': "Sanctification is the act or process of acquiring sanctity , of being made or becoming holy .\nTo sanctify is to literally `` set apart for particular use in a special purpose or work and to make holy or sacred . ''",
  'context': ["Sanctification is the act or process of acquiring sanctity , of being made or becoming holy . It is a gift given through the power of God to a person or thing which is then considered sacred or set apart in an official capacity within the religion . In general anything from a temp

In [16]:
with open(f"{RESULTS_PATH}/run-0.json", "w") as file:
    file.write(json.dumps(results))

In [17]:
bleu_scores = [result["bleu_score"] for result in results] 
np.mean(bleu_scores)

0.4382711866509373

In [18]:
retrieval_scores = [result["retrieval_score"] for result in results] 
np.mean(retrieval_scores)

0.9950248756218906

### Q-RAG Loop

In [19]:
new_question_vectorstore_db = deepcopy(question_vectorstore_db)

In [20]:
for i, score in enumerate(bleu_scores):
    if score < 0.5:
        id = dataset[i]["id"]
        question = dataset[i]["input"]
        ground_truth_answer = (" ").join(dataset[i]["output"][0]["answer"])

        relevant_chunk_list = rag_agent.retrieve_context(
            query=question + " " + ground_truth_answer,
            top_k=2,
            distance_threshold=1.5
        )

        relevant_chunk_id_list = [chunk.metadata["id"] for chunk in relevant_chunk_list]

        question_document = Document(
            page_content=question,
            metadata={
                "id": id,
                "type": "question", 
                "connections": relevant_chunk_id_list
            }
        )

        new_question_vectorstore_db.add_documents([question_document])

In [21]:
new_rag_agent = RAGAgent(
    client=llm_agent,
    chunk_retriever=chunks_vectorstore_db,
    question_retriever=new_question_vectorstore_db,
    dataframe=document_df
)

In [22]:
new_results = evaluation(
    agent=new_rag_agent,
    test_set=dataset,
    top_k=5,
    q_top_k=2,
    distance_threshold=1.5,
    q_distance_threshold=0.25
)

In [23]:
with open(f"{RESULTS_PATH}/run-1.json", "w") as file:
    file.write(json.dumps(new_results))

In [24]:
new_bleu_scores = [result["bleu_score"] for result in new_results] 
np.mean(new_bleu_scores)

0.4449976317072134

In [25]:
new_retrieval_scores = [result["retrieval_score"] for result in new_results] 
np.mean(new_retrieval_scores)

0.9950248756218906

### Analysis