## Short-form QnA

### Set-up

In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
import json
import re
import random

from typing import List

from datasets import load_dataset, Dataset

from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

from langchain_community.document_loaders import DataFrameLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents.base import Document
from langchain_openai import ChatOpenAI

In [11]:
OPENAI_API_KEY = ""
OPENAI_MODEL = "gpt-3.5-turbo"

EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

DATASET = "rajpurkar/squad"

FAISS_PATH = "./vectorstore/squad"
RESULTS_PATH = "./results/squad"

### Embedding Model

In [3]:
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL, 
    model_kwargs={"device": "cpu"}
)

### Dataset

In [4]:
full_dataset = load_dataset(DATASET, split="validation")

random.seed(13)
rand_indices = random.sample(range(0, len(full_dataset)), 200)
dataset = full_dataset.select(rand_indices)

dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 200
})

In [5]:
document_df = pd.DataFrame({
    "id": full_dataset["id"],
    "title": full_dataset["title"],
    "text": full_dataset["context"],
    "type": "chunk"
})

document_df = document_df.drop_duplicates(["text"])
document_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2067 entries, 0 to 10565
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      2067 non-null   object
 1   title   2067 non-null   object
 2   text    2067 non-null   object
 3   type    2067 non-null   object
dtypes: object(4)
memory usage: 80.7+ KB


In [6]:
loader = DataFrameLoader(document_df, page_content_column="text")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 300,
    chunk_overlap = 100
)

document_chunks = loader.load()
document_chunks = text_splitter.split_documents(document_chunks)

In [7]:
vectorstore_db = FAISS.from_documents(
    documents=document_chunks, 
    embedding=embedding_model, 
    normalize_L2=True
)

vectorstore_db.save_local(FAISS_PATH)

### Retriever

In [8]:
chunks_vectorstore_db = FAISS.load_local(
    folder_path=FAISS_PATH, 
    embeddings=embedding_model, 
    normalize_L2=True,
    allow_dangerous_deserialization=True,
)

In [9]:
question_vectorstore_db = FAISS.from_documents(
    documents=[
        Document(page_content="Mock Question", metadata={
            "id": 0, 
            "type": "question", 
            "connections": []
        })],
    embedding=embedding_model
)

### LLM Client

In [12]:
llm_agent = ChatOpenAI(
    api_key=OPENAI_API_KEY,
    model=OPENAI_MODEL,
    temperature=0.0
)

### RAG Agent

In [13]:
class RAGAgent:
    def __init__(
        self,
        client: ChatOpenAI,
        chunk_retriever: FAISS,
        question_retriever: FAISS,
        dataframe: pd.DataFrame
    ):
        self.dataframe = dataframe
        self.client = client
        self.chunk_retriever = chunk_retriever
        self.question_retriever = question_retriever

    
    def retrieve_context(
        self,
        query: str,
        top_k: int,
        distance_threshold: float,
        retrieve_questions: bool = False,
        **krawgs
    ):
        """
        Retrieve relevant documents given query
        """
        retriever = self.chunk_retriever if not retrieve_questions else self.question_retriever
        
        docs_with_metadata = retriever.similarity_search_with_score(
            query=query,
            k=top_k,
            **krawgs
        )
        filtered_docs = [doc for doc, score in docs_with_metadata if score <= distance_threshold] 
        
        if retrieve_questions:
            doc_ids = [doc for relevant_docs in filtered_docs for doc in relevant_docs.metadata["connections"]]
        else:
            doc_ids = [doc.metadata["id"] for doc in filtered_docs]

        filtered_docs = [DataFrameLoader(pd.DataFrame(self.dataframe[self.dataframe["id"] == doc_id])).load()[0] for doc_id in set(doc_ids)]
        
        return filtered_docs
    

    def generate_response(
        self,
        question: str,
        retrieved_docs: List[Document]
    ):
        """
        Generate response based on query and context documents
        """
        documents = ["Title:" + str(doc.metadata["title"]) + "\n" + str(doc.page_content) for doc in retrieved_docs]
        context_str = "\n\n".join(documents)
    
        prompt = ChatPromptTemplate.from_messages([
             ("system", """
                ### INSTRUCTION
                Answer the users QUESTION by extracting from the CONTEXT text above. 
                You should only use keywords from the provided CONTEXT to form your answer.
                Keep your answer concise and short, just in a few words.
              """),
            ("human", "### CONTEXT\n{context}\n### QUESTION\n{question}")
        ])

        chain = prompt | self.client
        response = chain.invoke({"question": question, "context": context_str})

        return response.content
        

### Evaluation

In [14]:
rag_agent = RAGAgent(
    client=llm_agent,
    chunk_retriever=chunks_vectorstore_db,
    question_retriever=question_vectorstore_db,
    dataframe=document_df
)

In [15]:
def compute_bleu_score(ref_list: str, cand: str):
    """
    BLEU score
    """
    smoothing_func = SmoothingFunction().method1

    reference = [word_tokenize(re.sub(r'[^\w\s]','', ref.lower())) for ref in ref_list]
    candidate = word_tokenize(re.sub(r'[^\w\s]','', cand.lower()))

    weight_configs = (1, 0, 0, 0)

    bleu_score = sentence_bleu(
        references=reference, 
        hypothesis=candidate, 
        weights=weight_configs, 
        smoothing_function=smoothing_func
    )

    return bleu_score


def compute_retrieval_score(ground_truth_context: str, retrieved_context: List[Document]):
    """
    Whether the ground truth context document is retrieved 
    """
    retrieved_context_str = [doc.page_content for doc in retrieved_context]
    return 1 if ground_truth_context in retrieved_context_str else 0

In [16]:
def evaluation(
    agent: RAGAgent, 
    test_set: Dataset, 
    top_k: int, 
    q_top_k: int,
    distance_threshold: float,
    q_distance_threshold: float
):
    results = []

    for i in range(len(test_set)):
        id = test_set[i]["id"]
        question = test_set[i]["question"]
        ground_truth_answer = test_set[i]["answers"]["text"]
        ground_truth_context = test_set[i]["context"]

        relevant_chunks = agent.retrieve_context(
            query=question, 
            top_k=top_k, 
            distance_threshold=distance_threshold,
            retrieve_questions=False
        )

        relevant_question_chunks = agent.retrieve_context(
            query=question, 
            top_k=q_top_k, 
            distance_threshold=q_distance_threshold,
            retrieve_questions=True
        )

        context = relevant_question_chunks + relevant_chunks
        response = agent.generate_response(question, context)

        bleu_score = compute_bleu_score(ground_truth_answer, response)
        retrieval_score = compute_retrieval_score(ground_truth_context, context)

        results.append({
            "question": question,
            "answer": response,
            "ground_truth_answer": ground_truth_answer,
            "context": [chunk.page_content for chunk in relevant_chunks],
            "q_context": [chunk.page_content for chunk in relevant_question_chunks],
            "ground_truth_context": ground_truth_context,
            "bleu_score": bleu_score,
            "retrieval_score": retrieval_score,
            "id": id
        })
        
    return results

In [17]:
results = evaluation(
    agent=rag_agent,
    test_set=dataset,
    top_k=5,
    q_top_k=1,
    distance_threshold=1.5,
    q_distance_threshold=0.25
)

In [18]:
with open(f"{RESULTS_PATH}/run-0.json", "w") as file:
    file.write(json.dumps(results))

In [19]:
bleu_scores = [result["bleu_score"] for result in results] 
np.mean(bleu_scores)

0.7302206150734558

In [20]:
retrieval_scores = [result["retrieval_score"] for result in results] 
np.mean(retrieval_scores)

0.925

### Q-RAG Loop

In [21]:
new_question_vectorstore_db = deepcopy(question_vectorstore_db)

In [22]:
for i, score in enumerate(bleu_scores):
    if score < 0.5:
        id = dataset[i]["id"]
        question = dataset[i]["question"]
        ground_truth_answer_list = sorted(dataset[i]["answers"]["text"], key=len)
        ground_truth_answer = ground_truth_answer_list[-1]

        relevant_chunk_list = rag_agent.retrieve_context(
            query=question + " " + ground_truth_answer,
            top_k=2,
            distance_threshold=1.5
        )

        relevant_chunk_id_list = [chunk.metadata["id"] for chunk in relevant_chunk_list]

        question_document = Document(
            page_content=question,
            metadata={
                "id": id,
                "type": "question", 
                "connections": relevant_chunk_id_list
            }
        )

        new_question_vectorstore_db.add_documents([question_document])

In [23]:
new_rag_agent = RAGAgent(
    client=llm_agent,
    chunk_retriever=chunks_vectorstore_db,
    question_retriever=new_question_vectorstore_db,
    dataframe=document_df
)

In [24]:
new_results = evaluation(
    agent=new_rag_agent,
    test_set=dataset,
    top_k=5,
    q_top_k=2,
    distance_threshold=1.5,
    q_distance_threshold=0.25
)

In [25]:
with open(f"{RESULTS_PATH}/run-1.json", "w") as file:
    file.write(json.dumps(new_results))

In [26]:
new_bleu_scores = [result["bleu_score"] for result in new_results] 
np.mean(new_bleu_scores)

0.77242408767743

In [27]:
new_retrieval_scores = [result["retrieval_score"] for result in new_results] 
np.mean(new_retrieval_scores)

0.97