In [5]:
import pandas as pd
from dotenv import load_dotenv
import os 
load_dotenv()
OPENAIAPI = os.getenv("OPENAIAPI")
HuggingFaceToken = os.getenv("HuggingFaceToken")
data = pd.read_excel("data.xlsx")

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer


def split_texts(
    chunk_size: int,
    knowledge_base: str,
    tokenizer_name: str,
) :
    """
    Split documents into chunks of size `chunk_size` characters and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=["\n\n", "\n", ".", " ", ""],
    )


    
    docs_processed = text_splitter.split_text(knowledge_base)
    return docs_processed


In [5]:
# import
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
def load_embidding(
    texts:list[str]
    )->Chroma:
    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    db = Chroma.from_texts(texts, embedding_function)
    return db

In [40]:
texts = split_texts(
    chunk_size = 400,
    knowledge_base="\n".join(data["paragraph A"].values),
    tokenizer_name="mistralai/Mistral-7B-Instruct-v0.2",
) 
db_A = load_embidding(texts)

In [41]:
texts = split_texts(
    chunk_size = 400,
    knowledge_base="\n".join(data["paragraph B"].values),
    tokenizer_name="mistralai/Mistral-7B-Instruct-v0.2",
) 
db_B = load_embidding(texts)

In [74]:
RAG_PROMPT_TEMPLATE = """
<|system|>
You are a financial trade expert. You will be provided with excerpts from two documents.
Your task is to answer the question based on the information provided for each documents. 
Use only the information from the document as the source for answer of the document.
Separate the answer of the first documents from teh second document.
If the answer cannot be deduced from the document, do not give an answer.
Structure your output with descreption of the question, then question itself, then its answer in first document an the last its answer in the second document.
for example , if we have this question 'show me an outline of the risk factors'
its answer should be in this format :
   The first document, the risk factors outlined in  are:
        1. …
        2. …
            a. …
            b. …
        3. …

    The second document, the risk factors outlined in are:
        1. …
        2. …
        3. …
            a.
            b.
            c.
            d.
        4. …
            a. …
            b. …
                i. …
        5. …
</s>
<|user|>
first document: {first_document}

second document: {second_document}

---
Now here is the question you need to answer.

Question: {question}
</s>
<|assistant|>
"""

In [75]:
from langchain_community.llms import HuggingFaceHub

repo_id = "mistralai/Mistral-7B-Instruct-v0.2"


READER_LLM = HuggingFaceHub(
    repo_id=repo_id,
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
    huggingfacehub_api_token = HuggingFaceToken
)

In [76]:
from ragatouille import RAGPretrainedModel
from langchain_core.language_models.llms import LLM


def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index_A: Chroma,
    knowledge_index_B: Chroma,
    reranker: RAGPretrainedModel = None,
    num_retrieved_docs: int = 6,
    num_docs_final: int = 2,
):
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    relevant_docs = knowledge_index_A.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs_A = [doc.page_content for doc in relevant_docs]
    relevant_docs = knowledge_index_B.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs_B = [doc.page_content for doc in relevant_docs]

    # Optionally rerank results
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs_A = relevant_docs_A[:num_docs_final]
    relevant_docs_B = relevant_docs_B[:num_docs_final]
    # Build the final prompt
    context_document_A = "\n\n"
    context_document_A += "".join([f"\n" + doc for i, doc in enumerate(relevant_docs_A)])
    
    context_document_B = "\n\n"
    context_document_B += "".join([f"\n" + doc for i, doc in enumerate(relevant_docs_B)])
    
    
    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, first_document=context_document_A, second_document=context_document_B)

    # Redact an answer
    answer = llm(final_prompt)

    return answer[len(final_prompt):]

In [78]:
print(answer_with_rag(
    question=data["question"].iloc[0],
    llm=READER_LLM,
    knowledge_index_A=db_A,
    knowledge_index_B=db_B,
))

The first document, the potential conflicts of interest involving the Collateral Manager as outlined in the document are:
1. Ongoing relationships with, rendering services to, engaging in transactions with, and investing in other issuers of collateralized debt obligations that invest in assets of a similar nature to those securing the Notes, and with companies whose securities are pledged to secure the Notes.
2. Possession of information relating to issuers of Portfolio Assets that is not known to the individuals at the Collateral Manager responsible for monitoring the Portfolio Assets and performing the other obligations under the Collateral Management Agreement.

The second document, the potential conflicts of interest involving the Collateral Manager as outlined in the document are:
1. Ongoing relationships with, rendering services to, engaging in transactions with, and investing in other issuers of collateralized debt obligations that invest in assets of a similar nature to those s

In [94]:
from tqdm import tqdm
def run_rag_tests(
    eval_dataset: pd.DataFrame,
    llm,
    knowledge_index_A: Chroma,
    knowledge_index_B: Chroma,
):

    outputs = []
    for index,row in tqdm(eval_dataset.iterrows()):
        question = row["question"]


        answer = answer_with_rag(question, llm, knowledge_index_A, knowledge_index_B)
        result = {
            "question": question,
            "true_answer": row["Answer in document A"] + "\n \n" + row["Answer in document B"],
            "generated_answer": answer,
        }
        outputs.append(result)
        
    return pd.DataFrame(outputs)


In [95]:
response = run_rag_tests(
    eval_dataset=data,
    llm=READER_LLM,
    knowledge_index_A=db_A,
    knowledge_index_B=db_B,
)

29it [01:20,  2.78s/it]


In [96]:
response

Unnamed: 0,question,true_answer,generated_answer
0,What are the potential conflicts of interest i...,The potential conflicts of interest involving ...,"The first document, the potential conflicts of..."
1,What is the potential conflict of interest for...,The potential conflict of interest for the Rat...,"The first document, the potential conflict of ..."
2,what are the potential conflicts of interest f...,The potential conflicts of interest for the Ho...,The documents provided do not directly address...
3,What happens to Bank Subordinated Notes in the...,"In the event of the issuer's insolvency, the B...","The first document, the risks outlined for Ban..."
4,What are the geographical concentrations of th...,The investment portfolio in the first document...,The first document does not provide specific i...
5,What are the conditions and limitations on the...,"In the first document, the Collateral Manager ...","The first document, the conditions and limitat..."
6,What are the potential risks and limitations a...,The potential risks and limitations associated...,"The first document, the potential risks and li..."
7,Can you explain the concept of Negative Carry ...,The concept of Negative Carry described in the...,"The concept of Negative Carry, as outlined in ..."
8,How do the documents describe the reliability ...,The first document describes credit ratings as...,The documents describe credit ratings as opini...
9,What are the procedures and potential impacts ...,"In the first document, collections from option...",The first document describes that collections ...


In [100]:
from langchain.output_parsers.openai_tools import PydanticToolsParser
from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.utils.function_calling import convert_to_openai_tool
from langchain_openai import ChatOpenAI
from operator import itemgetter

In [101]:
class Answer(BaseModel):
    prefix: str = Field(description="analyse of generated answer and perfect answer")
    score: int = Field(description="the score of the generated answer")

In [104]:
model = ChatOpenAI(temperature=0.01, model="gpt-4-0125-preview", api_key=OPENAIAPI)

answer_tool_oai = convert_to_openai_tool(Answer)
llm_with_tool = model.bind(
        tools=[answer_tool_oai],
        tool_choice={"type": "function", "function": {"name": "Answer"}},
    )
parser_tool = PydanticToolsParser(tools=[Answer])

In [111]:
class Evaluate:
    @classmethod
    def generate(self, perfect_answer, generated_answer,question):      
        
        
        template =  f"""
            You are a fair evaluator language model. you will be provided with question ,the perfect answer and generated answer. 
            Your task is to compare the generated answer with perfect answer base the score below:
            
            Score 1: The response is completely incorrect, inaccurate, and/or not factual.
            Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
            Score 3: The response is somewhat correct, accurate, and/or factual.
            Score 4: The response is mostly correct, accurate, and factual.
            Score 5: The response is completely correct, accurate, and factual.
            
            
            perfect answer: {perfect_answer}
            
            generated answer: {generated_answer}
            
            
            question : {question}
            
            structure your answer by analysing of generated answer and perfect answer then the score of generated answer
         """ 
         
        prompt = PromptTemplate(
            template=template,
            input_variables=["perfect_answer", "generated_answer"]
        )
        chain = (
            {
                "perfect_answer": itemgetter("perfect_answer"),
                "second_document": itemgetter("generated_answer"),
                "question": itemgetter("question"),
            }
            | prompt
            | llm_with_tool
            | parser_tool
        )
        solution = chain.invoke(
                {"perfect_answer": perfect_answer, "generated_answer": generated_answer, "question":question}
            )
        return solution

In [115]:
outputs = []
for index, row in tqdm(response.iterrows()):
    answer = Evaluate.generate(row["true_answer"], row["generated_answer"], row["question"])[0]
    outputs.append({
        "prefix":answer.prefix,
        "score":answer.score
    })
    

29it [03:39,  7.57s/it]


In [117]:
scores_using_shunks_400_words = pd.DataFrame(outputs)
scores_using_shunks_400_words

Unnamed: 0,prefix,score
0,The generated answer captures some elements of...,3
1,The generated answer essentially repeats the s...,5
2,The generated answer diverges significantly fr...,2
3,The generated answer provides a concise summar...,4
4,The generated answer fails to accurately refle...,1
5,The generated answer provides a summary of the...,3
6,The generated answer closely aligns with the p...,4
7,The generated answer accurately captures the e...,5
8,The generated answer accurately captures the e...,4
9,The generated answer provides a comprehensive ...,4


Now, let s use another method of splitting. rather than split by shunks, i will use the splitting from data

In [122]:
db_A = load_embidding(list(data["paragraph A"].values))
db_B = load_embidding(list(data["paragraph B"].values))

In [123]:
response = run_rag_tests(
    eval_dataset=data,
    llm=READER_LLM,
    knowledge_index_A=db_A,
    knowledge_index_B=db_B,
)

29it [01:49,  3.77s/it]


In [124]:
outputs = []
for index, row in tqdm(response.iterrows()):
    answer = Evaluate.generate(row["true_answer"], row["generated_answer"], row["question"])[0]
    outputs.append({
        "prefix":answer.prefix,
        "score":answer.score
    })
scores_using_original_shunks = pd.DataFrame(outputs)

29it [03:30,  7.28s/it]


In [125]:
scores_using_original_shunks

Unnamed: 0,prefix,score
0,The generated answer simplifies the potential ...,2
1,The generated answer elaborates on the potenti...,4
2,The generated answer diverges significantly fr...,2
3,The generated answer provides a detailed list ...,4
4,The generated answer fails to accurately refle...,1
5,The generated answer accurately captures the e...,3
6,The generated answer closely aligns with the p...,4
7,The generated answer provides a concise and ac...,5
8,The generated answer accurately captures the e...,4
9,The generated answer accurately captures the e...,4


In [126]:
scores_using_shunks_400_words["score"].mean()

2.9310344827586206

In [127]:
scores_using_original_shunks["score"].mean()

2.8620689655172415

Note:
As we notice, the score of Misral with original paragraphs is lower than when using chunks of 400 words, and that may be due to several reasons, such as when the input to the model is larger, it becomes less accurate.

This is just a simple RAG using chroma. By default, chroma uses the L2 metric to calculate similarity. We can test using cosine similarity instead. Additionally, we can change the size of the chunk or the embedding model to achieve a more accurate RAG system.

We can use rerank rag as another strategy:

- By having the model provide answers from zero-shot learning, then using the question along with its answer to search for ranked chunks, and finally selecting the best 'n' from the chunks.
- By having the model generate other questions similar to ours, then using them to rank chunks and choosing the best 'n' from them.

Afterward, we can use only our question to rerank them. 

These methods will help to retrieve only the relevant chunks

For evaluation, I use GPT-4 to give scores because it is more accurate in evaluating the meaning of text. However, we can also use Rouge-1 or Rouge-2 for evaluation