In [None]:
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-large-instruct")
persist_directory = "../database/anwesha_chroma"
vectorstore = Chroma(
    persist_directory=persist_directory, embedding_function=embeddings)
retriever = vectorstore.as_retriever()

In [12]:
from langchain_community.vectorstores import Chroma

In [13]:
persist_directory = "../database/anwesha_chroma"
vectorstore = Chroma(
    persist_directory=persist_directory, embedding_function=embeddings)
retriever = vectorstore.as_retriever()

In [14]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="moonshotai/kimi-k2-instruct",
)

In [15]:
from langchain.prompts import ChatPromptTemplate

# RAG-Fusion: Related
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [16]:
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_rag_fusion
    | ChatGroq(
        model="moonshotai/kimi-k2-instruct",
    )
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

In [17]:
question = "অনুপম তার মামার চেয়ে কত বছরের ছোট ছিল?"

In [18]:
from langchain.load import dumps, loads


def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """

    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results


retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke(
    {"question": question})
len(docs)

6

In [19]:
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter
# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion,
     "question": itemgetter("question")}
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question": question})

'উপরের প্রসঙ্গ অনুযায়ী, “অপরিচিতা” গল্পে মামার সঙ্গে অনুপমের বয়সের পার্থক্য “বছর চারেক” (অর্থাৎ প্রায় ৪ বছর)।  \n\nঅতএব, অনুপম তার মামার চেয়ে **৪ বছরের ছোট ছিল।**'

In [20]:
sample_queries = [
    "অপরিচিতা' গল্পে, অনুপমের মতে কে আসর জমাতে অদ্বিতীয়?",
    "অনুপম তার মামার চেয়ে কত বছরের ছোট ছিল?",
    "মন্দ নয় হে! খাঁটি সোনা বটে!' - এই উক্তিটি কার?",
    "কল্যাণীর বাবার নাম কী?",
    "বিবাহ-উপলক্ষ্যে কন্যাপক্ষকে কোথায় আসতে হয়েছিল?",
    "শম্ভুনাথ সেন পেশায় কী ছিলেন?",
    "অনুপম এবং তার মা কোন বাহনে তীর্থে যাচ্ছিলেন?",
    "রেলগাড়িতে কল্যাণীর সাথে কয়টি ছোট ছোট মেয়ে ছিল?",
    "বিবাহ ভাঙার পর কল্যাণী কী ব্রত গ্রহণ করে?",
    "গল্পের শেষে অনুপমের বয়স কত?"
]

expected_responses = [
    "হরিশ",
    "বছর ছয়েক",
    "বিনুদা",
    "শম্ভুনাথ সেন",
    "কলিকাতা",
    "ডাক্তার",
    "রেলগাড়ি",
    "দুটি-তিনটি",
    "মেয়েদের শিক্ষার ব্রত",
    "সাতাশ"
]

In [None]:
from ragas import EvaluationDataset


dataset = []

for query, reference in zip(sample_queries, expected_responses):
    relevant_docs = retriever.invoke(query)
    response = final_rag_chain.invoke({"question": query})
    dataset.append(
        {
            "user_input": query,
            "retrieved_contexts": [rdoc.page_content for rdoc in relevant_docs],
            "response": response,
            "reference": reference,
        }
    )

evaluation_dataset = EvaluationDataset.from_list(dataset)

In [21]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness

evaluator_llm = LangchainLLMWrapper(llm)

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],
    llm=evaluator_llm,
)

result

NameError: name 'evaluation_dataset' is not defined