In [None]:
#****************************** Step1 ********************************
# Imporing Standard and third-party imports for RAG pipeline
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
import torch

In [None]:
#************************ Step 2 *********************************************************
# *********************** Load FAISS Vectore Store and Embedding Model *******************

# Load the same embedding model as used in Task 2
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load the vector store built in Task 2
vector_db = FAISS.load_local(
    "../notebooks/vector_store",  
    embedding_model,
    allow_dangerous_deserialization=True
)


In [None]:
#********************************* Step 3 *******************************************
# ********************************Define Retrival Function ***************************

def retrieve_top_chunks(query: str, k: int = 5):
    """
    Retrieves the top-k most relevant complaint chunks from FAISS index.
    
    Args:
        query (str): The user's question.
        k (int): Number of chunks to retrieve.
        
    Returns:
        List of Document objects.
    """
    return vector_db.similarity_search(query, k=k)

In [None]:
#***************************************  Step 4 *********************************
#***************************************  Define RAG Prompt Template *************

rag_prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a financial analyst assistant at CrediTrust.
Your task is to answer customer complaint-related questions using only the provided context.

Context:
{context}

Question:
{question}

Answer (based only on the context above):
"""
)


In [None]:
#************************* Step 5 ****************************************
# ************************ Load a Lightweight LLM ************************

# We'll use HuggingFace's DistilBERT for generation (very lightweight and CPU-friendly)

llm_pipeline = pipeline(
    "text-generation",
    model="distilgpt2",  # This can be switch to mistral, llama, etc. concidering available resource
    max_new_tokens=200,
    temperature=0.7,
    do_sample=True
)

llm = HuggingFacePipeline(pipeline=llm_pipeline)

In [None]:
#********************** Step 6 **************************
#********************* Combine Retriever + LLM into RetrievalQA Chain ********************

rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vector_db.as_retriever(search_kwargs={"k": 5}),
    chain_type="stuff",
    chain_type_kwargs={"prompt": rag_prompt_template}
)

In [None]:
# ************************* Step 7 *********************************************
# ************************* Define Example Queries for Evaluation **************

example_questions = [
    "Why are users unhappy with Buy Now, Pay Later services?",
    "What issues do customers report about savings accounts?",
    "Are there any complaints about money transfers being delayed?",
    "Why do people dislike credit cards?",
    "What is the most common complaint about personal loans?"
]

In [None]:
# ************************** Step 8 *********************************************
# ************************** Run Evaluation and Store Results *******************

evaluation_results = []

for question in example_questions:
    result = rag_chain({"query": question})
    retrieved_docs = retrieve_top_chunks(question)
    
    evaluation_results.append({
        "Question": question,
        "Generated Answer": result["result"],
        "Retrieved Source 1": retrieved_docs[0].page_content[:250],  # Short preview
        "Retrieved Source 2": retrieved_docs[1].page_content[:250] if len(retrieved_docs) > 1 else "",
        "Quality Score (1-5)": "",  # Leave blank to score manually
        "Comments": ""
    })

In [None]:
# ********************* Step 9 ********************************************************
# ********************* Display Evaluation Table for Manual Review ********************

eval_df = pd.DataFrame(evaluation_results)
eval_df[["Question", "Generated Answer", "Retrieved Source 1", "Retrieved Source 2", "Quality Score (1-5)", "Comments"]]
