<a href="https://colab.research.google.com/github/MohitTiwari-07/Contents/blob/main/Assignment_8_(RAG_Q%26A_chatbot)_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project Title: Hybrid RAG Chatbot for Intelligent Document-Grounded Q&A

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv('Training Dataset.csv')

# Drop missing values
df.dropna(inplace=True)

# Optional: Convert categorical to string
df = df.astype(str)

# Combine rows into documents for retrieval
docs = []
for i, row in df.iterrows():
    doc = ' '.join([f"{col}: {row[col]}" for col in df.columns])
    docs.append(doc)


In [None]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Use a lightweight embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = embedder.encode(docs)

# Create FAISS index
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(np.array(doc_embeddings))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from transformers import pipeline

# Load QA model (can swap with licensed API if available)
qa_pipeline = pipeline("text-generation", model="gpt2")

def generate_response(query):
    query_embedding = embedder.encode([query])
    scores, retrieved_ids = index.search(np.array(query_embedding), k=3)
    context = "\n".join([docs[i] for i in retrieved_ids[0]])

    # Compose prompt
    prompt = f"Answer the question using this data:\n{context}\n\nQuestion: {query}\nAnswer:"
    response = qa_pipeline(prompt, max_length=200, do_sample=True)[0]['generated_text']
    return response.split("Answer:")[-1].strip()


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [None]:
def generate_response(query, df):
    query_lower = query.lower()

    if "gender" in query_lower and "loan" in query_lower:
        # Your gender analysis logic here
        return "Gender-based loan approval stats..."

    elif "average loan amount" in query_lower:
        avg = df[df['Loan_Status'] == 'Y']['LoanAmount'].mean()
        return f"The average loan amount for approved applications is around {avg:.2f} units."

    elif "property area" in query_lower:
        area_group = df.groupby('Property_Area')['Loan_Status'].value_counts(normalize=True).unstack().fillna(0)
        top_area = area_group['Y'].idxmax()
        rate = area_group['Y'].max() * 100
        return f"Applicants from {top_area} areas have the highest approval rate at {rate:.2f}%."

    else:
        return "I'm still learning to answer that—try asking about loan approval factors like gender, property area, or income."


#Install Sentence Transformers

In [1]:
!pip install -U sentence-transformers


Collecting sentence-transformers
  Using cached sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux201

Generate Local Embeddings

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")  # Lightweight and fast
doc_embeddings = [model.encode(doc) for doc in docs]


Load the Dataset

In [None]:
import pandas as pd

# Adjust path if needed
df = pd.read_csv("/content/Training Dataset.csv")


In [None]:
df.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Convert Rows to Text Documents

In [None]:
def create_documents(df):
    documents = []
    for _, row in df.iterrows():
        doc = f"""
        Loan_ID: {row.get('Loan_ID', '')}
        Gender: {row.get('Gender', '')}
        Married: {row.get('Married', '')}
        Education: {row.get('Education', '')}
        ApplicantIncome: {row.get('ApplicantIncome', '')}
        CoapplicantIncome: {row.get('CoapplicantIncome', '')}
        LoanAmount: {row.get('LoanAmount', '')}
        Loan_Status: {row.get('Loan_Status', '')}
        Property_Area: {row.get('Property_Area', '')}
        Credit_History: {row.get('Credit_History', '')}
        """
        documents.append(doc.strip())
    return documents

docs = create_documents(df)


# Create Embeddings and Retrieve Context

In [None]:
!pip install -U sentence-transformers




In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")  # Fast, accurate, lightweight


# Generate Embeddings for Your Document Chunks

In [None]:
doc_embeddings = [model.encode(doc) for doc in docs]


Add Context Retriever (Cosine Similarity)

In [None]:
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def retrieve_context(query, documents, embeddings):
    query_embedding = model.encode(query)
    similarities = [cosine_similarity(query_embedding, emb) for emb in embeddings]
    best_index = int(np.argmax(similarities))
    return documents[best_index]


# Answer Questions Using a Hugging Face LLM

Install & Load the Model

In [None]:
!pip install -U transformers accelerate

from transformers import pipeline

# Load a lightweight, instruction-tuned model
generator = pipeline("text2text-generation", model="google/flan-t5-base")




config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


Build Your Answer Function

In [None]:
def answer_query(query):
    context = retrieve_context(query, docs, doc_embeddings)

    prompt = f"""You are a loan approval assistant. Based on the context below, answer the user's question clearly and factually.

Context:
{context}

Question:
{query}

Answer:"""

    result = generator(prompt, max_new_tokens=200)[0]["generated_text"]
    return result


In [None]:
def answer_query(query):
    query_clean = query.lower().strip()

    # Handle greetings
    if query_clean in ["hi", "hello", "hey", "hlo", "yo"]:
        return "👋 Hi there! I’m your loan approval assistant. Ask me anything about how income, gender, education, or property area influence loan outcomes."

    # Retrieve relevant context
    context = retrieve_context(query, docs, doc_embeddings)

    # Build prompt for local Hugging Face model
    prompt = f"""You are an AI assistant that answers questions about loan approval. Based on the information below, respond helpfully:

Context:
{context}

Question:
{query}

Answer:"""

    result = generator(prompt, max_new_tokens=200)[0]["generated_text"]
    return result


In [None]:
while True:
    user_query = input("Ask your question (or type 'exit' to quit): ")
    if user_query.lower().strip() == "exit":
        print("Chatbot session ended. Goodbye! 👋")
        break
    print("\nBot:", answer_query(user_query), "\n")


Ask your question (or type 'exit' to quit): hi

Bot: 👋 Hi there! I’m your loan approval assistant. Ask me anything about how income, gender, education, or property area influence loan outcomes. 

Ask your question (or type 'exit' to quit): What is the average approved loan amount?

Bot: 140.0 

Ask your question (or type 'exit' to quit): What role does education play in loan approval?

Bot: Not Graduate 

Ask your question (or type 'exit' to quit): Are loans more likely to be approved in urban or rural areas?

Bot: urban 

Ask your question (or type 'exit' to quit): Which property area has the highest approval rate?

Bot: Urban Credit History 

Ask your question (or type 'exit' to quit): Which applicant types are most likely to get approved?

Bot: Male 

Ask your question (or type 'exit' to quit): Can you summarize the key factors that affect loan approval?

Bot: Gender, Married: Yes, Education: Not Graduate, Loan Amount: 140.0 

Ask your question (or type 'exit' to quit): exit
Chatbot