In [None]:
!pip install faiss-cpu


In [None]:
import pandas as pd
import faiss
import logging
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import torch
from transformers import pipeline
from google.colab import drive
from datetime import datetime
import itertools

In [None]:
drive.mount('/content/drive')

# Load Chunked Data and FAISS Index

In [None]:
import pandas as pd
import faiss
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Update paths



# Load chunked data
data_path = '/content/drive/My Drive/Colab Notebooks/chunked_complaints.csv'
try:
    df_chunks = pd.read_csv(data_path)
    logging.info(f"Loaded chunked dataset with shape: {df_chunks.shape}")
    print("Columns:", df_chunks.columns.tolist())
    print("\nSample Chunks (first 5):")
    print(df_chunks[['complaint_id', 'product', 'chunk_idx', 'chunk_text']].head())
except Exception as e:
    logging.error(f"Failed to load chunked dataset: {e}")
    raise

# Load FAISS index
index_path = '/content/drive/My Drive/Colab Notebooks/complaint_index.faiss'
try:
    index = faiss.read_index(index_path)
    logging.info(f"FAISS index size: {index.ntotal}")
except Exception as e:
    logging.error(f"Failed to load FAISS index: {e}")
    raise

# Implement the Retriever

In [None]:

# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

def retrieve_chunks(query, index, df_chunks, model, top_k=5):
    try:
        # Encode query
        query_embedding = model.encode([query], show_progress_bar=False)
        # Search FAISS index
        distances, indices = index.search(np.array(query_embedding, dtype=np.float32), top_k)
        # Get corresponding chunks
        retrieved_chunks = df_chunks.iloc[indices[0]][['complaint_id', 'product', 'chunk_idx', 'chunk_text']].to_dict('records')
        return retrieved_chunks, distances[0]
    except Exception as e:
        logging.error(f"Error retrieving chunks for query '{query}': {e}")
        return [], []

# Test retriever
sample_queries = [
    "Why are people unhappy with BNPL?",
    "What are common issues with Credit Card fraud?",
    "Why do Savings Account complaints happen?"
]
for query in sample_queries:
    chunks, distances = retrieve_chunks(query, index, df_chunks, model)
    print(f"\nQuery: {query}")
    print("Retrieved Chunks:")
    for i, chunk in enumerate(chunks):
        print(f"Chunk {i+1} (Product: {chunk['product']}, Distance: {distances[i]:.4f}):")
        print(chunk['chunk_text'])

In [None]:
!pip install -U transformers accelerate bitsandbytes


# Set Up the LLM

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

model_name = "HuggingFaceH4/zephyr-7b-beta"

# ✅ 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# ✅ Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)

# ✅ Create text-generation pipeline
llm = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Example inputs
system_prompt = "You are a helpful financial assistant."
user_question = "What are the main risks in microloans?"
retrieved_chunks = [
    "Microloans often face high default rates due to limited borrower credit history.",
    "Operational costs can be disproportionately high relative to loan size."
]

# Build prompt in Zephyr format
full_prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{user_question}\n<|retrieved|>\n" + "\n".join(retrieved_chunks)

# Generate
response = llm(full_prompt, max_new_tokens=200, do_sample=True, temperature=0.7)

print(response[0]["generated_text"])


# Load and Verify Files

In [None]:
import pandas as pd
import faiss
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Update paths



# Load chunked data
data_path = '/content/drive/My Drive/Colab Notebooks/chunked_complaints.csv'
try:
    df_chunks = pd.read_csv(data_path)
    logging.info(f"Loaded chunked dataset with shape: {df_chunks.shape}")
    print("Columns:", df_chunks.columns.tolist())
    print("\nSample Chunks (first 5):")
    print(df_chunks[['complaint_id', 'product', 'chunk_idx', 'chunk_text']].head())
except Exception as e:
    logging.error(f"Failed to load chunked dataset: {e}")
    raise

# Load FAISS index
index_path = '/content/drive/My Drive/Colab Notebooks/complaint_index.faiss'
try:
    index = faiss.read_index(index_path)
    logging.info(f"FAISS index size: {index.ntotal}")
except Exception as e:
    logging.error(f"Failed to load FAISS index: {e}")
    raise

# Integrate FAISS Retriever with the LLM

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

def retrieve_chunks(query, index, df_chunks, embedder, top_k=5):
    try:
        query_embedding = embedder.encode([query], show_progress_bar=False)
        distances, indices = index.search(np.array(query_embedding, dtype=np.float32), top_k)
        retrieved_chunks = df_chunks.iloc[indices[0]][['complaint_id', 'product', 'chunk_idx', 'chunk_text']].to_dict('records')
        return retrieved_chunks, distances[0]
    except Exception as e:
        logging.error(f"Error retrieving chunks for query '{query}': {e}")
        return [], []

def rag_pipeline(query, index, df_chunks, embedder, llm, top_k=5):
    try:
        # Retrieve chunks
        chunks, distances = retrieve_chunks(query, index, df_chunks, embedder, top_k)
        if not chunks:
            return "No relevant complaints found.", [], []

        # Build prompt in Zephyr format
        system_prompt = "You are a financial complaint analysis assistant."
        context = "\n".join([f"Complaint (Product: {chunk['product']}): {chunk['chunk_text']}" for chunk in chunks])
        full_prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{query}\n<|retrieved|>\n{context}"

        # Generate response
        response = llm(full_prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
        return response[0]["generated_text"], chunks, distances
    except Exception as e:
        logging.error(f"Error in RAG pipeline for query '{query}': {e}")
        return "Error processing query.", [], []

# Test the RAG Pipeline

In [None]:
# Test RAG pipeline
sample_queries = [
    "Why are people unhappy with BNPL?",
    "What are common issues with Credit Card fraud?",
    "Why do Savings Account complaints happen?",
    "What issues do people face with Money Transfers?",
    "Why are Personal Loan complaints common?"
]
results = []
for query in sample_queries:
    response, chunks, distances = rag_pipeline(query, index, df_chunks, embedder, llm)
    print(f"\nQuery: {query}")
    print("Response:", response)
    print("Retrieved Chunks:")
    for i, chunk in enumerate(chunks):
        print(f"Chunk {i+1} (Product: {chunk['product']}, Distance: {distances[i]:.4f}):")
        print(chunk['chunk_text'])
    results.append({
        'query': query,
        'response': response,
        'retrieved_chunks': chunks,
        'distances': distances.tolist()
    })

# Save Pipeline and Results

In [None]:
from google.colab import files
# Save RAG pipeline script
with open('/content/drive/My Drive/Colab Notebooks/rag_pipeline.py', 'w') as f:
    f.write("""
# [Copy the full RAG pipeline code from Steps 3-5 here manually]
""")
logging.info("RAG pipeline saved as/content/drive/My Drive/Colab Notebooks/rag_pipeline.py")

# Save results
pd.DataFrame(results).to_json('/content/drive/My Drive/Colab Notebooks/rag_results.json', orient='records', lines=True)
logging.info("RAG results saved as/content/drive/My Drive/Colab Notebooks/rag_results.json")

# Download files
files.download('/content/drive/My Drive/Colab Notebooks/rag_pipeline.py')
files.download('/content/drive/My Drive/Colab Notebooks/rag_results.json')

In [None]:
!pip install nbformat --quiet

import nbformat

# Path to the uploaded notebook (adjust filename if needed)
notebook_path = "C:/Users/HP/10 Acadamy PRojects/New folder (6)/Complaint-Analysis-RAG/notebooks/task_3_ Building_the_RAG.ipynb"

# Load
with open(notebook_path, "r", encoding="utf-8") as f:
    nb = nbformat.read(f, as_version=4)

# Clean
if "widgets" in nb.metadata:
    del nb.metadata["widgets"]

for cell in nb.cells:
    if "outputs" in cell:
        cell["outputs"] = []
    if "execution_count" in cell:
        cell["execution_count"] = None

# Save cleaned version
cleaned_path = "C:\Users\HP\10 Acadamy PRojects\New folder (6)\Complaint-Analysis-RAG\notebooks\cleaned_task_3_Building_the_RAG.ipynb"
with open(cleaned_path, "w", encoding="utf-8") as f:
    nbformat.write(nb, f)

print("Cleaned notebook saved to:", cleaned_path)
