In [4]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [5]:
import pandas as pd
import faiss
import logging
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import torch
from transformers import pipeline
from google.colab import drive
from datetime import datetime
import itertools

In [9]:
drive.mount('/content/drive')

Mounted at /content/drive


# Load Chunked Data and FAISS Index

In [15]:
import pandas as pd
import faiss
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Update paths



# Load chunked data
data_path = '/content/drive/My Drive/Colab Notebooks/chunked_complaints.csv'
try:
    df_chunks = pd.read_csv(data_path)
    logging.info(f"Loaded chunked dataset with shape: {df_chunks.shape}")
    print("Columns:", df_chunks.columns.tolist())
    print("\nSample Chunks (first 5):")
    print(df_chunks[['complaint_id', 'product', 'chunk_idx', 'chunk_text']].head())
except Exception as e:
    logging.error(f"Failed to load chunked dataset: {e}")
    raise

# Load FAISS index
index_path = '/content/drive/My Drive/Colab Notebooks/complaint_index.faiss'
try:
    index = faiss.read_index(index_path)
    logging.info(f"FAISS index size: {index.ntotal}")
except Exception as e:
    logging.error(f"Failed to load FAISS index: {e}")
    raise

Columns: ['complaint_id', 'product', 'chunk_idx', 'chunk_text', 'chunk_length']

Sample Chunks (first 5):
   complaint_id          product  chunk_idx  \
0      14069121      Credit Card          0   
1      14061897  Savings Account          0   
2      14047085      Credit Card          0   
3      14040217      Credit Card          0   
4      14040217      Credit Card          1   

                                          chunk_text  
0  a card was opened under my name by a fraudster...  
1  i made the mistake of using my wellsfargo debi...  
2  i have a secured credit card with citibank whi...  
3  i have a citi rewards cards the credit balance...  
4  prior to the notification about reaching my li...  


# Implement the Retriever

In [16]:

# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

def retrieve_chunks(query, index, df_chunks, model, top_k=5):
    try:
        # Encode query
        query_embedding = model.encode([query], show_progress_bar=False)
        # Search FAISS index
        distances, indices = index.search(np.array(query_embedding, dtype=np.float32), top_k)
        # Get corresponding chunks
        retrieved_chunks = df_chunks.iloc[indices[0]][['complaint_id', 'product', 'chunk_idx', 'chunk_text']].to_dict('records')
        return retrieved_chunks, distances[0]
    except Exception as e:
        logging.error(f"Error retrieving chunks for query '{query}': {e}")
        return [], []

# Test retriever
sample_queries = [
    "Why are people unhappy with BNPL?",
    "What are common issues with Credit Card fraud?",
    "Why do Savings Account complaints happen?"
]
for query in sample_queries:
    chunks, distances = retrieve_chunks(query, index, df_chunks, model)
    print(f"\nQuery: {query}")
    print("Retrieved Chunks:")
    for i, chunk in enumerate(chunks):
        print(f"Chunk {i+1} (Product: {chunk['product']}, Distance: {distances[i]:.4f}):")
        print(chunk['chunk_text'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Query: Why are people unhappy with BNPL?
Retrieved Chunks:
Chunk 1 (Product: Buy Now, Pay Later (BNPL), Distance: 0.9367):
practices of bnpl companies reporting only negative data creates an incomplete and potentially damaging picture of a consumer s creditworthiness it is my understanding that the cfpb has been looking into the bnpl sector and the unfair practices that are being used difficulty accessing assistance during financial hardship furthermore affirm does not provide easily accessible avenues for customers to seek assistance during periods of financial hardship navigating their customer service channels to request payment arrangements or other forms of support is unnecessarily difficult and frustrating this lack of transparency and accessibility exacerbates the negative impact of late payments particularly during unforeseen financial challenges i have attempted to contact them and have received no assistance
Chunk 2 (Product: Credit Card, Distance: 1.0844):
to deceive consum

In [1]:
!pip install -U transformers accelerate bitsandbytes


Collecting transformers
  Downloading transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70

# Set Up the LLM

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

model_name = "HuggingFaceH4/zephyr-7b-beta"

# ✅ 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# ✅ Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)

# ✅ Create text-generation pipeline
llm = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Example inputs
system_prompt = "You are a helpful financial assistant."
user_question = "What are the main risks in microloans?"
retrieved_chunks = [
    "Microloans often face high default rates due to limited borrower credit history.",
    "Operational costs can be disproportionately high relative to loan size."
]

# Build prompt in Zephyr format
full_prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{user_question}\n<|retrieved|>\n" + "\n".join(retrieved_chunks)

# Generate
response = llm(full_prompt, max_new_tokens=200, do_sample=True, temperature=0.7)

print(response[0]["generated_text"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0


<|system|>
You are a helpful financial assistant.
<|user|>
What are the main risks in microloans?
<|retrieved|>
Microloans often face high default rates due to limited borrower credit history.
Operational costs can be disproportionately high relative to loan size.
Political instability, natural disasters, and economic shocks can lead to high losses.
Exposure to systemic risks in the financial sector, such as bank failures or regulatory changes, can impact MFIs and their borrowers.
Fraud and corruption risks can arise in MFI operations and disbursement processes.
Interest rate and currency risks can impact the profitability and solvency of MFIs and their clients.
Credit risks arise from the possibility that borrowers may default on their loan obligations.
Inadequate collateral or guarantees can increase the risks associated with microloans.
Lack of access to finance for borrowers who live in remote areas or belong to marginalized communities can create barriers to repayment.

Source: Wo

# Load and Verify Files

In [10]:
import pandas as pd
import faiss
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Update paths



# Load chunked data
data_path = '/content/drive/My Drive/Colab Notebooks/chunked_complaints.csv'
try:
    df_chunks = pd.read_csv(data_path)
    logging.info(f"Loaded chunked dataset with shape: {df_chunks.shape}")
    print("Columns:", df_chunks.columns.tolist())
    print("\nSample Chunks (first 5):")
    print(df_chunks[['complaint_id', 'product', 'chunk_idx', 'chunk_text']].head())
except Exception as e:
    logging.error(f"Failed to load chunked dataset: {e}")
    raise

# Load FAISS index
index_path = '/content/drive/My Drive/Colab Notebooks/complaint_index.faiss'
try:
    index = faiss.read_index(index_path)
    logging.info(f"FAISS index size: {index.ntotal}")
except Exception as e:
    logging.error(f"Failed to load FAISS index: {e}")
    raise

Columns: ['complaint_id', 'product', 'chunk_idx', 'chunk_text', 'chunk_length']

Sample Chunks (first 5):
   complaint_id          product  chunk_idx  \
0      14069121      Credit Card          0   
1      14061897  Savings Account          0   
2      14047085      Credit Card          0   
3      14040217      Credit Card          0   
4      14040217      Credit Card          1   

                                          chunk_text  
0  a card was opened under my name by a fraudster...  
1  i made the mistake of using my wellsfargo debi...  
2  i have a secured credit card with citibank whi...  
3  i have a citi rewards cards the credit balance...  
4  prior to the notification about reaching my li...  


# Integrate FAISS Retriever with the LLM

In [11]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

def retrieve_chunks(query, index, df_chunks, embedder, top_k=5):
    try:
        query_embedding = embedder.encode([query], show_progress_bar=False)
        distances, indices = index.search(np.array(query_embedding, dtype=np.float32), top_k)
        retrieved_chunks = df_chunks.iloc[indices[0]][['complaint_id', 'product', 'chunk_idx', 'chunk_text']].to_dict('records')
        return retrieved_chunks, distances[0]
    except Exception as e:
        logging.error(f"Error retrieving chunks for query '{query}': {e}")
        return [], []

def rag_pipeline(query, index, df_chunks, embedder, llm, top_k=5):
    try:
        # Retrieve chunks
        chunks, distances = retrieve_chunks(query, index, df_chunks, embedder, top_k)
        if not chunks:
            return "No relevant complaints found.", [], []

        # Build prompt in Zephyr format
        system_prompt = "You are a financial complaint analysis assistant."
        context = "\n".join([f"Complaint (Product: {chunk['product']}): {chunk['chunk_text']}" for chunk in chunks])
        full_prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{query}\n<|retrieved|>\n{context}"

        # Generate response
        response = llm(full_prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
        return response[0]["generated_text"], chunks, distances
    except Exception as e:
        logging.error(f"Error in RAG pipeline for query '{query}': {e}")
        return "Error processing query.", [], []

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Test the RAG Pipeline

In [12]:
# Test RAG pipeline
sample_queries = [
    "Why are people unhappy with BNPL?",
    "What are common issues with Credit Card fraud?",
    "Why do Savings Account complaints happen?",
    "What issues do people face with Money Transfers?",
    "Why are Personal Loan complaints common?"
]
results = []
for query in sample_queries:
    response, chunks, distances = rag_pipeline(query, index, df_chunks, embedder, llm)
    print(f"\nQuery: {query}")
    print("Response:", response)
    print("Retrieved Chunks:")
    for i, chunk in enumerate(chunks):
        print(f"Chunk {i+1} (Product: {chunk['product']}, Distance: {distances[i]:.4f}):")
        print(chunk['chunk_text'])
    results.append({
        'query': query,
        'response': response,
        'retrieved_chunks': chunks,
        'distances': distances.tolist()
    })


Query: Why are people unhappy with BNPL?
Response: <|system|>
You are a financial complaint analysis assistant.
<|user|>
Why are people unhappy with BNPL?
<|retrieved|>
Complaint (Product: Buy Now, Pay Later (BNPL)): practices of bnpl companies reporting only negative data creates an incomplete and potentially damaging picture of a consumer s creditworthiness it is my understanding that the cfpb has been looking into the bnpl sector and the unfair practices that are being used difficulty accessing assistance during financial hardship furthermore affirm does not provide easily accessible avenues for customers to seek assistance during periods of financial hardship navigating their customer service channels to request payment arrangements or other forms of support is unnecessarily difficult and frustrating this lack of transparency and accessibility exacerbates the negative impact of late payments particularly during unforeseen financial challenges i have attempted to contact them and h

# Save Pipeline and Results

In [14]:
from google.colab import files
# Save RAG pipeline script
with open('/content/drive/My Drive/Colab Notebooks/rag_pipeline.py', 'w') as f:
    f.write("""
# [Copy the full RAG pipeline code from Steps 3-5 here manually]
""")
logging.info("RAG pipeline saved as/content/drive/My Drive/Colab Notebooks/rag_pipeline.py")

# Save results
pd.DataFrame(results).to_json('/content/drive/My Drive/Colab Notebooks/rag_results.json', orient='records', lines=True)
logging.info("RAG results saved as/content/drive/My Drive/Colab Notebooks/rag_results.json")

# Download files
files.download('/content/drive/My Drive/Colab Notebooks/rag_pipeline.py')
files.download('/content/drive/My Drive/Colab Notebooks/rag_results.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>