# Text Chunking, Embedding, and Vector Store Indexing

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%pip install sentence_transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence_transformers)
 

In [5]:
%pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


## Import Libaries

In [6]:
import pandas as pd
import numpy as np
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.rag import (retrieve_similar_complaints, load_faiss_index, load_metadata, load_embedding_model,
                    initialize_faiss_index, save_faiss_index, embed_chunks, prepare_chunks_and_metadata)
from sentence_transformers import SentenceTransformer
import faiss

## Load Chunked data

In [7]:
file_path = '/content/drive/MyDrive/data/chunked_complaints.csv'
df = pd.read_csv(file_path)

In [8]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,...,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,narrative_length,cleaned_narrative,cleaned_narrative_length,narrative_chunks
0,2025-06-13,Credit card,Store credit card,Getting a credit card,Card opened without my consent or knowledge,A XXXX XXXX card was opened under my name by a...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78230,...,Web,2025-06-13,Closed with non-monetary relief,Yes,,14069121,91,a xxxx xxxx card was opened under my name by a...,91,['a xxxx xxxx card was opened under my name by...
1,2025-06-12,Credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,"Dear CFPB, I have a secured credit card with c...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",NY,11220,...,Web,2025-06-13,Closed with monetary relief,Yes,,14047085,156,dear cfpb i have a secured credit card with ci...,156,['dear cfpb i have a secured credit card with ...
2,2025-06-12,Credit card,General-purpose credit card or charge card,Incorrect information on your report,Account information incorrect,I have a Citi rewards cards. The credit balanc...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",IL,60067,...,Web,2025-06-12,Closed with explanation,Yes,,14040217,233,i have a citi rewards cards the credit balance...,231,['i have a citi rewards cards the credit balan...
3,2025-06-09,Credit card,General-purpose credit card or charge card,Problem with a purchase shown on your statement,Credit card company isn't resolving a dispute ...,b'I am writing to dispute the following charge...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78413,...,Web,2025-06-09,Closed with monetary relief,Yes,,13968411,454,bi am writing to dispute the following charges...,454,['bi am writing to dispute the following charg...
4,2025-06-09,Credit card,General-purpose credit card or charge card,Problem when making payments,Problem during payment process,"Although the account had been deemed closed, I...",Company believes it acted appropriately as aut...,Atlanticus Services Corporation,NY,11212,...,Web,2025-06-09,Closed with monetary relief,Yes,,13965746,170,although the account had been deemed closed i ...,170,['although the account had been deemed closed ...


## Embedding Model Choice

For this project, I used the `all-MiniLM-L6-v2` model from the SentenceTransformers library. This model is lightweight, fast, and provides high-quality sentence embeddings suitable for semantic search and retrieval tasks. It is widely used in industry and research for its balance of performance and efficiency.

In [9]:
# Prepare chunks and metadata
all_chunks, metadata = prepare_chunks_and_metadata(df)

In [10]:
# Load the embedding model
embedding_model = load_embedding_model('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
# Generate embeddings for all chunks
embeddings = embed_chunks(all_chunks, embedding_model)

In [12]:
# Store embeddings as a numpy array
# embeddings = np.array(embeddings)
print('Embeddings shape:', embeddings.shape)

Embeddings shape: (1609126, 384)


## Vector Store Indexing

In [13]:
# Create FAISS index
index = initialize_faiss_index(embeddings)

In [14]:
import os

index_path = './vector_store/complaint_chunks.index'
metadata_path = './vector_store/complaint_chunks_metadata.pkl'

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(index_path), exist_ok=True)

save_faiss_index(index, metadata, index_path, metadata_path)

In [15]:
question = "Why are people unhappy with personal loans?"
results = retrieve_similar_complaints(question, embedding_model, index, metadata, all_chunks, k=5)
for i, (chunk, meta, dist) in enumerate(results, 1):
    print(f"Result {i} (Distance: {dist:.4f})")
    print(f"Complaint ID: {meta['complaint_id']}, Product: {meta['product']}")
    print(f"Text: {chunk}\n")

Result 1 (Distance: 0.5832)
Complaint ID: 3875658, Product: Payday loan, title loan, or personal loan
Text: to them for help they are preying on consumers that are financially unstable and are only adding to that instability it is extremely hard to get back on your feet without a highinterest loan over your head and nearly impossible with one i am speaking from

Result 2 (Distance: 0.6675)
Complaint ID: 4044329, Product: Payday loan, title loan, or personal loan
Text: in person if this is true one main financial is intentionally making it more difficult for people to pay off their loans with the intent of burdening people with debt so they can make a profit

Result 3 (Distance: 0.6739)
Complaint ID: 3934137, Product: Payday loan, title loan, or personal loan
Text: that has nothing to do with personal fault is the height of corporate greed and displays a total lack of compassion or humanity especially when the loan is almost paid off

Result 4 (Distance: 0.7078)
Complaint ID: 10709657, 

Designing a robust prompt template is crucial for getting the desired output from the LLM. A good template should include:

1.  **Role-playing**: Clearly define the persona the LLM should adopt (e.g., financial analyst assistant).
2.  **Task Description**: Explain what the LLM needs to do (e.g., answer questions about customer complaints).
3.  **Constraints**: Specify limitations, such as only using the provided context and stating when information is insufficient.
4.  **Context Inclusion**: Provide a placeholder for the retrieved relevant information.
5.  **Question Inclusion**: Provide a placeholder for the user's query.
6.  **Output Format**: Suggest how the answer should be structured (e.g., directly answering the question).

Here is a more robust prompt template incorporating these elements:

In [16]:
prompt_template = """You are a helpful financial analyst assistant for CrediTrust.
Your primary task is to answer questions about customer complaints based *only* on the provided context.
Adhere strictly to the information presented in the context.
If the provided context does not contain enough information to answer the question, respond with "I do not have enough information from the provided complaints to answer this question."
Do not use any external knowledge or make assumptions.

Context:
{context}

Question:
{question}

Answer:
"""

In [17]:
def generate_response_from_rag(question, retrieved_chunks, prompt_template, llm):
    """
    Combines the prompt, question, and retrieved chunks and sends to an LLM.

    Args:
        question (str): The user's question.
        retrieved_chunks (list): A list of retrieved text chunks.
        prompt_template (str): The template for the prompt.
        llm: The language model object or pipeline.

    Returns:
        str: The LLM's generated response.
    """
    # Combine retrieved chunks into a single context string
    context = "\n\n".join(retrieved_chunks)

    # Format the prompt with the context and question
    formatted_prompt = prompt_template.format(context=context, question=question)

    # TODO: Integrate with your chosen LLM (e.g., using Hugging Face, LangChain, etc.)
    # Example placeholder for sending the prompt to an LLM:
    # response = llm(formatted_prompt)
    # return response

    # For demonstration, returning the formatted prompt
    return formatted_prompt

# Example usage (assuming you have an LLM object 'my_llm' and 'results' from retrieval)
# Note: You need to replace 'my_llm' with your actual LLM integration.
retrieved_chunks = [chunk for chunk, meta, dist in results]
question = "Why are people unhappy with personal loans?"
# generated_answer = generate_response_from_rag(question, retrieved_chunks, prompt_template, my_llm)
# print(generated_answer)

NameError: name 'my_llm' is not defined