# Dataset Preparation for RAG Pipeline

### import libraries 


In [6]:
!pip install nltk



In [15]:
import nltk
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
import numpy as np

In [16]:
import logging
import os


In [17]:
import faiss

# Load the Preprocessed Dataset and look it up 

In [21]:
# Step 1: Load the preprocessed dataset
data_path = r'C:\Users\HP\10 Acadamy PRojects\New folder (6)\Complaint-Analysis-RAG\data\preprocessed_complaints.csv'
try:
    df = pd.read_csv(data_path)
    logging.info(f"Loaded dataset with shape: {df.shape}")
    print("Columns:", df.columns.tolist())
    print("\nSample Data (first 5 rows):")
    print(df[['mapped_product', 'cleaned_narrative']].head())
except Exception as e:
    logging.error(f"Failed to load dataset: {e}")
    raise

2025-07-05 13:06:58,775 - INFO - Loaded dataset with shape: (465679, 21)


Columns: ['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative', 'Company public response', 'Company', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Date sent to company', 'Company response to consumer', 'Timely response?', 'Consumer disputed?', 'Complaint ID', 'narrative_length', 'mapped_product', 'cleaned_narrative']

Sample Data (first 5 rows):
    mapped_product                                  cleaned_narrative
0      Credit Card  a card was opened under my name by a fraudster...
1  Savings Account  i made the mistake of using my wellsfargo debi...
2      Credit Card  i have a secured credit card with citibank whi...
3      Credit Card  i have a citi rewards cards the credit balance...
4      Credit Card  b the following charges on my citi credit card...


# Chunk the Narratives &  Generate Embeddings

In [19]:

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Ensure NLTK tokenizer is ready
nltk.download('punkt')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:

# Chunk the narratives
def chunk_narrative(narrative, chunk_size=200, overlap=40):
    try:
        if not isinstance(narrative, str) or not narrative.strip():
            logging.warning("Skipping empty or non-string narrative")
            return []
        words = word_tokenize(narrative)
        if len(words) <= chunk_size:
            return [(narrative, 0)]
        chunks = []
        start = 0
        index = 0
        while start < len(words):
            end = min(start + chunk_size, len(words))
            chunk_words = words[start:end]
            chunk = ' '.join(chunk_words)
            chunks.append((chunk, index))
            start += chunk_size - overlap
            index += 1
        return chunks
    except Exception as e:
        logging.warning(f"Error chunking narrative: {e}")
        return []

# Process in batches to reduce memory usage
batch_size = 5000  # Reduced for local machine
chunked_data = []
for start_idx in range(0, len(df), batch_size):
    batch = df.iloc[start_idx:start_idx + batch_size]
    logging.info(f"Processing batch {start_idx // batch_size + 1}/{len(df) // batch_size + 1}")
    for idx, row in batch.iterrows():
        complaint_id = row['Complaint ID']
        product = row['mapped_product']
        narrative = row['cleaned_narrative']
        chunks = chunk_narrative(narrative)
        for chunk, chunk_idx in chunks:
            chunked_data.append({
                'complaint_id': complaint_id,
                'product': product,
                'chunk_idx': chunk_idx,
                'chunk_text': chunk
            })

df_chunks = pd.DataFrame(chunked_data)
logging.info(f"Chunked dataset shape: {df_chunks.shape}")
print("\nSample Chunks (first 5):")
print(df_chunks[['complaint_id', 'product', 'chunk_idx', 'chunk_text']].head())
df_chunks['chunk_length'] = df_chunks['chunk_text'].apply(lambda x: len(word_tokenize(x)) if isinstance(x, str) else 0)
print("\nChunk Length Summary (words):")
print(df_chunks['chunk_length'].describe())

# Save chunked data
save_path = r'C:\Users\HP\10 Acadamy PRojects\New folder (6)\Complaint-Analysis-RAG\data\chunked_complaints.csv'
df_chunks.to_csv(save_path, index=False)
logging.info(f"Chunked dataset saved as {save_path}")

# Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
batch_size = 256  # Smaller batch size for local machine
embeddings = []
for start_idx in range(0, len(df_chunks), batch_size):
    batch = df_chunks['chunk_text'].iloc[start_idx:start_idx + batch_size].tolist()
    logging.info(f"Encoding batch {start_idx // batch_size + 1}/{len(df_chunks) // batch_size + 1}")
    try:
        batch_embeddings = model.encode(batch, show_progress_bar=True)
        embeddings.append(batch_embeddings)
    except Exception as e:
        logging.error(f"Error encoding batch {start_idx // batch_size + 1}: {e}")
        continue
embeddings = np.vstack(embeddings)
logging.info(f"Embedding shape: {embeddings.shape}")
print("\nSample Embedding (first chunk, first 5 dimensions):")
print(embeddings[0][:5])

# Store embeddings in FAISS
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
logging.info(f"FAISS index size: {index.ntotal}")
faiss.write_index(index, r'C:\Users\HP\10 Acadamy PRojects\New folder (6)\Complaint-Analysis-RAG\data\complaint_index.faiss')

#Re-save chunked data (for consistency)
df_chunks.to_csv(save_path, index=False)
logging.info(f"Chunked dataset re-saved as {save_path}")

# Verify chunk quality
for product in df_chunks['product'].unique():
    print(f"\nSample Chunks for {product}:")
    print(df_chunks[df_chunks['product'] == product][['chunk_idx', 'chunk_text', 'chunk_length']].sample(3))

2025-07-05 20:38:54,646 - INFO - Processing batch 1/94


2025-07-05 20:39:00,431 - INFO - Processing batch 2/94
2025-07-05 20:39:05,116 - INFO - Processing batch 3/94
2025-07-05 20:39:08,106 - INFO - Processing batch 4/94
2025-07-05 20:39:11,352 - INFO - Processing batch 5/94
2025-07-05 20:39:15,360 - INFO - Processing batch 6/94
2025-07-05 20:39:18,893 - INFO - Processing batch 7/94
2025-07-05 20:39:22,161 - INFO - Processing batch 8/94
2025-07-05 20:39:25,899 - INFO - Processing batch 9/94
2025-07-05 20:39:29,379 - INFO - Processing batch 10/94
2025-07-05 20:39:32,277 - INFO - Processing batch 11/94
2025-07-05 20:39:35,881 - INFO - Processing batch 12/94
2025-07-05 20:39:38,917 - INFO - Processing batch 13/94
2025-07-05 20:39:41,755 - INFO - Processing batch 14/94
2025-07-05 20:39:45,005 - INFO - Processing batch 15/94
2025-07-05 20:39:48,753 - INFO - Processing batch 16/94
2025-07-05 20:39:52,981 - INFO - Processing batch 17/94
2025-07-05 20:39:56,976 - INFO - Processing batch 18/94
2025-07-05 20:40:01,078 - INFO - Processing batch 19/94



Sample Chunks (first 5):
   complaint_id          product  chunk_idx  \
0      14069121      Credit Card          0   
1      14061897  Savings Account          0   
2      14047085      Credit Card          0   
3      14040217      Credit Card          0   
4      14040217      Credit Card          1   

                                          chunk_text  
0  a card was opened under my name by a fraudster...  
1  i made the mistake of using my wellsfargo debi...  
2  i have a secured credit card with citibank whi...  
3  i have a citi rewards cards the credit balance...  
4  prior to the notification about reaching my li...  

Chunk Length Summary (words):
count    763160.000000
mean        133.157777
std          64.043164
min           1.000000
25%          78.000000
50%         140.000000
75%         200.000000
max         200.000000
Name: chunk_length, dtype: float64


2025-07-05 20:51:31,428 - INFO - Chunked dataset saved as C:\Users\HP\10 Acadamy PRojects\New folder (6)\Complaint-Analysis-RAG\data\chunked_complaints.csv
2025-07-05 20:51:31,450 - INFO - Use pytorch device_name: cpu
2025-07-05 20:51:31,451 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-07-05 20:51:38,644 - INFO - Encoding batch 1/2982
Batches: 100%|██████████| 8/8 [00:10<00:00,  1.29s/it]
2025-07-05 20:51:48,990 - INFO - Encoding batch 2/2982
Batches: 100%|██████████| 8/8 [00:10<00:00,  1.35s/it]
2025-07-05 20:51:59,819 - INFO - Encoding batch 3/2982
Batches: 100%|██████████| 8/8 [00:10<00:00,  1.37s/it]
2025-07-05 20:52:10,813 - INFO - Encoding batch 4/2982
Batches: 100%|██████████| 8/8 [00:11<00:00,  1.41s/it]
2025-07-05 20:52:22,137 - INFO - Encoding batch 5/2982
Batches: 100%|██████████| 8/8 [00:11<00:00,  1.43s/it]
2025-07-05 20:52:33,611 - INFO - Encoding batch 6/2982
Batches: 100%|██████████| 8/8 [00:11<00:00,  1.43s/it]
2025-07-05 20:52:45,104 - INFO - En


Sample Embedding (first chunk, first 5 dimensions):
[-0.03698349  0.02451562 -0.07761159  0.01777064  0.00104698]


2025-07-06 07:52:32,624 - INFO - FAISS index size: 763160
2025-07-06 07:52:52,871 - INFO - Chunked dataset re-saved as C:\Users\HP\10 Acadamy PRojects\New folder (6)\Complaint-Analysis-RAG\data\chunked_complaints.csv



Sample Chunks for Credit Card:
        chunk_idx                                         chunk_text  \
75684           2  federal laws 2 request the removal of the deli...   
103629          6  somebody made a mistake in the back office ple...   
332797          0  i received a notification from paypal that the...   

        chunk_length  
75684             79  
103629           200  
332797           200  

Sample Chunks for Savings Account:
        chunk_idx                                         chunk_text  \
555400          0  a scam was conducted international scam it was...   
701427          2  an answer i dont think thats right they tell p...   
489390          0  i received a letter from us bank saying i have...   

        chunk_length  
555400           121  
701427            86  
489390            95  

Sample Chunks for Buy Now, Pay Later (BNPL):
        chunk_idx                                         chunk_text  \
108132          1  scam website the loan was not mad

# Checking the FAISS index,chunked dataset size and the quality of the data !

In [27]:
import faiss

# Check FAISS index
index = faiss.read_index(r'C:\Users\HP\10 Acadamy PRojects\New folder (6)\Complaint-Analysis-RAG\data\complaint_index.faiss')
print("\nFAISS Index Size:", index.ntotal)


FAISS Index Size: 763160


In [None]:
df_chunks = pd.read_csv(r'C:\Users\HP\10 Acadamy PRojects\New folder (6)\Complaint-Analysis-RAG\data\chunked_complaints.csv')
print("Chunked Dataset Shape:", df_chunks.shape)
print("\nSample Chunks (first 5):")
print(df_chunks[['complaint_id', 'product', 'chunk_idx', 'chunk_text']].head())
df_chunks['chunk_length'] = df_chunks['chunk_text'].apply(lambda x: len(word_tokenize(x)) if isinstance(x, str) else 0)
print("\nChunk Length Summary (words):")
print(df_chunks['chunk_length'].describe())

Chunked Dataset Shape: (763160, 5)

Sample Chunks (first 5):
   complaint_id          product  chunk_idx  \
0      14069121      Credit Card          0   
1      14061897  Savings Account          0   
2      14047085      Credit Card          0   
3      14040217      Credit Card          0   
4      14040217      Credit Card          1   

                                          chunk_text  
0  a card was opened under my name by a fraudster...  
1  i made the mistake of using my wellsfargo debi...  
2  i have a secured credit card with citibank whi...  
3  i have a citi rewards cards the credit balance...  
4  prior to the notification about reaching my li...  

Chunk Length Summary (words):
count    763160.000000
mean        133.157777
std          64.043164
min           1.000000
25%          78.000000
50%         140.000000
75%         200.000000
max         200.000000
Name: chunk_length, dtype: float64


In [28]:
for product in df_chunks['product'].unique():
    print(f"\nSample Chunks for {product}:")
    print(df_chunks[df_chunks['product'] == product][['chunk_idx', 'chunk_text', 'chunk_length']].sample(3))


Sample Chunks for Credit Card:
        chunk_idx                                         chunk_text  \
380520          1  at they confirmed it was the replacement card ...   
589167          0  i had fraud on my credit card which i cancelle...   
750900          0  its netspend by my card and phone were stolen ...   

        chunk_length  
380520            65  
589167            63  
750900            71  

Sample Chunks for Savings Account:
        chunk_idx                                         chunk_text  \
409285          5  if compelling evidence is provided by the acqu...   
419169          3  my acct with the boa app and change my phone n...   
602557          1  a couple of years ago in a phone call today bo...   

        chunk_length  
409285           200  
419169           100  
602557           121  

Sample Chunks for Buy Now, Pay Later (BNPL):
        chunk_idx                                         chunk_text  \
290353          0  xxxxxxxx i went to upgrade online

# Check short chunks 

In [29]:
# Load chunked dataset
df_chunks = pd.read_csv(r'C:\Users\HP\10 Acadamy PRojects\New folder (6)\Complaint-Analysis-RAG\data\chunked_complaints.csv')

# Check short chunks (<=10 words)
short_chunks = df_chunks[df_chunks['chunk_length'] <= 10]
print("\nShort Chunks (<=10 words):")
print(f"Number of short chunks: {len(short_chunks)}")
print(short_chunks[['complaint_id', 'product', 'chunk_idx', 'chunk_text', 'chunk_length']].head(10))


Short Chunks (<=10 words):
Number of short chunks: 10209
     complaint_id          product  chunk_idx  \
110      13081204      Credit Card          2   
142      13873157  Savings Account          2   
203      13111263      Credit Card          2   
231      14039311  Savings Account          2   
314      13948165  Savings Account          2   
327      13729370      Credit Card          2   
395      13909146  Money Transfers          2   
516      12720611      Credit Card          2   
769      13874580      Credit Card          2   
799      13750104      Credit Card          2   

                                            chunk_text  chunk_length  
110                                        protections             1  
142  correctly or i will take legal action please t...            10  
203                 of the comptroller of the currency             6  
231                                          sincerely             1  
314   marked removed by wells fargo but not rei