In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

### Text Cleaning and Sampling 

In [3]:
input_file = '../Data/filtered/filtered_complaints.csv'
output_file = '../Data/filtered/sampled_complaints.csv'

def clean_narrative(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'i am writing to file a complaint|to whom it may concern|dear cfpb|x{2,}', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
    return " ".join(text.split())

#Identify valid products (>= 10 rows)
chunk_iter = pd.read_csv(input_file, chunksize=100000, usecols=['Product'])
all_counts = pd.Series(dtype=int)
for chunk in chunk_iter:
    all_counts = all_counts.add(chunk['Product'].value_counts(), fill_value=0)

valid_products = all_counts[all_counts >= 10].index.tolist()

# Process, clean, and collect all eligible rows
processed_data = []
for chunk in pd.read_csv(input_file, chunksize=50000):
    chunk = chunk[chunk['Product'].isin(valid_products)].copy()
    chunk['cleaned_narrative'] = chunk['Consumer complaint narrative'].apply(clean_narrative)
    processed_data.append(chunk)

df_all = pd.concat(processed_data)

# Stratified Sampling
df_sample, _ = train_test_split(
    df_all, 
    train_size=15000, 
    stratify=df_all['Product'], 
    random_state=42
)

df_sample.to_csv(output_file, index=False)
print(f"Success. Saved 15,000 rows to {output_file}")
print(df_sample['Product'].value_counts())

Success. Saved 15,000 rows to ../Data/filtered/sampled_complaints.csv
Product
Debt collection                                            5091
Checking or savings account                                4251
Money transfer, virtual currency, or money service         2944
Credit card                                                2444
Payday loan, title loan, personal loan, or advance loan     270
Name: count, dtype: int64


### Chunking , Embedding and storing into a vector store  

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Grab the longest narrative from sample to test the "worst case"
test_text = df_sample.loc[df_sample['cleaned_narrative'].str.len().idxmax(), 'cleaned_narrative']
print(f"Original Length: {len(test_text)} characters")

# Set up the experiment
chunk_size = 600
chunk_overlap = 100

splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", ". ", " ", ""]
)

# Perform the split
test_chunks = splitter.split_text(test_text)

print(f"Total Chunks Created: {len(test_chunks)}\n")

# Inspect the first two chunks to see the "Overlap"
print(" CHUNK 1 ---")
print(test_chunks[0])
print("\n CHUNK 2 (Look for the overlap at the start) ---")
print(test_chunks[1])

Original Length: 24630 characters
Total Chunks Created: 57

 CHUNK 1 ---
the county court is a corporation. judge was acting as an administrator and not listening or properly reviewing the court record. objected multiple times that lawyer was abusing the legal process, since already met the burden of proof with several certified letters sent out withing 5 days of recieving notice from the parties working with the judge committing fraud and , then , . judge did not have consent showing up in court the day , in special appearance as a live man. reserved their rights without prejudice ucc 1308

 CHUNK 2 (Look for the overlap at the start) ---
. reserved their rights without prejudice ucc 1308. also reserved their given rights, common law rights, civil rights and constitutional protections. they also invoked common law and described with legal citations that that matter was blended with law and common law equity, and cited ucc 103.6. the judge was unfamiliar with blended law and the citati

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

test_text = df_sample.loc[df_sample['cleaned_narrative'].str.len().idxmax(), 'cleaned_narrative']
print(f"Original Length: {len(test_text)} characters")

# Set up the experiment
chunk_size = 600
chunk_overlap = 100

# Larger chunks to reduce 70 fragments to something more manageable
bigger_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,   # Increased from 600
    chunk_overlap=200, # Increased to keep 2-3 sentences of context
    separators=["\n\n", "\n", ". ", " ", ""]
)

bigger_chunks = bigger_splitter.split_text(test_text)
print(f"New Total Chunks: {len(bigger_chunks)}")
print(f"Sample Chunk:\n{bigger_chunks[1][:300]}") # Show the start of the next piece

Original Length: 24630 characters
New Total Chunks: 32
Sample Chunk:
. the judge was unfamiliar with blended law and the citations of v. , and v. , he did not know about any more recent cases that the case was based on being retired since . explained about v. since there were fraudulent assignments out of a irs static trust, ie tax evasion. the trust was set up confo
