# Text Chunking

## Import Libaries

In [1]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Load Filtered data

In [2]:
file_path = '../data/filtered_complaints.csv'
df = pd.read_csv(file_path)

In [10]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,...,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,narrative_length,cleaned_narrative,cleaned_narrative_length,narrative_chunks
0,2025-06-13,Credit card,Store credit card,Getting a credit card,Card opened without my consent or knowledge,A XXXX XXXX card was opened under my name by a...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78230,...,Web,2025-06-13,Closed with non-monetary relief,Yes,,14069121,91,a xxxx xxxx card was opened under my name by a...,91,[a xxxx xxxx card was opened under my name by ...
1,2025-06-12,Credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,"Dear CFPB, I have a secured credit card with c...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",NY,11220,...,Web,2025-06-13,Closed with monetary relief,Yes,,14047085,156,dear cfpb i have a secured credit card with ci...,156,[dear cfpb i have a secured credit card with c...
2,2025-06-12,Credit card,General-purpose credit card or charge card,Incorrect information on your report,Account information incorrect,I have a Citi rewards cards. The credit balanc...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",IL,60067,...,Web,2025-06-12,Closed with explanation,Yes,,14040217,233,i have a citi rewards cards the credit balance...,231,[i have a citi rewards cards the credit balanc...
3,2025-06-09,Credit card,General-purpose credit card or charge card,Problem with a purchase shown on your statement,Credit card company isn't resolving a dispute ...,b'I am writing to dispute the following charge...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78413,...,Web,2025-06-09,Closed with monetary relief,Yes,,13968411,454,bi am writing to dispute the following charges...,454,[bi am writing to dispute the following charge...
4,2025-06-09,Credit card,General-purpose credit card or charge card,Problem when making payments,Problem during payment process,"Although the account had been deemed closed, I...",Company believes it acted appropriately as aut...,Atlanticus Services Corporation,NY,11212,...,Web,2025-06-09,Closed with monetary relief,Yes,,13965746,170,although the account had been deemed closed i ...,170,[although the account had been deemed closed i...


## Text Chunking

In [4]:
df['cleaned_narrative_length'] = df['cleaned_narrative'].fillna('').apply(lambda x: len(str(x).split()))

In [5]:
print(df['cleaned_narrative_length'].describe())

min_narrative_length = df['cleaned_narrative_length'].min()
max_narrative_length = df['cleaned_narrative_length'].max()

print(f"Minimum narrative length: {min_narrative_length}")
print(f"Maximum narrative length: {max_narrative_length}")

count    272371.000000
mean        195.221356
std         214.504954
min           1.000000
25%          87.000000
50%         126.000000
75%         242.000000
max        6469.000000
Name: cleaned_narrative_length, dtype: float64
Minimum narrative length: 1
Maximum narrative length: 6469


In [6]:
# Example function to experiment with chunk_size and chunk_overlap
def experiment_text_splitter(text, chunk_sizes, chunk_overlaps):
    results = {}
    for size in chunk_sizes:
        for overlap in chunk_overlaps:
            splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)
            chunks = splitter.split_text(text)
            results[(size, overlap)] = len(chunks)
            print(f"chunk_size={size}, chunk_overlap={overlap} => {len(chunks)} chunks")
    return results

# Use a sample narrative for testing
sample_narrative = df['cleaned_narrative'].dropna().iloc[0]
chunk_sizes = [256, 512, 1024]
chunk_overlaps = [0, 50, 100]
experiment_text_splitter(sample_narrative, chunk_sizes, chunk_overlaps)

chunk_size=256, chunk_overlap=0 => 2 chunks
chunk_size=256, chunk_overlap=50 => 3 chunks
chunk_size=256, chunk_overlap=100 => 3 chunks
chunk_size=512, chunk_overlap=0 => 1 chunks
chunk_size=512, chunk_overlap=50 => 1 chunks
chunk_size=512, chunk_overlap=100 => 1 chunks
chunk_size=1024, chunk_overlap=0 => 1 chunks
chunk_size=1024, chunk_overlap=50 => 1 chunks
chunk_size=1024, chunk_overlap=100 => 1 chunks


{(256, 0): 2,
 (256, 50): 3,
 (256, 100): 3,
 (512, 0): 1,
 (512, 50): 1,
 (512, 100): 1,
 (1024, 0): 1,
 (1024, 50): 1,
 (1024, 100): 1}

- 75% are shorter than 256 → perfect match for chunk_size=256
- Only ~25% will be split into multiple chunks — where overlap helps
- Therefore the best balance is chunk_size=256, chunk_overlap=50 => 3 chunks

In [7]:
# Final splitter for production use
final_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
df['narrative_chunks'] = df['cleaned_narrative'].fillna('').apply(lambda x: final_splitter.split_text(x))
df[['cleaned_narrative', 'narrative_chunks']].head()

Unnamed: 0,cleaned_narrative,narrative_chunks
0,a xxxx xxxx card was opened under my name by a...,[a xxxx xxxx card was opened under my name by ...
1,dear cfpb i have a secured credit card with ci...,[dear cfpb i have a secured credit card with c...
2,i have a citi rewards cards the credit balance...,[i have a citi rewards cards the credit balanc...
3,bi am writing to dispute the following charges...,[bi am writing to dispute the following charge...
4,although the account had been deemed closed i ...,[although the account had been deemed closed i...


## Save Chunked Data

In [9]:
# Save the DataFrame to a CSV file
df.to_csv('../data/chunked_complaints.csv', index=False)

print("Chunked dataset saved to data/chunked_complaints.csv")

Chunked dataset saved to data/chunked_complaints.csv
