# Exploratory Data Analysis and Preprocessing
This notebook loads the CFPB complaints dataset, performs EDA, and preprocesses the data for the RAG pipeline.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

In [None]:
# Load the dataset
df = pd.read_csv('../data/complaints.csv')
df.head()

## Data Overview
- Number of records
- Columns
- Sample rows

In [None]:
print('Number of records:', len(df))
print('Columns:', df.columns.tolist())
df.sample(5)

## Distribution of Complaints by Product

In [None]:
product_counts = df['Product'].value_counts()
plt.figure(figsize=(10,5))
sns.barplot(x=product_counts.index, y=product_counts.values)
plt.title('Number of Complaints by Product')
plt.ylabel('Count')
plt.xlabel('Product')
plt.xticks(rotation=45)
plt.show()

## Narrative Length Analysis

In [None]:
df['narrative_length'] = df['Consumer complaint narrative'].fillna('').apply(lambda x: len(x.split()))
plt.figure(figsize=(10,5))
sns.histplot(df['narrative_length'], bins=50, kde=True)
plt.title('Distribution of Narrative Lengths')
plt.xlabel('Word Count')
plt.ylabel('Number of Complaints')
plt.show()

## Complaints With and Without Narratives

In [None]:
with_narrative = df['Consumer complaint narrative'].notnull().sum()
without_narrative = df['Consumer complaint narrative'].isnull().sum()
print(f'Complaints with narrative: {with_narrative}')
print(f'Complaints without narrative: {without_narrative}')

## Filter Dataset for Project Requirements
- Only keep specified products
- Remove empty narratives

In [None]:
products = ['Credit card', 'Personal loan', 'Buy Now, Pay Later', 'Savings account', 'Money transfer']
filtered = df[df['Product'].isin(products)]
filtered = filtered[filtered['Consumer complaint narrative'].notnull() & (filtered['Consumer complaint narrative'].str.strip() != '')]

## Clean Text Narratives

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9]', '', text)
    text = re.sub(r'i am writing to file a complaint', '', text)
    text = re.sub(r'+', ' ', text).strip()
    return text
filtered['cleaned_narrative'] = filtered['Consumer complaint narrative'].apply(clean_text)
filtered.head()

## Save Cleaned and Filtered Dataset

In [None]:
filtered.to_csv('../data/filtered_complaints.csv', index=False)
print('Filtered dataset saved to ../data/filtered_complaints.csv')