In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.data.preprocess import load_cfpb, preprocess_complaints
from src.config import RAW_DIR

sns.set_theme()

raw_path = RAW_DIR / 'cfpb_complaints.csv'  # change if needed
raw_path

In [None]:
df_raw = load_cfpb(Path(raw_path))
df_raw.shape

## Quick EDA

In [None]:
product_col = 'Product'
narrative_col = 'Consumer complaint narrative'

df_raw[product_col].value_counts().head(15)

In [None]:
has_narr = df_raw[narrative_col].fillna('').astype(str).str.strip().ne('')
has_narr.value_counts()

In [None]:
word_counts = df_raw[narrative_col].fillna('').astype(str).str.split().map(len)
word_counts.describe()

In [None]:
plt.figure(figsize=(10, 4))
sns.histplot(word_counts[word_counts > 0], bins=50)
plt.title('Narrative length distribution (word count)')
plt.xlabel('Words')
plt.show()

## Filtering + Cleaning (Project Scope)\n
\n
We keep only the in-scope product categories and remove empty narratives. Then we apply a conservative text cleaning step to improve embedding quality.

In [None]:
result = preprocess_complaints(df_raw)
df = result.df
result.stats

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(data=df, x='product_category', order=df['product_category'].value_counts().index)
plt.xticks(rotation=25, ha='right')
plt.title('Filtered complaints by product category')
plt.show()

In [None]:
out_path = Path('data/filtered_complaints.csv')
out_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_path, index=False)
out_path, df.shape

## Summary (write-up starter)\n
\n
- The dataset contains a wide spread of narrative lengths, with many short/empty narratives that should be removed for semantic search.\n
- After filtering to the in-scope product categories and removing empty narratives, we produce a clean dataset suitable for chunking/embeddings.\n
- Next, we will create a stratified sample and build a vector index for semantic retrieval (Task 2).