# Task 1: Exploratory Data Analysis and Data Preprocessing

## Objective
Understand the structure, content, and quality of the CFPB complaint dataset and prepare it for use in a Retrieval-Augmented Generation (RAG) pipeline.

## 1. Load Dataset

We load the full CFPB complaints dataset containing structured metadata and free-text consumer complaint narratives.


In [None]:
import os
import sys
from pathlib import Path
# Ensure project root is on sys.path so `src` imports work in the notebook
sys.path.insert(0, str(Path.cwd().parent.resolve()))
print('Added project root to sys.path:', sys.path[0])
print(Path.cwd())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from src.data_loader import load_complaints_csv
from src.eda import (
    plot_product_distribution,
    analyze_narrative_length,
    count_missing_narratives
)
from src.preprocessing import (
    filter_products_and_narratives,
    apply_text_cleaning,
    get_product_distribution,
    get_narrative_length_stats,
    count_narrative_presence,
    clean_narrative_text,
)

# Paths
# Resolve paths relative to project root (notebooks/ is the notebook CWD)
PROJECT_ROOT = Path.cwd().parent.resolve()
RAW_DATA_PATH = PROJECT_ROOT / "data" / "raw" / "complaints.csv"
OUTPUT_PATH = PROJECT_ROOT / "data" / "processed" / "filtered_complaints.csv"

PRODUCT_COL = "Product"
TEXT_COL = "Consumer complaint narrative"

TARGET_PRODUCTS = [
    "Credit card",
    "Personal loan",
    "Savings account",
    "Money transfer"
]

# Load data
df = load_complaints_csv(RAW_DATA_PATH)
print(df.shape)
df.head()
df.info()
# df.isna().sum().sort_values(ascending=False).head(10)

# Narrative Presence Analysis

In [None]:
presence_stats = count_narrative_presence(
    df,
    text_col="Consumer complaint narrative"
)
print("Narrative presence:", presence_stats)

### Insight:
- A significant portion of records do not contain narratives and must be removed for semantic analysis.

## Product Distribution (Raw Data)

In [None]:

product_dist = get_product_distribution(
    df,
    product_col=PRODUCT_COL
)
print("Raw product distribution:")
print(product_dist)

plot_product_distribution(df, PRODUCT_COL)

## Narrative Length Analysis

In [None]:
lengths = (
    df[TEXT_COL]
    .dropna()
    .astype(str)
    .apply(lambda x: len(x.split()))
)

print("Narrative length statistics:")
print(lengths.describe())

sns.histplot(x=lengths, bins=50)
plt.title("Consumer Complaint Narrative Word Count Distribution")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.show()

## Cleaning and Filtering

In [None]:

missing_counts = count_missing_narratives(df, TEXT_COL)
print(missing_counts)


### Filtering to Relevant Products and Valid Narratives

In [None]:
# Filter complaints to allowed products and non-empty narratives
# -----------------------------
ALLOWED_PRODUCTS = [
    "Credit card",
    "Personal loan",
    "Savings account",
    "Money transfers",  # canonical name
]

df_filtered = filter_products_and_narratives(
    df,
    product_col=PRODUCT_COL,
    text_col=TEXT_COL,
    allowed_products=ALLOWED_PRODUCTS,
    debug=True  # optional, prints diagnostics
)

print("Filtered product distribution:")
print(df_filtered[PRODUCT_COL].value_counts())

# Check for missing allowed products
missing_products = set(ALLOWED_PRODUCTS) - \
    set(df_filtered[PRODUCT_COL].unique())
if missing_products:
    print("\nWarning: These allowed products are missing after filtering:",
          missing_products)

### Text Cleaning

In [None]:
# Clean narrative text (enhanced)
# -----------------------------

# Apply enhanced text cleaning
df_filtered["cleaned_narrative"] = df_filtered[TEXT_COL].apply(
    lambda x: clean_narrative_text(x, remove_stopwords=True, lemmatize=True)
)

# Remove any rows where cleaning wiped content
df_filtered = df_filtered[df_filtered["cleaned_narrative"].str.strip(
) != ""].reset_index(drop=True)

# Show distribution after cleaning
print("\nFiltered product distribution (post-cleaning):")
print(df_filtered[PRODUCT_COL].value_counts())

print("\nShape after cleaning:", df_filtered.shape)

# Sanity checks
num_products = df_filtered[PRODUCT_COL].nunique()
assert num_products >= 3, f"Too few products remain after cleaning! ({num_products} found)"

num_empty_narratives = df_filtered["cleaned_narrative"].isna().sum()
assert num_empty_narratives == 0, f"Some cleaned narratives are empty! ({num_empty_narratives} found)"

In [None]:

# Save
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
df_filtered.to_csv(OUTPUT_PATH, index=False)

print(f"Cleaned dataset saved to {OUTPUT_PATH}")
