# Task 1: Exploratory Data Analysis and Preprocessing

This notebook performs EDA and preprocessing on the CFPB complaint dataset for the CredTrust Financial RAG chatbot.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Load dataset
df = pd.read_csv('path_to_cfpb_complaints.csv')  # Replace with actual path

# EDA: Complaint distribution by product
product_counts = df['Product'].value_counts()
plt.figure(figsize=(10, 6))
sns.barplot(x=product_counts.values, y=product_counts.index)
plt.title('Complaint Distribution by Product')
plt.xlabel('Number of Complaints')
plt.ylabel('Product')
plt.show()

# Narrative length analysis
df['narrative_length'] = df['Consumer complaint narrative'].apply(lambda x: len(str(x).split()) if pd.notna(x) else 0)
plt.figure(figsize=(10, 6))
sns.histplot(df['narrative_length'], bins=50)
plt.title('Distribution of Narrative Word Counts')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()

# Missing narratives
missing_narratives = df['Consumer complaint narrative'].isna().sum()
print(f'Missing narratives: {missing_narratives} ({missing_narratives / len(df) * 100:.2f}%)')

# Filter for specified products
products = ['Credit Card', 'Personal Loan', 'Buy Now Pay Later', 'Savings Account', 'Money Transfer']
df_filtered = df[df['Product'].isin(products)].copy()

# Remove empty narratives
df_filtered = df_filtered[df_filtered['Consumer complaint narrative'].notna()]

# Clean narratives
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\bi am writing to file a complaint\b', '', text)
    return text.strip()

df_filtered['Consumer complaint narrative'] = df_filtered['Consumer complaint narrative'].apply(clean_text)

# Save cleaned dataset
df_filtered.to_csv('data/filtered_complaints.csv', index=False)
print('Cleaned dataset saved to data/filtered_complaints.csv')