In [None]:
# Import essential libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

In [None]:
# Set visualization style
sns.set(style="whitegrid")

# Define the path to your dataset
data_path = "data/raw/complaints.csv"  # Adjust if your file is elsewhere

# Load the dataset
df = pd.read_csv(data_path, low_memory=False)

# Preview the shape and first few rows
print("Shape of the dataset:", df.shape)
df.head()

In [None]:
# Plot distribution of complaints by Product
plt.figure(figsize=(12, 6))
sns.countplot(data=df, y="Product", order=df["Product"].value_counts().index)
plt.title("Distribution of Complaints by Product")
plt.xlabel("Number of Complaints")
plt.ylabel("Product")
plt.tight_layout()
plt.show()

In [None]:
#Analyze Narrative Length
# Calculate number of words in each narrative
df["Narrative_Word_Count"] = df["Consumer complaint narrative"].astype(str).apply(lambda x: len(x.split()))

# Plot histogram of narrative lengths
plt.figure(figsize=(10, 5))
sns.histplot(df["Narrative_Word_Count"], bins=50, kde=True)
plt.title("Distribution of Consumer Narrative Word Count")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
# Count entries with and without complaint narrative
with_narrative = df["Consumer complaint narrative"].notna().sum()
without_narrative = df.shape[0] - with_narrative

print("Complaints with narrative:", with_narrative)
print("Complaints without narrative:", without_narrative)


In [None]:
# Filter to 5 target products & Non empty narratives
# Define the five products we care about
target_products = [
    "Credit card",
    "Personal loan",
    "Buy Now, Pay Later (BNPL)",
    "Savings account",
    "Money transfers"
]

# Filter by product and non-empty narrative
filtered_df = df[df["Product"].isin(target_products)]
filtered_df = filtered_df[filtered_df["Consumer complaint narrative"].notna()]
filtered_df = filtered_df[filtered_df["Consumer complaint narrative"].str.strip() != ""]

# Check result
print("Filtered dataset shape:", filtered_df.shape)
filtered_df[["Product", "Consumer complaint narrative"]].head()


In [None]:
#clean the narrative text
# Function to clean narrative text
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r"i am writing.*?complaint.*?\\b", "", text)  # Remove boilerplate
    text = re.sub(r"[^a-zA-Z0-9\\s]", "", text)  # Remove special characters
    text = re.sub(r"\\s+", " ", text).strip()  # Remove extra spaces
    return text

# Apply cleaning
filtered_df["Cleaned_Narrative"] = filtered_df["Consumer complaint narrative"].apply(clean_text)

# Preview cleaned narratives
filtered_df[["Product", "Cleaned_Narrative"]].head()


In [None]:
#save cleaned data
# Create folder if it doesn't exist
os.makedirs("data", exist_ok=True)

# Save cleaned and filtered data
filtered_df.to_csv("data/filtered_complaints.csv", index=False)
print("✅ Cleaned dataset saved to: data/filtered_complaints.csv")
