In [None]:
# 01_eda_and_cleaning.ipynb

### 📌 Task 1: Exploratory Data Analysis and Preprocessing
# Author: Miskir
# Dataset: complaints.csv

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os

# Ensure directories exist
os.makedirs("../data", exist_ok=True)
os.makedirs("../reports", exist_ok=True)

# Load Data
df = pd.read_csv("../data/complaints.csv")
print(f"Dataset shape: {df.shape}")
df.head()

# Save bar chart of complaint distribution
plt.figure(figsize=(10, 5))
sns.countplot(y="Product", data=df, order=df["Product"].value_counts().index, palette='viridis')
plt.title("Complaint Count by Product")
plt.xlabel("Count")
plt.ylabel("Product")
plt.tight_layout()
plt.savefig("../reports/product_distribution.png")
plt.show()

# Word count distribution
df["narrative_length"] = df["Consumer complaint narrative"].astype(str).apply(lambda x: len(x.split()))
plt.figure(figsize=(8, 4))
plt.hist(df["narrative_length"], bins=50, color="steelblue")
plt.title("Narrative Word Count Distribution")
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig("../reports/narrative_wordcount.png")
plt.show()

# Filter relevant products
relevant_products = [
    "Credit card", 
    "Personal loan", 
    "Buy Now, Pay Later", 
    "Savings account", 
    "Money transfer, virtual currency, or money service"
]
df_filtered = df[df["Product"].isin(relevant_products)]
df_filtered = df_filtered[df_filtered["Consumer complaint narrative"].notnull()]

# Clean narrative
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\n|\r", " ", text)
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text.strip()

df_filtered["cleaned_narrative"] = df_filtered["Consumer complaint narrative"].apply(clean_text)

# Save cleaned dataset
df_filtered.to_csv("../data/filtered_complaints.csv", index=False)
print("✅ Cleaned data saved to ../data/filtered_complaints.csv")
