In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os

%matplotlib inline


def load_data(file_path):
    return pd.read_csv(file_path)

input_path = "data/raw/complaints.csv"
df = load_data(input_path)
print(f"Dataset shape: {df.shape}")
print(df.head())


plt.figure(figsize=(10, 6))
sns.countplot(data=df, y='Product', order=df['Product'].value_counts().index)
plt.title('Distribution of Complaints by Product')
plt.xlabel('Number of Complaints')
plt.ylabel('Product')
plt.show()

df['narrative_length'] = df['Consumer complaint narrative'].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)
plt.figure(figsize=(10, 6))
sns.histplot(df['narrative_length'], bins=50)
plt.title('Distribution of Narrative Length (Word Count)')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()

mean_narrative_length = df['narrative_length'].mean()
median_narrative_length = df['narrative_length'].median()
max_narrative_length = df['narrative_length'].max()

print(f"Mean narrative length: {mean_narrative_length:.2f} words")
print(f"Median narrative length: {median_narrative_length:.2f} words")
print(f"Max narrative length: {max_narrative_length} words")

narrative_counts = df['Consumer complaint narrative'].isna().value_counts()
complaints_with_narratives = narrative_counts.get(False, 0)
complaints_without_narratives = narrative_counts.get(True, 0)
total_complaints = len(df)
percentage_with_narratives = (complaints_with_narratives / total_complaints) * 100

print(f"Complaints with narratives: {complaints_with_narratives}")
print(f"Complaints without narratives: {complaints_without_narratives}")

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text.strip())
    return text

def preprocess_data(df, target_products, output_path):
    df_filtered = df[df['Product'].isin(target_products)].copy()
    df_filtered = df_filtered[df_filtered['Consumer complaint narrative'].notnull()]
    df_filtered['Consumer complaint narrative'] = df_filtered['Consumer complaint narrative'].apply(clean_text)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    df_filtered.to_csv(output_path, index=False)
    return df_filtered

target_products = ['Credit card', 'Consumer Loan', 'Payday loan', 'Checking or savings account', 'Money transfer']
output_path = "data/filtered_complaints.csv"
df_filtered = preprocess_data(df, target_products, output_path)
print(f"Filtered dataset shape: {df_filtered.shape}")
print(f"Filtered dataset saved to {output_path}")
