In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os, sys
import re

sys.path.append('../scripts') 
import nltk

from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
def load_and_explore_data(file_path):
    """Load the dataset and perform initial exploration"""
    print(" LOADING AND EXPLORING DATA")
    print("=" * 50)
    
    # Load the dataset
    df = pd.read_csv(file_path)
    
    # Basic information
    print(f"Dataset Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print("\nFirst 5 rows:")
    print(df.head())
    
    print("\nDataset Info:")
    print(df.info())
    
    print("\nMissing Values:")
    print(df.isnull().sum())
    
    print("\nBasic Statistics:")
    print(df.describe(include='all'))
    
    return df

df = load_and_explore_data('../data/complaints.csv')
print(f"Dataset Shape: {df.shape}")

In [None]:
# Distribution of complaints by Product
plt.figure(figsize=(12,6))
product_counts = df['Product'].value_counts()
sns.barplot(x=product_counts.index[:10], y=product_counts.values[:10])
plt.xticks(rotation=45)
plt.title("Top 10 Complaint Counts by Product")
plt.ylabel("Number of Complaints")
plt.show()


In [None]:
# Narrative length analysis
df['narrative_length'] = df['Consumer complaint narrative'].dropna().apply(lambda x: len(str(x).split()))
df['narrative_length'].hist(bins=50)
plt.title("Distribution of Narrative Lengths")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.show()

print("Shortest narrative length:", df['narrative_length'].min())
print("Longest narrative length:", df['narrative_length'].max())


In [None]:
# Complaints with and without narratives
with_narrative = df['Consumer complaint narrative'].notnull().sum()
without_narrative = df['Consumer complaint narrative'].isnull().sum()

print(f"Complaints with narrative: {with_narrative}")
print(f"Complaints without narrative: {without_narrative}")


In [None]:
# Filter by 5 target products & remove null narratives
target_products = [
    'Credit card',
    'Personal loan',
    'Buy Now, Pay Later (BNPL)',
    'Savings account',
    'Money transfer, virtual currency'
]

filtered_df = df[
    df['Product'].isin(target_products) &
    df['Consumer complaint narrative'].notnull()
].copy()

print(f"Filtered dataset size: {filtered_df.shape}")


In [None]:
# Clean text narratives
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'i am writing.*?complaint', '', text)  # remove boilerplate text if present
    text = re.sub(r'\s+', ' ', text).strip()
    return text

filtered_df['cleaned_narrative'] = filtered_df['Consumer complaint narrative'].apply(clean_text)


In [None]:
# Save cleaned dataset back to Drive
output_path = '../data/filtered_complaints.csv'
filtered_df.to_csv(output_path, index=False)
print(f"Cleaned dataset saved to: {output_path}")
