# Deep Dive Exploratory Data Analysis

In [None]:
import sys
import os
sys.path.append(os.path.abspath('..'))

import pandas as pd
import matplotlib.pyplot as plt

# Debugging Imports
import src
print(f"SRC PKG: {src.__file__}")
import src.utils
print(f"UTILS FILE: {src.utils.__file__}")

from src.config import FILTERED_CSV, IMAGES_DIR
from src.utils import save_plot, generate_wordcloud

pd.set_option('display.max_colwidth', 100)
df = pd.read_csv(FILTERED_CSV, low_memory=False)

In [None]:
# Viz 1: Histogram of Complaint Word Counts
df['word_count'] = df['Consumer complaint narrative'].astype(str).apply(lambda x: len(x.split()))
fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(df['word_count'], bins=50, color='teal', edgecolor='black')
ax.set_title('Distribution of Complaint Word Counts')
ax.set_xlabel('Words')
ax.set_ylabel('Frequency')
save_plot(fig, 'word_len_dist.png')
plt.show()

In [None]:
# Viz 2: Top 10 Sub-products
top_sub = df['Sub-product'].value_counts().head(10)
fig, ax = plt.subplots(figsize=(10, 6))
top_sub.sort_values().plot(kind='barh', color='salmon', ax=ax)
ax.set_title('Top 10 Sub-products')
save_plot(fig, 'sub_products.png')
plt.show()

In [None]:
# Viz 3: Information regarding time trend is dependent on 'Date received'
if 'Date received' in df.columns:
    df['Date received'] = pd.to_datetime(df['Date received'])
    time_trend = df.set_index('Date received').resample('M').size()
    fig, ax = plt.subplots(figsize=(12, 6))
    time_trend.plot(ax=ax, color='purple')
    ax.set_title('Complaints Over Time')
    save_plot(fig, 'time_trend.png')
    plt.show()

In [None]:
# Viz 4: WordCloud
from IPython.display import Image
generate_wordcloud(df['Consumer complaint narrative'].astype(str).tolist(), 'wordcloud.png')
Image(filename='../images/wordcloud.png')