In [None]:
import os
import sys
from pathlib import Path
# Ensure project root is on sys.path so `src` imports work in the notebook
sys.path.insert(0, str(Path.cwd().parent.resolve()))
print('Added project root to sys.path:', sys.path[0])
print(Path.cwd())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from src.data_loader import load_complaints_csv
from src.eda import (
    plot_product_distribution,
    analyze_narrative_length,
    count_missing_narratives
)
from src.preprocessing import (
    filter_products_and_narratives,
    apply_text_cleaning
)

# Paths
# Resolve paths relative to project root (notebooks/ is the notebook CWD)
PROJECT_ROOT = Path.cwd().parent.resolve()
RAW_DATA_PATH = PROJECT_ROOT / "data" / "raw" / "complaints.csv"
OUTPUT_PATH = PROJECT_ROOT / "data" / "processed" / "filtered_complaints.csv"

PRODUCT_COL = "Product"
TEXT_COL = "Consumer complaint narrative"

TARGET_PRODUCTS = [
    "Credit card",
    "Personal loan",
    "Savings account",
    "Money transfer"
]

# Load data
df = load_complaints_csv(RAW_DATA_PATH)
print(df.shape)
df.head()
df.info()
df.isna().sum().sort_values(ascending=False).head(10)

In [None]:

# EDA

plot_product_distribution(df, PRODUCT_COL)


In [None]:

length_stats = analyze_narrative_length(df, TEXT_COL)
print(length_stats)


In [None]:

missing_counts = count_missing_narratives(df, TEXT_COL)
print(missing_counts)


In [None]:

# Filtering
df_filtered = filter_products_and_narratives(
    df,
    product_col=PRODUCT_COL,
    text_col=TEXT_COL,
    allowed_products=TARGET_PRODUCTS
)

print(df_filtered.shape)
df_filtered.head()

In [None]:

# Cleaning
df_filtered = apply_text_cleaning(df_filtered, TEXT_COL)
print(df_filtered.shape)
df_filtered.head()

In [None]:

# Save
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
df_filtered.to_csv(OUTPUT_PATH, index=False)

print(f"Cleaned dataset saved to {OUTPUT_PATH}")
