# Load and Process Downloaded Data

**Author:** Lang Min  
**Date:** 2 December 2025  
**Email:** min.la@northeastern.edu  

## 1. Setup

In [1]:
import pandas as pd

## 2. Load file and filter

In [2]:
# Read the input CSV
df_all = pd.read_csv('Complaints_0925.csv') # Use the downloaded file <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
print("File read successfully.")
print(f'Number of complaints before filtering: {df_all.shape[0]}')
print()

# Clean all the complaints with empty narrative
df_clean = df_all[df_all['Consumer complaint narrative'].notna() & (df_all['Consumer complaint narrative'].str.strip() != '')]
print(f'Number of complaints after filtering: {df_clean.shape[0]}')
print()

# Limit the max number to 10000
if len(df_clean) > 10000:
    df = df_clean.sample(n=10000, random_state=42)
else:
    df = df_clean
    
# Check dataframe
print(f'Number of complaints: {df.shape[0]}')
print(df.columns.tolist())


File read successfully.
Number of complaints before filtering: 47422

Number of complaints after filtering: 7140

Number of complaints: 7140
['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative', 'Company public response', 'Company', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Date sent to company', 'Company response to consumer', 'Timely response?', 'Consumer disputed?', 'Complaint ID']


## 3. Output

In [None]:
# Define output file
output_file = 'Complaints_0925_filtered.csv'  # Choose the name <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
df.to_csv(output_file, index=False)
print("File filtered successfully.")
