In [2]:
# ==================================================
# CELL 1: Import All Necessary Libraries
# ==================================================

# Data manipulation libraries
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Utility libraries
import warnings
import os

# Ignore warning messages to keep output clean
warnings.filterwarnings('ignore')

# Make plots appear in the notebook
%matplotlib inline

# Set plot style for beautiful visualizations
sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (14, 7)
plt.rcParams['font.size'] = 11

# Display all columns in pandas (don't truncate)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print("="*60)
print("‚úÖ ALL LIBRARIES IMPORTED SUCCESSFULLY!")
print("="*60)
print("üìä Pandas version:", pd.__version__)
print("üî¢ NumPy version:", np.__version__)
print("="*60)
print("üéØ READY TO LOAD YOUR DATASET!")
print("="*60)


‚úÖ ALL LIBRARIES IMPORTED SUCCESSFULLY!
üìä Pandas version: 2.3.3
üî¢ NumPy version: 1.26.4
üéØ READY TO LOAD YOUR DATASET!


In [3]:
data_path = '../data/raw/complaints.csv'

print("üîÑ Loading dataset... This might take 30-60 seconds...")
print("üìÇ Loading from:", data_path)
print("-" * 60)

# Load the CSV file
df = pd.read_csv(data_path, low_memory=False)

print("‚úÖ DATASET LOADED SUCCESSFULLY!")
print("="*60)
print(f"üìä Total Rows (Records): {df.shape[0]:,}")
print(f"üìã Total Columns (Features): {df.shape[1]}")
print("="*60)
print("üíæ Dataset Size in Memory:", round(df.memory_usage(deep=True).sum() / 1024**2, 2), "MB")
print("="*60)

üîÑ Loading dataset... This might take 30-60 seconds...
üìÇ Loading from: ../data/raw/complaints.csv
------------------------------------------------------------
‚úÖ DATASET LOADED SUCCESSFULLY!
üìä Total Rows (Records): 777,959
üìã Total Columns (Features): 18
üíæ Dataset Size in Memory: 868.67 MB


In [4]:
# ==================================================
# CELL 3: Display First 5 Rows of Data
# ==================================================

print("üëÄ FIRST 5 ROWS OF YOUR DATASET:")
print("="*60)
df.head()

üëÄ FIRST 5 ROWS OF YOUR DATASET:


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,03/21/2017,Credit reporting,,Incorrect information on credit report,Information is not mine,,Company has responded to the consumer and the CFPB and chooses not to provide a public response,EXPERIAN DELAWARE GP,TX,77075,Older American,,Phone,03/21/2017,Closed with non-monetary relief,Yes,No,2397100
1,04/19/2017,Debt collection,"Other (i.e. phone, health club, etc.)",Disclosure verification of debt,Not disclosed as an attempt to collect,,,"Security Credit Services, LLC",IL,60643,,,Web,04/20/2017,Closed with explanation,Yes,No,2441777
2,04/19/2017,Credit card,,Other,,,Company has responded to the consumer and the CFPB and chooses not to provide a public response,"CITIBANK, N.A.",IL,62025,,,Referral,04/20/2017,Closed with explanation,Yes,No,2441830
3,04/14/2017,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,Company believes it acted appropriately as authorized by contract or law,"Shellpoint Partners, LLC",CA,90305,,,Referral,04/14/2017,Closed with explanation,Yes,No,2436165
4,04/19/2017,Credit card,,Credit determination,,,Company has responded to the consumer and the CFPB and chooses not to provide a public response,U.S. BANCORP,LA,70571,,,Postal mail,04/21/2017,Closed with explanation,Yes,No,2441726


In [5]:
# ==================================================
# CELL 4: Examine All Columns and Their Data Types
# ==================================================

print("üìã COLUMN INFORMATION:")
print("="*60)
print(f"Total Columns: {len(df.columns)}")
print("="*60)
print("\nüìù Column Names and Data Types:\n")
df.info()

üìã COLUMN INFORMATION:
Total Columns: 18

üìù Column Names and Data Types:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777959 entries, 0 to 777958
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   Date received                 777959 non-null  object
 1   Product                       777959 non-null  object
 2   Sub-product                   542822 non-null  object
 3   Issue                         777959 non-null  object
 4   Sub-issue                     320986 non-null  object
 5   Consumer complaint narrative  157865 non-null  object
 6   Company public response       197884 non-null  object
 7   Company                       777959 non-null  object
 8   State                         772056 non-null  object
 9   ZIP code                      772001 non-null  object
 10  Tags                          109264 non-null  object
 11  Consumer consent provided?    288311 n

In [6]:
# ==================================================
# CELL 5: Analyze Missing Values
# ==================================================

print("üîç MISSING VALUES ANALYSIS:")
print("="*60)

# Calculate missing values
missing_values = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df)) * 100

# Create a summary dataframe
missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing_Count': missing_values.values,
    'Missing_Percent': missing_percent.values
})

# Filter to show only columns with missing values
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Percent', ascending=False)

if len(missing_df) > 0:
    print(f"‚ö†Ô∏è  Found {len(missing_df)} columns with missing values:\n")
    print(missing_df.to_string(index=False))
else:
    print("‚úÖ No missing values found in any column!")

print("\n" + "="*60)

üîç MISSING VALUES ANALYSIS:
‚ö†Ô∏è  Found 9 columns with missing values:

                      Column  Missing_Count  Missing_Percent
                        Tags         668695        85.955044
Consumer complaint narrative         620094        79.707800
     Company public response         580075        74.563698
  Consumer consent provided?         489648        62.940078
                   Sub-issue         456973        58.739985
                 Sub-product         235137        30.224858
          Consumer disputed?           9545         1.226928
                    ZIP code           5958         0.765850
                       State           5903         0.758780



In [7]:
# ==================================================
# CELL 6: Statistical Summary of Numerical Data
# ==================================================

print("üìä STATISTICAL SUMMARY OF NUMERICAL COLUMNS:")
print("="*60)
df.describe()

üìä STATISTICAL SUMMARY OF NUMERICAL COLUMNS:


Unnamed: 0,Complaint ID
count,777959.0
mean,1310413.0
std,728727.6
min,1.0
25%,691289.0
50%,1353374.0
75%,1956984.0
max,2488370.0


In [8]:
# ==================================================
# CELL 7: Explore Important Columns for Dispute Model
# ==================================================

# Let's look at all column names first
print("üìù ALL AVAILABLE COLUMNS IN YOUR DATASET:")
print("="*60)
for idx, col in enumerate(df.columns, 1):
    print(f"{idx}. {col}")
print("="*60)

# Check which columns might be useful for dispute resolution
print("\nüéØ LOOKING FOR KEY COLUMNS...")
print("="*60)

# Common important columns for dispute resolution
important_keywords = ['complaint', 'narrative', 'issue', 'product', 
                     'company', 'response', 'resolution', 'consumer', 
                     'timely', 'disputed']

print("\nüîç Columns that might contain dispute information:\n")
for col in df.columns:
    col_lower = col.lower()
    if any(keyword in col_lower for keyword in important_keywords):
        print(f"‚úì {col}")

print("\n" + "="*60)

üìù ALL AVAILABLE COLUMNS IN YOUR DATASET:
1. Date received
2. Product
3. Sub-product
4. Issue
5. Sub-issue
6. Consumer complaint narrative
7. Company public response
8. Company
9. State
10. ZIP code
11. Tags
12. Consumer consent provided?
13. Submitted via
14. Date sent to company
15. Company response to consumer
16. Timely response?
17. Consumer disputed?
18. Complaint ID

üéØ LOOKING FOR KEY COLUMNS...

üîç Columns that might contain dispute information:

‚úì Product
‚úì Sub-product
‚úì Issue
‚úì Sub-issue
‚úì Consumer complaint narrative
‚úì Company public response
‚úì Company
‚úì Consumer consent provided?
‚úì Date sent to company
‚úì Company response to consumer
‚úì Timely response?
‚úì Consumer disputed?
‚úì Complaint ID



In [9]:
# ==================================================
# CELL 8: Find and Analyze the Resolution/Response Column
# ==================================================

# Try to find the response/resolution column
# It might be called different things in your dataset
possible_response_cols = []

for col in df.columns:
    col_lower = col.lower()
    if 'response' in col_lower or 'resolution' in col_lower or 'company' in col_lower:
        possible_response_cols.append(col)

print("üéØ POSSIBLE RESPONSE/RESOLUTION COLUMNS:")
print("="*60)
for col in possible_response_cols:
    print(f"\nüìå Column: {col}")
    print("-"*60)
    print(f"Unique values count: {df[col].nunique()}")
    print(f"\nUnique values:")
    print(df[col].value_counts())
    print("="*60)

üéØ POSSIBLE RESPONSE/RESOLUTION COLUMNS:

üìå Column: Company public response
------------------------------------------------------------
Unique values count: 10

Unique values:
Company public response
Company has responded to the consumer and the CFPB and chooses not to provide a public response                            97492
Company chooses not to provide a public response                                                                           52473
Company believes it acted appropriately as authorized by contract or law                                                   34323
Company believes the complaint is the result of a misunderstanding                                                          3149
Company disputes the facts presented in the complaint                                                                       2887
Company believes complaint caused principally by actions of third party outside the control or direction of the company     2579
Company believes com

In [10]:
# ==================================================
# CELL 9: Explore Complaint Text/Narrative Column
# ==================================================

# Find narrative/text columns
narrative_cols = []

for col in df.columns:
    col_lower = col.lower()
    if 'narrative' in col_lower or 'description' in col_lower or 'complaint' in col_lower:
        # Check if it's a text column with actual narratives
        if df[col].dtype == 'object':
            narrative_cols.append(col)

print("üìù NARRATIVE/TEXT COLUMNS FOUND:")
print("="*60)

for col in narrative_cols:
    print(f"\nüìå Column: {col}")
    print("-"*60)
    print(f"Total entries: {df[col].count():,}")
    print(f"Missing entries: {df[col].isnull().sum():,}")
    print(f"Non-null percentage: {(df[col].count() / len(df)) * 100:.2f}%")
    
    # Show a sample narrative
    sample_narratives = df[col].dropna().head(3)
    if len(sample_narratives) > 0:
        print(f"\nüìñ Sample narratives from '{col}':")
        print("-"*60)
        for idx, narrative in enumerate(sample_narratives, 1):
            print(f"\nExample {idx}:")
            print(narrative[:300] + "..." if len(str(narrative)) > 300 else narrative)
            print("-"*40)
    
    print("="*60)

üìù NARRATIVE/TEXT COLUMNS FOUND:

üìå Column: Consumer complaint narrative
------------------------------------------------------------
Total entries: 157,865
Missing entries: 620,094
Non-null percentage: 20.29%

üìñ Sample narratives from 'Consumer complaint narrative':
------------------------------------------------------------

Example 1:
Started the refinance of home mortgage process with cash out option on XX/XX/2016. Necessary documents were submitted by XXXX. After initial review, got good faith estimate with loan amount and closing cost. Based on this estimate, a deposit of {$350.00} was made towards appraisal. Appraisal came wi...
----------------------------------------

Example 2:
My wife and I visited the Chase Bank branch at XXXX, XXXX, KY on XX/XX/2017 around XXXX AM to open new checking accounts. We were prompted to open these checking accounts because Chase mailed and emailed us vouchers to receive {$300.00} for opening each account and personal friends had also re

In [11]:
# ==================================================
# CELL 10: Final Summary - What We Have for Our Model
# ==================================================

print("="*70)
print(" üéØ DATASET SUMMARY FOR ESCROW DISPUTE RESOLUTION MODEL")
print("="*70)

print(f"\nüìä DATASET SIZE:")
print(f"   ‚Ä¢ Total complaints (rows): {len(df):,}")
print(f"   ‚Ä¢ Total features (columns): {df.shape[1]}")
print(f"   ‚Ä¢ Memory usage: {round(df.memory_usage(deep=True).sum() / 1024**2, 2)} MB")

print(f"\nüìù DATA COMPLETENESS:")
total_cells = df.shape[0] * df.shape[1]
total_missing = df.isnull().sum().sum()
print(f"   ‚Ä¢ Total data cells: {total_cells:,}")
print(f"   ‚Ä¢ Missing cells: {total_missing:,}")
print(f"   ‚Ä¢ Completeness: {((total_cells - total_missing) / total_cells * 100):.2f}%")

print(f"\nüéØ NEXT STEPS FOR BUILDING DISPUTE MODEL:")
print("   1. ‚úì Data loaded successfully")
print("   2. ‚è≠Ô∏è  Create target labels (favour_customer, favor_seller, split_payment)")
print("   3. ‚è≠Ô∏è  Clean and preprocess text data")
print("   4. ‚è≠Ô∏è  Feature engineering from complaint narratives")
print("   5. ‚è≠Ô∏è  Build machine learning model")
print("   6. ‚è≠Ô∏è  Evaluate model performance")
print("   7. ‚è≠Ô∏è  Create prediction system")

print("\n" + "="*70)
print("üéâ DATA EXPLORATION COMPLETE!")
print("="*70)
print("\nüí° TIP: Look at the columns carefully above.")
print("   We need to identify which column shows dispute outcomes!")
print("="*70)

 üéØ DATASET SUMMARY FOR ESCROW DISPUTE RESOLUTION MODEL

üìä DATASET SIZE:
   ‚Ä¢ Total complaints (rows): 777,959
   ‚Ä¢ Total features (columns): 18
   ‚Ä¢ Memory usage: 868.68 MB

üìù DATA COMPLETENESS:
   ‚Ä¢ Total data cells: 14,003,262
   ‚Ä¢ Missing cells: 3,072,028
   ‚Ä¢ Completeness: 78.06%

üéØ NEXT STEPS FOR BUILDING DISPUTE MODEL:
   1. ‚úì Data loaded successfully
   2. ‚è≠Ô∏è  Create target labels (favour_customer, favor_seller, split_payment)
   3. ‚è≠Ô∏è  Clean and preprocess text data
   4. ‚è≠Ô∏è  Feature engineering from complaint narratives
   5. ‚è≠Ô∏è  Build machine learning model
   6. ‚è≠Ô∏è  Evaluate model performance
   7. ‚è≠Ô∏è  Create prediction system

üéâ DATA EXPLORATION COMPLETE!

üí° TIP: Look at the columns carefully above.
   We need to identify which column shows dispute outcomes!
