In [1]:
%pip install --user nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')  # Style sheet for our imports
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng') 
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
df = pd.read_csv(r'C:\Users\USER\Desktop\SQL and Excel Files\complaints-2025-09-15_05_03.csv')
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,06/07/23,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,,,"EQUIFAX, INC.",NC,28314,,Consent not provided,Web,06/07/23,Closed with non-monetary relief,Yes,,7084840
1,08/10/23,Debt collection,Credit card debt,Communication tactics,"You told them to stop contacting you, but they...",,,ENCORE CAPITAL GROUP INC.,SC,29137,,Consent not provided,Web,08/10/23,Closed with non-monetary relief,Yes,,7384009
2,05/17/23,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Credit inquiries on your report that you don't...,,,"EQUIFAX, INC.",AR,72120,,Consent not provided,Web,05/17/23,Closed with explanation,Yes,,6989930
3,11/02/23,Credit reporting or other personal consumer re...,Credit reporting,Improper use of your report,Reporting company used your report improperly,,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,AL,35022,,Consent not provided,Web,11/02/23,Closed with non-monetary relief,Yes,,7791286
4,11/16/23,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,TX,78539,,Consent not provided,Web,11/16/23,Closed with explanation,Yes,,7860148


In [5]:
df.shape

(1172203, 18)

In [6]:
nulls = df.isnull().sum().reset_index()
nulls.rename(columns={'index': 'column_name', 0: 'null_values'}, inplace=True)

nulls['nulls(%)'] = (nulls['null_values'] / len(df)) * 100
nulls

Unnamed: 0,column_name,null_values,nulls(%)
0,Date received,0,0.0
1,Product,0,0.0
2,Sub-product,0,0.0
3,Issue,0,0.0
4,Sub-issue,4896,0.417675
5,Consumer complaint narrative,753628,64.291595
6,Company public response,463036,39.501349
7,Company,0,0.0
8,State,2180,0.185975
9,ZIP code,76,0.006484


In [7]:
# Dropping unnecessary columns
new_df = df.drop(columns=['Consumer disputed?', 'Tags', 'Sub-product', 'Sub-issue'])

In [8]:
new_df.shape

(1172203, 14)

## Quick Review of Customer Complaint

In [9]:
complaints = new_df['Consumer complaint narrative'].unique().reset_index()
complaints.head()

AttributeError: 'numpy.ndarray' object has no attribute 'reset_index'

In [10]:
# Get unique complaint narratives as a Series
complaints = pd.Series(new_df['Consumer complaint narrative'].unique())

# Convert to DataFrame
complaints_df = complaints.to_frame(name='complaint').reset_index(drop=True)

print(len(complaints_df))
complaints_df.head(20)

282430


Unnamed: 0,complaint
0,
1,"Back in XXXX XXXX XXXX, we were finally approv..."
2,XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX X...
3,I am very concerned because my credit reports ...
4,I am sincerely pleading with you to remove thi...
5,Contacted agency no information matched mine. ...
6,To let the credit bureaus know that the inquir...
7,AS OF XX/XX/XXXX I HAVE NOTICED FRAUD ACCOUNTS...
8,I've been having a lot of issues with Experian...
9,This account has caused me significant distres...


## Basic NLTK

In [11]:
example = complaints_df['complaint'][16]
print(example)

I went to redeem my reward points valued at {$1200.00} from Chase on a credit card ending in XXXX. I received the confirmation email, but the money never made it to my account. I called chase and they confirmed my account number and routing number were correct. They said the points should be corrected and in my account within 1-2 business days, but they still havent shown. Ive tried calling the bank multiple times and my calls wont go through.


In [12]:
# Lets tokenize our sentence
tokens = nltk.word_tokenize(example)
tokens[:20] 

['I',
 'went',
 'to',
 'redeem',
 'my',
 'reward',
 'points',
 'valued',
 'at',
 '{',
 '$',
 '1200.00',
 '}',
 'from',
 'Chase',
 'on',
 'a',
 'credit',
 'card',
 'ending']

In [13]:
# Using NLTK to find part of speech
tagged = nltk.pos_tag(tokens) # Each word has been given its part of speech
tagged[:20]

[('I', 'PRP'),
 ('went', 'VBD'),
 ('to', 'TO'),
 ('redeem', 'VB'),
 ('my', 'PRP$'),
 ('reward', 'NN'),
 ('points', 'NNS'),
 ('valued', 'VBN'),
 ('at', 'IN'),
 ('{', '('),
 ('$', '$'),
 ('1200.00', 'CD'),
 ('}', ')'),
 ('from', 'IN'),
 ('Chase', 'NNP'),
 ('on', 'IN'),
 ('a', 'DT'),
 ('credit', 'NN'),
 ('card', 'NN'),
 ('ending', 'VBG')]

In [14]:
# For our tags part of speech we put them into an entity
entity = nltk.chunk.ne_chunk(tagged)
entity.pprint()

(S
  I/PRP
  went/VBD
  to/TO
  redeem/VB
  my/PRP$
  reward/NN
  points/NNS
  valued/VBN
  at/IN
  {/(
  $/$
  1200.00/CD
  }/)
  from/IN
  (PERSON Chase/NNP)
  on/IN
  a/DT
  credit/NN
  card/NN
  ending/VBG
  in/IN
  (GPE XXXX/NNP)
  ./.
  I/PRP
  received/VBD
  the/DT
  confirmation/NN
  email/NN
  ,/,
  but/CC
  the/DT
  money/NN
  never/RB
  made/VBD
  it/PRP
  to/TO
  my/PRP$
  account/NN
  ./.
  I/PRP
  called/VBD
  chase/NN
  and/CC
  they/PRP
  confirmed/VBD
  my/PRP$
  account/NN
  number/NN
  and/CC
  routing/VBG
  number/NN
  were/VBD
  correct/JJ
  ./.
  They/PRP
  said/VBD
  the/DT
  points/NNS
  should/MD
  be/VB
  corrected/VBN
  and/CC
  in/IN
  my/PRP$
  account/NN
  within/IN
  1-2/JJ
  business/NN
  days/NNS
  ,/,
  but/CC
  they/PRP
  still/RB
  havent/VBD
  shown/VBN
  ./.
  (PERSON Ive/NNP)
  tried/VBD
  calling/VBG
  the/DT
  bank/NN
  multiple/JJ
  times/NNS
  and/CC
  my/PRP$
  calls/NNS
  wont/VBP
  go/VB
  through/IN
  ./.)


## VADER Sentiment Scoring

In [15]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [16]:
# Example
sia.polarity_scores('I love going for hikes')

{'neg': 0.0, 'neu': 0.417, 'pos': 0.583, 'compound': 0.6369}

For our Example, it was tagged mostly as positive

In [17]:
# Example
sia.polarity_scores('You are in big trouble')

{'neg': 0.403, 'neu': 0.597, 'pos': 0.0, 'compound': -0.4019}

Tagged mostly on the negative and neutral side

In [18]:
# Lets see for our previos example
sia.polarity_scores(example)

{'neg': 0.0, 'neu': 0.887, 'pos': 0.113, 'compound': 0.7184}

Tagged as neutral score

## Know lets run the polarity score on the entire dataset.

In [19]:
from tqdm import tqdm
tqdm.pandas()   # enable progress bar for pandas apply

# This is how we would handle our Nan Values
new_df['text_status'] = new_df['Consumer complaint narrative'].apply(
    lambda x: "No text" if pd.isna(x) else "To analyze"
)

# Sentiment analysis function with NaN handling
def compute_sentiment(text):
    if pd.isna(text):  # handle missing text
        return pd.Series({'sentiment_score': None, 'sentiment_label': 'No text'})

    score = sia.polarity_scores(text)['compound']
    if score >= 0.05:
        label = 'Positive'
    elif score <= -0.05:
        label = 'Negative'
    else:
        label = 'Neutral'
    return pd.Series({'sentiment_score': score, 'sentiment_label': label})

# Apply with progress bar
new_df[['sentiment_score', 'sentiment_label']] = new_df['Consumer complaint narrative'].progress_apply(compute_sentiment)

100%|███████████████████████████████████████████████████████████████████████| 1172203/1172203 [33:32<00:00, 582.60it/s]


In [21]:
new_df.shape

(1172203, 17)

In [22]:
# Save to CSV
new_df.to_csv("consumer_complaints_sentiment.csv", index=False, encoding="utf-8")